大规模爬取数据

 大规模爬取数据

#获取渠道链接,如手机、家电品类页面链接
import requests from bs4 import BeautifulSoup start_url = 'http://bj.58.com/sale.shtml' def get_channel_url(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, 'lxml') links = soup.select('ul.ym-submnu > li > b > a ') for link in links: page_url = 'http://bj.58.com' + link.get('href') print(page_url)
#获取具体商品链接
import time import pymongo client = pymongo.MongoClient(host='localhost', port=27017) #mongo数据库 ceshi = client['ceshi'] url_list = ceshi['url_lists'] item_info = ceshi['item_info'] def get_links_from(channel,pages,who_sells=0): #参数分为渠道链接、页数、个人或商家参数 #http://bj.58.com/shouji/0/pn5/ 研究链接结构,分为渠道链接、0或1表示商家或个人、页数 link_view = '{}{}/pn{}/'.format(channel,str(who_sells),str(pages)) wb_data = requests.get(link_view) time.sleep(1) soup = BeautifulSoup(wb_data.text, 'lxml') if soup.find('td','t'): for link in soup.select('td.t > a.t'): item_link = link.get('href').split("?")[0] url_list.insert_one({'url':item_link}) #放入数据库 print(item_link) else: pass
#获取商品详情
def get_item_info(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text,'lxml') no_longer_exist = '404' in soup.find('script',type = 'text/javascript').get('href').split('/') #判断链接是否失效 if no_longer_exist: pass else: title = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')[0].text #路径直接复制 price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')[0].text #area = list(soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')[0].text if soup.find_all('sapn')) item_info.insert_one({'title':title,'price':price,}) #放入数据库 print({'title':title,'price':price,})

  

  

猜你喜欢

转载自www.cnblogs.com/szhao0823/p/9178589.html