今天为大家介绍用selenium自动化工具抓取淘宝美食
1.直接上代码
#coding:utf-8 import re from pyquery import PyQuery as pq from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from config import * import pymongo client=pymongo.MongoClient(MONGO_URL) db=client[MONGO_DB] # browser=webdriver.Chrome() # browser=webdriver.Firefox() browser=webdriver.PhantomJS('D:\phantomjs\\bin\phantomjs.exe',service_args=SERVICE_ARGS) wait=WebDriverWait(browser, 10) browser.set_window_size(1400,900) # browser.get('https://www.taobao.com') def search(): print('正在搜索') try: browser.get('https://www.taobao.com') input =wait .until( EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))#是否加载成功 ) submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".btn-search"))) input.send_keys("美食") submit.click() total=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".total"))) get_products() return total.text except TimeoutError: return search() # browser.quit() def next_page(page_number): print('正在翻页',page_number) try: input =wait .until( EC.presence_of_element_located((By.CSS_SELECTOR, "input.input:nth-child(2)")) ) submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "span.btn:nth-child(4)"))) input.clear() input.send_keys(page_number) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'li.active > span'),str(page_number)))#翻页 get_products() except TimeoutError: next_page(page_number) def get_products(): wait .until(EC.presence_of_element_located((By.CSS_SELECTOR, ".m-itemlist .items .item"))) html=browser.page_source doc=pq(html) items=doc(".m-itemlist .items .item").items() for item in items: products={ 'image':item.find('.pic .img').attr('src'), 'price':item.find('.price').text(), 'deal':item.find('.deal-cnt').text()[:-3], 'title':item.find('.title').text(), 'shop':item.find('.shop').text(), 'location':item.find('.location').text() } save_to_mongo(products) def save_to_mongo(result): try: if db[MONGO_TABLE].insert(result): print("存储到MONGODB成功",result) except Exception: print('存储到数据库',result) def main(): try: total=search() total=int(re.compile('(\d+)').search(total).group(1)) for i in range(2,total+1): next_page(i) browser.close() except Exception: print("出错啦") finally: browser.close() if __name__=='__main__': main() # pass
2.confing文件,用于定义MongoDB字段等信息
# - * - coding:utf-8 - * - MONGO_URL='localhost' MONGO_DB='taobao'#数据库名称 MONGO_TABLE='product'#表名称 SERVICE_ARGS=['--load-images=false','--disk-cache=true']#不加载图片