爬虫参考资料

https://www.cnblogs.com/dayouzi/archive/2018/12/02/10054913.html

https://www.yuque.com/kouss/taoke/wwkwwr

https://my.oschina.net/codingDog/blog/983302

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from pyquery import PyQuery
import random
import time
from selenium.webdriver.support.ui import WebDriverWait
browser = webdriver.Firefox
browser.add_argument('-headless') #无头浏览器
browser = webdriver.Firefox(firefox_options=browser)

browser = webdriver.Firefox()

jswd = "Object.defineProperties(navigator,{webdriver:{get:() => false}}"
def searchTaobao(searchName):
tbWait()
browser.get(url='https://www.taobao.com')
browser.execute_script(jswd)

text= browser.find_element_by_css_selector("#q")
text.send_keys(searchName)
text.send_keys(Keys.ENTER)
tbWait()

for ieHeight in range(1,98):
    js = 'window.scrollTo(0,'+str(ieHeight*100)+')'
    browser.execute_script(js)
    time.sleep(0.05)

tbHtml = browser.page_source
dataList = parsHtml(html=tbHtml)
saveData(dataList)

for pageSize in range(2,100):
    browser.find_element_by_css_selector('#mainsrp-pager > div > div > div > div.form > input').clear()
    browser.find_element_by_css_selector('#mainsrp-pager > div > div > div > div.form > input').send_keys(pageSize)
    browser.find_element_by_css_selector('##mainsrp-pager > div > div > div > div.form > span.btn.J_Submit').click()

    tbWait()
    for ieHeight in range(1,98):
        js = 'window.scrollTo(0,'+str(*100)+')'
        browser.execute_script(js)
        time.sleep(0.05)
    tbHtml = browser.page_source
    dataList = parsHtml(html=tbHtml)
    saveData(dataList)
print('[+] 搜索完毕,数据获取完成')

def loginTaobao():
browser.get('https://login.taobao.com/member/login.jhtml')#进入登录页面try:
input = WebDriverWait(browser, 10).until(
browser.find_element_by_class_name('#J_Quick2Static')
)
#EC.presence_of_element_located((By.CLASS_NAME, 'forget-pwd.J_Quick2Static'))) #因为登录页面有时候是扫码登录,使用需要我们点击切换到密码登录
input.click()
#browser.get(url='https://login.taobao.com/member/login.jhtml?redirectURL=https%3A%2F%2Fwww.taobao.com%2Fmarkets%2Ffootmark%2Ftbfoot%3Fspm%3Da21bo.2018.1997525045.3.5af911d91myd2V')
browser.execute_script(jswd)
loginMsg = input('[-]登录:e is login w is wait :').strip()
while loginMsg != 'e':
loginMsg = input('[-]登录:e is login w is wait :').strip()
print('[+]登录,用户登录成功,数据开始爬取....')
def tbWait():
waitTime = random.randint(5,10)
time.sleep(waitTime)
def parsHtml(html):
dataList = []
doc = PyQuery(html)
items = doc('nainsrp-itemlist > div > div > div >div').items()
for item in items:
temp = {
'name':item('div:nth-child(2) > div:nth-child(2) > a').text(),
'price':item('div:nth-child(2) > div > div').text()
}
dataList.append(temp)
print(temp)
return dataList
def saveData(datas):
for data in datas:
with open() as fo:
fo.write(''.format() + '\n')

if name == 'main':
# while True:
loginTaobao()
searchTaobao('thinkpad笔记本')
browser.close()

猜你喜欢

转载自www.cnblogs.com/codetree/p/10269984.html