Python爬虫 —— 知乎之selenium模拟登陆+requests.Session()获取cookies

代码如下:

 1 # coding:utf-8
 2 from selenium import webdriver
 3 import requests
 4 import sys
 5 import time
 6 from lxml import etree
 7 # reload(sys)
 8 # sys.setdefaultencoding('utf-8')
 9 
10 class Zhihu:
11     def __init__(self,homeurl):
12         self.homeurl = homeurl
13 
14     def GetCookies(self):
15         browser = webdriver.Chrome()
16         browser.get("https://www.zhihu.com/signin")
17         browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("13060882373")
18         browser.find_element_by_css_selector(".SignFlow-password input").send_keys("XXXXXX")
19         browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
20         time.sleep(3)
21         # js = "window.scrollTo(0, document.body.scrollHeight);"
22         # browser.execute_script(js)
23         # time.sleep(3)
24         cookies = browser.get_cookies()
25         browser.quit()
26         return cookies
27 
28     def Crawl(self):
29         s = requests.Session()
30         s.headers.clear()
31         for cookie in self.GetCookies():
32             s.cookies.set(cookie['name'], cookie['value'])
33         html = s.get(self.homeurl).text
34         html_tree = etree.HTML(html)
35         items = html_tree.xpath('//*[@id="root"]/div/main/div/div/div[1]/div[2]/div//div[@class="ContentItem AnswerItem"]/@data-zop')
36         for item in items:
37             # print item
38             content = eval(item)
39             authorName = content['authorName']
40             title = content['title']
41             print authorName + "回答了:" + title
42 
43 
44 zhihu = Zhihu('https://www.zhihu.com/')
45 zhihu.Crawl()

猜你喜欢

转载自www.cnblogs.com/DOLFAMINGO/p/9170429.html