之前网络上上多的教程,包括视频教程,都用了很老的方式,老师也说过爬虫和反爬是一场永不停息的战斗。我在这次体会的淋漓尽致,前一天知乎的登录刚刚告破,第二天知乎就改版了,现在知乎的登录方式是通过文件上传的方式,而且每个登录的验证码也是加密的js方式,我这才疏学浅搞不定恨啊。我因为这彻夜难眠搞了3天还是没搞定。平常还得上班,晚上回来都没搞定,最后在不断的深入学习过程中,我发现了一个很牛逼的方式也是直接攻破知乎登录的方式。
from scrapy.http import HtmlResponse import re class JSPageMiddleware(object): def process_request(self, request, spider): # if spider.name == 'jobbole': # spider.browser.get(request.url) # import time # time.sleep(3) # print("访问:{0}".format(request.url)) # # return HtmlResponse(url=spider.browser.current_url,body=spider.browser.page_source,encoding="utf-8",request=request) if spider.name == 'zhihu' and not re.match("(.*zhihu.com/question/(\d+))(/|$).*", request.url): spider.browser.get(request.url) import time time.sleep(3) if request.url =='https://www.zhihu.com/signin': spider.browser.find_element_by_css_selector(".Login-content input[name='username']").send_keys("13460688542") spider.browser.find_element_by_css_selector(".Login-content input[name='password']").send_keys("3989441qwe") spider.browser.find_element_by_css_selector(".Button.SignFlow-submitButton.Button--primary.Button--blue").click() time.sleep(3) print("访问:{0}".format(request.url)) return HtmlResponse(url=spider.browser.current_url,body=spider.browser.page_source,encoding="utf-8",request=request)
middlewares.py 中加入
DOWNLOADER_MIDDLEWARES = { # 'ArticleSpider.middlewares.RandomUserAgentMiddleware': 543, 'ArticleSpider.middlewares.JSPageMiddleware':1 }
MYSQL_HOST='' MYSQL_DBNAME='scrapy' MYSQL_USER='root' MYSQL_PASSWORD='123' MYSQL_CHARSET='utf8' MYSQL_USE_UNICODE=True
settings.py加入
import scrapy import json import time import os import re from scrapy.loader import ItemLoader from ..items import ZhihuAnswerItem,ZhihuQuestionItem from selenium import webdriver from scrapy.xlib.pydispatch import dispatcher from scrapy import signals try: import urlparse as parse except: from urllib import parse class ZhihuSpider(scrapy.Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] # start_urls = ['https://www.zhihu.com/question/263432973'] start_urls = ['https://www.zhihu.com/signin'] xsrf = '' header = { "Host": "www.zhihu.com", "Referer": "https://www.zhihu.com/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" } def __init__(self): self.browser = webdriver.Chrome(executable_path="D:/Program Files/selenium_brower/chromedriver.exe") super(ZhihuSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): self.browser.quit() def parse(self, response): #yield scrapy.Request("https://www.zhihu.com/question/263432973",headers=self.header,callback=self.parse_question) all_urls = response.css("a::attr(href)").extract() all_urls = [parse.urljoin(response.url, url) for url in all_urls] all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls) for url in all_urls: match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url) if match_obj: # 如果提取到question相关的页面则下载后交由提取函数进行提取 request_url = match_obj.group(1) print(request_url) yield scrapy.Request(request_url, headers=self.header, callback=self.parse_question) else: print("没有符合的") # 如果不是question页面则直接进一步跟踪 yield scrapy.Request(url, headers=self.header, callback=self.parse) def parse_question(self,response): match_obj = re.match(".*www.zhihu.com/question/(\d.*)",response.url) if match_obj: zhihu_id = match_obj.group(1) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title","h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url",response.url) item_loader.add_value("zhihu_id", zhihu_id) item_loader.add_css("answer_num", ".QuestionAnswers-answers span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css("watch_user_num",".QuestionFollowStatus .QuestionFollowStatus-counts div strong::text") item_loader.add_xpath("topics", "//*[@class='TopicLink']/div/div/text()") question_item = item_loader.load_item() yield question_item pass
zhihu.py
class ZhihuQuestionItem(scrapy.Item): zhihu_id = scrapy.Field() topics = scrapy.Field() url = scrapy.Field() title = scrapy.Field() content = scrapy.Field() creat_time = scrapy.Field() update_time = scrapy.Field() answer_num = scrapy.Field() comments_num = scrapy.Field() watch_user_num = scrapy.Field() click_num = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into zhihu_question (zhihu_id,topics,url,title,content,creat_time,update_time,answer_num,comments_num,watch_user_num,click_num,crawl_time,crawl_update_time) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE comments_num=VALUES(comments_num),watch_user_num=VALUES(watch_user_num),click_num=VALUES(click_num) """ zhihu_id = self["zhihu_id"][0] topics = ",".join(self["topics"]) url = "".join(self["url"]) title = "".join(self["title"]) content = "".join(self["content"]) creat_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) update_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) answer_num = self["answer_num"][0] comments_num = get_nums(self["comments_num"][0]) watch_user_num = self["watch_user_num"][0] click_num = self["watch_user_num"][1] crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) crawl_update_time =datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (zhihu_id,topics,url,title,content,creat_time,update_time,answer_num,comments_num,watch_user_num,click_num,crawl_time,crawl_update_time) return insert_sql,params
items.py 加入
class MysqlTwistedZhihuPipline(object): #通过连接池的方式 def __init__(self,dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbparms = dict( host=settings["MYSQL_HOST"], db=settings["MYSQL_DBNAME"], user=settings["MYSQL_USER"], passwd=settings["MYSQL_PASSWORD"], charset=settings["MYSQL_CHARSET"], cursorclass=MySQLdb.cursors.DictCursor, use_unicode=settings["MYSQL_USE_UNICODE"], ) dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) return cls(dbpool) def process_item(self, item, spider): # 使用twisted将mysql插入变成异步执行 query = self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error, item, spider) # 处理异常 def handle_error(self, failure, item, spider): # 处理异步插入的异常 print(failure) def do_insert(self, cursor, item): insert_sql,params = item.get_insert_sql() cursor.execute(insert_sql, params)
pipelines.py中加入
数据库创建sql