scrapy mid中间件一般处理方法

import user_agent
import requests


class UA_midd(object):
    def process_request(self,request,spider):
        request.headers['User-Agent'] = user_agent.generate_user_agent()
        referer = request.url
        if referer:
            request.headers['Referer'] = referer


class Proxy_midd(object):

    def __init__(self):
        self.ip = ''
        self.url = 'http://188.131.212.24:5010/get/'
        self.count = 0

    def process_request(self, request, spider):

        if self.count == 0 or self.count >=20:
            res = requests.get(url=self.url).content.decode()
            if not 'no' in res:
                self.ip = res
            self.count = 1

        if self.ip:
            request.meta['proxy'] = 'http://' + self.ip
            self.count += 1
        else:
            self.count += 5


    def process_exception(self, request, exception, spider):
        if isinstance(request,TimeoutError):
            self.count += 20
            return request

单纯的处理ua和ip的功能

# 如果你是通过cookies池进行维护的,请请求不过是的cokies

# 注意在中间件中设置cookies是字典化的
import json
import requests

class cookies_mid(object):
        def __init__(self):
            slef.cookies_url = '你维护的cookies池'
        
        def process_request(self,request,spider):
            request.cookies = self.get_cookies()                        
        
        def get_cookies(self):
            cookies = requests.get(self.cookies_url).content.decode()
            if cookies:
                return json.loads(cookies)
    

 cookies更换



有关资料 https://blog.csdn.net/sc_lilei/article/details/80702449

猜你喜欢

转载自www.cnblogs.com/zengxm/p/11094666.html
今日推荐