【python】代理反ip限制获取URL数据

版权声明:原创 https://blog.csdn.net/hangvane123/article/details/82953707
#-*- coding:utf-8 -*-
import datetime
import queue
import threading
import time
from random import choice

import requests
import urllib3

urllib3.disable_warnings()

class Scraping:
    def __init__(self):
        #浏览器User Agent
        self.uas = [
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
            "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
            "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
            "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
            ]
        #主循环计数器
        self.mainCounter=0
        #Format开始时间
        self.startTimeStr = datetime.datetime.now().strftime('%H:%M:%S')
        #开始时间
        self.startTime=datetime.datetime.now()
        #消息队列
        self.msgQ=queue.Queue()
        #数据队列
        self.dataQ=queue.Queue()
        #代理队列
        self.proxyQ=queue.Queue()
        #tips显示时间间隔
        self.tipsTime=5
        #开始状态
        self.status='stoped'
        #目标Url附加数据文件句柄
        self.readHwnd=open(r'源.txt')
        #最新proxy
        self.presentProxy=''
        #proxyQ最大数量
        self.maxProxyQ=20
        #proxyQ添加速度
        self.getProxyQSpeed=0.6
        #获取代理api
        self.getProxyUrl='http://dynamic.goubanjia.com/dynamic/get/xxxxxxx.html?sep=3'
        #最大目标线程数量
        self.maxThreadNum=15
        #目标url
        self.targetUrl='https://xxx.xxx.com/?regnamesugg&username='
    def getData(self):
        '''
        获取目标Url附加数据(单行)
        '''
        line=self.readHwnd.readline()
        if(not line):
            return ''
        else:
            line=line.strip('\n')
            return line
    def addDataThread(self):
        '''
        调用self.getData()
        维护数据队列self.dataQ,保持数据队列长度
        '''
        while(self.status=='running'):
            if(self.dataQ.qsize()<self.maxThreadNum):
                data=self.getData()
                if(data==''):
                    #数据读取完毕
                    print('addData:\t数据读取完毕')
                    return
                self.dataQ.put(data)

    def getProxy(self):
        '''
        从代理api self.getProxyUrl获取代理ip:port
        '''
        try:
            ipReq = requests.get(self.getProxyUrl)
            ips=ipReq.text.split('\n')
            for ip in ips:
                #print('getProxy:\t获取新ip'+ip)
                return ip
        except Exception as e:
            print('getProxy:\t'+str(e))
            return self.presentProxy

    def addProxyThread(self):
        '''
        调用self.getProxy
        维护代理队列self.proxyQ
        '''
        while(self.status=='running'):
            time.sleep(self.getProxyQSpeed)
            if(self.proxyQ.qsize()<self.maxProxyQ):
                proxy=self.getProxy()
                if(proxy!=self.presentProxy):
                    self.proxyQ.put(proxy)
                    self.presentProxy=proxy
                    print('addProxy:\t添加新proxy '+proxy)
    def tipsThread(self):
        '''
        显示tips
        '''
        while(self.status=='running'):
            time.sleep(self.tipsTime)
            nowTime=datetime.datetime.now()
            runTime=(nowTime-self.startTime).seconds
            print('tips:\t运行时间:'+str(runTime)+'s\t速度:'+str(self.mainCounter/self.tipsTime)+
            '\tmsgQ.qsize:'+str(self.msgQ.qsize())+'\tdataQ.qsize:'+str(self.dataQ.qsize())+'\tproxyQ.qsize:'+str(self.proxyQ.qsize()))
            self.mainCounter=0

    def workThread(self):
        '''
        工作线程
        从代理队列self.proxyQ,数据队列self.dataQ获取数据,请求目标Url self.targetUrl
        返回{'data':data,'result':result}至消息队列self.msgQ
        return 1    :   id已存在
        return 0    :   id不存在
        return -1   :   ip访问限制
        return -2   :   id特殊error(非法id)
        return -3   :   连接拒绝/连接超时
        '''
        while(self.status=='running'):
            time.sleep(0.1)
            try:
                proxy=self.proxyQ.get()
            except queue.Empty:
                continue
            try:
                data=self.dataQ.get()
            except queue.Empty:
                #返还proxy
                self.proxyQ.put(proxy)
            headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                        "Accept-Encoding":"gzip, deflate, br",
                        "Accept-Language":"zh-CN,zh;q=0.9",
                        "User-Agent":choice(self.uas),
                        }
            try:
                req = requests.get(self.targetUrl+data,timeout=3,proxies={'https':proxy,'http':proxy},headers=headers,verify=False)
                
                self.msgQ.put({
                    'data':data,
                    'result':self.checkRawText(req.text,proxy)
                    })

            except:
                #连接错误
                #不返还proxy
                self.msgQ.put({
                    'data':data,
                    'result':-3
                    })

                
    def checkRawText(self,text,proxy):
        '''
        检查读取Url返回值
        '''
        if(text.find('"errno":0')!=-1):
            #无error
            #返还proxy
            self.proxyQ.put(proxy)
            if(text.find('"userexsit":1')!=-1):
                #已存在
                return 1
            else:
                #不存在
                return 0
        elif(text.find('"errno":500010')!=-1):
            #ip限制
            #不返还proxy
            return -1
        else:
            #特殊error
            #返还proxy
            self.proxyQ.put(proxy)
            return -2
    def handleMsgThread(self):
        '''
        返回值处理线程
        读取消息队列self.msgQ,正常返回值记录,异常返回值打回数据队列self.dataQ重新读取
        '''
        while(self.status=='running'):
            try:
                msg=self.msgQ.get()
                if(msg['result']==1):
                    self.mainCounter+=1
                    print('handleMsg:\t'+msg['data']+'×')
                    a=open(r'存在.txt','a+')
                    a.write(msg['data']+'\n')
                    a.close
                elif(msg['result']==0):
                    self.mainCounter+=1
                    print('handleMsg:\t'+msg['data']+'√')
                    a=open(r'不存在.txt','a+')
                    a.write(msg['data']+'\n')
                    a.close
                elif(msg['result']==-1):
                    print('handleMsg:\t'+msg['data']+'ip限制')
                    self.dataQ.put(msg['data'])
                elif(msg['result']==-2):
                    self.mainCounter+=1
                    print('handleMsg:\t'+msg['data']+'特殊error')
                elif(msg['result']==-3):
                    print('handleMsg:\t'+msg['data']+'连接错误')
                    self.dataQ.put(msg['data'])
            except queue.Empty:
                time.sleep(0.2)
                

    def start(self):
        '''
        开启消息线程
        开启代理队列维护线程
        开启数据队列维护线程
        开启返回值处理线程
        开启self.maxThreadNum个工作线程
        '''
        self.status='running'
        tipsThread=threading.Thread(target=self.tipsThread)
        tipsThread.start()
        proxyThread=threading.Thread(target=self.addProxyThread)
        proxyThread.start()
        dataThread=threading.Thread(target=self.addDataThread)
        dataThread.start()
        handleMsgThread=threading.Thread(target=self.handleMsgThread)
        handleMsgThread.start()
        for a in range(0,self.maxThreadNum):
            tmpThread=threading.Thread(target=self.workThread)
            tmpThread.start()
scr=Scraping()
scr.start()

运行结果:

addProxy:       添加新proxy 119.96.195.76:58269
handleMsg:      一争×
addProxy:       添加新proxy 117.63.204.66:25444
tips:   		运行时间:5s     速度:0.2        msgQ.qsize:0    dataQ.qsize:2   proxyQ.qsize:0
handleMsg:      一从连接错误
handleMsg:      一但连接错误
addProxy:       添加新proxy 144.123.71.189:53086
tips:   		运行时间:10s    速度:0.0        msgQ.qsize:0    dataQ.qsize:3   proxyQ.qsize:0
handleMsg:      一冼×
addProxy:       添加新proxy 106.112.171.133:33564
handleMsg:      一别×
handleMsg:      一从×
handleMsg:      一但×
tips:   		运行时间:15s    速度:0.8        msgQ.qsize:0    dataQ.qsize:0   proxyQ.qsize:0
addProxy:       添加新proxy 122.4.28.184:22336
addProxy:       添加新proxy 123.180.71.236:63368
tips:   		运行时间:20s    速度:0.0        msgQ.qsize:0    dataQ.qsize:0   proxyQ.qsize:0
addProxy:       添加新proxy 123.163.131.188:43554
addProxy:       添加新proxy 121.228.52.101:62493
tips:   		运行时间:25s    速度:0.0        msgQ.qsize:0    dataQ.qsize:0   proxyQ.qsize:0
addProxy:       添加新proxy 183.147.252.249:19525
addProxy:       添加新proxy 110.88.127.24:56712

猜你喜欢

转载自blog.csdn.net/hangvane123/article/details/82953707