版权声明:原创 https://blog.csdn.net/hangvane123/article/details/82953707
#-*- coding:utf-8 -*-
import datetime
import queue
import threading
import time
from random import choice
import requests
import urllib3
urllib3.disable_warnings()
class Scraping:
def __init__(self):
#浏览器User Agent
self.uas = [
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
"Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
"Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
]
#主循环计数器
self.mainCounter=0
#Format开始时间
self.startTimeStr = datetime.datetime.now().strftime('%H:%M:%S')
#开始时间
self.startTime=datetime.datetime.now()
#消息队列
self.msgQ=queue.Queue()
#数据队列
self.dataQ=queue.Queue()
#代理队列
self.proxyQ=queue.Queue()
#tips显示时间间隔
self.tipsTime=5
#开始状态
self.status='stoped'
#目标Url附加数据文件句柄
self.readHwnd=open(r'源.txt')
#最新proxy
self.presentProxy=''
#proxyQ最大数量
self.maxProxyQ=20
#proxyQ添加速度
self.getProxyQSpeed=0.6
#获取代理api
self.getProxyUrl='http://dynamic.goubanjia.com/dynamic/get/xxxxxxx.html?sep=3'
#最大目标线程数量
self.maxThreadNum=15
#目标url
self.targetUrl='https://xxx.xxx.com/?regnamesugg&username='
def getData(self):
'''
获取目标Url附加数据(单行)
'''
line=self.readHwnd.readline()
if(not line):
return ''
else:
line=line.strip('\n')
return line
def addDataThread(self):
'''
调用self.getData()
维护数据队列self.dataQ,保持数据队列长度
'''
while(self.status=='running'):
if(self.dataQ.qsize()<self.maxThreadNum):
data=self.getData()
if(data==''):
#数据读取完毕
print('addData:\t数据读取完毕')
return
self.dataQ.put(data)
def getProxy(self):
'''
从代理api self.getProxyUrl获取代理ip:port
'''
try:
ipReq = requests.get(self.getProxyUrl)
ips=ipReq.text.split('\n')
for ip in ips:
#print('getProxy:\t获取新ip'+ip)
return ip
except Exception as e:
print('getProxy:\t'+str(e))
return self.presentProxy
def addProxyThread(self):
'''
调用self.getProxy
维护代理队列self.proxyQ
'''
while(self.status=='running'):
time.sleep(self.getProxyQSpeed)
if(self.proxyQ.qsize()<self.maxProxyQ):
proxy=self.getProxy()
if(proxy!=self.presentProxy):
self.proxyQ.put(proxy)
self.presentProxy=proxy
print('addProxy:\t添加新proxy '+proxy)
def tipsThread(self):
'''
显示tips
'''
while(self.status=='running'):
time.sleep(self.tipsTime)
nowTime=datetime.datetime.now()
runTime=(nowTime-self.startTime).seconds
print('tips:\t运行时间:'+str(runTime)+'s\t速度:'+str(self.mainCounter/self.tipsTime)+
'\tmsgQ.qsize:'+str(self.msgQ.qsize())+'\tdataQ.qsize:'+str(self.dataQ.qsize())+'\tproxyQ.qsize:'+str(self.proxyQ.qsize()))
self.mainCounter=0
def workThread(self):
'''
工作线程
从代理队列self.proxyQ,数据队列self.dataQ获取数据,请求目标Url self.targetUrl
返回{'data':data,'result':result}至消息队列self.msgQ
return 1 : id已存在
return 0 : id不存在
return -1 : ip访问限制
return -2 : id特殊error(非法id)
return -3 : 连接拒绝/连接超时
'''
while(self.status=='running'):
time.sleep(0.1)
try:
proxy=self.proxyQ.get()
except queue.Empty:
continue
try:
data=self.dataQ.get()
except queue.Empty:
#返还proxy
self.proxyQ.put(proxy)
headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9",
"User-Agent":choice(self.uas),
}
try:
req = requests.get(self.targetUrl+data,timeout=3,proxies={'https':proxy,'http':proxy},headers=headers,verify=False)
self.msgQ.put({
'data':data,
'result':self.checkRawText(req.text,proxy)
})
except:
#连接错误
#不返还proxy
self.msgQ.put({
'data':data,
'result':-3
})
def checkRawText(self,text,proxy):
'''
检查读取Url返回值
'''
if(text.find('"errno":0')!=-1):
#无error
#返还proxy
self.proxyQ.put(proxy)
if(text.find('"userexsit":1')!=-1):
#已存在
return 1
else:
#不存在
return 0
elif(text.find('"errno":500010')!=-1):
#ip限制
#不返还proxy
return -1
else:
#特殊error
#返还proxy
self.proxyQ.put(proxy)
return -2
def handleMsgThread(self):
'''
返回值处理线程
读取消息队列self.msgQ,正常返回值记录,异常返回值打回数据队列self.dataQ重新读取
'''
while(self.status=='running'):
try:
msg=self.msgQ.get()
if(msg['result']==1):
self.mainCounter+=1
print('handleMsg:\t'+msg['data']+'×')
a=open(r'存在.txt','a+')
a.write(msg['data']+'\n')
a.close
elif(msg['result']==0):
self.mainCounter+=1
print('handleMsg:\t'+msg['data']+'√')
a=open(r'不存在.txt','a+')
a.write(msg['data']+'\n')
a.close
elif(msg['result']==-1):
print('handleMsg:\t'+msg['data']+'ip限制')
self.dataQ.put(msg['data'])
elif(msg['result']==-2):
self.mainCounter+=1
print('handleMsg:\t'+msg['data']+'特殊error')
elif(msg['result']==-3):
print('handleMsg:\t'+msg['data']+'连接错误')
self.dataQ.put(msg['data'])
except queue.Empty:
time.sleep(0.2)
def start(self):
'''
开启消息线程
开启代理队列维护线程
开启数据队列维护线程
开启返回值处理线程
开启self.maxThreadNum个工作线程
'''
self.status='running'
tipsThread=threading.Thread(target=self.tipsThread)
tipsThread.start()
proxyThread=threading.Thread(target=self.addProxyThread)
proxyThread.start()
dataThread=threading.Thread(target=self.addDataThread)
dataThread.start()
handleMsgThread=threading.Thread(target=self.handleMsgThread)
handleMsgThread.start()
for a in range(0,self.maxThreadNum):
tmpThread=threading.Thread(target=self.workThread)
tmpThread.start()
scr=Scraping()
scr.start()
运行结果:
addProxy: 添加新proxy 119.96.195.76:58269
handleMsg: 一争×
addProxy: 添加新proxy 117.63.204.66:25444
tips: 运行时间:5s 速度:0.2 msgQ.qsize:0 dataQ.qsize:2 proxyQ.qsize:0
handleMsg: 一从连接错误
handleMsg: 一但连接错误
addProxy: 添加新proxy 144.123.71.189:53086
tips: 运行时间:10s 速度:0.0 msgQ.qsize:0 dataQ.qsize:3 proxyQ.qsize:0
handleMsg: 一冼×
addProxy: 添加新proxy 106.112.171.133:33564
handleMsg: 一别×
handleMsg: 一从×
handleMsg: 一但×
tips: 运行时间:15s 速度:0.8 msgQ.qsize:0 dataQ.qsize:0 proxyQ.qsize:0
addProxy: 添加新proxy 122.4.28.184:22336
addProxy: 添加新proxy 123.180.71.236:63368
tips: 运行时间:20s 速度:0.0 msgQ.qsize:0 dataQ.qsize:0 proxyQ.qsize:0
addProxy: 添加新proxy 123.163.131.188:43554
addProxy: 添加新proxy 121.228.52.101:62493
tips: 运行时间:25s 速度:0.0 msgQ.qsize:0 dataQ.qsize:0 proxyQ.qsize:0
addProxy: 添加新proxy 183.147.252.249:19525
addProxy: 添加新proxy 110.88.127.24:56712