#多线程与协程的使用会大大加速抓取速度,多线程算并发会产生重复和写入冲突等问题,协程会自动切换这块比较好点
分别主要是两个模块,多线程threading,协程gevent
1、多线程threading版
***多线程使用最主要就是给个线程分配内容的问题,这有些小技巧,主要就是最后几句,取模运算分配
import re
from lxml import etree
import requests
import json
import pandas as pd
import time
import csv
import threading
s=requests.Session()
cookie="++++++++++++++++++++++++++++++"
headers2 = {
"Accept":"*/*",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Languag":"zh-CN,zh;q=0.9",
"Connection":"keep-alive",
"Content-Type":"application/x-www-form-urlencoded",
"Cookie":cookie,
"Host":"weibo.com",
"Referer":"https://weibo.com/u/1549364094?profile_ftype=1&is_all=1",
"User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"X-Requested-With":"XMLHttpRequest",
}
headers1 = {
"Connection":"keep-alive",
"Cookie":cookie,
"Host":"weibo.com",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",
"Upgrade-Insecure-Requests":"1",
}
def request1(url):
html=s.get(url,headers=headers2)
# print(html.text)
json1=json.loads(html.text)['data']
return etree.HTML(json1)
def request2(url):
html = s.get(url,headers=headers1)
# print(html.text)
return html.text
lists_all=[]
# url_id="210926262"
content_all=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\lists2.csv",engine='python',header=None).values.tolist() #OSError: Initializing from file failed 加上engine='python'
print(content_all)
def download(content):
try:
url_id=content[1]
name=content[0]
home_url="https://weibo.com/{}?profile_ftype=1&is_all=1#_0".format(url_id)
print(home_url)
time.sleep(10)
a=request2(home_url)
# print(a)
content_id = re.findall("page_id']='(.*?)';",a)[0]
domain_id= re.findall("domain']='(.*?)';",a)[0]
MyProfileFeed_id=re.findall("Pl_Official_MyProfileFeed__(\d+)",a)[0]
print(content_id)
print(domain_id)
print(MyProfileFeed_id)
#个人简介
username=re.findall('<title>(.*?)的微博',a)[0] #re.findall('<h1.*?>(.*?)<',a)
# username=aa.xpath('//h1/h')
print(username)
info = re.findall(',(.*?)的微博主页.*?description',a)[0]
print(info)
person_url="https://weibo.com/p/{}/info?mod=pedit_more".format(content_id)
print(person_url)
time.sleep(10)
try:
if request2(person_url):
b=request2(person_url)
info_html=re.findall('domid":"Pl_Official_PersonalInfo__.*?"html":"(.*?)"}',b)[0].strip().replace("\\r","").replace("\\n","").replace("\\","")
print(info_html)
info_html=etree.HTML(info_html)
information={}
for i in range(len(info_html.xpath('//span[contains(@class,"pt_title")]'))):
bb=info_html.xpath('//span[contains(@class,"pt_title")]/text()')[i].strip()
try:
if bb == "博客:":
cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/a/text()'.format(i + 1))[0].strip()
elif bb == "个性域名:":
cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/a/text()'.format(i + 1))[0].strip()
elif bb == "标签:":
print("++++++++++++++++111")
cc = info_html.xpath('//a[@node-type="tag"]/text()')
print(cc)
# cc=dd.xpath('string(.)').strip()
else:
cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/text()'.format(i + 1))[0].strip()
except:
pass
information["{}".format(bb)]=cc
print(information)
lists_all.append([name,username,info,information])
with open("lists24.csv", "w", encoding="utf-8", newline="") as f:
k = csv.writer(f, dialect="excel")
k.writerow(["名字", "昵称", "info", "简介"])
for list1 in lists_all:
k.writerow(list1)
except:
pass
except:
lists_all_set=list(set(lists_all))
with open("lists25.csv", "w", encoding="utf-8", newline="") as f:
k = csv.writer(f, dialect="excel")
k.writerow(["名字", "昵称", "info", "简介"])
for list1 in lists_all_set:
k.writerow(list1)
#*** 多线程
if __name__ == "__main__":
length=len(content_all)
xclist=[[],[],[],[],[],[],[],[],[],[]] #计算分配每个线程的请求数
N=len(xclist)
for i in range(length):
xclist[i%N].append(content_all[i])
for i in range(10): #线程数量
for m in range(len(xclist[i])):
t=threading.Thread(target=download,args=(xclist[i][m],))
t.start()
2、协程gevent版
***协程使用引入库这三句写最前面最上面不会出错
import gevent
import gevent.monkey
gevent.monkey.patch_all()
协程使用主要就是链接分配问题
import gevent
import gevent.monkey
gevent.monkey.patch_all()
import re
from lxml import etree
import requests
import json
import pandas as pd
import time
import csv
import threading
s=requests.Session()
cookie="++++++++"
headers2 = {
"Accept":"*/*",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Languag":"zh-CN,zh;q=0.9",
"Connection":"keep-alive",
"Content-Type":"application/x-www-form-urlencoded",
"Cookie":cookie,
"Host":"weibo.com",
"Referer":"https://weibo.com/u/1549364094?profile_ftype=1&is_all=1",
"User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"X-Requested-With":"XMLHttpRequest",
}
headers1 = {
"Connection":"keep-alive",
"Cookie":cookie,
"Host":"weibo.com",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",
"Upgrade-Insecure-Requests":"1",
}
def request1(url):
html=s.get(url,headers=headers2)
# print(html.text)
json1=json.loads(html.text)['data']
return etree.HTML(json1)
def request2(url):
html = s.get(url,headers=headers1)
# print(html.text)
return html.text
lists_all=[]
# url_id="210926262"
content_all=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\lists2.csv",engine='python',header=None).values.tolist() #OSError: Initializing from file failed 加上engine='python'
print(content_all)
def download(content):
try:
url_id=content[1]
name=content[0]
home_url="https://weibo.com/{}?profile_ftype=1&is_all=1#_0".format(url_id)
print(home_url)
time.sleep(10)
a=request2(home_url)
# print(a)
content_id = re.findall("page_id']='(.*?)';",a)[0]
domain_id= re.findall("domain']='(.*?)';",a)[0]
MyProfileFeed_id=re.findall("Pl_Official_MyProfileFeed__(\d+)",a)[0]
print(content_id)
print(domain_id)
print(MyProfileFeed_id)
#个人简介
username=re.findall('<title>(.*?)的微博',a)[0] #re.findall('<h1.*?>(.*?)<',a)
# username=aa.xpath('//h1/h')
print(username)
info = re.findall(',(.*?)的微博主页.*?description',a)[0]
print(info)
person_url="https://weibo.com/p/{}/info?mod=pedit_more".format(content_id)
print(person_url)
time.sleep(10)
try:
if request2(person_url):
b=request2(person_url)
info_html=re.findall('domid":"Pl_Official_PersonalInfo__.*?"html":"(.*?)"}',b)[0].strip().replace("\\r","").replace("\\n","").replace("\\","")
print(info_html)
info_html=etree.HTML(info_html)
information={}
for i in range(len(info_html.xpath('//span[contains(@class,"pt_title")]'))):
bb=info_html.xpath('//span[contains(@class,"pt_title")]/text()')[i].strip()
try:
if bb == "博客:":
cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/a/text()'.format(i + 1))[0].strip()
elif bb == "个性域名:":
cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/a/text()'.format(i + 1))[0].strip()
elif bb == "标签:":
print("++++++++++++++++111")
cc = info_html.xpath('//a[@node-type="tag"]/text()')
print(cc)
# cc=dd.xpath('string(.)').strip()
else:
cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/text()'.format(i + 1))[0].strip()
except:
pass
information["{}".format(bb)]=cc
print(information)
lists_all.append([name,username,info,information])
with open("lists24.csv", "w", encoding="utf-8", newline="") as f:
k = csv.writer(f, dialect="excel")
k.writerow(["名字", "昵称", "info", "简介"])
for list1 in lists_all:
k.writerow(list1)
except:
pass
except:
lists_all_set=list(set(lists_all))
with open("lists25.csv", "w", encoding="utf-8", newline="") as f:
k = csv.writer(f, dialect="excel")
k.writerow(["名字", "昵称", "info", "简介"])
for list1 in lists_all_set:
k.writerow(list1)
#协程版
if __name__ == "__main__":
length=len(content_all)
xclist = [] #构建协程链接池
for i in range(length):
xclist.append(gevent.spawn(download,content_all[i] ))
print(xclist)
gevent.joinall(xclist)