多线程与协程爬取使用方法

#多线程与协程的使用会大大加速抓取速度，多线程算并发会产生重复和写入冲突等问题，协程会自动切换这块比较好点
分别主要是两个模块，多线程threading，协程gevent

1、多线程threading版
***多线程使用最主要就是给个线程分配内容的问题，这有些小技巧，主要就是最后几句，取模运算分配

import re
from lxml import etree
import requests
import json
import pandas as pd
import time
import csv

import threading



s=requests.Session()
cookie="++++++++++++++++++++++++++++++"
headers2 = {

        "Accept":"*/*",
        "Accept-Encoding":"gzip, deflate, br",
        "Accept-Languag":"zh-CN,zh;q=0.9",
    "Connection":"keep-alive",
    "Content-Type":"application/x-www-form-urlencoded",
    "Cookie":cookie,
    "Host":"weibo.com",
    "Referer":"https://weibo.com/u/1549364094?profile_ftype=1&is_all=1",
    "User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "X-Requested-With":"XMLHttpRequest",
}
headers1 = {
   
    "Connection":"keep-alive",
    "Cookie":cookie,
    "Host":"weibo.com",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",
    "Upgrade-Insecure-Requests":"1",
}

def request1(url):

    html=s.get(url,headers=headers2)
    # print(html.text)
    json1=json.loads(html.text)['data']

    return etree.HTML(json1)

def request2(url):

    html = s.get(url,headers=headers1)
    # print(html.text)
    return html.text


lists_all=[]
# url_id="210926262"
content_all=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\lists2.csv",engine='python',header=None).values.tolist()   #OSError: Initializing from file failed 加上engine='python'
print(content_all)

def download(content):
    try:

        url_id=content[1]
        name=content[0]
        home_url="https://weibo.com/{}?profile_ftype=1&is_all=1#_0".format(url_id)
        print(home_url)
        time.sleep(10)
        a=request2(home_url)
        # print(a)
        content_id = re.findall("page_id']='(.*?)';",a)[0]
        domain_id= re.findall("domain']='(.*?)';",a)[0]
        MyProfileFeed_id=re.findall("Pl_Official_MyProfileFeed__(\d+)",a)[0]
        print(content_id)
        print(domain_id)
        print(MyProfileFeed_id)


        #个人简介
        username=re.findall('<title>(.*?)的微博',a)[0]  #re.findall('<h1.*?>(.*?)<',a)
        # username=aa.xpath('//h1/h')
        print(username)
        info = re.findall('，(.*?)的微博主页.*?description',a)[0]
        print(info)
        person_url="https://weibo.com/p/{}/info?mod=pedit_more".format(content_id)
        print(person_url)
        time.sleep(10)
        try:
            if request2(person_url):
                b=request2(person_url)
                info_html=re.findall('domid":"Pl_Official_PersonalInfo__.*?"html":"(.*?)"}',b)[0].strip().replace("\\r","").replace("\\n","").replace("\\","")
                print(info_html)
                info_html=etree.HTML(info_html)

                information={}
                for i in range(len(info_html.xpath('//span[contains(@class,"pt_title")]'))):
                    bb=info_html.xpath('//span[contains(@class,"pt_title")]/text()')[i].strip()
                    try:
                        if bb == "博客：":
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/a/text()'.format(i + 1))[0].strip()

                        elif bb == "个性域名：":
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/a/text()'.format(i + 1))[0].strip()

                        elif bb == "标签：":
                            print("++++++++++++++++111")
                            cc = info_html.xpath('//a[@node-type="tag"]/text()')
                            print(cc)
                            # cc=dd.xpath('string(.)').strip()
                        else:
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/text()'.format(i + 1))[0].strip()
                    except:
                        pass
                    information["{}".format(bb)]=cc
                print(information)
                lists_all.append([name,username,info,information])


            with open("lists24.csv", "w", encoding="utf-8", newline="") as f:
                k = csv.writer(f, dialect="excel")
                k.writerow(["名字", "昵称", "info", "简介"])
                for list1 in lists_all:
                    k.writerow(list1)
        except:
            pass

    except:
        lists_all_set=list(set(lists_all))
        with open("lists25.csv", "w", encoding="utf-8", newline="") as f:
            k = csv.writer(f, dialect="excel")
            k.writerow(["名字", "昵称", "info", "简介"])
            for list1 in lists_all_set:
                k.writerow(list1)


#*** 多线程
if __name__ == "__main__":
    length=len(content_all)
    xclist=[[],[],[],[],[],[],[],[],[],[]] #计算分配每个线程的请求数
    N=len(xclist)
    for i in range(length):
        xclist[i%N].append(content_all[i])

    for i in range(10): #线程数量

            for m in range(len(xclist[i])):
                t=threading.Thread(target=download,args=(xclist[i][m],))
                t.start()

2、协程gevent版
***协程使用引入库这三句写最前面最上面不会出错

import gevent
import gevent.monkey

gevent.monkey.patch_all()

协程使用主要就是链接分配问题

import gevent
import gevent.monkey

gevent.monkey.patch_all()
import re
from lxml import etree
import requests
import json
import pandas as pd
import time
import csv

import threading



s=requests.Session()
cookie="++++++++"
headers2 = {
  
        "Accept":"*/*",
        "Accept-Encoding":"gzip, deflate, br",
        "Accept-Languag":"zh-CN,zh;q=0.9",
    "Connection":"keep-alive",
    "Content-Type":"application/x-www-form-urlencoded",
    "Cookie":cookie,
    "Host":"weibo.com",
    "Referer":"https://weibo.com/u/1549364094?profile_ftype=1&is_all=1",
    "User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "X-Requested-With":"XMLHttpRequest",
}
headers1 = {

    "Connection":"keep-alive",
    "Cookie":cookie,
    "Host":"weibo.com",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",
    "Upgrade-Insecure-Requests":"1",
}

def request1(url):

    html=s.get(url,headers=headers2)
    # print(html.text)
    json1=json.loads(html.text)['data']

    return etree.HTML(json1)

def request2(url):

    html = s.get(url,headers=headers1)
    # print(html.text)
    return html.text


lists_all=[]
# url_id="210926262"
content_all=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\lists2.csv",engine='python',header=None).values.tolist()   #OSError: Initializing from file failed 加上engine='python'
print(content_all)

def download(content):
    try:

        url_id=content[1]
        name=content[0]
        home_url="https://weibo.com/{}?profile_ftype=1&is_all=1#_0".format(url_id)
        print(home_url)
        time.sleep(10)
        a=request2(home_url)
        # print(a)
        content_id = re.findall("page_id']='(.*?)';",a)[0]
        domain_id= re.findall("domain']='(.*?)';",a)[0]
        MyProfileFeed_id=re.findall("Pl_Official_MyProfileFeed__(\d+)",a)[0]
        print(content_id)
        print(domain_id)
        print(MyProfileFeed_id)


        #个人简介
        username=re.findall('<title>(.*?)的微博',a)[0]  #re.findall('<h1.*?>(.*?)<',a)
        # username=aa.xpath('//h1/h')
        print(username)
        info = re.findall('，(.*?)的微博主页.*?description',a)[0]
        print(info)
        person_url="https://weibo.com/p/{}/info?mod=pedit_more".format(content_id)
        print(person_url)
        time.sleep(10)
        try:
            if request2(person_url):
                b=request2(person_url)
                info_html=re.findall('domid":"Pl_Official_PersonalInfo__.*?"html":"(.*?)"}',b)[0].strip().replace("\\r","").replace("\\n","").replace("\\","")
                print(info_html)
                info_html=etree.HTML(info_html)

                information={}
                for i in range(len(info_html.xpath('//span[contains(@class,"pt_title")]'))):
                    bb=info_html.xpath('//span[contains(@class,"pt_title")]/text()')[i].strip()
                    try:
                        if bb == "博客：":
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/a/text()'.format(i + 1))[0].strip()

                        elif bb == "个性域名：":
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/a/text()'.format(i + 1))[0].strip()

                        elif bb == "标签：":
                            print("++++++++++++++++111")
                            cc = info_html.xpath('//a[@node-type="tag"]/text()')
                            print(cc)
                            # cc=dd.xpath('string(.)').strip()
                        else:
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/text()'.format(i + 1))[0].strip()
                    except:
                        pass
                    information["{}".format(bb)]=cc
                print(information)
                lists_all.append([name,username,info,information])


            with open("lists24.csv", "w", encoding="utf-8", newline="") as f:
                k = csv.writer(f, dialect="excel")
                k.writerow(["名字", "昵称", "info", "简介"])
                for list1 in lists_all:
                    k.writerow(list1)
        except:
            pass

    except:
        lists_all_set=list(set(lists_all))
        with open("lists25.csv", "w", encoding="utf-8", newline="") as f:
            k = csv.writer(f, dialect="excel")
            k.writerow(["名字", "昵称", "info", "简介"])
            for list1 in lists_all_set:
                k.writerow(list1)

#协程版
if __name__ == "__main__":
    length=len(content_all)
    xclist = []  #构建协程链接池
    for i in range(length):
        xclist.append(gevent.spawn(download,content_all[i] ))
    print(xclist)



    gevent.joinall(xclist)

多线程与协程爬取使用方法

猜你喜欢