Python爬取糗事百科-多进程方法

#正则表达式法
import requests
import re
import time
from multiprocessing import Pool
headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
def spider(url):
    req=requests.get(url=url,headers=headers)
    text=req.text
    names=re.findall('<h2>(.*?)</h2>',text,re.S)
    contents=re.findall('<div class="content">.*?<span>(.*?)</span>',text,re.S)
    laughts=re.findall('<span class="stats-vote">.*?<i class="number">(\d+)</i>.*?</span>',text,re.S)
    comments=re.findall('<span class="stats-comments">.*?<i class="number">(\d+)</i>.*?</span>',text,re.S)

    for name,content,laught,comment in zip(names,contents,laughts,comments):
        data={
            'name':name.strip(),
            'content':content.strip(),
            'laugth':laught,
            'comment':comment
        }
        print(data)

if __name__=='__main__':
    time1=time.time()
    urls=['https://www.qiushibaike.com/8hr/page/{}/'.format(str(i)) for i in range(1,14)]
    pool=Pool(processes=4)
    pool.map(spider,urls)
    time2=time.time()
    print(time2-time1)

猜你喜欢

转载自blog.csdn.net/weixin_42080280/article/details/80855162