Python数据爬虫学习笔记(14)爬取糗事百科数据(单线程+多线程)

一、需求:爬取糗事百科主页的热门段子:

二、观察URL阶段:

1、网页的URL很简单:

2、网页源代码中的段子内容:

观察到被div和span标签所包围。

三、编写代码:

(1)单线程方式:

import urllib.request
import re
import urllib.error
import sys

headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
for i in range(1,11):
    url="https://www.qiushibaike.com/8hr/page/"+str(i)
    pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
    pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
    datalist=re.compile(pat,re.S).findall(pagedata)
    for j in range(0,len(datalist)):
        print("第"+str(i)+"页第"+str(j)+"个段子的内容是:")
        #若段子内容有表情符号,则使用自带的SHELL会输出报错,使用以下方法
        #可以有效解决,输出的表情会以�代替,也可采用异常处理的方式
        non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
        print(datalist[j].translate(non_bmp_map))

(2)多线程方式:

扫描二维码关注公众号,回复: 3755418 查看本文章
import urllib.request
import re
import urllib.error
import sys
import threading

headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
class One(threading.Thread):
    def __init__(self): #可写参数,如(self,a)
        threading.Thread.__init__(self)
    def run(self):
        #爬取奇数页
        for i in range(1,11,2):
            url="https://www.qiushibaike.com/8hr/page/"+str(i)
            pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
            pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
            datalist=re.compile(pat,re.S).findall(pagedata)
            for j in range(0,len(datalist)):
                print("第"+str(i)+"页第"+str(j)+"个段子的内容是:")
                #若段子内容有表情符号,则使用自带的SHELL会输出报错,使用以下方法
                #可以有效解决,输出的表情会以�代替,也可采用异常处理的方式
                non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
                print(datalist[j].translate(non_bmp_map))            
class Two(threading.Thread):
    def __init__(self): 
        threading.Thread.__init__(self)
    def run(self):
        #爬取偶数页
        for i in range(2,11,2):
            url="https://www.qiushibaike.com/8hr/page/"+str(i)
            pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
            pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
            datalist=re.compile(pat,re.S).findall(pagedata)
            for j in range(0,len(datalist)):
                print("第"+str(i)+"页第"+str(j)+"个段子的内容是:")
                non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
                print(datalist[j].translate(non_bmp_map))
#启动线程
one=One()
one.start()
two=Two()
two.start()

感谢韦玮老师的指导

猜你喜欢

转载自blog.csdn.net/Smart3S/article/details/82925105