一、需求:爬取糗事百科主页的热门段子:
二、观察URL阶段:
1、网页的URL很简单:
2、网页源代码中的段子内容:
观察到被div和span标签所包围。
三、编写代码:
(1)单线程方式:
import urllib.request
import re
import urllib.error
import sys
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
for i in range(1,11):
url="https://www.qiushibaike.com/8hr/page/"+str(i)
pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist=re.compile(pat,re.S).findall(pagedata)
for j in range(0,len(datalist)):
print("第"+str(i)+"页第"+str(j)+"个段子的内容是:")
#若段子内容有表情符号,则使用自带的SHELL会输出报错,使用以下方法
#可以有效解决,输出的表情会以�代替,也可采用异常处理的方式
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
print(datalist[j].translate(non_bmp_map))
(2)多线程方式:
扫描二维码关注公众号,回复:
3755418 查看本文章
import urllib.request
import re
import urllib.error
import sys
import threading
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
class One(threading.Thread):
def __init__(self): #可写参数,如(self,a)
threading.Thread.__init__(self)
def run(self):
#爬取奇数页
for i in range(1,11,2):
url="https://www.qiushibaike.com/8hr/page/"+str(i)
pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist=re.compile(pat,re.S).findall(pagedata)
for j in range(0,len(datalist)):
print("第"+str(i)+"页第"+str(j)+"个段子的内容是:")
#若段子内容有表情符号,则使用自带的SHELL会输出报错,使用以下方法
#可以有效解决,输出的表情会以�代替,也可采用异常处理的方式
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
print(datalist[j].translate(non_bmp_map))
class Two(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
#爬取偶数页
for i in range(2,11,2):
url="https://www.qiushibaike.com/8hr/page/"+str(i)
pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist=re.compile(pat,re.S).findall(pagedata)
for j in range(0,len(datalist)):
print("第"+str(i)+"页第"+str(j)+"个段子的内容是:")
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
print(datalist[j].translate(non_bmp_map))
#启动线程
one=One()
one.start()
two=Two()
two.start()