![在这里插入图片描述](https://img-blog.csdnimg.cn/cover3/453357222947979978.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,image_MjAyMDA3MTUxNjIxMDEzOC5wbmc=,size_16,color_FFFFFF,t_70,image/resize,m_lfit,w_962#pic_center)
1、爬取准备
爬取目标
https://www.doutula.com/article/list/
批量爬取
温馨提示:爬取过程中保持网络通畅,不然会爬取失败!
2、完整代码
import requests
from threading import Thread
from queue import Queue
from lxml import etree
import time
import os
import random
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"
}
path = "./图片"
#数据采集
class CrawlInfo(Thread):
def __init__(self ,url_queue ,html_queue):
Thread.__init__(self)
self.url_queue = url_queue
self.html_queue = html_queue
def run(self):
# 不存在就创建
if not os.path.exists(path):
# mkdir是创建文件夹
os.mkdir(path)
while self.url_queue.empty() == False:
url = self.url_queue.get()
response = requests.get(url,headers=headers)
#请求成功状态码为200则进行put
if response.status_code == 200:
self.html_queue.put(response.text)
# 数据解析、保存
class ParseCrawl(Thread):
def __init__(self, html_queue):
Thread.__init__(self)
self.html_queue = html_queue
def run(self):
#队列不为空则继续遍历
while self.html_queue.empty() == False:
data = etree.HTML(self.html_queue.get())
#查看图1,根据class定位获取其下的所有a标签
a_list = data.xpath("//div[@class='col-sm-9 center-wrap']/a")
#遍历a标签
for a_singer in a_list:
#查看图2,重新写xpath根据class定位
#text():获取文本值
name = a_singer.xpath(".//div[@class='random_title']/text()")[0]
#替换掉name中的\u200b\u200b\u200b字符串
name = str(name).replace("\u200b\u200b\u200b","")
#拼接新的路径
new_path = path + "/" + name
#新路径不存在就创建
if not os.path.exists(new_path):
os.mkdir(new_path)
#获取图片url,查看图3
#根据class值定位到所有img的父标签,在根据img的class拿到所有img的data-original属性即图片url,这里不拿src属性是因为爬取时拿到的是图片还未加载完毕的url
img_url_list = a_singer.xpath(".//div[@class='random_article']//img[@class='lazy image_dtb img-responsive']/@data-original")
#遍历img_url_list
for img in img_url_list:
#由于图片有jpg、png、gif格式,这里我们根据'.'分割获取图片格式后缀
suffix = "." + str(img).split(".")[-1]
#发起请求,content为二进制
re = requests.get(img,headers=headers).content
#图片命名为随机数str(random.randint(1,500))
with open(new_path + "/" + str(random.randint(1,500)) + str(suffix), "wb") as f:
#保存图片
f.write(re)
#开始
if __name__ == '__main__':
start = time.time()
url_queue = Queue()
html_queue = Queue()
base_url = "https://www.doutula.com/article/list/?page={}"
#我们这只爬取13页
for i in range(1,14):
print("正在爬取第{}页".format(i))
new_url = base_url.format(i)
url_queue.put(new_url)
crawl_list = []
for i in range(3):
Crawl = CrawlInfo(url_queue, html_queue)
crawl_list.append(Crawl)
Crawl.start()
for crawl in crawl_list:
crawl.join()
parse_list = []
for i in range(3):
parse = ParseCrawl(html_queue)
parse_list.append(parse)
parse.start()
for parse in parse_list:
parse.join()
end = time.time()
print("爬取时间:{}".format(end- start))
3、图片辅助分析
图1
图2
图3
4、运行结果
博主会持续更新,有兴趣的小伙伴可以点赞、关注和收藏下哦,你们的支持就是我创作最大的动力!
本文爬虫源码已由 GitHub https://github.com/2335119327/PythonSpider 已经收录(内涵更多本博文没有的爬虫,有兴趣的小伙伴可以看看),之后会持续更新,欢迎Star。