learning record
learning record
1. Goals
aHR0cHM6Ly9waWMubmV0Ymlhbi5jb20vDQo=
Get the href and title of each picture, jump to the details page to extract the picture download address and request to realize the local download of the picture
2. Code implementation
"""
CSDN: 抄代码抄错的小牛马
mailbox:[email protected]
"""
import os
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import requests
from lxml import etree
from userAgentPooL import userAgent
from ipPooL import IP
UA = userAgent.get_ua()
def get_data(url):
start = time.time()
futures = []
# 创建 20 个线程池 with as 不用手动关闭
with ThreadPoolExecutor(max_workers=20) as pool:
# with ProcessPoolExecutor(max_workers=20) as pool:
for i in url:
headers = {
'User-Agent': UA,
'Referer': 'https://www.igdcc.com/'
}
print(f'主页:{headers["User-Agent"]}')
proxies = IP.get_ip()
resp = requests.get(url=i, headers=headers, proxies=proxies)
print(resp)
resp.encoding = 'gbk'
content = resp.text
tree = etree.HTML(content)
li_list = tree.xpath('//div[@class="slist"]/ul[@class="clearfix"]/li')
for item in li_list:
headers = {
'User-Agent': UA,
'Referer': 'https://pic.netbian.com/'
}
href = 'https://pic.netbian.com/' + item.xpath('./a/@href')[0]
proxies = IP.get_ip()
img_HTML = requests.get(url=href, headers=headers, proxies=proxies)
img_HTML.encoding = 'gbk'
img_data = img_HTML.text
new_tree = etree.HTML(img_data)
src = 'https://pic.netbian.com/' + new_tree.xpath('//div[@class="photo-pic"]/a/img/@src')[0]
title = new_tree.xpath('//div[@class="view"]/div[2]/a/img/@alt')[0]
# 将耗时的任务放到线程池中来执行
f = pool.submit(download, src=src, title=title)
futures.append(f)
for f in futures:
f.result()
print('一共花费时间', time.time() - start)
def download(src, title):
img_content = requests.get(url=src).content
if not os.path.exists('./ThreadpicPool'):
os.mkdir('./ThreadpicPool')
with open('./ThreadpicPool/%s.jpg' % title, 'wb') as fp: # wb 是写入二进制的,图片是二进制。
fp.write(img_content)
print('{%s}下载成功!!!' % title)
pass
def main():
print('----------------彼岸图网壁纸采集----------------')
star_page = int(input('请输入起始页:'))
end_page = int(input('请终止起始页:'))
url = []
for page in range(star_page, end_page + 1):
if end_page == 1:
url.append('https://pic.netbian.com/index.html')
break
else:
if page == 1:
url.append('https://pic.netbian.com/index.html')
else:
url.append(f' https://pic.netbian.com/index_{page}.html')
get_data(url)
pass
if __name__ == '__main__':
main()
Effect: