爬虫学的好,私货少不了

import urllib.request
import urllib.parse
from lxml import etree
import time
import os
def handle_request(url, page):
if (page == 1) or (page == 0):
url = url.format('')
else:
url = url.format('_' + str(page))
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
'Cookie': 'UM_distinctid=172bafc075930f-0b3214295ea2fd-f313f6d-1fa400-172bafc075a521; __gads=ID=7343205fec19267e:T=1592274993:S=ALNI_MZuCx78VBx2WBiIEBOXsKZoldvefg'
}
request = urllib.request.Request(url=url, headers=headers)
return request
def parse_content(content):
tree = etree.HTML(content)
image_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src2')
for image_src in image_list:
download_image(image_src)
def download_image(image_src):
dirpath = '站长美女图片爬取'
if not os.path.exists(dirpath):
os.mkdir(dirpath)
filename = os.path.basename(image_src)
print(filename)
filepath = os.path.join(dirpath, filename)
print(filepath)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
'Cookie': 'UM_distinctid=172bafc075930f-0b3214295ea2fd-f313f6d-1fa400-172bafc075a521; __gads=ID=7343205fec19267e:T=1592274993:S=ALNI_MZuCx78VBx2WBiIEBOXsKZoldvefg'
}
image_src = 'https:' + image_src
request = urllib.request.Request(url=image_src, headers=headers)
response = urllib.request.urlopen(request)
with open(filepath, 'wb') as fp:
print(f'正在爬取{filename}')
fp.write(response.read())
print(f'{filename}爬取完毕')
time.sleep(2)
def main():
url = 'http://sc.chinaz.com/tupian/xingganmeinvtupian{}.html'
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
for page in range(start_page, end_page + 1):
request = handle_request(url, page)
print('开始爬取第{}页'.format(page))
response = urllib.request.urlopen(request)
content = response.read().decode()
parse_content(content)
print('第{}页爬取结束'.format(page))
if __name__ == '__main__':
start = time.time()
main()
print('蜘蛛结网完毕,收工')
end = time.time()
print(f'爬取所有妹子图片用时: {end-start}s')

- 在这个星球上,你很重要,请珍惜你的珍贵! ~~~夜斗小神社
