20200311_最新爬取mzitu

废话不多, 直接上代码, python3.6:
import requests
from bs4 import BeautifulSoup
import os
import time;
import random

#pip install BeautifulSoup4 -i  https://pypi.douban.com/simple
#pip install requests -i  https://pypi.douban.com/simple

# http请求头
Hostreferer = {
    'Referer': 'http://www.mzitu.com',
    
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}

# 此请求头Referer破解盗图链接
Picreferer = {
    # 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3679.0 Safari/537.36',
    # 'Referer': 'http://i.meizitu.net',
    # https://www.mzitu.com/224497/3
    'Referer': 'http://www.mzitu.com',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
all_url = 'https://www.mzitu.com'
# 对mzitu主页all_url发起请求，将返回的HTML数据保存，便于解析
start_html = requests.get(all_url, headers=Hostreferer)
 
soup = BeautifulSoup(start_html.text, "html.parser") # 缩进格式
page = soup.find_all('a', class_='page-numbers')
# 最大页数
max_page = page[-2].text
for n in range(1, int(max_page) + 1):
    path = 'D:/mzitu/' #存储路径
    all_url = 'https://www.mzitu.com' #重新赋值
    if n!=1:
        all_url=  all_url+"/page/"+str(n)+"/";
    print('开始爬第 %s 页, 网址是 %s' % (n , all_url))
    start_html = requests.get(all_url, headers=Hostreferer);
    soup = BeautifulSoup(start_html.text, "html.parser")
#    alt =  soup.find(id='pins').find_all('a', target='_blank').find_all('img',class_='lazy').get('alt');
    hrefs = soup.find(id='pins').find_all('a', target='_blank'); #根据ID找

    for href in hrefs:
        imgs = href.find('img',class_='lazy');
        if imgs == None:
            break;
        alt = imgs.get('alt');
        url = href.get('href');
        start_html2 = requests.get(url, headers=Hostreferer);
        soup2 = BeautifulSoup(start_html2.text, "html.parser")  # 缩进格式
        page2 = soup2.find('div', class_='pagenavi').find_all('a');
        # print (page2[0])
        max_page2 = page2[-2].text;
        path = path + alt.strip().replace('?', '');
        if (os.path.exists(path)):
            pass
            # print('目录已存在')
        else:
            os.makedirs(path)
        for m in range(1,int(max_page2)):

            time.sleep(random.randint(1,5))
            # alt = href.find('img', class_='lazy').get('alt');
            # url = href.get('href');
            url3 = url+'/'+str(m)+'/'
            print('开始爬→%s' % url3)
            start_html3 = requests.get(url3, headers=Hostreferer);
            soup3 = BeautifulSoup(start_html3.text, "html.parser")  # 缩进格式
            picSrc = soup3.find('div', class_='main-image').find('a').find('img').get('src');#.get('src');#.get('src'); #div class="main-image"
            # imglist = #获取当前页上所有的子连接, 不包含class="box"
            html = requests.get(picSrc, headers=Picreferer)

            # 提取图片名字
            file_name = path+'/'+picSrc.split(r'/')[-1];
            # 保存图片
            f = open(file_name, 'wb')
            f.write(html.content)
            f.close()
            print('图片保存到%s' % file_name);
20200311_最新爬取mzitu

猜你喜欢