python 翻页式爬取百度图片(两种实现方法)

首先声明下网上部分使用的网址是

 r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word={}'。foramt(word)
#这个是以前翻页版本的

(好像现在在地址栏不是这样了,貌似百度图片网页结构没改变之前是这样的)

有的是使用下面的网址

url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm= -1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=\
    &ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word={}&pn={}".format(word, page)

(网址在地址栏)

第一个网址是以前的,第二个网址是现在,两个网址均可以用于爬虫,但翻页式爬取图片的方法不同。

第2个网址(代码),首选输入在百度图片中输入想要批量下载的图片类型,通过改变搜索的图片类型,地址栏中只有word后面是发生变化的,而后首先遍历所有的页数,并下载。(这样好像再下载的过程中一些图片下载不成功)

# coding=utf-8
import requests
import json
import urllib
import os
import urllib
import urllib.request
from urllib.parse import quote
import re
import os
import time


def dowload_pic(path,pic_url,page):
    num=0
    for string in pic_url:
        ID=(page-1)*30 +num
        num+=1
        time.sleep(1)
        try:
            print("正在下载" + string)
            f_req = urllib.request.Request(string, headers=headers)
            f_url = urllib.request.urlopen(f_req).read()
            generate_path(path,wd)
            fs = open(path + "/" + wd+'/' + str(num) + ".jpg", "wb+")
            fs.write(f_url)
            fs.close()
           
                        
                       
            print(ID, "已下载成功")
        except Exception as e:
            print(ID, "下载失败")
            continue                                    
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
    "referer": "https://image.baidu.com"
}
path=r'F:\国科大\pachong\pic'
wd='成都'
word=quote(wd, encoding="utf-8")
page=1
pic_urls=[]
while True:
    print(page)
    url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm= -1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=\
    &ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word={}&pn={}".format(word, page)
#req = urllib.request.Request(url, headers=headers)
#f = urllib.request.urlopen(req).read().decode("utf-8")
    data=requests.get(url,timeout = 500)
    data.encoding='utf-8'
    da=data.text

    
    

    pic_url = re.findall('"objURL":"(.*?)",', da, re.S)
    if pic_url==[]:
        break
    else:
        dowload_pic(path,pic_url,page)
    page+=1  



   
def generate_path(path,word):
    if  not os.path.exists(path+'/'+word):
        os.mkdir(path+'/'+word)
    else:
        pass 

第一个网址

import re
import sys
import urllib
import requests
import os

keyword = '成都' 

def fanye(onepageurl):
    """获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
    if not onepageurl:
        print('已到最后一页, 结束')
        return [], ''
    try:
        html = requests.get(onepageurl)
        html.encoding = 'utf-8'
        html = html.text
    except Exception as e:
        print(e)
        pic_urls = []
        fanye_url = ''
        return pic_urls, fanye_url
    pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
    fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
    fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
    return pic_urls, fanye_url


def down_pic(pic_urls):
    """给出图片链接列表, 下载所有图片"""
    for i, pic_url in enumerate(pic_urls):
        try:
            pic = requests.get(pic_url, timeout=15)
            string = str(i + 1) + '.jpg'
            os.chdir(r'F:\国科大\pachong\pic\fanye')
            with open(string, 'wb') as f:
                f.write(pic.content)
                print('成功下载第%s张图片: ' % (str(i + 1)))
        except Exception as e:
            print('下载第%s张图片时失败: ' % (str(i + 1)))
            print(e)
            continue


if __name__ == '__main__':
    word = '成都' 
    url = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
    url_init = url + urllib.parse.quote(word)
    pic_urls = []
    page_url, fanye_url = fanye(url_init)
    pic_urls.extend(page_url)

    fanye_count = 0  # 累计翻页数
    while 1:
        page_url, fanye_url = fanye(fanye_url)
        fanye_count += 1
        # print('第页' % str(fanye_count))
        if fanye_url == '' and page_url == []:
            break
        pic_urls.extend(page_url)
 
    down_pic(pic_urls)

由于百度的反爬虫,代码在运行过程中可能会出现错误。

发布了12 篇原创文章 · 获赞 10 · 访问量 8044

猜你喜欢

转载自blog.csdn.net/qq_33657870/article/details/105356242