https://baike.baidu,com/view/? 方式尽可能遍历百科词条

# -*- coding: utf-8 -*-
# @time : 2019/7/1  14:56
import requests
import time

baseUrl = 'http://baike.baidu.com/view/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
countToSleep = 300

writer = open("itemUrl.txt", "a+", encoding="utf8")   //存储可以访问的词条url
filedWriter = open("filedItemUrl.txt", "a+", encoding="utf8")     //存储由于百度服务端防爬策略而失败的 数字,最后再通过组装url进行采集
errorNumber = open("errorNumberItemUrl.txt", "a+", encoding="utf8")    //组装的url不存在
for i in range(1, 15500000):
    try:
        countToSleep -= 1
        if countToSleep > 0:
            response = requests.get(baseUrl + str(i), headers=headers)
            if 'error' in response.url:
                errorNumber.write(str(i) + '\n')
            else:
                writer.write(response.url + '\n')
                print("第" + str(i) + "个;当前url:" + response.url)
    except:
        filedWriter.write(str(i) + '\n')
        print("服务端断开连接,重新连接爬取...")
        time.sleep(4)
    finally:
        if not (countToSleep > 0):
            time.sleep(2)
            print("休息-------------------")
            countToSleep = 300

            writer.flush()
            errorNumber.flush()
            filedWriter.flush()

  

猜你喜欢

转载自www.cnblogs.com/dhName/p/11115696.html