# -*- coding: utf-8 -*- # @time : 2019/7/1 14:56 import requests import time baseUrl = 'http://baike.baidu.com/view/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'} countToSleep = 300 writer = open("itemUrl.txt", "a+", encoding="utf8") //存储可以访问的词条url filedWriter = open("filedItemUrl.txt", "a+", encoding="utf8") //存储由于百度服务端防爬策略而失败的 数字,最后再通过组装url进行采集 errorNumber = open("errorNumberItemUrl.txt", "a+", encoding="utf8") //组装的url不存在 for i in range(1, 15500000): try: countToSleep -= 1 if countToSleep > 0: response = requests.get(baseUrl + str(i), headers=headers) if 'error' in response.url: errorNumber.write(str(i) + '\n') else: writer.write(response.url + '\n') print("第" + str(i) + "个;当前url:" + response.url) except: filedWriter.write(str(i) + '\n') print("服务端断开连接,重新连接爬取...") time.sleep(4) finally: if not (countToSleep > 0): time.sleep(2) print("休息-------------------") countToSleep = 300 writer.flush() errorNumber.flush() filedWriter.flush()