python 多线程爬虫下载中图分类号

目标网站

在这里插入图片描述
爬这个网站需要访问 45836 个网页,
在这里插入图片描述
一个一个访问是很慢的,还好网站没有做反爬

单线程爪巴虫

import requests
from bs4 import BeautifulSoup
import traceback
import time

dic = {}
url = 'http://www.ztflh.com/?c='
def visit(i):
    try:
        print(i, end='')
        u = url+str(i)
        html = requests.get(u)
        html.encoding = 'utf8'
        h = html.text
        bs = BeautifulSoup(h)
        lis = bs.find('ul',{'id':'list'}).findAll('li')
        for i in lis:
            dic[i.span.text.strip('[]')] = i.a.text
        print()
    except Exception as e:
#         traceback.print_exc()
        print('failed', u)

for i in range(45837):
	visit(i)

print(dic)

可是这么一个一个来预计要爪巴几个小时

多线程爪巴虫

开启 100 个线程,不到 10 分钟完成所有网页的爪取

import threading
import requests
from lxml import etree
from urllib import request
import os
import re
from queue import Queue
 
class Producer(threading.Thread):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
    }
    def __init__(self,page_queue,result_queue,*args,**kwargs):
        super(Producer, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.result_queue = result_queue
 
    def run(self):
        while True:
            if self.page_queue.empty():
                print('bye')
                break
            print('剩余页数:', page_queue.qsize())
            url = self.page_queue.get()
            self.parse_page(url)
 
    def parse_page(self,u):
        try:
            html = requests.get(u)
            html.encoding = 'utf8'
            h = html.text
            bs = BeautifulSoup(h)
            lis = bs.find('ul',{'id':'list'}).findAll('li')
            dic = {}
            for i in lis:
                self.result_queue.put((i.span.text.strip('[]'),i.a.text))
                dic[i.span.text.strip('[]')] = i.a.text
            print(u, dic)
        except Exception as e:
            pass


N = 45836
N_threads = 100
page_queue = Queue(N)
result_queue = Queue(N)

for i in range(N):
    u = url + str(i)
    page_queue.put(u)

for x in range(N_threads):
    t = Producer(page_queue, result_queue)
    t.start()

result = list(result_queue.queue)
result.sort()

with open('中图分类号.csv', 'w', encoding='utf_8_sig') as file:
    for k,v in result:
        file.write(k.strip('{}')+','+v+'\n')

抓取结果

在这里插入图片描述
\vdots
在这里插入图片描述

大家先不要再爬啦,毕竟对网站不好,可以先试试我已经爬好的
https://download.csdn.net/download/itnerd/12836734

猜你喜欢

转载自blog.csdn.net/itnerd/article/details/108527439