多线程爬虫爬小说

import threading

url = "https://xx.com/11/11947/"

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
    # "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    # "Host": "httpbin.org",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
}
import requests

proxy = {'http': "347.99.91.112:3128"}
resp = requests.get(url, proxies=proxy, headers=headers).content
resp = resp.decode(encoding="gbk")

import re

t = re.compile('<a href="(/11.*?)">.*?</a>')
t2 = re.compile('<a href="/11.*?">(.*?)</a>')

arr1_src = t.findall(resp)
arr1_title = t2.findall(resp)
print(arr1_title)
print(arr1_src)


class Spider(threading.Thread):
    list_title = None
    list_src = None
    base_path = "https://xx.com"

    def __init__(self, list_title, list_src):
        super().__init__()
        self.list_title = list_title
        self.list_src = list_src

    def run(self) -> None:
        for i in range(0, len(self.list_title)):
            url = self.base_path + self.list_src[i]
            print(url)
            resp = None
            try:
                resp = requests.get(url, headers=headers, proxies=proxy).content.decode(encoding="gbk")
            except Exception as e:
                print(e)
                break

            thandler = re.compile('<div id="content">([\s\S]*?)</div>')
            arr = thandler.findall(resp)
            txt = ""
            for _ in arr:
                _ = str(_).replace("<br />\r\n<br />\r\n", "\r\n")
                _ = _.replace("&nbsp;", " ")
                txt = txt + _

            try:
                with open(r"F:\妹子\book\example" + "\\" + self.list_title[i] + ".txt", "w", encoding='utf-8') as f:
                    f.write(txt)
            except Exception as e:
                print(e)
                break


seg = len(arr1_title) // 5

for i in range(6):

    try:
        list_title = arr1_title[i * seg:(i + 1) * seg]
        list_src = arr1_src[i * seg:(i + 1) * seg]
        if list_src is not None and list_title is not None:
            spyder = Spider(list_title, list_src)
            spyder.start()


    except Exception as e:
        print(e)

代理ip自己找,最好整个 ip池,然后每个线程用不同的ip,网站链接就不贴了,不好意思贴出来
思路: 先点进目录(正则提取出所有的标题和链接),放入list_title 和 list_src
然后开几个线程 去分段 爬小说

然后进入 一篇小说 的网页,分析 article 所在的标签
,我点的这个网站的小说 全在

里面包着,用正则提取即可
([\s\S]*?), 注意 ,不能用 . 因为不能包含 回车换行

发布了151 篇原创文章 · 获赞 7 · 访问量 7517

猜你喜欢

转载自blog.csdn.net/qq_43923045/article/details/104272583
今日推荐