import threading
url = "https://xx.com/11/11947/"
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
# "Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
# "Host": "httpbin.org",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
}
import requests
proxy = {'http': "347.99.91.112:3128"}
resp = requests.get(url, proxies=proxy, headers=headers).content
resp = resp.decode(encoding="gbk")
import re
t = re.compile('<a href="(/11.*?)">.*?</a>')
t2 = re.compile('<a href="/11.*?">(.*?)</a>')
arr1_src = t.findall(resp)
arr1_title = t2.findall(resp)
print(arr1_title)
print(arr1_src)
class Spider(threading.Thread):
list_title = None
list_src = None
base_path = "https://xx.com"
def __init__(self, list_title, list_src):
super().__init__()
self.list_title = list_title
self.list_src = list_src
def run(self) -> None:
for i in range(0, len(self.list_title)):
url = self.base_path + self.list_src[i]
print(url)
resp = None
try:
resp = requests.get(url, headers=headers, proxies=proxy).content.decode(encoding="gbk")
except Exception as e:
print(e)
break
thandler = re.compile('<div id="content">([\s\S]*?)</div>')
arr = thandler.findall(resp)
txt = ""
for _ in arr:
_ = str(_).replace("<br />\r\n<br />\r\n", "\r\n")
_ = _.replace(" ", " ")
txt = txt + _
try:
with open(r"F:\妹子\book\example" + "\\" + self.list_title[i] + ".txt", "w", encoding='utf-8') as f:
f.write(txt)
except Exception as e:
print(e)
break
seg = len(arr1_title) // 5
for i in range(6):
try:
list_title = arr1_title[i * seg:(i + 1) * seg]
list_src = arr1_src[i * seg:(i + 1) * seg]
if list_src is not None and list_title is not None:
spyder = Spider(list_title, list_src)
spyder.start()
except Exception as e:
print(e)
代理ip自己找,最好整个 ip池,然后每个线程用不同的ip,网站链接就不贴了,不好意思贴出来
思路: 先点进目录(正则提取出所有的标题和链接),放入list_title 和 list_src
然后开几个线程 去分段 爬小说
然后进入 一篇小说 的网页,分析 article 所在的标签
,我点的这个网站的小说 全在
([\s\S]*?), 注意 ,不能用 . 因为不能包含 回车换行