示例代码:
import asyncio
import aiohttp
import sys
sys.setrecursionlimit(100000000) # 将递归深度限制设置为 2000
async def fetch_url(session, url, retry=3):
try:
semaphore = asyncio.Semaphore(25) # 限制并发数为 10
async with semaphore:
async with session.post(url,headers={
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}) as response:
if response.status == 200:
html = await response.text()
print(f"成功抓取 {url},内容长度:{len(html)}")
await asyncio.sleep(5)
await fetch_url(session, url)
return html
else:
print(f"抓取 {url} 失败,状态码:{response.status}")
if retry > 0:
print(f"正在重试 {url},剩余次数:{retry}")
await asyncio.sleep(1) # 等待1秒后重试
return await fetch_url(session, url, retry - 1)
else:
print(f"重试 {url} 失败,达到最大次数")
return None
except aiohttp.ClientError as e:
print(f"抓取 {url} 发生错误:{e}")
if retry > 0:
print(f"正在重试 {url},剩余次数:{retry}")
await asyncio.sleep(5) # 等待1秒后重试
return await fetch_url(session, url, retry - 1)
else:
print(f"重试 {url} 失败,达到最大次数")
return None
async def fetch_all_urls(url):
"""
异步抓取多个 URL 的内容。
:param urls: 要抓取的 URL 列表
:return: 抓取到的网页内容列表
"""
# 创建一个不验证 SSL 证书的连接器
connector = aiohttp.TCPConnector(ssl=False)
async with aiohttp.ClientSession(connector=connector) as session:
tasks = [fetch_url(session, i) for i in range(30)] # 创建任务列表
results = await asyncio.gather(*tasks) # 并发执行所有任务
return results
async def main():
url = 'https://example.com'
# 异步抓取所有 URL
results = await fetch_all_urls(url)
# 打印抓取结果
for url, content in results:
if content:
print(f"{url} 抓取成功,内容长度:{len(content)}")
else:
print(f"{url} 抓取失败")
if __name__ == "__main__":
asyncio.run(main())