示例仅供参考,如有违权请联系删除!
比如我们想要获取“2024必看影片”
鼠标右键,查看源文件
找到这里
匹配正则表达式
obj1 = re.compile(r"2024必看热片.*?<ul>(?P<ul>.*?)</ul>", re.S)
obj2 = re.compile((r"<a href='(?P<href>.*?)'"))
obj3 = re.compile(r'◎片 名(?P<movie>.*?)<br />.*?<td '
r'style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<download>.*?)">', re.S)
完整代码:
# http://dytt89.com/
# 1.定位2024必看片
# 2.从该位置提取到子页面的链接地址
# 3.请求页面的链接地址拿到我们要的下载地址
import requests
import re
import csv
import pandas as pd
import time
url = 'http://dytt89.com/'
resp = requests.get(url, verify=False) # verify=False 去掉安全验证
resp.encoding = 'gb2312' # 指定字符集
# print(resp.text)
# 拿到ul里面的li
obj1 = re.compile(r"2024必看热片.*?<ul>(?P<ul>.*?)</ul>", re.S)
obj2 = re.compile((r"<a href='(?P<href>.*?)'"))
obj3 = re.compile(r'◎片 名(?P<movie>.*?)<br />.*?<td '
r'style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<download>.*?)">', re.S)
result1 = obj1.finditer(resp.text)
child_href_list = []
for it in result1:
ul = it.group('ul')
# print(ul)
# 提取href
result2 = obj2.finditer(ul)
for itt in result2:
child_href = url + itt.group('href').strip('/')
child_href_list.append(child_href) # 保存起来
# 写入CSV文件
with open('data_movie.csv', mode='w', encoding='utf-8', newline='') as f:
csvwriter = csv.writer(f)
csvwriter.writerow(["电影名称", "电影下载链接"]) # 写入表头
# 提取子页面内容
for href in child_href_list:
try:
child_resp = requests.get(href, verify=False)
child_resp.encoding = 'gb2312'
result3 = obj3.search(child_resp.text)
if result3:
movie = result3.group('movie').strip()
download = result3.group('download')
print(movie, download)
# 写入电影信息
csvwriter.writerow([movie, download])
time.sleep(1) # 添加延时,避免过于频繁的请求
except Exception as e:
print(f"处理链接 {href} 失败: {e}")
# 读取CSV文件并转换为Excel
df = pd.read_csv('data_movie.csv')
df.to_excel('data_movie.xlsx', index=False)
结果展示:
复制任意链接,会跳转到迅雷进行下载
恭喜您学会了,快去试试吧!!!