import requests
from lxml import etree
Basic_main = ‘http://www.ygdy8.net’
Headers = {
‘User-Agent’:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36’
}
#解析总的页面数据
def parse_tatol_data(url):
response = requests.get(url,headers = Headers)
text = response.text
#利用gbk格式储存会有些乱码
# 调用HTML类对HTML文本进行初始化,成功构造XPath解析对象,同时可以自动修正HMTL文本(标签缺少闭合自动添加上)
html = etree.HTML(text)
#解析所需的URL
part_detail_urls = html.xpath("//table[@class=‘tbspan’]//a/@href")
whole_detail_urls = map(lambda url:Basic_main + url,part_detail_urls)
return whole_detail_urls
#解析分支页面数据
def parse_branch_data(detail_url):
movie = {}
response = requests.get(detail_url,headers = Headers)
text = response.content.decode(‘gbk’)
html = etree.HTML(text)
title = html.xpath("//div[@class=‘title_all’]//font[@color=’#07519a’]/text()")[0]
#取0是为了获取字符串,text()是为了获得文本
movie[‘title’] = title
# for x in title:
# print(etree.tostring(x,encoding=‘utf-8’).decode(‘utf-8’))
#封面图,缩略图,演员以及主演的信息都在zoom标签中,所以单独分开
zoom = html.xpath("//div[@id=‘Zoom’]")[0]
imgs = zoom.xpath(".//img/@src")#标签分开时,注意勿忘 (.)
#获取封面图和缩略图
cover_img = imgs[0]
screenshot = imgs[1]
movie[‘cover’] = cover_img
movie[‘screenshot’] = screenshot
#用‘’替换rule,并返回info
def parse_info(info,rule):
return info.replace(rule,’’).strip()#strip()作用是消除前后空格
infos = zoom.xpath(".//text()")
for index,info in enumerate(infos):
if info.startswith('◎年 代'):
info = parse_info(info,'◎年 代')
movie['year'] = info
elif info.startswith('◎产 地'):
info = parse_info(info,'◎产 地')
movie['place'] = info
elif info.startswith('◎导 演'):
info = parse_info(info,'◎导 演')
movie['director'] = info
elif info.startswith('◎主 演'):
info = parse_info(info,'◎主 演')
actors = [info]
#演员不止一位
for x in range(index +1,len(infos)):
actor = infos[x].strip()
if actor.startswith('◎标 签'):
break
actors.append(actor)
movie['actors'] = actors
elif info.startswith('◎简 介 '):
info = parse_info(info,'◎简 介')
for x in range(index +1,len(infos)):
profile = infos[x].strip()
if profile.startswith('【下载地址】'):
break
movie['profile'] = profile
download_url = html.xpath("//td[@bgcolor='#fdfddf']//a/@href")[0]
movie['download_url'] = download_url
return movie
#获取7页总的数据
def main():
basic_url = ‘http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html’
movies = []
#获取前七页的url
for x in range(1,8):
url = basic_url.format(x)
detail_urls = parse_tatol_data(url)
#获取一页中的url
for detail_url in detail_urls:
movie = parse_branch_data(detail_url)
movies.append(movie)
if name == ‘main’:
main()