Python爬虫 包图网case

# coding=utf-8
import requests
from lxml import etree
# 请求网页获取网页信息
responce = requests.get("https://ibaotu.com/shipin/")
# 整理网页文本对象
html = etree.HTML(responce.text)
# 定位网页元素位置
title_list = html.xpath('//span[@class="video-title"]/text()')
print(title_list)
src_list = html.xpath('//div[@class="video-play"]/video/@src')
print(src_list)
# 遍历数据,循环取文件名和链接地址
for tit, src in zip(title_list, src_list):
# 定义链接返回拼接数据
responce = requests.get("http:" + src)
# 定义文件名称和格式
file_name = tit + ".mp4"
print('保存视频文件: {}'.format(file_name))
# 文件已二进制方式写入文件名称,播放链接内容
with open(file_name, 'wb') as f:
f.write(responce.content)


from urllib import *
import requests
from lxml import etree

class Spider:
def geturl(self, url):
response = requests.get(url)
response.encoding = response.apparent_encoding
return response.text

# def download(self,url):
# response = requests.Request(url)
# return response.url


def getinfo(self, url):
html = etree.HTML(url)
tit1 = '//span[@class="video-title"]/text()'
tit = html.xpath(tit1)
src = html.xpath('//div[@class="video-play"]/video/@src')
return tit, src

def saveinfo(self,name,data):
for n,l in zip(name,data):
responce = requests.get("http:" + l)
file_name = n + ".mp4"
print("正在下载:{}".format(file_name))
with open(file_name, 'wb')as f:
f.write(responce.content)


def run(self,firsturl):
html = self.geturl(firsturl)
info = self.getinfo(html)
for date in zip(info):
name = info[0]
src = info[1]
self.saveinfo(name,src)


if __name__ == '__main__':
spider = Spider()
spider.run("https://ibaotu.com/shipin/")

猜你喜欢

转载自www.cnblogs.com/smilyroy/p/11436261.html
今日推荐