Python3.x使用requests库将爬取数据存储到MySQL
豆瓣电影排名前250链接 https://movie.douban.com/top250
- 导入模块库
import requests
from lxml import etree #灵活地处理 XML 和 HTML页面的库
import time
import pymysql #PyMySQL是在 Python3.x 版本中用于连接 MySQL 服务器的一个库
2.定义处理类
class Douban:
def __init__(self):
#模拟请求头
self.header={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "movie.douban.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36"
}
def get_html(self,url): #获取页面
try:
resopnse = requests.get(url, headers=self.header)
resopnse.encoding = "utf-8"
resopnse.status_code
html = resopnse.text
return html
except Exception as e:
print("页面获取失败"+e)
return ""
def detail_url(self,html): #获取详情页面的url
html = etree.HTML(html)
durl = html.xpath('//ol[@class="grid_view"]/li//div[@class="pic"]/a/@href')
for url in durl:
self.detail_html(url)
time.sleep(2)
self.next_html(html)
def next_html(self,html): #获取下一页
n_url = html.xpath('//span[@class="next"]/a/@href')[0]
next_url = "https://movie.douban.com/top250"+n_url
if next_url:
print("="*1000,next_url)
html = self.get_html(next_url)
self.detail_url(html)
def detail_html(self,url): #获取详情页面的信息
htmls = self.get_html(url)
html = etree.HTML(htmls)
name = "".join(html.xpath('//div[@id="content"]/h1//span/text()')) # 获取名字标题
img_url = html.xpath('//div[@id="mainpic"]/a/img/@src')[0] #图片地址
daoyan = html.xpath('//div[@id="info"]/span[1]/span[2]/a/text()')[0] #导演
bianju = "".join(html.xpath('//div[@id="info"]/span[2]/span[2]//a/text()')) #编剧
zhuyan = "".join(html.xpath('//div[@id="info"]/span[3]/span[2]//text()')).replace("/",",") #主演
type = "".join(html.xpath('//span[@property="v:genre"]/text()'))#类型
score = html.xpath('//strong[contains(@class,"rating_num")]/text()')[0] # 获取评分
zu = (name,img_url,daoyan,bianju,zhuyan,type,score)
print(zu)
self.insert_table(zu)
def lian(self): # 连接数据库
username = "root"
password = "root"
path = "localhost"
dbname = "python_test"
db = pymysql.connect(path,username,password,dbname)
return db
def create_table(self): #创建表
cursor= self.lian().cursor()
cursor.execute("drop table if exists test3")
sql = """
create table test3(
id int primary key auto_increment,
name varchar(255),
img_url varchar(255),
daoyan varchar(255),
bianju varchar(255),
zhuyan text,
type varchar(255),
score varchar(255)
)character set utf8
"""
cursor.execute(sql)
def insert_table(self,zu): #添加数据到数据库
try:
cursor = self.lian().cursor()
sql = "insert into test3(name,img_url,daoyan,bianju,zhuyan,type,score) value(%s,%s,%s,%s,%s,%s,%s) "
cursor.execute(sql, zu)
self.lian().commit()
except:
self.lian().rollback()
print("添加失败")
def run(self):
self.create_table() #创建表
url = "https://movie.douban.com/top250"
html = self.get_html(url)
self.detail_url(html)
self.lian().close() #关闭数据库
# print(html)
if __name__ == '__main__':
d = Douban()
d.run()
3.结果省略。。。
如果对您有帮助,麻烦点个赞。您的鼓励就是我的动力!