Python3.x使用requests库将爬取数据存储到MySQL

其他 2021-01-31 14:45:59 阅读次数: 0

Python3.x使用requests库将爬取数据存储到MySQL

豆瓣电影排名前250链接 https://movie.douban.com/top250

导入模块库

import requests 
from lxml import etree #灵活地处理 XML 和 HTML页面的库
import time
import pymysql #PyMySQL是在 Python3.x 版本中用于连接 MySQL 服务器的一个库

2.定义处理类

class Douban:

    def __init__(self):
    #模拟请求头
        self.header={
    
    
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Host": "movie.douban.com",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36"
        }

    def get_html(self,url): #获取页面
        try:
            resopnse = requests.get(url, headers=self.header)
            resopnse.encoding = "utf-8"
            resopnse.status_code
            html = resopnse.text
            return html
        except Exception as e:
            print("页面获取失败"+e)
            return ""

    def detail_url(self,html): #获取详情页面的url
        html = etree.HTML(html)
        durl = html.xpath('//ol[@class="grid_view"]/li//div[@class="pic"]/a/@href')
        for url in durl:
            self.detail_html(url)
            time.sleep(2)
        self.next_html(html)

    def next_html(self,html): #获取下一页
        n_url = html.xpath('//span[@class="next"]/a/@href')[0]
        next_url = "https://movie.douban.com/top250"+n_url
        if next_url:
            print("="*1000,next_url)
            html = self.get_html(next_url)
            self.detail_url(html)


    def detail_html(self,url): #获取详情页面的信息
        htmls = self.get_html(url)
        html = etree.HTML(htmls)
        name = "".join(html.xpath('//div[@id="content"]/h1//span/text()')) # 获取名字标题
        img_url = html.xpath('//div[@id="mainpic"]/a/img/@src')[0] #图片地址
        daoyan = html.xpath('//div[@id="info"]/span[1]/span[2]/a/text()')[0] #导演
        bianju = "".join(html.xpath('//div[@id="info"]/span[2]/span[2]//a/text()')) #编剧
        zhuyan = "".join(html.xpath('//div[@id="info"]/span[3]/span[2]//text()')).replace("/","，") #主演
        type = "".join(html.xpath('//span[@property="v:genre"]/text()'))#类型
        score = html.xpath('//strong[contains(@class,"rating_num")]/text()')[0] # 获取评分
        zu = (name,img_url,daoyan,bianju,zhuyan,type,score)
        print(zu)
        self.insert_table(zu)

    def lian(self): # 连接数据库
        username = "root"
        password = "root"
        path = "localhost"
        dbname = "python_test"
        db = pymysql.connect(path,username,password,dbname)
        return db

    def create_table(self): #创建表
        cursor= self.lian().cursor()
        cursor.execute("drop table if exists test3")
        sql = """
        create table test3(
        id int primary key auto_increment,
        name varchar(255),
        img_url varchar(255),
        daoyan varchar(255),
        bianju varchar(255),
        zhuyan text,
        type varchar(255),
        score varchar(255)
        )character set utf8
        """
        cursor.execute(sql)

    def insert_table(self,zu): #添加数据到数据库
        try:
            cursor = self.lian().cursor()
            sql = "insert into test3(name,img_url,daoyan,bianju,zhuyan,type,score) value(%s,%s,%s,%s,%s,%s,%s) "
            cursor.execute(sql, zu)
            self.lian().commit()
        except:
            self.lian().rollback()
            print("添加失败")

    def run(self):
        self.create_table() #创建表
        url = "https://movie.douban.com/top250"
        html = self.get_html(url)
        self.detail_url(html)
        self.lian().close() #关闭数据库
        # print(html)



if __name__ == '__main__':
    d = Douban()
    d.run()