6.Sqlite database (data persistence)

After Python3, the sqlite3 database is supported by default. In order to improve the efficiency of the entire crawler project, use the lightweight database Sqlite

SQLite storage class basic data types

Storage class	description
NULL	The value is a NULL value.
INTEGER	The value is a signed integer, stored in 1, 2, 3, 4, 6, or 8 bytes depending on the size of the value.
REAL	The value is a floating point value, stored as an 8-byte IEEE floating point number.
TEXT	The value is a text string, stored using the database encoding (UTF-8, UTF-16BE, or UTF-16LE).
BLOB	The value is a blob data, stored entirely according to its input. (Big Data Binary)

Affinity data type ¹ , please refer to the rookie tutorial: SQLite data type

<img src=“https://img-blog.csdnimg.cn/20210129183339102.png” height=1

Actual operation

Lead package

import sqlite3

1. Connect to create a database

conn = sqlite3.connect("test.db")  # 打开或者创建数据库文件

2. Create a table

    c = conn.cursor()  # 游标
    # 三个‘ 保留段落格式
    sql = '''
        create table company
            (id int primary key not null,
            name text not null,
            age int not null,
            address char(50),
            salary real);
    '''
    
    c.execute(sql)  # 执行sql
    conn.commit()  # 提交数据口操作
    conn.close()  # 数据库关闭

3. Insert data

    conn = sqlite3.connect("test.db")  # 打开或者创建数据库文件
    c = conn.cursor()  # 游标
    sql = '''
        insert into company(id, name, age, address, salary)
            values (1, "阿强", 32, "北京", 15000),
            (2, "阿斗", 31, "上海", 10000);
    '''
    c.execute(sql)  # 执行sql
    conn.commit()  # 提交数据库操作
    conn.close()  # 数据库关闭

4. Query data

    conn = sqlite3.connect("test.db")  # 打开或者创建数据库文件
    c = conn.cursor()  # 游标
    sql = "select id, name, address, salary from company"
    cursor = c.execute(sql)  # 执行sql
    for row in cursor:
        print('id = ', row[0])
        print('name = ', row[1])
        print('address = ', row[2])
        print('salary = ', row[3], '\n')
    conn.close()  # 数据库关闭

To create a table and insert data into the table, execute sql statements, submit operations, and close database connections. Queries do not require submit operations. After query operations, the return type is sqlite3.Cursor object, and the access results need to be traversed. It is recommended to use a pair of ``` packages for too long sql statements so that the paragraph format can be preserved so that the query statement will not make mistakes. Modify and update the data are the same steps, do not do too many demonstrations

Task-driven development: Store the data crawled by Douban Top250 movies into the data

import re  # 正则表达式，进行文字匹配
# import bs4 #只需要使用bs4中的BeautifulSoup因此可以如下写法：
from bs4 import BeautifulSoup  # 网页解析，获取数据
import xlwt  # 进行excel操作
import sqlite3  # 进行SQLlite数据库操作
import urllib.request, urllib.error  # 指定url，获取网页数据


def main():
    # 爬取的网页
    baseurl = "https://movie.douban.com/top250?start="
    # # 保存的路径
    savepath = ".\\豆瓣电影Top250.xls"  # 使用\\表示层级目录或者在整个字符串前加r“.\豆瓣电影Top250”
    savepath2Db = "movies.db"
    # # 1.爬取网页
    # print(askURL(baseurl))
    datalist = getData(baseurl)
    print(datalist)
    # # 3.保存数据(存储到excel或者DB中)
    saveData(datalist, savepath)
    saveData2Db(datalist, savepath2Db)


# 影片详情链接的规则
findLink = re.compile('<a href="(.*?)">')  # 创建正则表达式对象
# 影片图片的链接规则
findImgSrc = re.compile('<img alt=".*src="(.*?)"', re.S)  # re.S忽略换行
# 影片片名
findTitle = re.compile('<span class="title">(.*)</span>')
# 影片评分
findRating = re.compile('<span class="rating_num" property="v:average">(.*)</span>')
# 评价人数
# findJudge = re.compile('<span>(\d*)(.*)人评价</span>')
findJudge = re.compile('<span>(\d*)人评价</span>')
# 概况
findInq = re.compile('<span class="inq">(.*)</span>')
# 影片相关内容
findBd = re.compile('<p class="">(.*?)</p>', re.S)  # 中间有</br>，因此要忽略换行符


# 爬取网页
def getData(baseurl):
    datalist = []
    for i in range(0, 10):  # 一页25条电影
        url = baseurl + str(i*25)
        html = askURL(url)  # 保存获取到的网页源码
        # print(html)
        # 2.解析数据(逐一)
        soup = BeautifulSoup(html, "html.parser")  # 使用html.parser解析器解析html文档形成树形结构数据
        for item in soup.find_all("div", class_="item"):  # 查找符合要求的字符串，形成列表
            # print(item)
            data = []  # 保存一部电影的信息
            item = str(item)
            # 影片详情链接
            link = re.findall(findLink, item)[0]
            data.append(link)
            # 图片
            img = re.findall(findImgSrc, item)[0]
            data.append(img)
            # 标题
            titles = re.findall(findTitle, item)
            if(len(titles) == 2):
                ctitle = titles[0]  # 中文名
                data.append(ctitle)
                otitle = titles[1].replace("/", "")
                data.append(otitle)  # 外文名
            else:
                data.append(titles[0])
                data.append(' ')  # 外文名留空
            # data.append(title)
            # 评分
            rating = re.findall(findRating, item)[0]
            data.append(rating)
            # 评价人数
            judgeNum = re.findall(findJudge, item)[0]
            # print(judgeNum)
            data.append(judgeNum)
            # 添加概述
            inq = re.findall(findInq, item)
            if len(inq) == 0:
                data.append(" ")
            else:
                data.append(inq[0].replace("。", ""))
            # 影片相关内容
            bd = re.findall(findBd, item)[0]
            bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd)  # 去掉</br>
            bd = re.sub('/', " ", bd)  # 替换/
            data.append(bd.strip())  # 去掉前后的空格

            datalist.append(data)  # 把处理好的一部电影的信息保存
        # for it in datalist:
        #     print(it)
    return datalist


# 得到执行url的网页信息
def askURL(url):
    # 头部信息 其中用户代理用于伪装浏览器访问网页
    head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/87.0.4280.88 Safari/537.36"}
    req = urllib.request.Request(url, headers=head)
    html = ""  # 获取到的网页源码
    try:
        response = urllib.request.urlopen(req)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):  # has attribute
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html


def saveData(datalist, savepath):
    book = xlwt.Workbook(encoding="utf-8", style_compression=0)  # style_compression：压缩的效果
    sheet = book.add_sheet("豆瓣电影top250", cell_overwrite_ok=True)  # 单元格内容可覆盖
    col = ("电影详情链接", "图片链接", "影片中文名", "影片外文名", "评分", "评价数", "概述", "相关信息")  # 元组添加表头
    for i in range(8):  # 写入表头(列名)
        sheet.write(0, i, col[i])

    for i in range(1, len(datalist)+1):
        for j in range(len(datalist[i-1])):
            sheet.write(i, j, datalist[i-1][j])
    book.save(savepath)


def saveData2Db(datalist, savepath2Db):
    init_db(savepath2Db)
    conn = sqlite3.connect(savepath2Db)
    cur = conn.cursor()
    for data in datalist:
        for index in range(len(data)):
            if index == 4 or index == 5:
                continue
            data[index] = '"'+str(data[index])+'"'
        sql = '''
            insert into movie250(
                info_link, pic_link, cname, ename, score, rated, introduction, info)
                values(%s)
        '''%",".join(data)
        # print(sql)
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()


def init_db(dbpath):
    sql = '''
        create table movie250(
            id integer primary key autoincrement,
            info_link text,
            pic_link text,
            cname vachar,
            ename vachar,
            score numeric,
            rated numeric,
            introduction text,
            info text
        )
    '''
    conn = sqlite3.connect(dbpath)  # 创建数据连接
    cursor = conn.cursor()  # 创建游标
    cursor.execute(sql)  # 执行sql语句
    conn.commit()  # 提交
    # cursor.close()
    conn.close()  # 关闭数据库


if __name__ == '__main__':
    # init_db("movieTest.db")  # 测试数据库初始化
    main()

When storing to the database, make sure that every value exists, even if it is a null value, an empty string must be given, otherwise an error may occur when inserting into the database

Movie Top250 is also normally saved in the database

Affinity ↩︎