python爬虫+mysql数据保存

# import urllib.request
# import http.cookiejar
# import pymysql
# conn = pymysql.connect("localhost", "root", "123456", "test")
# cursor = conn.cursor()
# cursor.execute("DROP TABLE IF EXISTS employee")
# sql = """CREATE TABLE employee(first_name CHAR(20) NOT NULL,
#          last_name CHAR(20),
#          age INT,
#          sex CHAR(1))"""
# cursor.execute(sql)
# sqlInsert = """INSERT INTO employee(first_name,last_name,age,sex) VALUES('李白','白居易',20,'男')"""
# try:
#     cursor.execute(sqlInsert)
#     cursor.execute(sqlInsert)
#     conn.commit()
# except:
#     conn.rollback()
# conn.close()


# 爬虫
import requests
from bs4 import BeautifulSoup
import pymysql

# 本地数据库
sql_host = 'localhost'
# 数据库的用户名
sql_user = 'root'
# 数据库密码
sql_password = '123456'
# 数据的名
sql_name = 'test'
SQL_INSERT = """INSERT INTO user_data(author,page,sex,age,vote) VALUES(%s,%s,%s,%s,%s)"""

def download_page(http_url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
    call_back = requests.get(http_url, headers=headers)
    return call_back.text


def get_page_content(html, page):
    conn = pymysql.connect(sql_host, sql_user, sql_password, sql_name)
    cursor = conn.cursor()
    output = """第{}页 作者：{} 性别：{} 年龄：{} 点赞：{} 评论：{}\n{}\n------------\n"""  # 最终输出格式
    soup = BeautifulSoup(html, 'html.parser')
    con = soup.find(id='content-left')
    con_list = con.find_all('div', class_='article')
    for item in con_list:
        author = item.find('h2').string
        content = item.find('div', class_='content').find('span').get_text()
        stats = item.find('div', class_='stats')
        vote = stats.find('span', class_='stats-vote').find('i', class_='number').get_text()
        comments = stats.find('span', class_='stats-comments').find('i', class_='number').string
        author_info = item.find('div', class_='articleGender')
        if author_info is not None:
            class_list = author_info['class']
            age = author_info.string
            if 'womenIcon' in class_list:
                gender = '女'
            elif 'manIcon' in class_list:
                gender = '男'
            else:
                gender = ''
        else:
            gender = ''
            age = ''

        print(author, page, gender, age, vote, content)
        # cursor.execute(SQL_INSERT, ("name","data","gg","sd","dd"))
        cursor.execute("""INSERT INTO user_data(author,page,sex,age,vote) VALUES("name","data","gg","sd","dd")""")
        conn.commit()

        # conn.close()

def main():
    for i in range(1, 2):
        http_url = 'https://qiushibaike.com/text/page/{}'.format(i)
        html = download_page(http_url)
        get_page_content(html,i)


if __name__ == '__main__':
    main()
python爬虫+mysql数据保存

猜你喜欢