爬虫--爬取网页信息存储到Mysql数据库

有两种方法进行处理特殊内容的存储：

第一种：使用 repr 函数

repr() 函数将对象转化为供解释器读取的形式。

import json
import requests
import pymysql


class mysql_conn(object):
    # 魔术方法 ，初始化 ， 构造函数
    def __init__(self):
        self.db = pymysql.connect(host='127.0.0.1',user='root',password='123456',port=3306,database='python')
        self.cursor = self.db.cursor()
    # 执行modify（修改）相关的操作
    def execute_modify_mysql(self,sql):
        self.cursor.execute(sql)
        self.db.commit()
    # 魔术方法 ， 析构化 ， 析构函数
    def __del__(self):
        self.cursor.close()
        self.db.close()

mc = mysql_conn()

headers = {
    'Cookie': '***********',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}

# urllib 的相关操作如下
url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=111'

response = requests.get(url, headers=headers)

res_dict = json.loads(response.text)

list_list = res_dict['list']

for list_item_dict in list_list:
    # list 列表内的一个item, 他是一个dict
    data_str = list_item_dict['data']

    data_dic = json.loads(data_str)

    id = int(data_dic['id'])
    title = data_dic['title']
    description = data_dic['description']
    target = data_dic['target']
    sql = "insert into xue_sql(id,title,description,target) values (%d,%s,%s,%s);"%(id,repr(title),repr(description),repr(target))
    mc.execute_modify_mysql(sql)

第二种方法：

使用元组的形式把值进行传递，拼接

import json
import requests
import pymysql


class mysql_conn(object):
    # 魔术方法 ，初始化 ， 构造函数
    def __init__(self):
        self.db = pymysql.connect(host='127.0.0.1',user='root',password='123456',port=3306,database='python')
        self.cursor = self.db.cursor()
    # 执行modify（修改）相关的操作
    def execute_modify_mysql(self,sql,data = None):
        self.cursor.execute(sql,data)
        self.db.commit()
    # 魔术方法 ， 析构化 ， 析构函数
    def __del__(self):
        self.cursor.close()
        self.db.close()

mc = mysql_conn()

headers = {
    'Cookie': '*******',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}

# urllib 的相关操作如下
url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=111'

response = requests.get(url, headers=headers)

res_dict = json.loads(response.text)

list_list = res_dict['list']

for list_item_dict in list_list:
    # list 列表内的一个item, 他是一个dict
    data_str = list_item_dict['data']

    data_dic = json.loads(data_str)

    id = int(data_dic['id'])
    title = data_dic['title']
    description = data_dic['description']
    target = data_dic['target']

    data = (id,repr(title),repr(description),repr(target))

    sql = "insert into new_xue(id,title,description,target) values (%s,%s,%s,%s);"
    mc.execute_modify_mysql(sql,data = data)

爬虫--爬取网页信息存储到Mysql数据库

猜你喜欢