Python数据存储的四种方法(txt,json,csv,数据库[关系型、非关系型])

版权声明:来自 https://blog.csdn.net/Leesoar521yt/article/details/81452130

代码使用requests与bs4,以爬取CSDN博客文章标题为例:

爬取前需安装requests与bs4

pip install requests
pip install bs4
将数据存储在.txt

下例将数据保存在csdn.txt,直接上代码。

# !usr/bin/env python3
# -*- coding: UTF-8 -*-

from bs4 import BeautifulSoup
import requests


url = 'https://www.csdn.net/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'}
html = requests.get(url, headers=headers, timeout=3).text
soup = BeautifulSoup(html, 'html.parser')
html_list = soup.find_all('div', {'class': 'list_con'})
for sub in html_list:
    title = sub.find('div', {'class': 'title'}).find('h2').get_text().strip()   # 标题
    link = sub.find('div', {'class': 'title'}).find('h2').a['href']   # 链接
    summary = sub.find('div', {'class': 'summary oneline'}).get_text().strip()   # 摘要
    author = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'name'}).get_text().strip()   # 作者
    time = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'time'}).get_text().strip()   # 时间
    try:   # 标签,有些为空,故用异常捕获,以下同理
        tag = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'tag'}).find('a').get_text().strip()
    except Exception:
        tag = 'None'
    try:   # 阅读数
        read_num = sub.find('dd', {'class': 'read_num'}).find('span', {'class', 'num'}).get_text()
    except Exception:
        read_num = 'None'
    try:   # 评论数
        common_num = sub.find('dd', {'class': 'common_num '}).find('span', {'class': 'num'}).get_text()
    except Exception:
        common_num = 'None'
    with open('./csdn.txt', 'a+', encoding='UTF-8') as f:
        f.write('\n'.join([title, link, summary, author, time, tag, read_num, common_num]))
        f.write('\n================================================================================\n')
print('Finished.')
将数据存储到在.json

在上面代码的基础上,只需修改添加几处即可。要注意的是,在json存储中文时,需要设置encoding='UTF-8'ensure_ascii=False

# !usr/bin/env python3
# -*- coding: UTF-8 -*-

from bs4 import BeautifulSoup
import requests
from json import dumps


url = 'https://www.csdn.net/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'}
html = requests.get(url, headers=headers, timeout=3).text
soup = BeautifulSoup(html, 'html.parser')
html_list = soup.find_all('div', {'class': 'list_con'})
data_list = []
for sub in html_list:
    title = sub.find('div', {'class': 'title'}).find('h2').get_text().strip()   # 标题
    link = sub.find('div', {'class': 'title'}).find('h2').a['href']   # 链接
    summary = sub.find('div', {'class': 'summary oneline'}).get_text().strip()   # 摘要
    author = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'name'}).get_text().strip()   # 作者
    time = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'time'}).get_text().strip()   # 时间
    try:   # 标签,有些为空,故用异常捕获,以下同理
        tag = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'tag'}).find('a').get_text().strip()
    except Exception:
        tag = 'None'
    try:   # 阅读数
        read_num = sub.find('dd', {'class': 'read_num'}).find('span', {'class', 'num'}).get_text()
    except Exception:
        read_num = 'None'
    try:   # 评论数
        common_num = sub.find('dd', {'class': 'common_num '}).find('span', {'class': 'num'}).get_text()
    except Exception:
        common_num = 'None'
    data = {
            '标题': title,
            '链接': link,
            '摘要': summary,
            '作者': author,
            '发表时间': time,
            '标签': tag,
            '阅读数': read_num,
            '评论数': common_num
            }
    data_list.append(data)

with open('./csdn.json', 'a+', encoding='UTF-8') as f:
    f.write(dumps(data_list, indent=2, ensure_ascii=False))
print('Finished.')
将数据存储在.csv

csv,相当于一个结构化表的纯文本形式,比Excel简洁。

方法1:使用数组写入csv

# !usr/bin/env python3
# -*- coding: UTF-8 -*-

from bs4 import BeautifulSoup
import requests
from csv import writer


url = 'https://www.csdn.net/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'}
html = requests.get(url, headers=headers, timeout=3).text
soup = BeautifulSoup(html, 'html.parser')
html_list = soup.find_all('div', {'class': 'list_con'})
data_list = []
for sub in html_list:
    title = sub.find('div', {'class': 'title'}).find('h2').get_text().strip()   # 标题
    link = sub.find('div', {'class': 'title'}).find('h2').a['href']   # 链接
    summary = sub.find('div', {'class': 'summary oneline'}).get_text().strip()   # 摘要
    author = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'name'}).get_text().strip()   # 作者
    time = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'time'}).get_text().strip()   # 时间
    try:   # 标签,有些为空,故用异常捕获,以下同理
        tag = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'tag'}).find('a').get_text().strip()
    except Exception:
        tag = 'None'
    try:   # 阅读数
        read_num = sub.find('dd', {'class': 'read_num'}).find('span', {'class', 'num'}).get_text()
    except Exception:
        read_num = 'None'
    try:   # 评论数
        common_num = sub.find('dd', {'class': 'common_num '}).find('span', {'class': 'num'}).get_text()
    except Exception:
        common_num = 'None'

    data_list.append([title, link, summary, author, time, tag, read_num, common_num])

with open('./csdn.csv', 'w', encoding='GB18030') as csvfile:
    writer = writer(csvfile)
    writer.writerow(['标题', '链接', '摘要', '作者', '发表时间', '标签', '阅读数', '评论数'])
    writer.writerows(data_list)

print('Finished.')

方法2:使用字典写入csv

# !usr/bin/env python3
# -*- coding: UTF-8 -*-

from bs4 import BeautifulSoup
import requests
from csv import DictWriter


url = 'https://www.csdn.net/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'}
html = requests.get(url, headers=headers, timeout=3).text
soup = BeautifulSoup(html, 'html.parser')
html_list = soup.find_all('div', {'class': 'list_con'})
for sub in html_list:
    title = sub.find('div', {'class': 'title'}).find('h2').get_text().strip()   # 标题
    link = sub.find('div', {'class': 'title'}).find('h2').a['href']   # 链接
    summary = sub.find('div', {'class': 'summary oneline'}).get_text().strip()   # 摘要
    author = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'name'}).get_text().strip()   # 作者
    time = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'time'}).get_text().strip()   # 时间
    try:   # 标签,有些为空,故用异常捕获,以下同理
        tag = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'tag'}).find('a').get_text().strip()
    except Exception:
        tag = 'None'
    try:   # 阅读数
        read_num = sub.find('dd', {'class': 'read_num'}).find('span', {'class', 'num'}).get_text()
    except Exception:
        read_num = 'None'
    try:   # 评论数
        common_num = sub.find('dd', {'class': 'common_num '}).find('span', {'class': 'num'}).get_text()
    except Exception:
        common_num = 'None'
    data = {
            '标题': title,
            '链接': link,
            '摘要': summary,
            '作者': author,
            '发表时间': time,
            '标签': tag,
            '阅读数': read_num,
            '评论数': common_num
            }
    with open('./csdn.csv', 'a+', encoding='GB18030') as csvfile:
        fieldnames = ['标题', '链接', '摘要', '作者', '发表时间', '标签', '阅读数', '评论数']
        writer = DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow(data)

print('Finished.')
将数据存储在数据库
存储在关系型数据库
存储在非关系型数据库

猜你喜欢

转载自blog.csdn.net/Leesoar521yt/article/details/81452130