版权声明:来自 https://blog.csdn.net/Leesoar521yt/article/details/81452130
代码使用requests与bs4,以爬取CSDN博客文章标题为例:
爬取前需安装requests与bs4
pip install requests
pip install bs4
将数据存储在.txt
下例将数据保存在csdn.txt,直接上代码。
# !usr/bin/env python3
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import requests
url = 'https://www.csdn.net/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'}
html = requests.get(url, headers=headers, timeout=3).text
soup = BeautifulSoup(html, 'html.parser')
html_list = soup.find_all('div', {'class': 'list_con'})
for sub in html_list:
title = sub.find('div', {'class': 'title'}).find('h2').get_text().strip() # 标题
link = sub.find('div', {'class': 'title'}).find('h2').a['href'] # 链接
summary = sub.find('div', {'class': 'summary oneline'}).get_text().strip() # 摘要
author = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'name'}).get_text().strip() # 作者
time = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'time'}).get_text().strip() # 时间
try: # 标签,有些为空,故用异常捕获,以下同理
tag = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'tag'}).find('a').get_text().strip()
except Exception:
tag = 'None'
try: # 阅读数
read_num = sub.find('dd', {'class': 'read_num'}).find('span', {'class', 'num'}).get_text()
except Exception:
read_num = 'None'
try: # 评论数
common_num = sub.find('dd', {'class': 'common_num '}).find('span', {'class': 'num'}).get_text()
except Exception:
common_num = 'None'
with open('./csdn.txt', 'a+', encoding='UTF-8') as f:
f.write('\n'.join([title, link, summary, author, time, tag, read_num, common_num]))
f.write('\n================================================================================\n')
print('Finished.')
将数据存储到在.json
在上面代码的基础上,只需修改添加几处即可。要注意的是,在json存储中文时,需要设置
encoding='UTF-8'
与ensure_ascii=False
。
# !usr/bin/env python3
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import requests
from json import dumps
url = 'https://www.csdn.net/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'}
html = requests.get(url, headers=headers, timeout=3).text
soup = BeautifulSoup(html, 'html.parser')
html_list = soup.find_all('div', {'class': 'list_con'})
data_list = []
for sub in html_list:
title = sub.find('div', {'class': 'title'}).find('h2').get_text().strip() # 标题
link = sub.find('div', {'class': 'title'}).find('h2').a['href'] # 链接
summary = sub.find('div', {'class': 'summary oneline'}).get_text().strip() # 摘要
author = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'name'}).get_text().strip() # 作者
time = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'time'}).get_text().strip() # 时间
try: # 标签,有些为空,故用异常捕获,以下同理
tag = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'tag'}).find('a').get_text().strip()
except Exception:
tag = 'None'
try: # 阅读数
read_num = sub.find('dd', {'class': 'read_num'}).find('span', {'class', 'num'}).get_text()
except Exception:
read_num = 'None'
try: # 评论数
common_num = sub.find('dd', {'class': 'common_num '}).find('span', {'class': 'num'}).get_text()
except Exception:
common_num = 'None'
data = {
'标题': title,
'链接': link,
'摘要': summary,
'作者': author,
'发表时间': time,
'标签': tag,
'阅读数': read_num,
'评论数': common_num
}
data_list.append(data)
with open('./csdn.json', 'a+', encoding='UTF-8') as f:
f.write(dumps(data_list, indent=2, ensure_ascii=False))
print('Finished.')
将数据存储在.csv
csv,相当于一个结构化表的纯文本形式,比Excel简洁。
方法1:使用数组写入csv
# !usr/bin/env python3
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import requests
from csv import writer
url = 'https://www.csdn.net/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'}
html = requests.get(url, headers=headers, timeout=3).text
soup = BeautifulSoup(html, 'html.parser')
html_list = soup.find_all('div', {'class': 'list_con'})
data_list = []
for sub in html_list:
title = sub.find('div', {'class': 'title'}).find('h2').get_text().strip() # 标题
link = sub.find('div', {'class': 'title'}).find('h2').a['href'] # 链接
summary = sub.find('div', {'class': 'summary oneline'}).get_text().strip() # 摘要
author = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'name'}).get_text().strip() # 作者
time = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'time'}).get_text().strip() # 时间
try: # 标签,有些为空,故用异常捕获,以下同理
tag = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'tag'}).find('a').get_text().strip()
except Exception:
tag = 'None'
try: # 阅读数
read_num = sub.find('dd', {'class': 'read_num'}).find('span', {'class', 'num'}).get_text()
except Exception:
read_num = 'None'
try: # 评论数
common_num = sub.find('dd', {'class': 'common_num '}).find('span', {'class': 'num'}).get_text()
except Exception:
common_num = 'None'
data_list.append([title, link, summary, author, time, tag, read_num, common_num])
with open('./csdn.csv', 'w', encoding='GB18030') as csvfile:
writer = writer(csvfile)
writer.writerow(['标题', '链接', '摘要', '作者', '发表时间', '标签', '阅读数', '评论数'])
writer.writerows(data_list)
print('Finished.')
方法2:使用字典写入csv
# !usr/bin/env python3
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import requests
from csv import DictWriter
url = 'https://www.csdn.net/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'}
html = requests.get(url, headers=headers, timeout=3).text
soup = BeautifulSoup(html, 'html.parser')
html_list = soup.find_all('div', {'class': 'list_con'})
for sub in html_list:
title = sub.find('div', {'class': 'title'}).find('h2').get_text().strip() # 标题
link = sub.find('div', {'class': 'title'}).find('h2').a['href'] # 链接
summary = sub.find('div', {'class': 'summary oneline'}).get_text().strip() # 摘要
author = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'name'}).get_text().strip() # 作者
time = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'time'}).get_text().strip() # 时间
try: # 标签,有些为空,故用异常捕获,以下同理
tag = sub.find('dl', {'class': 'list_userbar'}).find('dd', {'class': 'tag'}).find('a').get_text().strip()
except Exception:
tag = 'None'
try: # 阅读数
read_num = sub.find('dd', {'class': 'read_num'}).find('span', {'class', 'num'}).get_text()
except Exception:
read_num = 'None'
try: # 评论数
common_num = sub.find('dd', {'class': 'common_num '}).find('span', {'class': 'num'}).get_text()
except Exception:
common_num = 'None'
data = {
'标题': title,
'链接': link,
'摘要': summary,
'作者': author,
'发表时间': time,
'标签': tag,
'阅读数': read_num,
'评论数': common_num
}
with open('./csdn.csv', 'a+', encoding='GB18030') as csvfile:
fieldnames = ['标题', '链接', '摘要', '作者', '发表时间', '标签', '阅读数', '评论数']
writer = DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(data)
print('Finished.')