Crawl Baidu Post Bar
import requests
from lxml import etree
import json
class TiebaSpider(object):
def __init__(self,name):
self.name = name
self.url = 'http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/m?kw='+name+'&lp=5011&lm=&pinf=1&pn=0'
# 请求头
self.headers = {
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
}
self.prefix_url = 'http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/'
def get_page_from_url(self,url):
""" 发送请求, 获取页面数据"""
response = requests.get(url,headers=self.headers)
return response.content
def get_data_from_page(self,page):
element = etree.HTML(page)
a_s = element.xpath('//div[contains(@class,"i")]/a')
data_list = []
for a in a_s:
data = {}
data['title'] = a.xpath('./text()')[0]
data['url'] = self.prefix_url + a.xpath('./@href')[0]
data_list.append(data)
next_url = element.xpath('//a[text()="下一页"]/@href')
if len(next_url) != 0:
next_url = self.prefix_url + next_url[0]
else:
next_url = None
return data_list,next_url
def save_data(self,data_list):
"""保存数据 """
file_name = "{}.jsonlines".format(self.name)
with open(file_name,'a',encoding='utf8') as f:
for data in data_list:
json.dump(data,f,ensure_ascii=False)
f.write('\n')
def run(self):
url = self.url
# 循环每一页
while url:
# 发送请求, 获取页面数据
page = self.get_page_from_url(url)
# 提取数据(xpath)
data_list,url = self.get_data_from_page(page)
# 保存数据
self.save_data(data_list)
if __name__ == '__main__':
tbs = TiebaSpider('刘亦菲') # 刘亦菲可以替他明星
tbs.run()
Please leave a message if you have any questions