凤凰网的抓取

import requests
import re,json,pymysql,time
#获取页码ID
article_id_list=[
http://shankapi.ifeng.com/shanklist//getColumnInfo//default/6429514672495399578/1532918315000/20/5-35059-/getColumnInfoCallback?callback=getColumnInfoCallback&=154536371661415",
"http://shankapi.ifeng.com/shanklist/
/getColumnInfo//default/6432933109712293888/1533730982000/20/5-35059-/getColumnInfoCallback?callback=getColumnInfoCallback&=154536371298113”,
http://shankapi.ifeng.com/shanklist//getColumnInfo//default/6442555267656712337/1536025048000/20/5-35059-/getColumnInfoCallback?callback=getColumnInfoCallback&=154536360691311",
"http://shankapi.ifeng.com/shanklist/
/getColumnInfo//default/6447733192223883264/1537259554000/20/5-35059-/getColumnInfoCallback?callback=getColumnInfoCallback&=15453636020029”,
http://shankapi.ifeng.com/shanklist//getColumnInfo//default/6463590392771121594/1541039520000/20/5-35059-/getColumnInfoCallback?callback=getColumnInfoCallback&=15453635950277",
"http://shankapi.ifeng.com/shanklist/
/getColumnInfo//default/6469909334296887381/1542546120000/20/5-35059-/getColumnInfoCallback?callback=getColumnInfoCallback&=15453635905465”,
http://shankapi.ifeng.com/shanklist//getColumnInfo//default/6471911734469727016/1543024466000/20/5-35059-/getColumnInfoCallback?callback=getColumnInfoCallback&=15453635857703",
"http://shankapi.ifeng.com/shanklist/
/getColumnInfo//default/6471911734469727016/1542546120000/20/5-35059-/getColumnInfoCallback?callback=getColumnInfoCallback&=15453635905465”
]

headers = {
“User-Agent”: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36’,}
for url in article_id_list:
try:
response=requests.get(url,headers=headers)
text=response.text
re_text=re.findall(’’‘getColumnInfoCallback((.))’’’,text)
m=json.loads(re_text[0])
ID=m[“data”][“newsstream”]
for one in ID:
#标题
try:
title=str(one[“title”])
except:
title=“空”
#详情页URL
try:
info_url=str(one[“url”])
except:
info_url=“空”
#来源
try:
source=str(one[“source”])
except:
source=“空”
#创建时间
try:
creat_time=str(one[“newsTime”])
except:
creat_time=“空”
#概述
try:
Central_idea=str(one[“summary”])
except:
Central_idea=“空”
author_kind=“空”
response_info = requests.get(info_url, headers=headers)
text_info=response_info.text
print(text)
m = re.findall("var allData =(.
}}?);", text_info)
print(m)
tt = json.loads((m[0]))
article = tt[“docData”][“contentData”][“contentList”][0][“data”]
# print(repr(tt[“docData”][“contentData”][“contentList”][0][“data”]))
# 匹配文章内容
contentan_1 = re.findall(’’’>(.*?)<’’’, article)
try:
real_content = ‘’.join(contentan_1).replace(’\r’, ‘’).replace(’\n’, ‘’).replace(’\t’, ‘’).replace(’ ‘, ‘’)
except:
real_content=“空”
#文章内容
try:
real_content = str(real_content)
except:
real_content=“空”
# 文章长度
try:
article_count = len(real_content)
except:
article_count=“空”
#文章属性标签
try:
label_local = []
for label in tt[“docData”][“breadCrumbdata”]:
label_local.append(label[“title”])
except:
label_local=“空”
try:
img_counts = len(tt[“docData”][“imagesInContent”])
except:
img_counts=“空”
try:
author=tt[“docData”][“editorName”]
except:
author=“空”
#存入数据库
try:
db = pymysql.connect(host=‘127.0.0.1’, user=‘root’, password=‘123456’, database=‘key_word’, charset=‘utf8’)
# 创建游标对象
print(1)
cursor = db.cursor()
# 数据添加到数据库的语句
# sql = “insert into title_1 values(null,{},{},{},{})”.format(article_title,source,comment,datetime)
sql = ‘’‘insert into ifeng values(null,"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")’’’ % (title, source, creat_time, info_url, Central_idea, real_content, article_count, author,img_counts,label_local)
# 执行添加过程
cursor.execute(sql)
# 提交
db.commit()
# 关闭游标
cursor.close()
db.close()
print("==" * 60)
except:
print(‘出错了1’)
except:
print(“error”)

猜你喜欢

转载自blog.csdn.net/chengjintao1121/article/details/85330209