钛媒体的抓取

import requests
import re,json,pymysql
ss=0
headers = {
“User-Agent”: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36’,
}
for num in range(0,165,15):
print(ss)
url=‘http://www.tmtpost.com/ajax/common/get?url=%2Fv1%2Fposts%2Flist%2Fcategory%2F3189960&data=limit%3D15%26offset%3D{}%26fields%3Dthumb_image%3Bsummary%3Bnumber_of_comments%3Btags%3Bauthors%26thumb_image_size%3D[“200_150”]’.format(num)
response=requests.get(url,headers=headers)
text1 = json.loads(response.text)[“data”]
nn=0
for one in text1:
try:
title=str(one[“title”])
except:
title=“空”
print(title)
try:
creat_time=str(one[“time_created”])
except:
creat_time=“空”
print(creat_time)
try:
Central_idea=str(one[‘summary’])
except:
Central_idea=“空”
print(Central_idea)
try:
label=[]
for label_one in one[“tags”]:
label.append(label_one[“tag”])
except:
label=“空”
print(label)
try:
source=str(one[“authors”][0][“username”])
except:
source=“空”
author_kind=“空”
try:
info_url=“http://www.tmtpost.com/{}.html".format(one["post_guid”])
except:
info_url=“空”
try:
response_info = requests.get(info_url, headers=headers)
text_info=response_info.text
#print(text_info)
article=re.findall(’’’

([\s\S] )
’’’, text_info)
article_1=’’.join(article).replace(’\r’, ‘’).replace(’\n’, ‘’).replace(’\t’, ‘’).replace(’ ‘,’’)
try:
img=re.findall(’’'src="(.?)"’’’,article_1)
img_count = len(img)
except:
img_count = 0
article_2 = re.findall(’’’>(.*?)<’’’, article_1)
try:
real_contents = “”.join(article_2).replace(’”’, ‘’).replace(’“’, ‘’).replace(’ ‘,’’).replace(’…’, ‘’)
article_count=len(real_contents)
print(real_contents)
print(len(real_contents))
except:
real_contents=0
article_count=0
except:
print(“error”)
# 存入数据库
try:
db = pymysql.connect(host=‘127.0.0.1’, user=‘root’, password=‘123456’, database=‘key_word’, charset=‘utf8’)
# 创建游标对象
print(1)
cursor = db.cursor()
# 数据添加到数据库的语句
# sql = “insert into title_1 values(null,{},{},{},{})”.format(article_title,source,comment,datetime)
sql = ‘’‘insert into tmtpost_new values(null,"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")’’’ % (title, source, creat_time, info_url, Central_idea, real_contents, article_count, author_kind, img_count,label)
# 执行添加过程
cursor.execute(sql)
# 提交
db.commit()
# 关闭游标
cursor.close()
db.close()
#print("==" * 60)
nn+=1
print(nn)
except:
print(‘出错了1’)
ss+=1
print(ss,"–"*10)

猜你喜欢

转载自blog.csdn.net/chengjintao1121/article/details/85330258