python网络爬虫之新手胡乱玩
抓学校官网新闻
import requests from bs4 import BeautifulSoup #**大学综合新闻页面中所有新闻 with open('D:/抓的数据/**综合新闻.txt','w',encoding='utf-8') as f: f.write('**大学综合新闻页面所有新闻如下:\n') res = requests.get('不告诉你') res.encoding='GB2312' soup =BeautifulSoup(res.text,'html.parser') news=soup.select('.newsgridlist') for xiaoxi in news[0].select('li'): time=xiaoxi.contents[2] for a in xiaoxi.select('a'): path=a['href'] str="http://news.xtu.edu.cn" path=str+path title=a['title'] f.write(title+' ') f.write(path+' ') f.write(time+' ') f.write("\n\n") for i in range(2,88) : res = requests.get("不告诉你/zonghexw/index_%d.html"%(i)) res.encoding='GB2312' soup =BeautifulSoup(res.text,'html.parser') news=soup.select('.newsgridlist') for xiaoxi in news[0].select('li'): time=xiaoxi.contents[2] for a in xiaoxi.select('a'): path=a['href'] str="不告诉你" path=str+path title=a['title'] f.write(title+' ') f.write(path+' ') f.write(time) f.write("\n\n")
将每一个文件的内容存到本地
#首页的详细新闻 res = requests.get('不告诉你') res.encoding='GB2312' soup =BeautifulSoup(res.text,'html.parser') news=soup.select('.newsgridlist') for xiaoxi in news[0].select('li'): time=xiaoxi.contents[2] for a in xiaoxi.select('a'): path=a['href'] str="不告诉你" path=str+path res2 =requests.get(path) res2.encoding='GB2312' soup2=BeautifulSoup(res2.text,'html.parser') ntitle=soup2.select('.title') ntitle1=ntitle[0].text with open('D:/抓的数据/详细新闻/'+ntitle1+'.txt','w',encoding='utf-8') as f1: f1.write('\t\t\t\t\t\t\t') f1.write(ntitle1) for content1 in soup2.select('.content p'): f1.write(content1.text+'\n\n')
还可以爬一些用户信息之类的,保存为csv文件,用excel打开处理。
感觉就是python太强大了