python网络爬虫之新手胡乱玩

python网络爬虫之新手胡乱玩

抓学校官网新闻

import requests
from bs4 import BeautifulSoup

#**大学综合新闻页面中所有新闻
with open('D:/抓的数据/**综合新闻.txt','w',encoding='utf-8') as f:
    f.write('**大学综合新闻页面所有新闻如下:\n')
    res = requests.get('不告诉你')
    res.encoding='GB2312'
    soup =BeautifulSoup(res.text,'html.parser')
    news=soup.select('.newsgridlist')
    for xiaoxi in news[0].select('li'):
        time=xiaoxi.contents[2]
        for a in xiaoxi.select('a'):
            path=a['href']
            str="http://news.xtu.edu.cn"
            path=str+path
            title=a['title']
            f.write(title+' ')
            f.write(path+' ')
            f.write(time+' ')
            f.write("\n\n")
    for i in range(2,88) :
        res = requests.get("不告诉你/zonghexw/index_%d.html"%(i))
        res.encoding='GB2312'
        soup =BeautifulSoup(res.text,'html.parser')
        news=soup.select('.newsgridlist')
        for xiaoxi in news[0].select('li'):
            time=xiaoxi.contents[2]
            for a in xiaoxi.select('a'):
                path=a['href']
                str="不告诉你"
                path=str+path
                title=a['title']
                f.write(title+' ')
                f.write(path+' ')
                f.write(time)
                f.write("\n\n")

将每一个文件的内容存到本地

#首页的详细新闻
    res = requests.get('不告诉你')
    res.encoding='GB2312'
    soup =BeautifulSoup(res.text,'html.parser')
    news=soup.select('.newsgridlist')
    for xiaoxi in news[0].select('li'):
        time=xiaoxi.contents[2]
        for a in xiaoxi.select('a'):
            path=a['href']
            str="不告诉你"
            path=str+path
            res2 =requests.get(path)
            res2.encoding='GB2312'
            soup2=BeautifulSoup(res2.text,'html.parser')
            ntitle=soup2.select('.title')
            ntitle1=ntitle[0].text
            with open('D:/抓的数据/详细新闻/'+ntitle1+'.txt','w',encoding='utf-8') as f1:
                f1.write('\t\t\t\t\t\t\t')
                f1.write(ntitle1)
                for content1 in soup2.select('.content p'):
                    f1.write(content1.text+'\n\n')

还可以爬一些用户信息之类的,保存为csv文件,用excel打开处理。

感觉就是python太强大了

猜你喜欢

转载自blog.csdn.net/qq_29914229/article/details/80032703