爬取 某学校教务处发布的含有申报的通知的正文、标题、日期、链接

一、首先爬取所有的申报通知的网址链接

import requests
import re
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
browser = webdriver.Chrome('E:/python/chromedriver.exe')
url = 'http://jgxy.jhc.cn/jxkysy/list.htm'
browser.get(url)
title = []
url = []
new_str1 = '申报'
new_str2 = '通知'
for i in range(1,34):
    browser.find_element_by_class_name('pageNum').send_keys(i)
    browser.find_element_by_class_name('pagingJump').click()
    x = browser.find_elements_by_class_name('Article_Title')
    for a in x:
        z = a.find_element_by_tag_name('a').get_attribute('href')
        if new_str1 in str(a.text) and new_str2 in str(a.text):
            title.append(a.text)
            url.append(str(z))
df = {
    
    
    '标题':title,
    '链接':url
}
the_url = pd.DataFrame(df)
the_url.to_csv('标题链接.csv')

二、读取刚刚生成的网址链接excel,进行爬取

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
url = pd.read_csv('标题链接.csv')
title_list = []
data = []
the_text = []
the_url = []
for i in url['链接']:
    try:
        time.sleep(0.5)
        html = requests.get(i)
        bs4 = BeautifulSoup(html.content, 'lxml')
        the_title = bs4.find(name='h1', class_='actitle').text
        Data = bs4.find(name='span', class_='Article_PublishDate').text
        text = bs4.find(name='div', class_='Article_Content').text
        title_list.append(the_title)
        data.append(Data)
        the_text.append(text)
        the_url.append(i)
    except:
        print('需要权限的网址:' + i)
df = {
    
    
    '链接': the_url,
    '标题': title_list,
    '日期': data,
    '正文': the_text
}
work = pd.DataFrame(df)
work.to_csv('作业.csv')
print(work)

猜你喜欢

转载自blog.csdn.net/sgsdsdd/article/details/112793354