python抓取网络文章

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import urllib.request
from requests.exceptions import RequestException
import csv
import pandas as pd


import random


def getUrl():
    data = []
    res = requests.get('https://xxx.com/')#获取目标网页
    res.encoding = 'utf-8'#抓取网页出现乱码

    soup = BeautifulSoup(res.text,'html.parser')#爬取网页

    for news in soup.select('#list li'):
        m_url = 'https://xxx.org'+news.find('a').get('href')
        data.append(m_url)
        #print(data)
    return data

urls = getUrl();

# 获取页面内容
def getHtml(url):
    try:
        response = requests.get(url)

        if response.status_code == 200:
            return response.text
    except RequestException:
        print('===request exception===')
        return None

# 解析网页
def parse_html(html):
    try:
        soup = BeautifulSoup(html, 'html.parser')

        for tag in soup.select('#entry'):

            title = tag.find('h1').get_text()

            for art in tag.select('#entrybody'):

                #a = art.find('div',id='fengxibutton')
                #b = art.find('div',id='fenxi')
                #a.decompose() 去除指定标签节点
                #b.decompose()
                content = art.get_text()

        return title,content
    except Exception:
        print('===parseHtml exception===')
        return None

# 保存到csv表中
def save2csv(title, content):
    with open('xx.csv', 'a+', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['title', 'content'])
        writer.writerow([title, content])
        pd.read_csv('xx.csv')

def article():
    for url in urls:

        html = getHtml(url)

        info = parse_html(html)

        if info==None:
            title = url
            content = url
        else:
            (title, content) = info

        save2csv(title, content)


article()

猜你喜欢

转载自blog.csdn.net/for_get_love/article/details/88865195