21经济网爬虫样例

本文主要记录如何基于 Scrapy 获取 21经济网 中金融分类下的新闻资讯内容。

items.py

import scrapy

class NewsItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field() # 标题
    summary = scrapy.Field() # 主旨
    origin = scrapy.Field()  # 文章原文链接
    cover = scrapy.Field()  # 文章封面图片
    author = scrapy.Field()  # 文章作者
    publish_date = scrapy.Field() # 发布日期
    publish_time = scrapy.Field()  # 发布日期含时间
    publish_from = scrapy.Field() # 文章来源
    category = scrapy.Field() # 文章所属分类 report,
    type = scrapy.Field() # 类型,图片(img)或者文字(txt)
    html = scrapy.Field() # html内容
    text = scrapy.Field() # 文本内容
    image_urls = scrapy.Field() # 正文图片Url地址
    images = scrapy.Field() # 正文图片本地地址

21FinanceSpider.py

# -*- coding: utf-8 -*-
import re
import scrapy
import logging
import datetime
from news.items import NewsItem

# 设置日志的输出样式
logging.basicConfig(level=logging.INFO,
                    format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
                    datefmt='%Y-%m-%d %T'
                    )
logger = logging.getLogger(__name__)


class Finance21Spider(scrapy.Spider):
    name = '21Finance'
    allowed_domains = ['www.21jingji.com']
    start_urls = ['http://www.21jingji.com/channel/finance/']
    current_page = 1
    today = str(datetime.date.today())
    base_url = "http://www.21jingji.com/channel/finance/{}.html"
    has_next = True

    def parse(self, response): # 提取最新发布列表页数据
        logger.info('当前页数: ' + str(self.current_page))
        self.current_page = self.current_page +1
        news_list = response.xpath("//div[@id='data_list']/div[@class='li']")
        for news in news_list:
            content = news.xpath("./div[@class='Tlist']")[0] if len(news.xpath("./div[@class='Tlist']"))>0 else None
            if content is not None:
                item = NewsItem()
                item['category'] = '资讯'
                item['type'] = 'TXT'
                item['origin'] = news.xpath("./a/@href").extract_first()
                item['cover'] = news.xpath("./a/img/@src").extract_first()
                item['title'] = content.xpath("./a[@class='listTit']/@title").extract_first()
                item['publish_from'] = content.xpath("./p[@class='shuoming']/child::span[1]/text()").extract_first()
                item['author'] = content.xpath("./p[@class='shuoming']/child::span[2]/text()").extract_first()
                # 获取详情
                yield scrapy.Request(
                    item['origin'],
                    callback=self.parse_detail,
                    meta={'item': item})

        moreD = response.xpath("//div[@class='moreD']")[0] if len(news.xpath("//div[@class='moreD']"))>0 else None
        if moreD is not None:
            moreP = moreD.xpath("p[@class='moreP']/a[text()='查看更多']").extract_first()
            if moreP is None:
                self.has_next = False

        if self.has_next:
            # 获取下一页地址
            next_url = self.base_url.format(str(self.current_page))
            yield scrapy.Request(next_url, callback=self.parse)

    def parse_detail(self, response):  # 提取详情页数据
        item = response.meta['item']
        wh = response.xpath("//p[@class='Wh']")[0]
        retstr = wh.xpath("./span[1]/text()").extract_first()
        match = re.search(r"(\d{4}-\d{2}-\d{2})",retstr)
        if match:
            date_str = match.group(0)
            tm = re.search(r"(\s\d{2}:\d{2})",retstr)
            time_str = tm.group(0)
        else:
            date_str = retstr.replace('年', '-').replace('月', '-').replace('日', '')
            time_str = wh.xpath("./span[2]/text()").extract_first()
        # if date_str == self.today:
        if True:
            item['publish_date'] = date_str
            item['publish_time'] = date_str + " "+ time_str
            item['summary'] = response.xpath("//p[@class='abstract backg']/text()").extract_first()
            news_html = response.xpath("//div[@class='detailCont']")[0]
            texts = news_html.xpath("normalize-space(string(.))").extract()
            for i in range(0, len(texts)):
                texts[i] = re.sub('\s', ' ', texts[i])
            item['text'] = ''.join(texts)
            item['html'] = news_html.extract().replace('http://www.21jingji.com/','javascript:void(0);')
            yield item
            # logger.info(item)
        else:
            self.has_next = False

pipelines.py

# -*- coding: utf-8 -*-

from pymongo import MongoClient
from news.items import NewsItem

class NewsPipeline(object):

    @classmethod
    def from_crawler(cls, crawler):

        return cls()

    def open_spider(self, spider):
        self.client = MongoClient(host='172.16.250.238', port=27017)
        self.client.test.authenticate('bigdata', 'bigdata')
        self.db = self.client['test']

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        collection = self.db['statsNews']
        if isinstance(item, NewsItem):
            collection.insert_one(dict(item))
        # with open('C:\\Users\\pjli\\Desktop\\pyworks\\news.txt', 'w', encoding='utf-8') as f:
        #     json.dump(dict(item), f, ensure_ascii=False, indent=2)
发布了254 篇原创文章 · 获赞 583 · 访问量 1270万+

猜你喜欢

转载自blog.csdn.net/pengjunlee/article/details/104940085
今日推荐