基于Scrapy的CSDN爬取

1. 打开cmd命令行工具,进入桌面,输入scrapy startproject csdn_spider,然后再进入到csdn_spider这个目录下,输入scrapy genspider csdn csdn.net,这样项目就创建好了。

2. 需要爬取的具体数据如下:

(1) 爬取起始页优秀专栏推荐,每一页的每个专栏地址,图片url地址以及专栏名称,数据保存至数据库、Excel表格中

(2) 爬取每个分类地址下的每页中的每一篇博客文章标题,url地址,文章简介,时间以及查看数

(3) 进入到文章标题页,获取该页的HTML源代码,并将其转换为json数据,下载并保存

3. 然后,在spider里写入自己需要爬取的具体数据,代码如下:

import scrapy
from scrapy.http import Request
from ..items import ImgItem, ArticleItem

class CsdnSpider(scrapy.Spider):
    name = 'csdn'
    allowed_domains = ['csdn.net']
    start_urls = ['https://blog.csdn.net/column.html']
    base_url = 'https://blog.csdn.net'

    def parse(self, response):

        columns = response.xpath('//div[contains(@class,"column_list  ")]')

        for c in columns:

            bg_img = c.xpath('div[@class="column_bg"]/@style').extract_first('')
            # 封面图
            img_src = bg_img.split('(')[-1].strip(')')

            # 详情页面地址
            link = c.xpath('a[@class="column_list_link"]/@href').extract_first('')
            # 拼接完整的url地址
            url = self.base_url + link
            # 专栏标题
            title = c.xpath('a[@class="column_list_link"]/div/p/text()').extract_first('')
            # 博文数
            article_nums = c.xpath('a[@class="column_list_link"]/div/div/div[1]/span/text()').extract_first('')
            # 查看数
            look_nums = c.xpath('a[@class="column_list_link"]/div/div/div[2]/span/text()').extract_first()

            # 封面图item
            item = ImgItem()
            item['src'] = [img_src]
            item['url'] = url
            item['title'] = title
            item['article_nums'] = article_nums
            item['look_nums'] = look_nums
            item['referer'] = response.url
            # 主页中的item
            yield item

            # 创建一个请求对象
            yield Request(
                url=url,
                callback=self.parse_detail,
                meta={
                    'title': title
                }
            )
            break

    def parse_detail(self, response):

        # 获取文章所在专栏
        a_type = response.meta.get('title')

        # 找到所有的博客
        lis = response.xpath('//ul[@class="detail_list"]/li')

        for li in lis:
            # 博客标题
            title = li.xpath('h4/a/text()').extract_first('')
            # 博客链接
            href = li.xpath('h4/a/@href').extract_first('')
            # 博文简介
            description = li.xpath('p/text()').extract_first('')
            # 创建日期
            create_time = li.xpath('div/span/text()').extract_first('')
            # 查看数
            look_nums = li.xpath('div/em/text()').extract_first('')

            article = ArticleItem()
            article['a_type'] = a_type
            article['title'] = title
            article['href'] = [href]
            article['description'] = description
            article['create_time'] = create_time
            article['look_nums'] = look_nums
            article['referer'] = response.url

            yield article

4. 在items里需要写的代码如下:

import scrapy

class CsdnSpiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

# 专栏页面数据Item
class ImgItem(scrapy.Item):

    src = scrapy.Field()
    url = scrapy.Field()
    title = scrapy.Field()
    article_nums = scrapy.Field()
    look_nums = scrapy.Field()
    referer = scrapy.Field()

    def save(self, cursor):
        sql = "INSERT INTO section(title, article_nums, look_nums, url, src)VALUES('{}',{},{},'{}','{}')".format(self['title'], self['article_nums'], self['look_nums'], self['url'], self['src'][0])
        cursor.execute(sql)


# 详情页面博客Item
class ArticleItem(scrapy.Item):
    a_type = scrapy.Field()
    title = scrapy.Field()
    description = scrapy.Field()
    create_time = scrapy.Field()
    look_nums = scrapy.Field()
    href = scrapy.Field()
    referer = scrapy.Field()

    def save(self, cursor):
        sql = "INSERT INTO article(title, a_type, look_nums, descrition, create_time, href)VALUES('{}','{}',{},'{}','{}','{}')".format(self['title'], self['a_type'], self['look_nums'], self['description'], self['create_time'], self['href'][0])
        cursor.execute(sql)

5. 由于该网站的图片做的有防盗链措施,所以在middlewares中设置一个随机请求头

from fake_useragent import UserAgent

class RandomUAMiddleware(object):
    """This middleware allows spiders to override the user_agent"""

    def __init__(self):

        self.ua = UserAgent()

    def process_request(self, request, spider):

        request.headers.setdefault(b'User-Agent', self.ua.random)
        print(request)

6. 在pipeline中执行写入数据库、Excel表格等操作,具体代码如下:

import sqlite3
import os
import json
import codecs

from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
import request
import xlwt

from .items import ImgItem, ArticleItem


class CsdnSpiderPipeline(object):

    def process_item(self, item, spider):
        return item


class SaveToSqlite(object):

    def open_spider(self, spider):

        self.create_table()

    def create_table(self):
        self.connect_sql()
        # 列表页专栏表
        sql = 'CREATE TABLE IF NOT EXISTS section(id INTEGER PRIMARY KEY ,title CHAR ,article_nums INTEGER , look_nums INTEGER ,src CHAR, url CHAR)'
        self.cursor.execute(sql)
        # 文章表
        sql = 'CREATE TABLE IF NOT EXISTS article(id INTEGER PRIMARY KEY ,title CHAR ,a_type CHAR , look_nums INTEGER ,descrition CHAR, href CHAR, create_time CHAR)'
        self.cursor.execute(sql)

        self.close_sql()

    def connect_sql(self):
        self.conn = sqlite3.connect('csdn.db')
        self.cursor = self.conn.cursor()

    def close_sql(self):
        self.conn.commit()
        self.cursor.close()
        self.conn.close()

    def process_item(self, item, spider):
        self.connect_sql()
        # 如果传递过来的item是ImgItem类型,执行ImgItem中save函数
        # 如果传递过来的item是ArticleItem类型,执行ArticleItem中save函数
        item.save(self.cursor)
        # 关闭数据库
        self.close_sql()

        return item


class CustomFilesPipeline(object):

    def __init__(self):
        if not os.path.exists('files'):
            os.mkdir('files')
        self.headers = {
    'Host': 'blog.csdn.net',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
    'Accept-Encoding': 'gzip, deflate, br',
    'Cookie': 'TY_SESSION_ID=0be1faa9-44b5-46c6-a7e1-91c25b2b9ac7; uuid_tt_dd=10_19734560250-1536220718353-140611; uuid=b15cfa66-0320-4b76-8327-3a854d6ab2fe; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1536220725,1536227113,1536282369,1536289663; dc_tos=peo2gz; dc_session_id=10_1536289653416.952275; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1536291972; scvh=2017-11-16+16%3a54%3a14+004',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0, no-cache',
    'Pragma': 'no-cache'
}

    def process_item(self, item, spider):
        # 自己下载文件
        if isinstance(item, ArticleItem):
            href = item['href'][0]
            title = item['title']
            response = request.get(href,headers=self.headers)
            #  下载文件,存入files
            with open('files/'+title+'.html', 'w+' ,encoding='utf-8') as f:
                f.write(response.text)
        return item


class CustomImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        # 判断是否为ImgItem,需要下载图片,创建request对象并返回
        if isinstance(item, ImgItem):
            reqs = [Request(x, meta={'item': item}) for x in item.get(self.images_urls_field, [])]
            for r in reqs:
                r.headers.setdefault('Referer', item['referer'])
                r.headers.setdefault('Host', 'img.blog.csdn.net')

            return reqs

        else:
            return []

    def file_path(self, request, response=None, info=None):

        item = request.meta.get('item')
        title = item['title']

        return title+'.jpg'


class ExcelWritePipeline(object):

    def __init__(self):
        self.section = xlwt.Workbook(encoding='utf-8')
        self.article = xlwt.Workbook(encoding='utf-8')
        self.sheet1 = self.section.add_sheet('section')
        self.sheet2 = self.article.add_sheet('article')
        self.img_count = 0
        self.article_count = 0

    def open_spider(self, spider):

        self.sheet1.write(0, 0, 'title')
        self.sheet1.write(0, 1, 'src')
        self.sheet1.write(0, 2, 'url')
        self.sheet1.write(0, 3, 'look_nums')
        self.sheet1.write(0, 4, 'article_nums')

        self.sheet2.write(0, 0, 'title')
        self.sheet2.write(0, 1, 'a_type')
        self.sheet2.write(0, 2, 'description')
        self.sheet2.write(0, 3, 'look_nums')
        self.sheet2.write(0, 4, 'href')
        self.sheet2.write(0, 5, 'create_time')

    def process_item(self, item, spider):

        if isinstance(item, ImgItem):
            self.img_count += 1
            self.sheet1.write(self.img_count, 0, item['title'])
            self.sheet1.write(self.img_count, 1, item['src'][0])
            self.sheet1.write(self.img_count, 2, item['url'])
            self.sheet1.write(self.img_count, 3, item['look_nums'])
            self.sheet1.write(self.img_count, 4, item['article_nums'])
            self.section.save('section.xls')

        elif isinstance(item, ArticleItem):
            self.article_count += 1
            self.sheet2.write(self.article_count, 0, item['title'])
            self.sheet2.write(self.article_count, 1, item['a_type'])
            self.sheet2.write(self.article_count, 2, item['description'])
            self.sheet2.write(self.article_count, 3, item['look_nums'])
            self.sheet2.write(self.article_count, 4, item['href'][0])
            self.sheet2.write(self.article_count, 5, item['create_time'])
            self.article.save('article.xls')

        return item

    def close_spider(self, spider):

        self.article.save('article.xls')
        self.section.save('section.xls')

class JsonWritePipeline(object):
    def __init__(self):
        # 做写入数据的准备
        self.file = codecs.open('article.json', 'w+', encoding='utf-8')
        # 先写入一个[
        self.file.write('[')
        # [{},{},{},{}]

    def process_item(self, item, spider):
        # 把item转换为json字符串
        if isinstance(item, ArticleItem):
            # 将item转换字典
            item_dict = dict(item)
            # 转换为json字符串并追加,逗号
            json_str = json.dumps(item_dict)+','
            # 写入
            self.file.write(json_str)

        return item

    # __del__()  当pipeline对象被释放时,可以做一些保存的操作
    def __del__(self):
        # seek() 移动文件光标位置
        # seek(offset, start)
        # os.SEEK_CUR  os.SEEK_END  os.SEEK_SET
        # 将光标移动到文件末尾字符的前面,逗号之前
        self.file.seek(-1, os.SEEK_END)
        # 将光标之后的字符切除掉
        self.file.truncate()
        # 写入]
        self.file.write(']')
        # 关闭文件
        self.file.close()

    def close_spider(self, spider):
        # seek() 移动文件光标位置
        # seek(offset, start)
        # os.SEEK_CUR  os.SEEK_END  os.SEEK_SET
        # 将光标移动到文件末尾字符的前面,逗号之前
        self.file.seek(-1, os.SEEK_END)
        # 将光标之后的字符切除掉
        self.file.truncate()
        # 写入]
        self.file.write(']')
        # 关闭文件
        self.file.close()

6. 最后,在settings里配置以上操作,使其生效

(1)第22行的robots协议改为False

ROBOTSTXT_OBEY = False

(2)禁用scrapy自带的中间件,并启用自定义设置的UserAgent中间件

DOWNLOADER_MIDDLEWARES = {
   # 启用自定义设置的UserAgent中间件
   'csdn_spider.middlewares.RandomUAMiddleware': 555,
   # 禁用scrapy自带的中间件
   'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None
}

(3) 自定义数据保存的顺序,并配置图片保存路径以及文件保存路径

ITEM_PIPELINES = {
   'csdn_spider.pipelines.SaveToSqlite': 300,
   'csdn_spider.pipelines.CustomFilesPipeline': 301,
   'csdn_spider.pipelines.CustomImagesPipeline': 302,
   'csdn_spider.pipelines.ExcelWritePipeline': 303,
   'csdn_spider.pipelines.JsonWritePipeline': 304,
}
# 保存图片路径字段
IMAGES_URLS_FIELD = 'src'
IMAGES_STORE = 'images'

# 下载文件配置
FILES_URLS_FIELD = 'href'
FILES_STORE = 'files'

猜你喜欢

转载自blog.csdn.net/qq_42598133/article/details/82500027
今日推荐