网络爬虫(四):Scrapy的进阶使用

在掌握了基本的Scrapy的使用之后,可以用更加强大的方式利用他。
进一步设置items.py文件,pipelines.py文件,将在网页中下载数据进行归类保存。
接下来实战爬取网易新闻的正文内容、时间、出版社等。
items.py
代码:

import scrapy
class NewsItem(scrapy.Item):
	news_thread = scrapy.Field() 
	news_title = scrapy.Field()
	news_url = scrapy.Field()
	news_time = scrapy.Field()
	news_source = scrapy.Field()
	source_url = scrapy.Field()
	news_text = scrapy.Field()

piplines.py
代码:

from scrapy.exporters import CsvItemExporter
class NewsPipeline(object):
	def __init__(self):
		self.file = open ('news_data.csv','wb')
		self.exporter = CsvItemExporter(self.file,encoding='gbk')
		self.exporter.start_exporting()
		#创建一个名称为news_data.csv的文件,并且将数据导入

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item

	def close_spider(self,spider):
		self.exporter.finish_exporting()
		self.file.close()
		#定义结束传输

作用:将数据进行分类,并存入所创建的.csv文件中

主体代码:
这里的文件名由所创name决定
代码为:

import scrapy
from news.items import NewsItem
from scrapy.linkextractors import LinkExtractor
#链接提取器
from scrapy.spiders import CrawlSpider,Rule


class News163Spider(CrawlSpider):#类的继承
	name = 'news163'
	allowed_domains = ['news.163.com']
	start_urls = ['http://news.163.com/']
	rules = (
			Rule (LinkExtractor(allow=r"/18/09\d+/*"),
			callback ="parse_news",follow=True),
		)
			#如果满足allow=r"/18/09\d+/*(正则表达式) 把网站给parse-news

	def parse_news(self,response):
		item = NewsItem()#实例化操作,把item当成字典使用
		item['news_thread'] = response.url.strip().split('/')[-1][:-5]
		self.get_title(response,item)
		self.get_time(response,item)
		self.get_source(response,item)
		self.get_url(response,item)
		self.get_source_url(response,item)
		self.get_text(response,item)
		return item

	def get_title(self,response,item):
		title = response.css('title::text').extract()
		print('*'*20)
		if title:#判断是否为空
			print('title:{}'.format(title[0][:-5]))
			item['news_title']=title[0][:-5]
	def get_time(self,response,item):
		time = response.css('.post_time_source::text').extract()
		if time:
			print('time:{}'.format(time[0][:-5]))

			item['news_time'] = time[0][:-5]
	def get_source(self,response,item):
		source = response.css('#ne_article_source::text').extract()
		if source:
			print('source:{}'.format(source[0]))
			item['news_source'] = source[0]
	def get_source_url(self,response,item):
		source_url = response.css('#ne_article_source::attr(href)').extract()
		#attr是属性
		if source_url:
			print('source_url:{}'.format(source_url[0]))
			item['source_url'] = source_url[0]
	def get_text(self,response,item):
		text = response.css('#endText p::text').extract()
		if text:
			print('text:{}'.format(text))
			item['news_text'] = text
	def get_url(self,response,item):
		url = response.url
		if url:
			print('uews_url:{}'.format(url))
			item['news_url']=url		

运行后就可得到整理好的一个csv文件。

猜你喜欢

转载自blog.csdn.net/qq_42785117/article/details/83502968