目标网站介绍
中国煤炭市场网 是集煤炭新闻、煤炭市场分析、煤炭价格、煤炭数据的综合性煤炭资讯平台,涵盖煤炭产量、煤炭销量、煤炭消费、煤炭港口、煤炭库存、煤炭进出口、煤炭运输等核心…
开始Scrapy
数据采集准备
1. 不了解5分钟快速抓网站思路的小伙伴先看
【Scrapy 五分钟撸网站】全站数据必备基础知识
2. 不了解数据抓取业务管理整理小伙伴先看
【Scrapy 五分钟撸网站】爬虫目标整理和数据准备
3. 不了解Scrapy模板量产的小伙伴先看(必看)
【Scrapy 五分钟撸网站】数据抓取项目框架通用模板
数据整理结果
1. Excel保存截图
模板套用
Spider下的<项目>.py文件
1. 创建spider项目
scrapy genspider www_cctd_com_cn " "
2. 整理全站css样式
先来看下页面的CSS样式,全站统一,暗爽ing。
3. 修改www_cctd_com_cn .py的的内容
这里将需要修改的地方进行说明,其他地方参考模板,不需修改。
- 作用域&自定义说明
allowed_domains = []
web_name = "中国煤炭市场"
- 添加抓取数据信息
start_menu = [
# 新闻资讯汇总
[
{
"channel_name": "煤炭资讯-新闻资讯", "url": "https://www.cctd.com.cn/list-10-1.html", },
{
"channel_name": "煤炭资讯-资讯中心", "url": "https://www.cctd.com.cn/list-9-1.html", },
{
"channel_name": "煤炭资讯-CCTD原创", "url": "https://www.cctd.com.cn/list-42-1.html", },
{
"channel_name": "煤炭资讯-宏观经济", "url": "https://www.cctd.com.cn/list-15-1.html", },
{
"channel_name": "煤炭资讯-煤炭行业", "url": "https://www.cctd.com.cn/list-17-1.html", },
{
"channel_name": "煤炭资讯-钢铁行业", "url": "https://www.cctd.com.cn/list-18-1.html", },
{
"channel_name": "煤炭资讯-焦炭行业", "url": "https://www.cctd.com.cn/list-139-1.html", },
{
"channel_name": "煤炭资讯-电力行业", "url": "https://www.cctd.com.cn/list-19-1.html", },
{
"channel_name": "煤炭资讯-建材行业", "url": "https://www.cctd.com.cn/list-20-1.html", },
{
"channel_name": "煤炭资讯-交通行业", "url": "https://www.cctd.com.cn/list-23-1.html", },
{
"channel_name": "煤炭资讯-煤化工行业", "url": "https://www.cctd.com.cn/list-21-1.html", },
{
"channel_name": "煤炭资讯-煤炭综合", "url": "https://www.cctd.com.cn/list-176-1.html", },
{
"channel_name": "煤炭资讯-煤炭运行", "url": "https://www.cctd.com.cn/list-361-1.html", },
{
"channel_name": "煤炭资讯-煤炭进出口", "url": "https://www.cctd.com.cn/list-114-1.html", },
{
"channel_name": "煤炭资讯-国际煤炭", "url": "https://www.cctd.com.cn/list-113-1.html", },
{
"channel_name": "煤炭资讯-煤炭政策", "url": "https://www.cctd.com.cn/list-11-1.html", },
{
"channel_name": "煤炭资讯-煤炭企业", "url": "https://www.cctd.com.cn/list-22-1.html", },
{
"channel_name": "煤炭资讯-煤炭安全", "url": "https://www.cctd.com.cn/list-108-1.html", },
{
"channel_name": "煤炭资讯-煤炭资源", "url": "https://www.cctd.com.cn/list-109-1.html", },
{
"channel_name": "煤炭资讯-煤炭科技", "url": "https://www.cctd.com.cn/list-112-1.html", },
{
"channel_name": "煤炭资讯-节能环保", "url": "https://www.cctd.com.cn/list-115-1.html", },
{
"channel_name": "煤炭资讯-煤炭市场分析与评论", "url": "https://www.cctd.com.cn/list-14-1.html", },
{
"channel_name": "煤炭资讯-煤炭市场周报", "url": "https://www.cctd.com.cn/list-87-1.html", },
{
"channel_name": "煤炭资讯-煤炭市场月报", "url": "https://www.cctd.com.cn/list-88-1.html", },
{
"channel_name": "煤炭资讯-煤炭价格分析", "url": "https://www.cctd.com.cn/list-91-1.html", },
{
"channel_name": "煤炭资讯-煤炭生产情况", "url": "https://www.cctd.com.cn/list-98-1.html", },
{
"channel_name": "煤炭资讯-港口煤炭市场", "url": "https://www.cctd.com.cn/list-93-1.html", },
{
"channel_name": "煤炭资讯-煤炭销售情况", "url": "https://www.cctd.com.cn/list-138-1.html", },
{
"channel_name": "煤炭资讯-钢焦煤市场", "url": "https://www.cctd.com.cn/list-92-1.html", },
{
"channel_name": "煤炭资讯-煤炭海运市场", "url": "https://www.cctd.com.cn/list-125-1.html", },
{
"channel_name": "煤炭资讯-煤炭直供电厂", "url": "https://www.cctd.com.cn/list-122-1.html", },
{
"channel_name": "煤炭资讯-国际煤市点评", "url": "https://www.cctd.com.cn/list-95-1.html", },
{
"channel_name": "市场分析-煤炭市场分析&评论 ", "url": "https://www.cctd.com.cn/list-13-1.html", },
{
"channel_name": "市场分析-煤炭市场快报", "url": "https://www.cctd.com.cn/list-128-1.html", },
{
"channel_name": "市场分析-煤炭市场分析&评论", "url": "https://www.cctd.com.cn/list-13-1.html", },
{
"channel_name": "煤炭分析-价格", "url": "https://www.cctd.com.cn/list-91-1.html", },
{
"channel_name": "煤炭分析-钢焦煤", "url": "https://www.cctd.com.cn/list-92-1.html", },
{
"channel_name": "煤炭分析-港口", "url": "https://www.cctd.com.cn/list-93-1.html", },
{
"channel_name": "煤炭分析-无烟煤", "url": "https://www.cctd.com.cn/list-94-1.html", },
{
"channel_name": "煤炭分析-点评", "url": "https://www.cctd.com.cn/list-95-1.html", },
{
"channel_name": "煤炭分析-扫描", "url": "https://www.cctd.com.cn/list-96-1.html", },
{
"channel_name": "煤炭分析-上旬", "url": "https://www.cctd.com.cn/list-89-1.html", },
{
"channel_name": "煤炭分析-中旬", "url": "https://www.cctd.com.cn/list-90-1.html", },
{
"channel_name": "煤炭分析-生产", "url": "https://www.cctd.com.cn/list-98-1.html", },
{
"channel_name": "煤炭分析-运输", "url": "https://www.cctd.com.cn/list-99-1.html", },
{
"channel_name": "煤炭分析-电力", "url": "https://www.cctd.com.cn/list-100-1.html", },
{
"channel_name": "煤炭分析-钢铁", "url": "https://www.cctd.com.cn/list-101-1.html", },
{
"channel_name": "煤炭分析-焦炭", "url": "https://www.cctd.com.cn/list-102-1.html", },
{
"channel_name": "煤炭分析-建材", "url": "https://www.cctd.com.cn/list-103-1.html", },
{
"channel_name": "煤炭分析-化工", "url": "https://www.cctd.com.cn/list-104-1.html", },
{
"channel_name": "煤炭分析-煤炭动态", "url": "https://www.cctd.com.cn/list-116-1.html", },
{
"channel_name": "煤炭分析-动力煤", "url": "https://www.cctd.com.cn/list-133-1.html", },
{
"channel_name": "煤炭分析-综述", "url": "https://www.cctd.com.cn/list-118-1.html", },
{
"channel_name": "煤炭分析-国际", "url": "https://www.cctd.com.cn/list-135-1.html", },
{
"channel_name": "煤炭分析-炼焦煤", "url": "https://www.cctd.com.cn/list-134-1.html", },
{
"channel_name": "煤炭分析-库存", "url": "https://www.cctd.com.cn/list-120-1.html", },
{
"channel_name": "煤炭分析-价格", "url": "https://www.cctd.com.cn/list-121-1.html", },
{
"channel_name": "煤炭分析-进出口", "url": "https://www.cctd.com.cn/list-123-1.html", },
{
"channel_name": "煤炭分析-直供电厂", "url": "https://www.cctd.com.cn/list-122-1.html", },
{
"channel_name": "煤炭分析-进出口", "url": "https://www.cctd.com.cn/list-123-1.html", },
{
"channel_name": "煤炭分析-港口", "url": "https://www.cctd.com.cn/list-124-1.html", },
{
"channel_name": "煤炭分析-运价", "url": "https://www.cctd.com.cn/list-125-1.html", },
{
"channel_name": "煤炭分析-冶金", "url": "https://www.cctd.com.cn/list-126-1.html", },
{
"channel_name": "煤炭分析-生产", "url": "https://www.cctd.com.cn/list-136-1.html", },
{
"channel_name": "煤炭分析-运输", "url": "https://www.cctd.com.cn/list-137-1.html", },
{
"channel_name": "煤炭分析-销售", "url": "https://www.cctd.com.cn/list-138-1.html", },
{
"channel_name": "煤炭分析-市场观察员", "url": "https://www.cctd.com.cn/list-44-1.html", },
{
"channel_name": "煤炭分析-指数报告", "url": "https://www.cctd.com.cn/list-45-1.html", },
{
"channel_name": "煤炭分析-全国", "url": "https://www.cctd.com.cn/list-91-1.html", },
{
"channel_name": "煤炭分析-CCTD秦皇岛", "url": "https://www.cctd.com.cn/list-463-1.html", },
{
"channel_name": "煤炭分析-内蒙古", "url": "https://www.cctd.com.cn/list-47-1.html", },
{
"channel_name": "煤炭分析-山西", "url": "https://www.cctd.com.cn/list-49-1.html", },
{
"channel_name": "煤炭分析-陕西", "url": "https://www.cctd.com.cn/list-48-1.html", },
{
"channel_name": "煤炭分析-湖北", "url": "https://www.cctd.com.cn/list-642-1.html", },
{
"channel_name": "煤炭分析-重庆", "url": "https://www.cctd.com.cn/list-556-1.html", },
{
"channel_name": "煤炭分析-榆林", "url": "https://www.cctd.com.cn/list-543-1.html", },
{
"channel_name": "煤炭分析-长江口", "url": "https://www.cctd.com.cn/list-423-1.html", },
{
"channel_name": "动力煤期货-新闻资讯 ", "url": "https://www.cctd.com.cn/list-609-1.html", },
{
"channel_name": "动力煤期货-投研观点 ", "url": "https://www.cctd.com.cn/list-610-1.html", },
{
"channel_name": "动力煤期货-交割情况 ", "url": "https://www.cctd.com.cn/list-611-1.html", },
{
"channel_name": "动力煤期货-动力煤期货高级分析师 ", "url": "https://www.cctd.com.cn/list-625-1.html", },
{
"channel_name": "动力煤期货-活动动态", "url": "https://www.cctd.com.cn/list-607-1.html", },
{
"channel_name": "动力煤期货-公告与通知 ", "url": "https://www.cctd.com.cn/list-606-1.html", },
{
"channel_name": "资讯中心-新闻资讯-热点数据 ", "url": "https://www.cctd.com.cn/list-576-1.html", },
]
]
- 样式整理
整体网站数据列表有多少种样式就要做多少个parseX,并添加到
parse_list = [
self.parse1,
]
- 标题&链接&封面
由于整体网站内容列表没有图片因此不使用Item_thumbImg
Item_title = response.xpath('//td[@style="padding-left: 10px"]/li/a/text()').extract() # 文章标题列表
Item_url = response.xpath('//td[@style="padding-left: 10px"]/li/a/@href').extract() # 文章链接列表
# Item_thumbImg = response.xpath('//标签[@class="属性]/li/a/img/@src').extract() # 文章封面图片列表
Spider下的parse_detail.py文件
1. 抓取详情页内容
修改列表数据详情页的CSS抓取样式
# 处理详情页带格式,这里整个页面进行抓取
item['content'] = ""
if 'class="news_show"' in response.text and len(None2Str(item['content'])) < 5:
item['content'] = response.xpath('//div[@class="news_show"]').extract_first()
if 'id="Zoom"' in response.text and len(None2Str(item['content'])) < 5:
item['content'] = response.xpath('//div[@id="Zoom"]').extract_first()
2. 特别说明
有些网站的程序员丧心病狂到一定程度10个页面9种样式这种,由于我们不可能每个页面都打开看一下详情页的CSS格式,因此有个通用的解决办法。
- 第一次抓取完内容之后打开MongoDB数据库执行下面的命令会把包含body的页面数据筛选出来,这些是没有根据指定样式抓取的数据,而是直接抓的全部页面的数据。
db.你的表名.find({content:/body/})
- 打开任意的link循环处理详情页的内容直到mongo命令没有筛选出来内容为止即可。