启动文件main.py
from scrapy.cmdline import execute
execute('scrapy crawl bili_gr_xx'.split())
执行spider下的爬取文件
# -*- coding: utf-8 -*-
import scrapy,json
from .. import items
class BiliGrXxSpider(scrapy.Spider):
name = 'bili_gr_xx'
allowed_domains = ['bilibili.com']
# start_urls = ['http://bilibili.com/']
# 我们使用这个函数作为初始的执行函数
def start_requests(self):
url = 'https://space.bilibili.com/ajax/member/GetInfo'
for i in range(1,201):
data_form = {
'mid':str(i),
'csrf': '',
}
url_ajax = 'https://space.bilibili.com/{}/'.format(i)
# get的时候是这个东东, scrapy.Request(url=, callback=)
req = scrapy.FormRequest(url=url, formdata=data_form, callback=self.parse, method='POST')
req.headers['referer'] = url_ajax
yield req
def parse(self, response):
print('--'*20)
mysql=items.bili_mysql()
html=json.loads(response.text)
# print(html)
mysql['name']=html['data']['name']
mysql['ID']=html['data']['mid']
mysql['sex']=html['data']['sex']
mysql['tx_img']=html['data']['face']
mysql['gr_biaoq']=html['data']['sign']
mysql['chao']=html['data']['official_verify']['desc']
for i in mysql:
if mysql[i] == '':
mysql[i]=None
yield mysql
items文件
class bili_mysql(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name=scrapy.Field()
ID=scrapy.Field()
sex=scrapy.Field()
tx_img=scrapy.Field()
gr_biaoq=scrapy.Field()
chao=scrapy.Field()
settings.py配置文件
将改的地方写了下来
#导包
from ..piaot import *
#是否遵循规则,不懂请百度
#ROBOTSTXT_OBEY改成False
ROBOTSTXT_OBEY = False
#开启报头
DEFAULT_REQUEST_HEADERS = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}
#配置存储文件地址和优先级
ITEM_PIPELINES = {
'bilibili_wj.pipelines.bilibili_mysql': 300,
}
pipelines.py存储文件
# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class BilibiliWjPipeline(object):
def process_item(self, item, spider):
return item
class bilibili_mysql(object):
def process_item(self, item, spider):
sql = "insert into xq_2 values(NULL,'{}',{},'{}','{}','{}','{}')".format(item['name'], item['ID'], item['sex'], item['tx_img'], item['gr_biaoq'],item['chao'])
print(sql)
# 打开数据库连接,
db = pymysql.connect("192.168.43.128", "root", "123456", "xq", charset='utf8')
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# 使用 fetchone() 方法获取单条数据.
cursor.execute(sql)
# 执行mysql
db.commit()
db.close()