抓取b站视频信息存入excel表格 + 存进mysql (已修改)

一、项目简介

1. 抓取搜索b站的视频,抓取标题、链接、播放量、弹幕数以及视频的上传时间,并把他们存放在excel中

2.涉及的类库:time、requests、xlwt、re、lxml

二、代码

# python
# -*- coding:utf-8 -*-
# author:Only time:2019/9/11

'''
1.爬取b站 python3视频链接、播放量、以及名称
2.xpath解析
3.存mysql
'''
import xlwt
import re
import time
import requests
from lxml import etree


def save_excel():
    alllists = get_parse_html()
    f = xlwt.Workbook(encoding='utf-8')
    sheet = f.add_sheet('b站爬虫_python教学视频',cell_overwrite_ok=True)
    alllists.insert(0,("标题","链接","观看次数","弹幕","上传时间"))
    for row, row_list in enumerate(alllists):
        for column, column_list in enumerate(row_list):
            sheet.write(row,column,str(column_list))
    f.save('b站爬虫_python教学视频'+'.xls')

def get_urllist(keyword):
    urllist = []
    for page in range(1,3):
        url = 'https://search.bilibili.com/all?keyword=' + keyword +'&page=' + str(page)
        urllist.append(url)
    return urllist

def get_parse_html():
    header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    alllists = []
    for url in get_urllist(keyword):
        response = requests.get(url, headers = header).text
        html = etree.HTML(response)
        # 标题
        titles = html.xpath('//div[@class="info"]//a//@title')
        # 链接
        links = html.xpath('//li[@class="video-item matrix"]/a/@href')
        # 观看次数
        people = html.xpath('//div[@class="tags"]/span[@title="观看"]/text()')
        peoples = []
        for i in people:
            people = i.replace('\n        ','')
            people = i.replace('\n      ', '')
            people = re.split(r'\s+',people)  # 正则表达式去空格
            people = people[1]
            if people[-1:] == '万':
                people = float(people[0:-1])*10000
            else:
                people = float(people)
            print(type(people))
            peoples.append(people)

        # 弹幕
        barrage = html.xpath('//div[@class="tags"]/span[@title="弹幕"]/text()')
        barrages = []
        for i in barrage:
            barrage = i.replace('\n        ','')
            barrage = i.replace('\n      ', '')
            barrages.append(barrage)
        # 上传时间
        data = html.xpath('//div[@class="tags"]/span[@title="上传时间"]/text()')
        datas = []
        for i in data:
            data = i.replace('\n        ', '')
            data = i.replace('\n      ', '')
            datas.append(data)
        #print("第"+str(number)+"页抓取成功")
        # time.sleep(5)  # 设置间隔时间为5秒抓取一次

        alllist = [zonghe for zonghe in zip(titles,links,peoples,barrages,datas)]
        alllists.extend(alllist)
    return alllists

if __name__ == "__main__":
    keyword = "python3"     # input("请输入要查找的关键字:")
    get_urllist(keyword)
    get_parse_html()
    save_excel()

 

三、运行结果

一、项目内容

1.抓取b站数据,存入mysql

2.涉及类库:re、lxml、requests、time、pymysql、

3.python3

4. mysql基础笔记

二、代码

# python
# -*- coding:utf-8 -*-
# author:Only time:2019/9/13

import pymysql
import requests
import re
import time
from lxml import etree


# 获得网址链接
def get_urllist(keyword):
    urllist = []
    for page in range(1,51):
        url = 'https://search.bilibili.com/all?keyword=' + keyword +'&page=' + str(page)
        urllist.append(url)
    print(urllist)
    return urllist

    
def save_mysql(url_list):

    # 连接数据库
    conn = pymysql.connect(
        host = '127.0.0.1',
        port = 3306,
        user = 'root',
        password = '123qwe',
        database = 'only',
        charset = 'utf8'
    )

    cursor = conn.cursor()

    sql_1 = 'create table blibli(id int primary key auto_increment not null,title varchar(120) ,link varchar(120) , people float(8,2) , barrage varchar(120) , data varchar(120))'

    cursor.execute(sql_1)

    try:
        # 爬数据
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        }
        for url in url_list:

            time.sleep(3)  # 设置延时  防止ip被封

            response = requests.get(url, headers=header).text
            html = etree.HTML(response)

            # 标题
            titles = html.xpath('//div[@class="info"]//a//@title')
            # 链接
            links = html.xpath('//li[@class="video-item matrix"]/a/@href')
            # 观看次数
            people = html.xpath('//div[@class="tags"]/span[@title="观看"]/text()')
            peoples = []
            for i in people:
                people = i.replace('\n        ', '')
                people = i.replace('\n      ', '')
                people = re.split(r'\s+', people)  # 正则表达式去空格
                people = people[1]
                if people[-1:] == '万':
                    people = float(people[0:-1]) * 10000
                else:
                    people = float(people)
                peoples.append(people)
            # 弹幕
            barrage = html.xpath('//div[@class="tags"]/span[@title="弹幕"]/text()')
            barrages = []
            for i in barrage:
                barrage = i.replace('\n        ', '')
                barrage = i.replace('\n      ', '')
                barrages.append(barrage)
            # 上传时间
            data = html.xpath('//div[@class="tags"]/span[@title="上传时间"]/text()')
            datas = []
            for i in data:
                data = i.replace('\n        ', '')
                data = i.replace('\n      ', '')
                datas.append(data)

            for i in range(len(datas)):
                print(titles[i])

                # 执行数据库操作
                sql_2 = '''insert into blibli(title,link,people,barrage,data) values ("%s","%s","%f","%s","%s")
                        '''
                cursor.execute(
                    sql_2 % (titles[i],links[i],peoples[i],barrages[i],datas[i])
                )
            cursor.fetchall()
            conn.commit()
            print("已提交")

    except Exception as  e:
        conn.rollback()
        print("数据已回滚")
        print(e)
    conn.close()


if __name__ == "__main__":
    keyword = 'python3'
    url_list = get_urllist(keyword)
    save_mysql(url_list)
    

三、代码执行结果展示

发布了22 篇原创文章 · 获赞 18 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/weixin_43930694/article/details/100735823