使用Python 爬取豆瓣热门电影

使用Python 爬取豆瓣热门电影所依赖的库

import requests
import json
import string
import urllib.parse
from urllib.parse import urlencode
from bs4 import BeautifulSoup

爬取出这些信息比如电影名称电影的评分等等 废话不多说直接上代码

爬取方法

def douban(name, count):
    #  请求地址
    url = "https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&"
    #  请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',
        'Cookie': '你的cookie信息'
    }
    # headers = {
    #     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
    # }
    # 携带的参数sort=time&page_limit=20&page_start=0
    parm = {
        'sort': 'time',
        'page_limit': 20,
        'page_start': count,
    }
    # 拼接请求参数
    ajax_url = url + urlencode(parm)
    # 发送请求
    data_json = requests.get(ajax_url, headers=headers).json()
    with open('D:\\HOME\\python\\Test02\\data\\豆瓣数据.txt', 'a') as output:
        for index, data in enumerate(data_json['subjects']):
            # 评分
            rate = data['rate']
            # 标题
            title = data['title']
            # 图片
            cover = data['cover']
            # 发送下一次请求
            data_ = requests.get(data['url'], headers=headers).content.decode()
            print("正在准备下文件写入:"+title)
            # 格式转换
            soupData = BeautifulSoup(data_, 'lxml')
            #  解析数据
            aa = soupData.find(class_='subjectwrap clearfix')
            info = aa.find(attrs={'id': 'info'})
            try:
                if (len(info.find_all(class_='pl')) == 10 ):
                    #写入文件
                    output.write(ten(rate, title, cover, info) + "\n")
                    output.flush()
                if (len(info.find_all(class_='pl')) == 7):
                     # 写入文件
                     output.write(seven(rate, title, cover, info) + "\n")
                     output.flush()
                print("成功向文件写入:" + title)
            except Exception:
                print("格式解析异常:" + title)

        output.close()

解析方法

def seven(rate, title, cover, info):
    # 导演
    directors = []
    for s in info.find_all(attrs={'rel': 'v:directedBy'}):
        directors.append(s.string)
    # 主演
    protagonists = []
    for s in info.find_all(attrs={'rel': 'v:starring'}):
        protagonists.append(s.string)
    # 类型
    types = []
    for s in info.find_all(attrs={'property': 'v:genre'}):
        types.append(s.string)
    # 解析 制片国家
    ProductsCountry = info.find_all(class_='pl')[3].next_sibling
    # 语言
    language = info.find_all(class_='pl')[4].next_sibling
    # 上映日期
    date = info.find(attrs={'property': 'v:initialReleaseDate'}).string
    # 片长
    runtime = info.find(attrs={'property': 'v:runtime'}).string
    # 将数据保存到集合中
    list = {'rate': rate, 'title': title, 'cover': cover, 'directors': directors, 'protagonists': protagonists,
            'types': types, 'ProductsCountry': ProductsCountry,
            'language': language, 'date': date, 'runtime': runtime}
    # 返回集合
    return json.dumps(list)
def ten(rate, title, cover, info):
    # 导演
    directors = []
    for s in info.find_all(attrs={'rel': 'v:directedBy'}):
        directors.append(s.string)
    # 主演
    protagonists = []
    for s in info.find_all(attrs={'rel': 'v:starring'}):
        protagonists.append(s.string)
    # 类型
    types = []
    for s in info.find_all(attrs={'property': 'v:genre'}):
        types.append(s.string)
        # 解析 制片国家
    ProductsCountry = info.find_all(class_='pl')[4].next_sibling
    # 语言
    language = info.find_all(class_='pl')[5].next_sibling
    # 上映日期
    date = info.find(attrs={'property': 'v:initialReleaseDate'}).string
    # 片长
    runtime = info.find(attrs={'property': 'v:runtime'}).string
    alternateName = info.find_all(class_='pl')[8].next_sibling
    # 将数据保存到集合中
    list = {'rate': rate, 'title': title, 'cover': cover, 'directors': directors, 'protagonists': protagonists,
            'types': types, 'ProductsCountry': ProductsCountry,
            'language': language, 'date': date, 'runtime': runtime, 'alternateName': alternateName}
    # 返回集合
    return json.dumps(list)

主函数

if __name__ == '__main__':
    name = urllib.parse.quote("热门", safe=string.ascii_letters)
    for i in range(0, 10):
        douban(name=name, count=i *20)

好了整体的代码就在这里来让我们一起看一下的爬取的结果

控制数据的信息

 由于我调用的json转换类型中文变成了unicode但是不影响我们使用随便复制一个去json解析器查看一下数据是否正确

 

 去官网校验一下

好了今天就分享到这里了喜欢的的点赞加关注

发布了44 篇原创文章 · 获赞 47 · 访问量 18万+

猜你喜欢

转载自blog.csdn.net/qq_43791724/article/details/104698053