爬取豆瓣电影top250(python3)

从别人的博客上看到的 做了一点点修改 具体网址找不到了 就这样吧

代码如下:

# 爬取豆瓣电影top250
# -*-coding:utf-8-*-

import requests
from bs4 import BeautifulSoup

DownLoad_URL = 'https://movie.douban.com/top250'


def parse_html_info(html):
    """
    解析第二级URL 查找简介信息
    :param: html
    :return: text
    """
    html = download_page(html)
    soup = BeautifulSoup(html, "lxml")
    detail = soup.find('div', attrs={'id': 'wrapper'})
    detail = detail.find('div', attrs={'id': 'content'})
    try:
        return detail.h1.span.string

    except Exception as e:
        print(e)


def parse_html(html):
    """
    解析HTML抓取需要的数据
    :param:html
    :return:movie_name_list,URL
    """
    # 解析文档
    soup = BeautifulSoup(html, "lxml")
    # 查找ol标签
    movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})
    # 存放电影名的列表
    movie_name_list = []
    # 遍历每一条列表 查找其中的电影名
    for movie_li in movie_list_soup.find_all('li'):
        # 电影名
        detail = movie_li.find('div', attrs={'class': 'hd'})
        movie_name = detail.find('span', attrs={'class': 'title'}).getText()
        # 简介
        next_page = detail.find('a')['href']
        # 下面的是调到新的页面 速度有点慢纯属玩玩
        if next_page:
            xx = parse_html_info(next_page)
        # 评分
        detail = movie_li.find('div', attrs={'class': 'bd'})
        score = detail.find('span', attrs={'class': 'rating_num'}).getText()
        if xx!=None:
            text = xx + ' ' + score
        else:
            text = movie_name + ' ' + score
            print(movie_name)
        # 添加到列表中
        movie_name_list.append(text)
    # 查找a标签内容 下一页
    next_page = soup.find('span', attrs={'class': 'next'}).find('a')

    # 返回数据 若为真 返回电影名称与URL地址
    if next_page:
        return movie_name_list, DownLoad_URL + next_page['href']
    # 必须要添加 else 因为当下一页不存在URL地址的时候 返回的只有一个数据 会报错 'NoneType' object is not iterable
    else:
        return movie_name_list, None


def download_page(url):
    """
    这是下载页面
    :param:url
    :return:data
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
    }
    data = requests.get(url, headers=headers).content
    return data


def main():
    url = DownLoad_URL
    # 获取 proDir 主路径
    # proDir = os.path.split(os.path.realpath(__file__))[0]
    # configPath 配置文件路径地址
    # configPath = os.path.join(proDir, "movies.txt")

    with open("movies.txt", "w", encoding='utf-8') as f:
        while url:
            html = download_page(url)

            movies, url = parse_html(html)
            new_movies = u'{movies}\n'.format(movies='\n'.join(movies))
            f.write(new_movies)


if __name__ == "__main__":
    main()

猜你喜欢

转载自my.oschina.net/u/2672404/blog/1609129