python 爬虫 正则表达式爬取猫眼电影top100榜

 思路分析,首先分析猫眼电影的top100榜的网页特点,然后根据网页信息编写用于匹配的正则表达式,然后提取对应的信息即可。

接下来是对存储文件的操作,将最终的结果存储在csv结果。因为在编写过程中,要试很多次,为了不要每次都手动删除文件,因此在程序运行开始之前判断文件是否已经存在,如果存在则删除,然后开始便开始爬取信息。代码如下:

import re
import requests
import csv
import os



class MaoYan():
    def __init__(self):
        self.url = "http://maoyan.com/board/4"
        self.fieldnames = ['index','title', 'actor','time','score']
        self.file = 0
        
    def file_del(self):
        files = os.listdir(os.getcwd())   #列出目录下的文件
        for file in files:
            suffix = file.split('.')[-1]
            if suffix == 'csv':
                os.remove(file)
                print(file,"文件已经被删除")

  
    def file_cre(self):
        
        self.file = open('my.csv','a', newline='')
        self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames)
        self.writer.writeheader()


    def url_cre(self):
        for i in range(10):
            url_start = "http://maoyan.com/board/4?offset="
            url = url_start + str(i*10)
            yield url

            
    def html_get(self):
        for url in self.url_cre():
            r = requests.get(url)
            html = r.text
            yield html
        
            
    def html_parse(self):
        
        for html in self.html_get():
            p2 = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?class="name".*?><a.*?>(.*?)</a>.*?"star".*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>', re.S)
            items = re.findall(p2,html)
            for item in items:
                info = {
                    'index':item[0],
                    'title':item[1],
                    'actor':item[2].strip().split(':')[-1],
                    'time':item[3].split(':')[-1],
                    'score':item[4]+item[5],
                }
                print(info)
                self.save_to_csv(info)

                

    def save_to_csv(self,info):
        global writer
        if self.writer.writerow(info):
            print("保存成功")
            


    def file_close(self):
        self.file.close()   #关闭文件
        
        


    def main(self):
        self.file_del()      #首先进行文件删除
        self.file_cre()      #创建文件
        self.html_parse()    #解析并存储
        self.file_close()    #关闭文件
           

if __name__ == "__main__":
    app = MaoYan()
    app.main()

猜你喜欢

转载自blog.csdn.net/qq_34246164/article/details/81489230