爬取豆瓣影评少年的你评论信息

爬取用户名，评论标题，内容，发表时间，点赞数量

直接上代码了大部分注释都有自己可以看下

# coding=utf-8
import requests
import lxml.html
import json
import re
from  queue import  Queue
import threading

CRAWL_EXIT = False #采集网页页码队列是否为空
PARSE_EXIT = False #数据队列是否为空
etree = lxml.html.etree
#爬取网页源代码的类
class ThreadCrawls(threading.Thread):
    #初始化方法，线程名字，页码队列,数据队列
    def __init__(self,threadName,pageQueue,dataQueue):
        threading.Thread.__init__(self)
        #线程名
        self.threadName = threadName
        #页码队列
        self.pageQueue = pageQueue
        #数据队列
        self.dataQueue = dataQueue
        # 请求头 防止被屏蔽
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}

    def run(self):
        # run方法
        print("开启采集线程"+self.threadName)
        while not CRAWL_EXIT:
            # 当网页页码队列不为空的时候
                try:
                    # 从pageQueue中取出一个页码 先进先出
                    # 并且 block = False当队列头为空的时候不阻塞线程
                    page = self.pageQueue.get(False)
                    #网页的url
                    url = "https://movie.douban.com/subject/30166972/reviews?start=" + str((page - 1) * 20)
                    response = requests.get(url, headers=self.headers) #发送请求得到响应
                    response.encoding = "utf-8" #设置编码
                    content = response.text #获得响应源代码
                    content = re.sub("[\t\r\n]", "", content) # 取出制表符换行符
                    #将爬取到的网页源代码放入dataQueue队列中
                    self.dataQueue.put(content)
                except: #出现异常的时候直接跳过 等于当页码队列为空的时候我们就跳过
                    pass
        print("结束采集线程"+self.threadName)

#解析网页源代码抽取数据的类
class ThreadParses(threading.Thread):
    def __init__(self,threadName,dataQueue,localFile,lock):
        threading.Thread.__init__(self)
        #线程名字
        self.threadName = threadName
        #数据队列
        self.dataQueue = dataQueue
        #解析后的数据 所存放的文件地址
        self.localFile = localFile
        #互斥锁  防止同时写入数据 造成数据的混乱
        self.lock = lock
    def run(self):
        # run方法
        print("开启解析线程"+self.threadName)
        while not PARSE_EXIT:
            # 当数据队列不为空的时候
            try:
                # False表示 队列头为空的时候 不阻塞线程
                html = self.dataQueue.get(False) #获得网页源代码
                self.parse(html) #解析html文档 获得内容
            except:
                pass
        print("结束解析线程" + self.threadName)
    #解析html文档 获得内容
    def parse(self,html):
        text = etree.HTML(html)
        node_list = text.xpath(".//div[@class='main review-item']")
        # print(node_list)
        for node in node_list:
            # 获取用户名username .//a[@class='name']
            username = node.xpath(".//a[@class='name']")[0].text
            # print(username)
            # time .//span[@class='main-meta']
            time = node.xpath(".//span[@class='main-meta']")[0].text
            # print(time)
            # title .//div[@class='main-bd']/h2/a
            title = node.xpath(".//div[@class='main-bd']/h2/a")[0].text
            # print(title)
            # content .//div[@class='short-content']
            content = node.xpath(".//div[@class='short-content']")[0].text
            # print(content)
            # 赞 .//div[@class='action']/a/span
            zan = node.xpath(".//div[@class='action']/a/span")[0].text
            # print(zan)

            items = {
                "username": username,
                "title": title,
                "content": content, #写入会没有数据，因为它的评论是动态加载的 此处没用动态获取数据 所有没有
                "time": time,
                "zan": zan
            }
            # with 会自动 打开和关闭 io  所以不需要手动关闭
            with self.lock:
                # 互斥锁写入
                #json.dumps 把字典对象转为json字符串 然后写入文件中
                self.localFile.write(json.dumps(items,ensure_ascii=False)+"\n")
#抓取数据和提取数据 保存数据到文件
def main():
    # 1. 页码队列
    pageQueue = Queue(20) #表示有20个页码 Queue 先进先出
    # 装载20个页码进队列
    for i in range(1,21):
        pageQueue.put(i)
    # 2. 数据队列 网页HTML源代码
    dataQueue = Queue(20) #表示有20个源代码
    #以追加的方式打开本地文件
    localFile = open("dbduanping.json","a",encoding="utf-8")
    lock = threading.Lock();  #创建互斥锁
    # 3. 三个采集数据线程的名字
    crawlList = ['采集1号线程', '采集2号线程', '采集3号线程']
    # 创建 启动 和存储 三个采集线程
    threadCrawls = []
    for threadName in crawlList:
        # 创建采集线程对象
        thread = ThreadCrawls(threadName,pageQueue,dataQueue)
        thread.start() #开启线程 开启的时候会自动执行run方法 (爬取网页源代码并存入dataQueue)
        threadCrawls.append(thread) #把线程添加进入采集线程列表
    # 4. 三个提取数据的解析线程名字
    parseList = ['解析1号线程', '解析2号线程', '解析3号线程']
    # 创建 启动 和存储 三个解析线程
    threadParses = []
    for threadName in parseList:
        #创建解析线程对象
        thread = ThreadParses(threadName,dataQueue,localFile,lock)
        thread.start() #开启解析线程 自动调用run方法 解析内容 存到本地
        threadParses.append(thread)

    while not pageQueue.empty():
        pass #当页码队列不为空的时候 跳过

    #如果页码队列为空 采集线程退出循环
    global CRAWL_EXIT
    CRAWL_EXIT = True #为空了  采集页码队列
    print("pageQueue 为空")

    for thread in threadCrawls:
        thread.join()  #阻塞子线程 必须等我执行完才会往下执行

    while not dataQueue.empty():
        pass  # 当数据队列不为空的时候跳过

    print("dataQueue 为空")
    global PARSE_EXIT
    PARSE_EXIT = True  # 为空了 数据队列

    for thread in threadParses:
        thread.join()

    with lock:
        #使用互锁 关闭文件
        localFile.close()

if __name__ == "__main__":
    main()

AppWhite_Star

发布了18 篇原创文章 · 获赞 2 · 访问量 1493

私信关注

python Queue，threading多线程实战

Python3Queue与Threading实战

爬取豆瓣影评少年的你评论信息

爬取用户名，评论标题，内容，发表时间，点赞数量

直接上代码了大部分注释都有自己可以看下

猜你喜欢

python Queue，threading多线程实战

Python3Queue与Threading实战

爬取豆瓣影评少年的你评论信息

爬取 用户名，评论标题，内容，发表时间，点赞数量

直接上代码了 大部分注释都有 自己可以看下

猜你喜欢

爬取用户名，评论标题，内容，发表时间，点赞数量

直接上代码了大部分注释都有自己可以看下