Python实现爬取知乎热榜

版权声明:Author:ljc https://blog.csdn.net/Jiacheng_Liu/article/details/83240670

本文提供了一种用Selenium库实现自动爬取知乎热榜小于回答数小于30的问题,并使用SMTP将内容发送到用户邮箱的方法。

// An highlighted block
var foo = 'bar';
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 17 07:57:51 2018

@author: ljc
"""

import numpy as np
import re
from selenium import webdriver
import time
from selenium.common.exceptions import StaleElementReferenceException
import random


def smtp_sender(mail_content = '你好,我是来自知乎的[邓旭东HIT] ,现在在进行一项用python登录qq邮箱发邮件的测试'):
    from email.mime.text import MIMEText
    from email.header import Header
    from smtplib import SMTP_SSL
    
    
    #qq邮箱smtp服务器
    host_server = 'smtp.qq.com'
    #sender_qq为发件人的qq号码
    sender_qq = '145*1****'
    #pwd为qq邮箱的授权码
    pwd = '*******'
    #发件人的邮箱
    sender_qq_mail = '*******[email protected]'
    #收件人邮箱
    receiver = '2*******@qq.com'
    #邮件的正文内容
    #邮件标题
    mail_title = '热榜检测'
    
    #ssl登录
    smtp = SMTP_SSL(host_server)
    #set_debuglevel()是用来调试的。参数值为1表示开启调试模式,参数值为0关闭调试模式
    smtp.set_debuglevel(0)
    smtp.ehlo(host_server)
    smtp.login(sender_qq, pwd)
    
    msg = MIMEText(mail_content, "plain", 'utf-8')
    msg["Subject"] = Header(mail_title, 'utf-8')
    msg["From"] = sender_qq_mail
    msg["To"] = receiver
    smtp.sendmail(sender_qq_mail, receiver, msg.as_string())
    smtp.quit()
    return






            
def gene_Hotstr(question_no = [],q_len = 0,question_no_list=[],q_list_len = 0):
    option = webdriver.ChromeOptions()
    option.add_argument("headless")
    driver = webdriver.Chrome(r"C:\Users\ljc14\Desktop\chromedriver.exe",chrome_options=option)
    driver.get('https://www.zhihu.com/billboard')
    questionset = ''
    questions_list_set = ''
#    question_no = []
    for i in range(50):
        action2 = ''
        if i<9:
            
            if i>2:
                action2 = driver.find_elements_by_xpath('//div[contains(text(), '+'\"0'+str(i+1)+'\")'+' and @class="HotList-itemIndex"]')
            else:
                action2 = driver.find_elements_by_xpath('//div[contains(text(), '+'\"0'+str(i+1)+'\")'+' and @class="HotList-itemIndex HotList-itemIndexHot"]')
        else:
            action2 = driver.find_elements_by_xpath('//div[contains(text(), '+'\"'+str(i+1)+'\")'+' and @class="HotList-itemIndex"]')
        if action2:
            try:
                webdriver.ActionChains(driver).move_to_element(action2[0]).click(action2[0]).perform()
                answer_number = driver.find_elements_by_class_name("List-headerText")
                languages = [x.text for x in answer_number]
                languages = str(languages)
                numbers = re.sub("\D", "", languages)
                if numbers:
                    numbers = int(numbers)
                    print(numbers)

                    questions_list = driver.find_elements_by_class_name("QuestionHeader-title")
                    questions_list = [x.text for x in questions_list]
                    questions_list = str(questions_list)
                    print(questions_list)
                    if questions_list not in question_no_list:
                        
                        question_no_list.append(questions_list)
                        questions_list_set = questions_list_set + '\n' + questions_list
                        
                    if numbers<30:
                        questions = driver.find_elements_by_class_name("QuestionHeader-title")
                        questions = [x.text for x in questions]
                        questions = str(questions)
                        if questions not in question_no:
                            question_no.append(questions)
#                                print(question_no)
                            print(questions)
                            questionset= questionset + '\n' + questions
                    driver.back()
            except StaleElementReferenceException:
                print(1)
                time.sleep(0.5)
                return [],0,''
    if len(question_no)>q_len:
        smtp_sender(questionset)
    if len(question_no_list)>q_list_len:
        smtp_sender(questions_list_set)        
    q_len = len(question_no)
    q_list_len = len(question_no_list)
    return question_no,q_len,questionset,question_no_list,q_list_len,questions_list_set

#driver = webdriver.Chrome(r"C:\Users\ljc14\Desktop\chromedriver.exe")
#driver.get('https://www.zhihu.com/billboard')
##action3 = driver.find_elements_by_xpath('//div[contains(text(), "01") and @class="HotList-itemIndex HotList-itemIndexHot"]')
#action3 = driver.find_elements_by_xpath('//div[contains(text(), "04") and @class="HotList-itemIndex"]')
##<div class="HotList-itemIndex HotList-itemIndexHot">01</div>
#print(len(action3))
#getHot(action3)


def time_pa():
#    question_no = []
    q_len = 0
    while True:
        if q_len ==0:
            print(0)
            [question_no,q_len,questionset,question_no_list,q_list_len,questions_list_set] = gene_Hotstr()
        else:
            print('Nothing new, so sad')
            [question_no,q_len,questionset,question_no_list,q_list_len,questions_list_set] = gene_Hotstr(question_no,q_len,question_no_list,q_list_len)

        time.sleep(random.randint(0,9))
        

time_pa()



猜你喜欢

转载自blog.csdn.net/Jiacheng_Liu/article/details/83240670