python爬取百家号文章

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import xlwt
from selenium import webdriver
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import requests
import hashlib
from lxml import etree

class Bjh():
def __init__(self):
self.wb = xlwt.Workbook()
chrome_options = webdriver.ChromeOptions()
# extension_path = r'D:\python\work\bj3y\1.0.2_0.crx'
path = "chromedriver"
# chrome_options.add_experimental_option('w3c', False)
mobileEmulation = {'deviceName': 'iPhone 6/7/8 Plus'}
chrome_options.add_experimental_option('mobileEmulation', mobileEmulation)
# chrome_options.add_argument('--headless')
# chrome_options.add_argument("--proxy-server=http://%s"%self.get_ip())
# chrome_options.add_argument('user-agent=%s'%random_ua())
chrome_options.add_argument("user-data-dir=" + r"C:\Users\redhat\AppData\Local\Google\Chrome\User Data")
self.driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_options)

def open(self,url):
self.driver.get(url=url)

def hua(self):
time.sleep(1)
print("滑动加载中")
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")

def check_state(self):
# pageSource = self.driver.page_source
# html = etree.HTML(pageSource)
# data = html.xpath('//*[@id="article"]/div/div/div[-1]/div')[0]
data=self.driver.find_element_by_class_name("s-loader").get_attribute('innerHTML')
state=str(data).split("\n")[1][-3:-2]
return state

def check_state_video(self):
# pageSource = self.driver.page_source
# html = etree.HTML(pageSource)
# data = html.xpath('//*[@id="article"]/div/div/div[-1]/div')[0]
data = self.driver.find_elements_by_class_name("s-loader")[1].get_attribute('innerHTML')
state=str(data).split("\n")[1][-3:-2]
return state

def get_article(self):
html = self.driver.page_source
html = etree.HTML(html)
article=html.xpath('//*[@id="article"]/div/div/div')
l=[]
for i in article:
title=i.xpath('div/div/div/div/div/div/div[2]/div[1]/text()')
if title==[]:
title = i.xpath('div/div/div/div/div[1]/text()')

read=i.xpath('div/div/div/div/div[3]/span/text()')
if read ==[]:
read = i.xpath('div/div/div/div/div/div/div[2]/div[2]/span/text()')
data=title+read
l.append(data)
return l

def tab_video(self):
ele=self.driver.find_element_by_xpath('//*[@id="app"]/div/div[3]/div/div[1]/div/div/div[4]')
ele.click()

def get_video(self):
html = self.driver.page_source
html = etree.HTML(html)
article=html.xpath('//*[@id="video"]/div/div/div')
l=[]
for i in article:
title=i.xpath('div/div/div[1]/div[2]/div/div[1]/text()')
if title==[]:
title = i.xpath('div/div/div/div/div[1]/text()')

read=i.xpath('div/div/div[1]/div[2]/div/div[3]/span/text()')
if read ==[]:
read = i.xpath('div/div/div/div/div/div/div[2]/div[2]/span/text()')
data=title+read
l.append(data)
return l

def write(self,title,type,data):
# wb = xlwt.Workbook()
# 添加sheet
ws = self.wb.add_sheet(type)
index = 0
for i in data:
if i==[]:
continue
ws.write(index, 0, i[0])
ws.write(index, 1, i[1])
ws.write(index, 2, i[2])
if type=="article":
ws.write(index, 3, i[3])
index += 1


# def run(self,url,title,type):
# self.open(url)
# if type=="article":
# while True:
# bjh.hua()
# state = bjh.check_state()
# if state == "2":
# break
# data=self.get_article()
# else:
# self.tab_video()
# while True:
# bjh.hua()
# state = bjh.check_state_video()
# if state == "2":
# break
# data = self.get_video()
# self.write(title,type,data)
# self.driver.close()
# self.driver.quit()
def run(self,url,title):
self.open(url)

while True:
self.hua()
state = self.check_state()
if state == "2":
break
data=self.get_article()
self.write(title, "article", data)

self.driver.refresh()
time.sleep(1)
self.driver.refresh()
time.sleep(1)
self.driver.refresh()
time.sleep(3)
while True:
try:
time.sleep(1)
self.tab_video()
break
except Exception as e:
print(e)
self.driver.refresh()

while True:
self.hua()
state = self.check_state_video()
if state == "2":
break
data = self.get_video()
self.write(title,"video",data)
self.wb.save(title + ".xls")
self.driver.close()
self.driver.quit()


if __name__ == '__main__':
title=str(input("请输入标题:"))
url=str(input("请输入url:"))
bjh = Bjh()
bjh.run(url,title)



猜你喜欢

转载自www.cnblogs.com/yzre/p/12695956.html