Pythonは何百もの記事をクロールします

#!/ usr / bin / env python 
#-*-coding:utf-8-*-
import xlwt
from selenium import webdriver
#from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import requests
import hashlib
from lxml import etree

class Bjh ():
def __init __(self):
self.wb = xlwt.Workbook()
chrome_options = webdriver.ChromeOptions()
#extension_path = r'D:\ python \ work \ bj3y \ 1.0.2_0.crx '
path = "chromedriver"
#chrome_options.add_experimental_option( 'w3c'、False)
mobileEmulation = {'deviceName': 'iPhone 6/7/8 Plus'}
chrome_options.add_experimental_option( 'mobileEmulation'、mobileEmulation)
#chrome_options.add_argument( '-headless')
#chrome_options.add_argument( "--proxy-server = http://%s"%self.get_ip())
#chrome_options。 add_argument( 'user-agent =%s'%random_ua())
chrome_options.add_argument( "user-data-dir =" + r "C:\ Users \ redhat \ AppData \ Local \ Google \ Chrome \ User Data")
自分.driver = webdriver.Chrome(executable_path = path、chrome_options = chrome_options)

def open(self、url):
self.driver.get(url = url)

def hua(self):
time.sleep(1)
print( "滑动加载中 ")
self.driver.execute_script(" window.scrollTo(0、document。body.scrollHeight) ")

def check_state(self):
#pageSource = self.driver.page_source
#html = etree.HTML(pageSource)
#data = html.xpath( '// * [@ id = "article"] / div / div / div [- 1] / div ')[0]
data = self.driver.find_element_by_class_name( "s-loader")。get_attribute(' innerHTML ')
state = str(data).split( "\ n")[1] [-3 :-2]
return state

def check_state_video(self):
#pageSource = self.driver.page_source
#html = etree.HTML(pageSource)
#data = html.xpath( '// * [@ id = "article"] / div / div / div [-1] / div ')[0]
data = self.driver.find_elements_by_class_name( "s-loader")[1] .get_attribute(' innerHTML ')
state = str(data).split( "\ n")[1] [-3:-2]
return state

def get_article(self):
html = self.driver.page_source
html = etree.HTML(html)
article = html.xpath( '// * [@ id = "article"] / div / div / div')
l = []
記事のi:
title = i.xpath( 'div / div / div / div / div / div / div [2] / div [1] / text()')
if title == []:
title = i.xpath( 'div / div / div / div / div [1] / text()')

read = i.xpath( 'div / div / div / div / div [3] / span / text()' )
if read == []:
read = i.xpath( 'div / div / div / div / div / div / div [2] / div [2] / span / text()')
data = title + read
l 。append(data)
return l

def tab_video(self):
ele = self.driver.find_element_by_xpath( '// * [@ id = "app"] / div / div [3] / div / div [1] / div / div / div [4]')
ele.click()

def get_video(self):
html = self.driver.page_source
html = etree.HTML(html)
article = html.xpath( '// * [@ id = "video"] / div / div / div')
l = [ ]
for i in article:
title = i.xpath( 'div / div / div [1] / div [2] / div / div [1] / text()')
if title == []:
title = i。 xpath( 'div / div / div / div / div [1] / text()')

read = i.xpath( 'div / div / div [1] / div [2] / div / div [3] / span / text() ')
読み取りの場合== []:
読み取り= i.xpath('div / div / div / div / div / div / div [2] / div [2] / span / text() ')
data = title + read
l.append(data)
return l

def write(self、title、type、data):
#wb = xlwt.Workbook()
#追加シート
ws = self.wb.add_sheet(type)
インデックス= 0
forデータ内のi:
if i == []:
続行
ws.write(index、0、i [0])
ws.write(index、1、i [1])
ws.write(index、2、i [2] )
if type == "article":
ws.write(index、3、i [3])
index + = 1


#def run(self、url、title、type):
#self.open(url)#if
type = = "article":
#Trueの間:
#bjh.hua()
#state = bjh.check_state()
#if state == "2":
#break
#data = self.get_article()
#else:
#self.tab_video()
#while True:
#bjh.hua ()
#state = bjh.check_state_video()
#if state == "2":
#break
#data = self.get_video()
#self.write(title、type、data)
#self.driver.close()
#self .driver.quit()
def run(self、url、title):
self.open(url)

while True:
self.hua()
state
==「2」の場合、state = self.check_state()
break
data = self.get_article()
self.write(title、 "article"、data)

self.driver.refresh()
time.sleep(1)
self。 driver.refresh()
time.sleep(1)
self.driver.refresh()
time.sleep(3)
while True:
try:
time.sleep(1)
self.tab_video()
break
例外を除いてe:
print(e)
self.driver.refresh()

ながらTrue:
self.hua()
state = self.check_state_video()
if state == "2":
break
data = self.get_video()
self.write(title、 "video"、data)
self.wb.save(title + ".xls")
self。 driver.close()
self.driver.quit()


if __name__ == ' __main__ ':
title = str(input( "请输入标题:"))
url = str(input( "请输入url:"))
bjh = Bjh()
bjh.run(url、title)



おすすめ

転載: www.cnblogs.com/yzre/p/12695956.html