版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/weixin_39777626/article/details/82010860
动态爬虫1:爬取影评信息
网页下载器
import requests
from http import cookiejar
import urllib
class HtmlDownloader():
def cookie():
with open('cookie.txt','r') as f:
cookies={}
for line in f.read().split(';'):
name,value=line.strip().split('=',1)
cookies[name]=value
return cookies
def download(self,url):
if url is None:
return None
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',
'Referer': r'http://movie.mtime.com',
'Connection': 'keep-alive'
}
cookie=cookiejar.CookieJar()
opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
response=opener.open(url)
r=requests.get(url,headers=headers,cookies=cookie)
if r.status_code==200:
r.encoding='utf-8'
return r.text
return None
网页解析器
import re
import json
class HtmlParser():
def parser_url(self,page_url,response):
pattern=re.compile(r'(http://movie.mtime.com/(\d+)/)')
urls=pattern.findall(response)
if urls!=None:
return list(set(urls))
else:
return '没有链接了!'
def parser_json(self,page_url,response):
pattern=re.compile(r'=(.*?);')
result=pattern.findall(response)[0]
if result!=None:
value=json.loads(result)
try:
isRelease=value.get('value')
except Exception as e:
print(e)
return None
if isRelease:
if value.get('value').get('hotValue')==None:
return self._parser_release(page_url,value)
else:
return self._parser_no_release(page_url,value,isRelease=2)
else:
return self._parser_no_release(page_url,value)
def _parser_release(self,page_url,value):
try:
isRelease=1
movieRating=value.get('value').get('movieRating')
boxOffice=value.get('value').get('boxOffice')
movieTitle=value.get('value').get('movieTitle')
RPictureFinal=movieRating.get('RPictureFinal')
RStoryFinal=movieRating.get('RStoryFinal')
RDirectorFinal=movieRating.get('RDirectorFinal')
ROtherFinal=movieRating.get('ROtherFinal')
RatingFinal=movieRating.get('RatingFinal')
MovieId=movieRating.get('MovieId')
Usercount=movieRating.get('Usercount')
AttitudeCount=movieRating.get('AttitudeCount')
if boxOffice!=None:
TotalBoxOffice=boxOffice.get('TotalBoxOffice')
TotalBoxOfficeUnit=boxOffice.get('TotalBoxOfficeUnit')
TodayBoxOffice=boxOffice.get('TodayBoxOffice')
TodayBoxOfficeUnit=boxOffice.get('TodayBoxOfficeUnit')
ShowDays=boxOffice.get('ShowDays')
try:
Rank=boxOffice.get('Rank')
except Exception as e:
Rank=0
return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,
TotalBoxOffice+TotalBoxOfficeUnit,TodayBoxOffice+TodayBoxOfficeUnit,Rank,ShowDays,isRelease)
else:
Rank=0
return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,u'无',u'无',Rank,0,isRelease)
except Exception as e:
print(e,page_url,value)
return None
def _parser_no_release(self,page_url,value,isRelease=0):
try:
movieRating=value.get('value').get('movieRating')
movieTitle=value.get('value').get('movieTitle')
RPictureFinal=movieRating.get('RPictureFinal')
RStoryFinal=movieRating.get('RStoryFinal')
RDirectorFinal=movieRating.get('RDirectorFinal')
ROtherFinal=movieRating.get('ROtherFinal')
RatingFinal=movieRating.get('RatingFinal')
MovieId=movieRating.get('MovieId')
Usercount=movieRating.get('Usercount')
AttitudeCount=movieRating.get('AttitudeCount')
try:
Rank=value.get('value').get('hotValue').get('Ranking')
except Exception as e:
Rank=0
return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,u'无',u'无',Rank,0,isRelease)
except Exception as e:
print(e,page_url,value)
return None
数据存储器
import sqlite3
class DataOutput():
def __init__(self):
self.cx=sqlite3.connect('/home/as/test.db')
self.create_table('MTime')
self.datas=[]
def create_table(self,table_name):
values='''
id integer primary key,
MovieId integer,
MovieTitle varchar(40) NOT NULL,
RatingFinal REAL NOT NULL DEFAULT 0.0,
ROtherFinal REAL NOT NULL DEFAULT 0.0,
RPictureFinal REAL NOT NULL DEFAULT 0.0,
RDirectorFinal REAL NOT NULL DEFAULT 0.0,
RStoryFinal REAL NOT NULL DEFAULT 0.0,
Usercount integer NOT NULL DEFAULT 0,
AttitudeCount integer NOT NULL DEFAULT 0,
TotalBoxOffice varchat(20) NOT NULL,
TodayBoxOffice varchat(20) NOT NULL,
Rank integer NOT NULL DEFAULT 0,
ShowDays integer NOT NULL DEFAULT 0,
isRelease integer NOT NULL
'''
self.cx.execute('CREATE TABLE IF NOT EXISTS %s ( %s ) '%(table_name, values))
def store_data(self,data):
if data is None:
return
self.datas.append(data)
if len(self.datas)>10:
self.output_db('MTime')
def output_db(self,table_name):
for data in self.datas:
self.cx.execute('INSERT INTO %s (MovieId,MovieTitle,RatingFinal,'
'ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,'
'Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,'
'Rank,ShowDays,isRelease) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
''%table_name,data)
self.datas.remove(data)
self.cx.commit()
def output_end(self):
if len(self.datas)>0:
self.output_db('MTime')
self.cx.close()
爬虫调度器
import time
class SpiderMan():
def __init__(self):
self.downloader=HtmlDownloader()
self.parser=HtmlParser()
self.output=DataOutput()
def crawl(self,root_url):
content=self.downloader.download(root_url)
urls=self.parser.parser_url(root_url,content)
for url in urls:
try:
t=time.strftime('%Y%-m%-d%H%M%S3282',time.localtime())
rank_url='http://service.library.mtime.com/Movie.api'\
'?Ajax_CallBack=true'\
'&Ajax_CallBackType=Mtime.Library.Services'\
'&Ajax_CallBackMethod=GetMovieOverviewRating'\
'&Ajax_CrossDomain=1'\
'&Ajax_RequestUrl=%s'\
'&t=%s'\
'&Ajax_CallBackArgument0=%s'%(url[0],t,url[1])
rank_content=self.downloader.download(rank_url)
data=self.parser.parser_json(rank_url,rank_content)
self.output.store_data(data)
except Exception as e:
print(e)
spider=SpiderMan()
spider.crawl('http://theater.mtime.com/China_Beijing/')
页面操作
1.页面交互与填充表单
2.元素拖拽
3.窗口和页面frame的切换
4.弹窗处理
5.历史记录
6.Cookie处理
7.设置phantomJS请求头中User_Agent
等待
1.显示等待
内置方法 | 功能 |
---|---|
title_is | 判断当前页面内容 |
title_contains | 判断当前页面包含的字符串 |
presence_of_element_located | 判断某个元素是否被加到DOM树 |
visibility_of_element_located | 判断元素是否可见 |
visibility_of | 判断元素是否可见 |
presence_of_all_elements_located | 判断是否至少有1个元素在DOM书中 |
text_to_be_present_in_element | 判断text包含的字符串 |
text_to_be_present_in_element_value | 判断value属性包含的字符串 |
frame_to_be_available_and_switch_to_it | 判断frame是否可以切换 |
invisibility_of_element_located | 判断元素是否不存在于DOM树或不可见 |
element_to_be_clickable | 判断元素是否可见并且是enable |
staleness_of | 等待某个元素从DOM树中移除 |
element_to_be_selected | 判断某个元素是否被选中,一般用于下拉列表 |
element_located_to_be_selected | 判断某个元素是否被选中,一般用于下拉列表 |
element_selection_state_to_be | 判断元素的选中状态 |
element__located_selection_status_to_be | 判断元素的选中状态 |
alert_is_present | 判断页面是否存在alert框 |
2.隐式等待
3.线程休眠
动态爬虫2:爬取去哪儿网
ele_toCity=driver.find_element_by_name('toCity')
ele_fromDate=driver.find_element_by_id('fromDate')
ele_toDate=driver.find_element_by_id('toDate')
ele_search=driver.find_element_by_class('search-btn')
ele_toCity.clear()
ele_toCity.send_keys(to_city)
ele_toCity.click()
ele_fromDate.clear()
ele_fromDate.send_keys(fromdate)
ele_toDate.clear()
ele_toDate.send_keys(todate)
try:
WebDriverWait(driver,10).until(
EC.title_contains(unicode(to_city))
)
except Exception as e:
print(e)
break
time.sleep(5)
js="window.scrollTo(0,document.body.scrollHeight);"
driver.execute_script(js)
time.sleep(5)
htm_const=driver.page_source
soup=BeautifulSoup(htm_const,'html.parser',from_encoding='utf-8')
infos=soup.find_all(class_='item_hotel_info')
f=codecs.open(unicode(to_city)+unicode(fromdate)+u'.html','a','utf-8')
for info in infos:
f.write(str(page_num)+'--'*50)
content=info.get_text().replace(" ","").replace("\t","").strip()
for line in [ln for ln content.splitlines() if ln.strip()]:
f.write(line)
f.write('\r\n')
f.close()
next_page=WebDriverWait(driver,10).until(
EC.visibility_of(driver.find_element_by_css_selector(".item.next"))
)
next_page.click()
import datetime
class QunaerSpider():
def get_hotel(self,driver,to_city,fromdate,todate):
ele_toCity=driver.find_element_by_name('toCity')
ele_fromDate=driver.find_element_by_id('fromDate')
ele_toDate=driver.find_element_by_id('toDate')
ele_search=driver.find_element_by_class_name('search-btn')
ele_toCity.clear()
ele_toCity.send_keys(to_city)
ele_toCity.click()
ele_fromDate.clear()
ele_fromDate.send_keys(fromdate)
ele_toDate.clear()
ele_toDate.send_keys(todate)
ele_search.click()
page_num=0
while True:
try:
WebDriverWait(driver,10).until(
EC.title_contains(unicode(to_city))
)
except Exception as e:
print(e)
break
time.sleep(5)
js="window.scrollTo(0,document.body.scrollHeight);"
driver.execute_script(js)
time.sleep(5)
htm_const=driver.page_source
soup=BeautifulSoup(htm_const,'html.parser',from_encoding='utf-8')
infos=soup.find_all(class_='item_hotel_info')
f=codecs.open(unicode(to_city)+unicode(fromdate)+u'.html','a','utf-8')
for info in infos:
f.write(str(page_num)+'--'*50)
content=info.get_text().replace(" ","").replace("\t","").strip()
for line in [ln for ln in content.splitlines() if ln.strip()]:
f.write(line)
f.write('\r\n')
f.close()
try:
next_page=WebDriverWait(driver,10).until(
EC.visibility_of(driver.find_element_by_css_selector(".item.next"))
)
next_page.click()
page_num+=1
time.sleep(10)
except Exception as e:
print(e)
break
def crawl(self,root_url,to_city):
today=datetime.date.today().strftime('%Y-%m-%d')
tomorrow=datetime.date.today()+datetime.timedelta(days=1)
tomorrow=tomorrow.strftime('%Y-%m-%d')
driver=webdriver.Firefox()
driver.set_page_load_timeout(50)
driver.get(root_url)
driver.maximize_window()
driver.implicitly_wait(10)
self.get_hotel(driver,to_city,today,tomorrow)
spider=QunaerSpider()
spider.crawl('http://hotel.qunar.com/',u'厦门')