selenium+PhantomJS小案例—爬豆瓣网所有电影代码python

#coding=utf-8
from selenium import webdriver

def crawMovie():
driver=webdriver.PhantomJS()
driver.get("https://movie.douban.com/")
movie_list=[]
more_btn=driver.find_element_by_xpath('(//a[@class="more-link"])[1]')
more_btn.click()

while True:
start_index=len(movie_list)
xpath_str='//a[@class="item"][position()>%d]'%start_index
item_tags=driver.find_elements_by_xpath(xpath_str)
print "start_index:",start_index
print item_tags
print "number:",len(item_tags)
for item_tag in item_tags:
img_tag=item_tag.find_element_by_tag_name('img')
cover=img_tag.get_attribute("src")
title=img_tag.get_attribute("alt")
rating=item_tag.find_element_by_xpath(".//p/strong").text

movie={'cover':cover,
'title':title,
'rating':rating
}

movie_list.append(movie)
print "--"*20
load_more_btn=driver.find_element_by_xpath('//a[@class="more"]')
if load_more_btn.get_attribute("style"):
break
load_more_btn.click()

with open("e:\\movie_list.txt","w") as fp:
for d in movie_list:
temp=""
for k in d:
temp+=k+":"+d[k]+","
fp.write("{"+temp.strip(",")+"}"+"\n")

if __name__=="__main__":
crawMovie()

猜你喜欢

转载自www.cnblogs.com/reyinever/p/9250467.html