selenium + phantomjs+python 外网动态爬虫

1.工具:

  • pycharm
  • selenium
  • phantomjs
  • python

2.代码:

# -*- coding: utf-8 -*-
from selenium import webdriver


driver = webdriver.PhantomJS(executable_path='/Users/test/Downloads/phantomjs-2.1.1-macosx/bin/phantomjs')
#定义URL数组,方便后续循环访问
loveurl_real = []

#定义保存爬取内容的文件
image_file = open('/Users/test/Documents/imageurls.txt','w+')
love_file = open('/Users/test/Documents/lovecounts.txt','w+')


def init():
    driver.get("https://www.instagram.com/explore/tags/selfie/")



def crawler():
    imglist = driver.find_elements_by_class_name("_icyx7")
    for imgurl in imglist:
        image_file.write(imgurl.get_attribute("src")+'\n')


    lovelist = driver.find_elements_by_css_selector("._8mlbc._vbtk2._t5r8b")

    for loveurl in lovelist:
        loveurl_real.append(loveurl.get_attribute("href"))


#依次循环访问页面
def skip():
    for url in loveurl_real:
        driver.get(url)
        if isElementExist(driver,"_tf9x3"):
            love = driver.find_element_by_class_name("_tf9x3").text
            love_file.write(filter(str.isdigit, love.encode("utf-8")) + '\n')
        elif isElementExist(driver,"_9jphp"):
            love = driver.find_element_by_class_name(("_9jphp")).text
            love_file.write(filter(str.isdigit, love.encode("utf-8"))+'\n')
        else:
            love_file.write("0" + '\n')



#定义一个函数来判断页面是否有某个元素!!!
def isElementExist(driver, element):
    flag = True
    driver = driver
    try:
        driver.find_element_by_class_name(element)
        return flag
    except:
        flag = False
        return flag


#记得关闭文件和窗口
def drop():
    driver.quit()
    image_file.close()
    love_file.close()

#刚进入页面时点击“更多”按钮,加载了更多的内容
def click_more():
    more_button = driver.find_element_by_css_selector("._8imhp._glz1g")
    more_button.click()
    js1 = 'return document.body.scrollHeight'
    js2 = 'window.scrollTo(0, document.body.scrollHeight)'
    old_scroll_height = 0
    while (driver.execute_script(js1) > old_scroll_height):
        old_scroll_height = driver.execute_script(js1)
        driver.execute_script(js2)
        time.sleep(2)



if __name__ == '__main__':
    init()
    click_more()
    crawler()
    skip()
    drop()

3.总结

  • 一定要先看看跳转的页面,如果是‘< a >’标签的话可以不用click,直接获取它的href属性值再driver.get(url)即可。
  • 判断页面是否有某个元素时,如果直接判断会报错然后停止运行,所以应该是写一个函数,try-catch来捕捉异常,然后根据返回的值进行判断。
  • 实现了模拟浏览器点击,和鼠标滑动的效果。

4.爬取的网页及内容

https://www.instagram.com/explore/tags/selfie/ 上面的图片以及点赞数~

猜你喜欢

转载自blog.csdn.net/qq_21460525/article/details/71541250