# -*- coding: utf-8 -*-
"""
Created on Mon Nov 26 11:52:34 2018
@author: jwjiang
"""
from selenium import webdriver
import time
import urllib
from bs4 import BeautifulSoup as bs
import re
import os
#****************************************************
base_url_part1 = 'https://www.google.com/search?q='
base_url_part2='&source=lnms&tbm=isch'#base_url_part1以及base_url_part2都是固定不变的,无需更改
search_query='Andy Cloth'#检索的关键词,可自己输入你想检索的关键字
location_driver='C:\\Users\\jwjiang\\Desktop\\pythonFile\\chromedriver.exe'#Chrome驱动程序在电脑中的位置
class Crawler:
def __init__(self):
self.url=base_url_part1+search_query+base_url_part2
#启动Chrome浏览器驱动
def start_brower(self):
# 启动Chrome浏览器
driver = webdriver.Chrome(location_driver)
# 最大化窗口,因为每一次爬取只能看到视窗内的图片
driver.maximize_window()
# 浏览器打开爬取页面
driver.get(self.url)
return driver
def downloadImg(self,driver):
t = time.localtime(time.time())
foldername = str(t.__getattribute__("tm_year"))+"-"+str(t.__getattribute__("tm_mon"))+"-"+str(t.__getattribute__("tm_mday"))#定义文件夹的名字
picpath = 'C:\\ImageDownload\\%s'%(foldername)#下载到的本地目录
if not os.path.exists(picpath): #路径不存在时创建一个
os.makedirs(picpath)
#下载图片的本地路径 D:/ImageDownload/~~
# 记录下载过的图片地址,避免重复下载
img_url_dic = {}
x = 300
#当鼠标的位置小于最后的鼠标位置时,循环执行
pos = 0
for i in range(7,12): #此处可自己设置爬取范围
pos =i*500 # 每次下滚500
js = "document.documentElement.scrollTop=%d" % pos
driver.execute_script(js)
time.sleep(2)
#获取页面源码
html_page=driver.page_source
#利用Beautifulsoup4创建soup对象并进行页面解析
soup=bs(html_page,"html.parser")
#通过soup对象中的findAll函数图像信息提取
imglist=soup.findAll('img',{'class':'rg_ic rg_i'})
#??这段代码问题?
for imgurl in imglist:
try:
print(x,end=' ')
if imgurl['src'] not in img_url_dic:
target = picpath+'\\%s.jpg' % x
#print ('Downloading image to location: ' + target + '\nurl=' + imgurl['src'])
img_url_dic[imgurl['src']] = ''
urllib.request.urlretrieve(imgurl['src'], target)
time.sleep(1)
x += 1
except KeyError:
print("ERROR!")
break
def run(self):
print (''' *************************************
** Welcome to use Spider **
*************************************''')
driver=self.start_brower()
self.downloadImg(driver)
driver.close()
print ("Download has finished.")
if __name__ == '__main__':
craw = Crawler()
craw.run()
Google图片爬虫代码
猜你喜欢
转载自blog.csdn.net/sun830910/article/details/84541378
今日推荐
周排行