selenium应用中的坑
- selenium库的使用能够很好的让你绕过反爬机制,应为程序在运行的过程中完全符合浏览器的行为,既然是完全符合历览器的行为那么也就不会被轻易的挡在外面,但是在应用过程中还是存在问题的。
- spider在获取数据的时候,就是它能看到的,然后是你指定的数据,只要程序员爸爸给了合适的定位操作,spider就能通过定位拿到数据,但是前端的大佬们,往往在写页面的时候,有时候标签的使用会变化,也就让我们的定位无法准确的定位,这时候,你的spider就会出现宕机的危机。
- 解决方法:
- 在开始编写代码之前,事先打开网页,进行查看,主要观察的地方是,你所获取内容在页面上的展示方式和地方。
- 异常处理,利用异常处理来保证你的spider不会死亡,而且通过异常的抛出你也能够进行发先,定位标记的错误,及时进行代码的优化。
- 访问时适当的加入等待机制,合适的等待机制,爬取的效率虽然会稍微的降低,但是稳定的数据获取,能够减少你的返工次数。
应用实例:
猎聘网的有关ptython的职位信息的获取(仅用于练习和代码测试)
利用chrome的自动控制,进行数据的获取
主要获取的字段为,公司名称、职位名称、薪资、应聘要求。
最后将数据存放到数据库中。
import pymysql
import sys
def save(table):
print('------------------------------')
global conn
conn = pymysql.connect(host='127.0.0.1',
user='root',
passwd='XXX',
port=8080,
charset='utf8')
global cur
cur = conn.cursor()
print('获取游标')
try:
cur.execute("create database lp character set utf8;")
except Exception as e:
print(e)
cur.execute('use lp;')
try:
cur.execute("create table "+table+"(id int,company char(100),job char(200),\
address char(100),salary char(100),ask varchar(5000))character set utf8;"
)
except Exception as e:
print(e)
print('创建表完成')
def inser_data(table,id,company,job,address,salary,ask):
sql_insert = 'insert into '+table+'(id,company,job,address,salary,ask) values (%s,%s,%s,%s,%s,%s);'
try:
cur.execute(sql_insert,[id,company,job,address,salary,ask])
except Exception as e:
print(e)
conn.commit()
def my_txt(table,ask):
f = open(table+'.txt','a+',encoding='utf-8')
f.write(ask)
f.close()
'''
职位要求得数据全部存储在本地txt文档制作词云
公司名称,职位名称和薪资字段全部存放于数据库
由于薪资字段得数据显示方式为“XX-XX”的范围所以全部以字符串的形式进行存放
'''
from selenium import webdriver
from time import sleep
import random
import re
from lp_spider import save_data
# from lp_spider import py_cloud
start_url = 'https://www.liepin.com/zhaopin/'
def open_url():
global driver
driver = webdriver.Chrome()
driver.get(start_url)
driver.maximize_window()
def get_page(type):
#隐形等待,网页完全打开
driver.implicitly_wait(20)
#输入需要查找的类型
driver.find_element_by_xpath('//*[@id="sojob"]/div[1]/form/div[1]/div/div/div[1]/input').send_keys(type)
#点击进行查找
driver.find_element_by_xpath('//*[@id="sojob"]/div[1]/form/div[1]/div/div/div[1]/button').click()
# 滑动滑块
driver.execute_script('window.scrollBy(0, 500)')
def get_info(table):
global id # 标号
id = 0
for j in range(1,101):
for i in range(1,41):
global company # 公司名称
global job # 职位名称
global salary # 薪资
global Ask # 职位要求
try:
ty = driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li['+str(i)+']/i/b').text
except:
ty = '无'
print(ty)
if ty == '企':
#sleep(random.choice(range(5, 15)))
#打开对应页面
try:
#打开对应的页面
driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li['+str(i)+']/div/div[1]/h3/a').click()
#print(i)
#跳转
print('站点地址:',end=' ')
print(driver.current_url)
handles = driver.window_handles
driver.switch_to.window(handles[len(handles)-1])
#print(driver.current_url)
driver.implicitly_wait(20)
#开始进行获取信息
try:
company = driver.find_element_by_xpath(
'//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[1]/h3/a[@title]').text
except Exception as e:
print(e)
try:
company = driver.find_element_by_xpath(
'//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h3').text
except Exception as e:
print(e)
company = driver.find_element_by_xpath(
'//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h1[@title]').text
#print(company)
try:
job = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[1]/h1').text
except Exception as e:
print(e)
job = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h1[@title]').text
#print(job)
#sleep(random.choice(range(1,5)))
try:
salary = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[1]').text
salary_m = re.findall('[\u4e00-\u9fa5]+',salary)
if (salary_m[0] == '面议'):
salary = ['面议']
else:
salary = driver.find_element_by_xpath(
'//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[1]').text
if (len(salary)<8):
salary = [salary]
else:
salary = re.findall('[0-9]*.[0-9]*.[\u4e00-\u9fa5]+', salary)
except Exception as e:
print(e)
salary = driver.find_element_by_xpath(
'//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[2]/div/div/p[1]').text
if (len(salary) < 8):
salary = [salary]
else:
salary = re.findall('[0-9]*.[0-9]*.[\u4e00-\u9fa5]+', salary)
#print(salary)#!salary经过处理后变成字典形式
try:
address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span/a').text
except Exception as e:
print(e)
try:
address = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[2]/div/div/p[2]/span').text
except Exception as e:
print(e)
try:
address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span/text()').text
except Exception as e:
print(e)
address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span').text
#print(address)
#移动滑动条
driver.execute_script('window.scrollBy(0,400)')
#sleep(10)
try:
Ask= driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[3]/div').text
except Exception as e:
Ask = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[3]/div').text
#Ask = Ask.replace("\n",'')
try:
Ask = Ask.replace("任职要求:", "")
except:
#print(Ask)
pass
try:
Ask = Ask.replace("岗位职责:", "")
except:
#print(Ask)
pass
try:
Ask = Ask.replace("职位描述:", "")
except:
#print(Ask)
pass
try:
Ask = Ask.replace("岗位要求:", "")
except:
#print(Ask)
pass
try:
Ask = Ask.replace("职责描述:", "")
except:
#print(Ask)
pass
try:
Ask = Ask.replace("任职资格:", "")
except:
#print(Ask)
pass
# print(Ask)
driver.close()
handles = driver.window_handles
sleep(random.choice(range(1, 5)))
driver.switch_to.window(handles[len(handles)-2])
# #滑动滑块
# driver.execute_script('window.scrollBy(0, 145)')
print(j, end='.')
print(i)
#print('————————————————————————————————————————————————————————————————————————' * 10)
save_data.inser_data(table,str(id), company, job, address, salary[0], Ask)
save_data.my_txt(table,Ask)
id = id + 1
except:
pass
else:
print(j, end='.')
print(i,end='完成')
#print('————————————————————————————————————————————————————————————————————————'*10)
if i<40:
if ty == '企':
# 滑动滑块
driver.execute_script('window.scrollBy(0, 145)')
if ty == '猎':
driver.execute_script('window.scrollBy(0,141)')
if ty == '直':
driver.execute_script('window.scrollBy(0,145)')
if ty == '无':
driver.execute_script('window.scrollBy(0,137)')
if ty == '优':
driver.execute_script('window.scrollBy(0,139)')
try:
driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a[8]').click()
except:
driver.execute_script('window.scrollTo(0,0)')#返回到页面首位
driver.execute_script('window.scrollBy(0,{})'.format(145 * 42))
driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a[8]').click()
sleep(random.choice(range(3,5)))
driver.execute_script('window.scrollBy(0, 500)')
save_data.cur.close()
save_data.conn.close()
if __name__ == '__main__':
while(1):
print('输入爬取职位类别名称,输入后按回车继续-->',end='')
ty = input()
save_data.save(ty)
open_url()
get_page(ty)
get_info(ty)
#py_cloud.make_cloud('python')
print('爬取结束')
# 词云
from wordcloud import WordCloud
import cv2
import jieba
with open('lp.txt', 'r',encoding='utf-8') as f:
text = f.read()
cut_text = " ".join(jieba.cut(text))
color_mask = cv2.imread('python1.jpg')
cloud = WordCloud(
# 设置字体,不指定就会出现乱码
font_path=" C:\\Windows\\Fonts\\STXINGKA.TTF",
# font_path=path.join(d,'simsun.ttc'),
# 设置背景色
background_color='white',
# 词云形状
mask=color_mask,
# 允许最大词汇
max_words=10000,
# 最大号字体
max_font_size=100
)
wCloud = cloud.generate(cut_text)
wCloud.to_file('cloud.png')
import matplotlib.pyplot as plt
plt.imshow(wCloud, interpolation='bilinear')
plt.axis('off')
plt.show()