从今天开始学习爬虫,展示我的第一个实例(用的是Python3写的,Python2需要加个编码方式吧,或许还有其他的不兼容的地方吧,我就不知道了),把这分享给大家,希望对大家有一些帮助
import urllib,re
import urllib.request
import xlwt
#打开网页,获取源码
def get_content():
url='https://search.51job.com/list/170200,000000,0000,00,9,99,Python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
a=urllib.request.urlopen(url)#打开网站
html=a.read()
html=html.decode('gbk')#解码
print(html)
return html
#从源码中获取数据
def get():
html=get_content()
reg=re.compile(r'class="t1 ".*?<a target="_blank" title="(.*?)".*?<span class="t2"><a target="_blank" title="(.*?)".*? <span class="t3">(.*?)</span>.*? <span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',re.S)#提高效率
items=re.findall(reg,html)
print(items)
return items
#创建Excel表格
def excel_write(items):
newTable='test.xls'
wb=xlwt.Workbook(encoding='utf-8')#创建表格
ws=wb.add_sheet('test1')
headData=['招聘职位','公司','地址','薪资','日期']
for colnum in range(0,5):
ws.write(0,colnum,headData[colnum],xlwt.easyxf('font:bold on'))
index=1
for item in items:
for i in range(0,5):
ws.write(index,i,item[i])
index+=1
wb.save(newTable)
items=get()
excel_write(items)