[爬虫] 爬取POI服务

爬取的数据仅用于科研
高德地图POI服务:http://lbs.amap.com/api/webservice/guide/api/search

结果:爬取厦门的公司企业信息
数据可以查看高德地图POI服务,提供哪些数据

这里写图片描述

coordinate_conversion文件:https://blog.csdn.net/summer_dew/article/details/80723434

# -*- coding:utf-8 -*-
# function: 爬取高德POI服务数据
import xlwt
import urllib
from bs4 import BeautifulSoup
import coordinate_conversion

TYPE = u"公司企业"
outPath = r"D:\Users\PasserQi\Desktop\GetAMapPOI\%s.xls" % TYPE

saveField = ["id","name","type","typecode","address","x","y","pname","cityname","business_area","photos"]

AMAP_API_KEY = "4fac3db866dcc3b8a735651d3a7db1c7" #高德地图密匙
urlParamJson = {
    'city' : '厦门',
    'output' : 'xml',
    'key' : AMAP_API_KEY,
    'types' : TYPE.encode("utf8"),
    'citylimit' : 'true', #只返回指定城市数据
    'offset' : '20'#每页条数
}
MIN_PAGE = 1
MAX_PAGE = 100 #最大页数


def setStype(name, height, bold=False):
    style = xlwt.XFStyle() #init style
    font = xlwt.Font() #为样式创建字体
    font.name = name
    font.bold = bold
    font.color_index = 4
    font.height = height
    return style

if __name__ == '__main__':
    #create
    w = xlwt.Workbook(encoding="utf-8")
    #create sheet
    sheet = w.add_sheet(TYPE)
    for i in range( len(saveField) ) :
        sheet.write(0, i, saveField[i])

    cur = 1
    for page in range(MIN_PAGE, MAX_PAGE):  # 页数
        urlParamJson["page"] = page
        print "当前 %s 页..." % page
        params = urllib.urlencode(urlParamJson)
        url = "http://restapi.amap.com/v3/place/text?%s" % params
        http = urllib.urlopen(url)
        dom = BeautifulSoup(http)
        poiList = dom.findAll("poi")
        if len(poiList) == 0:  #没有
            break
        for poi in poiList:
            for tag in poi:
                name = tag.name #标签名
                print name
                if name == "photos": #图片
                    index = saveField.index("photos")
                    value = ""
                    for i in tag:
                        photos_url = tag.url.get_text()
                        value = value + photos_url + ";"
                    sheet.write(cur, index, value)
                    continue
                if name in saveField:
                    index = saveField.index(name) #获取下标
                    value = tag.get_text() #获取值
                    sheet.write(cur, index, value) #保存
                if name == "location":
                    value = tag.get_text()
                    x,y = value.split(',')
                    x,y = coordinate_conversion.gcj02towgs84(float(x),float(y) ) #转换
                    # save x
                    index = saveField.index('x')
                    value = x
                    sheet.write(cur, index, value)
                    # save y
                    index = saveField.index('y')
                    value = y
                    sheet.write(cur, index, value)

            cur = cur+1
            if cur%11==0:
                print "当前已保存 %s 个信息" % cur

    w.save(outPath)

猜你喜欢

转载自blog.csdn.net/summer_dew/article/details/80723421