爬取携程7天内的全国所有城市航班

  1 #!/usr/bin/env python
  2 # coding: utf-8
  3 
  4 
  5 import requests
  6 import pandas as pd
  7 import json,random,time,datetime
  8 
  9 # userAgent
 10 userAgent = [
 11     "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
 12     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
 13     "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0",
 14     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
 15     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
 16     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 17     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
 18     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17"
 19     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
 20     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
 21     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
 22     "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
 23 ]
 24 
 25 # get city
 26 def getCityMsg():
 27     headers = {
 28         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
 29         "Referer": "https://flights.ctrip.com/itinerary",
 30         "Content-Type": "application/json"
 31     }
 32     url = 'https://flights.ctrip.com/itinerary/api/poi/get'
 33     r = requests.get(url=url,headers=headers).text
 34 #     print(len(r))
 35     # get city msg
 36     city = {}
 37     city_load = json.loads(r).get('data')
 38     for data in city_load.keys():
 39         if data != '热门':
 40             tmpdata = city_load.get(data)
 41             for i in tmpdata:
 42     #             print(i)  # A 
 43                 for k in tmpdata.get(i):   
 44                     name = k.get('data').split('|')
 45                     cityNumId = name[2]
 46                     cityId = name[3]
 47                     cityName = name[1].split('(')[0]
 48                     city[cityName] = [cityId, cityNumId]
 49     return city
 50 
 51 # 生成自今日至往后7天日期
 52 def get_date():
 53     dateList = []  # 存放时间list
 54     formatDate = datetime.datetime.now()  # 生成今日的格式化时间
 55     strDate = formatDate.strftime('%Y-%m-%d')  # 生成字符串日期
 56     stpDate = datetime.datetime.strptime(strDate,'%Y-%m-%d')  # 将字符串转为日期格式的日期
 57     for i in range(7):
 58         stpDate += datetime.timedelta(days=+1)   # 日期叠加1
 59         dateList.append(datetime.datetime.strftime(stpDate,'%Y-%m-%d'))  # 放入list
 60     return dateList
 61 
 62 # get page text:routeList
 63 def  get_routeList(headers, load_json):
 64     response = requests.post(url = "https://flights.ctrip.com/itinerary/api/12808/products",data=json.dumps(load_json), headers = headers).text
 65     result = json.loads(response)["data"].get('routeList')
 66     if result is not None:
 67         return json.loads(response)["data"].get('routeList')
 68     else:
 69         print('Get 【{} --> {}】 Page is failed !'.format(load_json.get('airportParams')[0].get('dcityname'), load_json.get('airportParams')[0].get('acityname')))
 70         print('休息30s后再来……')
 71         time.sleep(30)
 72         get_routeList(headers, load_json)
 73 
 74 # get Data
 75 def get_data(index, df, routeList):
 76     for i, route in enumerate(routeList):
 77         if route.get('routeType') == 'Flight':  # 只要航班
 78             index += 1
 79             # route is dict
 80             # we need route inside legs, legs is list, but its lengths is 1
 81             # so we should legs[0], legs[0] is dict
 82 
 83             # flight
 84             flight = route.get('legs')[0].get('flight')  # dict
 85             # cabins
 86             # cabins 里面又有不同长度的list ,因此考虑将cabins单独作为一个表
 87     #         cabins = route.get('legs')[0].get('cabins')  # list
 88             # characteristic
 89 #             characteristic = route.get('legs')[0].get('characteristic')  # dict
 90 
 91             #### about flight
 92             if flight is not None:
 93                 # common attr
 94                 df.loc[index,'airlineCode'] = flight.get('airlineCode')
 95                 df.loc[index,'AirlineName'] = flight.get('airlineName')
 96                 df.loc[index,'durationDays'] = flight.get('durationDays')
 97                 df.loc[index,'flightNumber'] = flight.get('flightNumber')
 98                 df.loc[index,'mealFlag'] = flight.get('mealFlag')
 99                 df.loc[index,'mealType'] = flight.get('mealType')
100                 df.loc[index,'comfort'] = flight.get('comfort')
101                 df.loc[index,'craftKind'] = flight.get('craftKind')
102                 df.loc[index,'craftTypeCode'] = flight.get('craftTypeCode')
103                 df.loc[index,'craftTypeKindDisplayName'] = flight.get('craftTypeKindDisplayName')
104                 df.loc[index,'craftTypeName'] = flight.get('craftTypeName')
105                 df.loc[index,'delayedTime'] = flight.get('delayedTime')
106                 df.loc[index,'oilFee'] = flight.get('oilFee')
107                 df.loc[index,'punctualityRate'] = flight.get('punctualityRate')
108                 df.loc[index,'sharedFlightName']  = flight.get('sharedFlightName')
109                 df.loc[index,'sharedFlightNumber'] = flight.get('sharedFlightNumber')
110                 df.loc[index,'specialCraft'] = flight.get('specialCraft')
111                 df.loc[index,'stopInfo'] = flight.get('stopInfo')
112                 df.loc[index,'stopTimes'] = flight.get('stopTimes')
113                 df.loc[index,'tax'] = flight.get('tax')
114                 # arrival
115                 df.loc[index,'arrivalairportName'] = flight.get('arrivalAirportInfo').get('airportName')
116                 df.loc[index,'arrivalairportTlc'] = flight.get('arrivalAirportInfo').get('airportTlc')
117                 df.loc[index,'arrivalcityName'] = flight.get('arrivalAirportInfo').get('cityName')
118                 df.loc[index,'arrivalcityTlc'] = flight.get('arrivalAirportInfo').get('cityTlc')
119                 df.loc[index,'arrivalTerminalName'] = flight.get('arrivalAirportInfo').get('terminal').get('name')
120                 df.loc[index,'arrivalDate'] = flight.get('arrivalDate')
121                 # departure 
122                 df.loc[index,'departureairportName'] = flight.get('departureAirportInfo').get('airportName')
123                 df.loc[index,'departureairportTlc'] = flight.get('departureAirportInfo').get('airportTlc')
124                 df.loc[index,'departureCityName'] = flight.get('departureAirportInfo').get('cityName')
125                 df.loc[index,'departureCityTlc'] = flight.get('departureAirportInfo').get('cityTlc')
126                 df.loc[index,'departureTerminalName'] = flight.get('departureAirportInfo').get('terminal').get('name')
127                 df.loc[index,'departureDate'] = flight.get('departureDate')
128 
129             #### characteristic : charactor
130             # characteristic:charactor
131             charactor = route.get('legs')[0].get('characteristic')  # dict
132             if charactor is not None:
133                 df.loc[index, 'businessAircraft'] = charactor.get('businessAircraft')
134                 df.loc[index, 'discountAmount'] = charactor.get('discountAmount')
135                 df.loc[index, 'discountShowType'] = charactor.get('discountShowType')
136                 df.loc[index, 'flyMan'] = charactor.get('flyMan')
137                 df.loc[index, 'groupTicketPrice'] = charactor.get('groupTicketPrice')
138                 df.loc[index, 'hotFlight'] = charactor.get('hotFlight')
139                 df.loc[index, 'hx'] = charactor.get('hx')
140                 df.loc[index, 'infantSoldOut'] = charactor.get('infantSoldOut')
141                 df.loc[index, 'lowPriceDiscount'] = charactor.get('lowPriceDiscount')
142                 df.loc[index, 'lowestBabyCfPrice'] = charactor.get('lowestBabyCfPrice')
143                 df.loc[index, 'lowestBabyPrice'] = charactor.get('lowestBabyPrice')
144                 df.loc[index, 'lowestCfPrice'] = charactor.get('lowestCfPrice')
145                 df.loc[index, 'lowestChildAdultCfPrice'] = charactor.get('lowestChildAdultCfPrice')
146                 df.loc[index, 'lowestChildAdultPrice'] = charactor.get('lowestChildAdultPrice')
147                 df.loc[index, 'lowestChildCfPrice'] = charactor.get('lowestChildCfPrice')
148                 df.loc[index, 'lowestChildPrice'] = charactor.get('lowestChildPrice')
149                 df.loc[index, 'lowestPrice'] = charactor.get('lowestPrice')
150                 df.loc[index, 'promotion'] = charactor.get('promotion')
151                 df.loc[index, 'providerHx'] = charactor.get('providerHx')
152                 df.loc[index, 'roundTripDiscounts'] = charactor.get('roundTripDiscounts')
153                 for i, stdPrice in enumerate(charactor.get('standardPrices')):
154                     diffCabinCla = stdPrice.get('cabinClass')
155                     df.loc[index, 'price' + diffCabinCla] = stdPrice.get('price')
156                 df.loc[index, 'superFlyMan'] = charactor.get('superFlyMan')
157                 df.loc[index, 'weight'] = charactor.get('weight')
158 
159 
160             #### carbins  
161             # 由于cabins 里面又有不同长度的list, 会出现很多的空值列。因此考虑将cabins单独作为一个表
162 #             cabins = route.get('legs')[0].get('cabins')  # list
163 #             for i, cabin in enumerate(cabins):
164 #                 carbin = cabin.get('cabinClass') + str(i) 
165 #                 diffCabin = cabin.get('cabinClass') + str(i)  # carbins is list, have more, we need diff them
166 #                 df.loc[index, 'compositionPrice' + diffCabin] = cabin.get('price').get('compositionPrice')
167 #                 df.loc[index, 'discount' + diffCabin] = cabin.get('price').get('discount')
168 #                 df.loc[index, 'discountAmount' + diffCabin] = cabin.get('price').get('discountAmount')
169 #                 df.loc[index, 'discountShowType' + diffCabin] = cabin.get('price').get('discountShowType')
170 #                 df.loc[index, 'favorablePrice' + diffCabin] = cabin.get('price').get('favorablePrice')
171 #                 df.loc[index, 'fdPrice' + diffCabin] = cabin.get('price').get('fdPrice')
172 #                 df.loc[index, 'originalPrice' + diffCabin] = cabin.get('price').get('originalPrice')
173 #                 df.loc[index, 'pcPrice' + diffCabin] = cabin.get('price').get('pcPrice')
174 #                 df.loc[index, 'price' + diffCabin] = cabin.get('price').get('price')
175 #                 df.loc[index, 'rate' + diffCabin] = cabin.get('price').get('rate')
176 #                 df.loc[index, 'salePrice' + diffCabin] = cabin.get('price').get('salePrice')
177 #                 df.loc[index, 'serviceCharge' + diffCabin] = cabin.get('price').get('serviceCharge')
178 
179 #     #             refundEndorse
180 #                 df.loc[index, 'changeNote' + diffCabin] = cabin.get('refundEndorse').get('changeNote')
181 #                 df.loc[index, 'changeRuleFlag' + diffCabin] = cabin.get('refundEndorse').get('changeRuleFlag')
182 #                 df.loc[index, 'endorseNote' + diffCabin] = cabin.get('refundEndorse').get('endorseNote')
183 #                 df.loc[index, 'minEndorseFee' + diffCabin] = cabin.get('refundEndorse').get('minEndorseFee')
184 #                 df.loc[index, 'minRefundFee' + diffCabin] = cabin.get('refundEndorse').get('minRefundFee')
185 #                 df.loc[index, 'refundNote' + diffCabin] = cabin.get('refundEndorse').get('refundNote')
186 #                 df.loc[index, 'refundRuleFlag' + diffCabin] = cabin.get('refundEndorse').get('refundRuleFlag')
187 #                 df.loc[index, 'remark' + diffCabin] = cabin.get('refundEndorse').get('remark')
188 #                 df.loc[index, 'serviceLevel' + diffCabin] = cabin.get('refundEndorse').get('serviceLevel')
189     return (index, df)
190         
191 
192 # main function    
193 def main(index, df,city):
194     # 初始化 时间
195     flightDates = get_date()
196     for flightDate in flightDates:   # 起飞日期
197         print(flightDate, end= '\t')
198         # (广州,(can,32))
199         for (fromCityName, fromCityId) in city.items():  # 起飞城市
200             for (toCityName, toCityId) in city.items():  # 降落城市    
201                 if  fromCityName != toCityName:  
202                     # headers
203                     headers = {
204                         "User-Agent": random.choice(userAgent),
205                         "origin": "https://flights.ctrip.com",
206                         "content-type": "application/json"
207                     }
208                     # 加载不同 load_json
209                     load_json = {
210                         "airportParams":[
211                             {"dcity":fromCityId[0],"dcityname":fromCityName,"acity":toCityId[0],"acityname":toCityName,"date":flightDate,"dcityid":fromCityId[1],"acityid":toCityId[1]}
212                         ],
213                         "classType": "ALL",
214                         "date": flightDate,
215                         "flightWay": "Oneway",
216                         "hasBaby": False,
217                         "hasChild": False,
218                         "searchIndex": 1,
219                         "token": "3481e1f047cee3eb638f2435b7c1b860"
220                     }
221                     # routeList
222                     routeList = get_routeList(headers, load_json)
223                     # get_data
224                     (index, df) = get_data(index, df, routeList)
225                 print(index,df.shape, end='\t')
226                 time.sleep(random.choice(range(5, 10)))
227             time.sleep(randomm.choice(range(5, 10)))
228             print('【{}】起飞,抓完!'.format(fromCityName))
229         time.sleep(randomm.choice(range(60, 90)))
230         print('起飞日期:{},抓完,写入文件!'.format(fromCityName))
231         df.to_csv('【{}】爬取:【{}】起飞航班.csv'.format(flightDates[0], flightDate),index=False)
232     return (index, df)
233 
234 
235 
236 if __name__ == "__main__":
237     df = pd.DataFrame()   # 存数df
238     index = 0   # 计数索引
239      # getCityMsg
240     city = getCityMsg()
241     (index, df) = main(index, df, city)

注意爬取速度和更换load_json中的token

猜你喜欢

转载自www.cnblogs.com/Alexisbusyblog/p/12580891.html