Ado, directly on the code and data into Mongdb
Import Requests Import pymongo Import Time Import Random mycon = pymongo.MongoClient ( ' 127.0.0.1 ' , 27017) # establish a connection mydb = mycon [ ' lagou_data ' ] # Set library name class LaGouSpider (): DEF the __init__ (Self, City, KD ): self.headers = { ' the User-- Agent ' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=', } self.city = city self.max_pn = 1 self.kd = kd def get_start(self): mycol = mydb[self.kd] # 设置集合名 url = "https://www.lagou.com/jobs/positionAjax.json?city="+ self.city +"&needAddtionalResult=false" for page in range(1,10): data = { 'first': 'true', 'pn': page, 'kd': self.kd } s = requests.Session() s.get(url = "https://www.lagou.com/jobs/list_python%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88?labelWords=&fromSearch=true&suginput=",headers =self.headers) Cookies = s.cookies Response = s.post (URL = URL, Data = Data, = Cookies Cookies, headers = self.headers) .json () Content = response.get ( ' Content ' ) IF Content: Result = Content [ ' positionResult ' ] [ ' Result ' ] Print ( ' position: {}, city: {}, the start of crawling: p {} \ n- ' .format (self.kd, self.city, Page)) for i in the Result: lagou_data = {} lagou_data [ ' positionName ' ] = I [ ' positionName ' ] # Post name lagou_data [ ' companyFullName ' ] = I [ ' companyFullName ' ] # wholly name lagou_data [ ' workYear ' ] = I [ ' workYear ' ] # work experience lagou_data [ ' Education '] = I [ ' Education ' ] # Academic requirements lagou_data [ ' jobNature ' ] = I [ ' jobNature ' ] # nature lagou_data [ ' the salary ' ] = I [ ' the salary ' ] # pay lagou_data [ ' City ' ] = I [ ' City '] # City lagou_data [ 'financeStage ' ] = i [ ' financeStage ' ] # financial stage lagou_data [ ' industryField ' ] = i [ ' industryField ' ] # Business lagou_data [ ' companyShortName ' ] = i [ ' companyShortName ' ] # simple name company lagou_data [ ' positionAdvantage ' ] = I [ ' positionAdvantage ' ] # Post advantages lagou_data [ ' companySize ' ] = I [ ' companySize ' ] # Company size lagou_data [ ' companyLabelList ' ] = I [ ' companyLabelList ' ] # Post treatment tag lagou_data [ ' District ' ] = I [ ' District ' ] # Area lagou_data [ ' positionLables ' ] = I [ ' positionLables '] # Technical scope of the tag lagou_data [ ' firstType ' ] = I [ ' firstType ' ] # Post type lagou_data [ ' createTime ' ] = I [ ' createTime ' ] # Name Print (lagou_data) mycol.insert (lagou_data) the time.sleep (random.uniform ( 3,7)) # random sleep IF the __name__ == ' __main__ ' : lagouLaGouSpider = ( '北京' , ' python ' ) lagou.get_start ()
Description: pull hook net Anti-climb general, it is to obtain information on the search pages cookies, and then added to the json data interface returned.