Homelink new house reptiles

# Chain new house reptiles
 
** ** goal today 

crawling latest region and the corresponding prices 
`` `Python 
Import Requests
 Import Re
 Import CSV 

class LianjiaSpider (Object):
     DEF  __init__ (Self): 
        self.url = ' HTTPS: // CQ .lianjia.com / ershoufang / PG {} / ' 
        self.headers = { ' the User-- Agent ' : ' the Mozilla / 5.0 ' } 

    # Get URL 
    DEF the get_page (Self, URL): 
        RES = requests.get (URL, headers = self.headers) 
        HTML =res.content.decode()
        #直接调用解析函数
        self.parse_page(html)

    #解析数据
    def parse_page(self,html):

        pattern=re.compile('<div class="houseInfo"><span.*?data-el="region">(.*?)</a>.*?<div class="totalPrice"><span>(.*?)</span>',re.S)
        r_list=pattern.findall(html)
        self.write_page(r_list)

    #保存
    def write_page(self,r_list):
        film_list=[]
        with open('lianjia.csv','a') as f:
            Writer = csv.writer (F)
             for R & lt in r_list:
             # the processed data is defined as a tuple 
                T = (R & lt [0] .strip (), R & lt [. 1] + ' Wan ' ) 
                film_list.append (T) 
                writer.writerows (film_list) 

    DEF main (Self):
         for page in Range (1,11 ): 
            URL = self.url.format (page) 
            self.get_page (URL) 
            Print ( ' being printed pages {} ' .format ( Page)) 


IF  the __name__ == ' __main__ ':
     spider=LianjiaSpider()
     spider.main()
```

 

Guess you like

Origin www.cnblogs.com/cxiaolong/p/11234872.html