爬虫基础教程 —— 4

 
 

这个单线程小爬虫以携程为例,爬取携程热门游,的一些简单信息

from lxml import etree
from bs4 import BeautifulSoup
import time

class XieCheng(object):
    # 添加头部信息 和我们的起始url
    def __init__(self):
        self.url = 'http://vacations.ctrip.com/'
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
        }
    # 发送请求
    def send_request(self,link):
        Response_html = requests.get(url=link).text
        # time.sleep(2)
        return Response_html
    
    # 解析热门城市的link
    def parse_place_link(self,ParseHtml):

        HtmlObj = re.compile(r' <dt>热门目的地旅游</dt>(.*?)<a target="_blank" href="/tours">更多目的地</a>',re.S)
        HotPlaceNode = HtmlObj.findall(ParseHtml)[0]
        HotPlaceLink = re.findall(r'href="(.*?)"',HotPlaceNode)
        for link in HotPlaceLink:
            link = 'http://vacations.ctrip.com' + str(link)
            yield link
            
            
    # 解析详情页link
    def parse_detail_page_link(self,ArticleHtml):
        EveryonePageLinks = re.findall(r'<h2 class="product_title"><a href="(.*?)"',ArticleHtml)
        for EveryonePageLink in EveryonePageLinks:
            EveryonePageLink = "http:" + str(EveryonePageLink)
            yield EveryonePageLink
            
    # 解析详情信息
    def parse_page_info(self,Response_html):
        try:                                                                    # 在这里我们捕获一下异常

            soup = BeautifulSoup(Response_html, 'lxml')
            if 'product_feature' in Response_html:
                product_feature = soup.select('.product_feature')[0].get_text().replace('\n', '').replace('\t', '')
            else:
                product_feature = ''
            detailed = ','.join(re.findall(r'[\u4e00-\u9fa50-9]+',re.findall(r'<!--详细行程Start-->(.*?)<!--详细行程End-->', Response_html, re.S)[0]))
            if "minPrice" in Response_html:
                price = re.findall(r'"minPrice":(.*?),', Response_html)[0] + str('元')
            elif 'ProductMinPrice' in Response_html:
                price = re.findall(r'"ProductMinPrice:"(.*?)",', Response_html)[0] + str('元')
            else:
                price = ''
            title = re.findall(r'<h1 itemprop="name">(.*?)<', Response_html, re.S)[0].strip()
            data = {
            'FEATURE': product_feature,
            'DETAILED':detailed,
            'PRICE': price,
            'TITLE':title,
            }
            return data
        except Exception as error:
            # print('----error----',error)
            return None
    #  主方法
    def main(self):
        html = self.send_request(self.url)
        links = self.parse_place_link(html)
        for link in links:
            detail_page = self.send_request(link)
            detail_links = self.parse_detail_page_link(detail_page)
            for detail_link in detail_links:
                PageHtml = self.send_request(detail_link)
                data = self.parse_page_info(PageHtml)
                print(data)

if __name__ == "__main__":
    xiecheng = XieCheng()
    xiecheng.main()

爬虫比较简单,数据类型也比较单一,但是流程是差不多的


猜你喜欢

转载自blog.csdn.net/redpintings/article/details/80185645
今日推荐