xpath与soup

# 练习数据抽取

import re, requests, json
from bs4 import BeautifulSoup
import lxml.html

"""
c = re.compile(r"([/]+)(\S+)")
re_data = "GET /index.html HTTP/1.1"

m = c.search(re_data)
print(m.group(2))


c = re.compile(r'(\w+)')

s = c.match('shd327sjahdajhsd87892ehawksd')
print(s.group())

"""

test_data = """
        <div>
            <ul>
                 <li class="item-0"><a href="link1.html" id="places_neighbours__row">9,596,960first item</a></li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-inactive"><a href="link3.html">third item</a></li>
                 <li class="item-1"><a href="link4.html" id="places_neighbours__row">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
                 <li class="good-0"><a href="link5.html">fifth good</a></li>
             </ul>
             <book>
                    <title lang="eng">Harry Potter</title>
                    <price>29.99</price>
            </book>
            <book>
                <title lang="zh">Learning XML</title>
                <price id="places_neighbours__row">39.95</price>
            </book>
            <book>
                <title>python</title>
                <price>40</price>
            </book>
         </div>
         """


headers = {
    ""
}






r = """<!--[if HTML5]><![endif]-->
<!DOCTYPE html>
<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
<!--[if lt IE 7]><html class="ie ie6 ie-lte9 ie-lte8 ie-lte7 no-js" lang="en-us"> <![endif]-->
<!--[if IE 7]><html class="ie ie7 ie-lte9 ie-lte8 ie-lte7 no-js" lang="en-us"> <![endif]-->
<!--[if IE 8]><html class="ie ie8 ie-lte9 ie-lte8 no-js" lang="en-us"> <![endif]-->
<!--[if IE 9]><html class="ie9 ie-lte9 no-js" lang="en-us"> <![endif]-->
<!--[if (gt IE 9)|!(IE)]><!--> <html class="no-js" lang="en-us"> <!--<![endif]-->
<head>
<title>Example web scraping website</title>
  <!--[if !HTML5]>
      <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  <![endif]-->
  <!-- www.phpied.com/conditional-comments-block-downloads/ -->
  <!-- Always force latest IE rendering engine
       (even in intranet) & Chrome Frame
       Remove this if you use the .htaccess -->
      
  <meta charset="utf-8" />

  <!-- http://dev.w3.org/html5/markup/meta.name.html -->
  <meta name="application-name" content="places" />

  <!--  Mobile Viewport Fix
        j.mp/mobileviewport & davidbcalhoun.com/2010/viewport-metatag
        device-width: Occupy full width of the screen in its current orientation
        initial-scale = 1.0 retains dimensions instead of zooming out if page height > device height
        user-scalable = yes allows the user to zoom in -->
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />

  <link rel="shortcut icon" href="/places/static/images/favicon.ico" type="image/x-icon">
  <link rel="apple-touch-icon" href="/places/static/images/favicon.png">

  <!-- All JavaScript at the bottom, except for Modernizr which enables
       HTML5 elements & feature detects -->
  <script src="/places/static/js/modernizr.custom.js"></script>

  <!-- include stylesheets -->
  

  <script type="text/javascript"><!--
    // These variables are used by the web2py_ajax_init function in web2py_ajax.js (which is loaded below).
    var w2p_ajax_confirm_message = "Are you sure you want to delete this object?";
    var w2p_ajax_disable_with_message = "Working...";
    var w2p_ajax_date_format = "%Y-%m-%d";
    var w2p_ajax_datetime_format = "%Y-%m-%d %H:%M:%S";
    var ajax_error_500 = 'An error occured, please <a href="/places/default/view/China-47">reload</a> the page'
    //--></script>

<meta name="keywords" content="web2py, python, web scraping" />
<meta name="generator" content="Web2py Web Framework" />
<meta name="author" content="Richard Penman" />
<script src="/places/static/js/jquery.js" type="text/javascript"></script><link href="/places/static/css/calendar.css" rel="stylesheet" type="text/css" /><script src="/places/static/js/calendar.js" type="text/javascript"></script><script src="/places/static/js/web2py.js" type="text/javascript"></script><link href="/places/static/css/web2py.css" rel="stylesheet" type="text/css" /><link href="/places/static/css/bootstrap.min.css" rel="stylesheet" type="text/css" /><link href="/places/static/css/bootstrap-responsive.min.css" rel="stylesheet" type="text/css" /><link href="/places/static/css/style.css" rel="stylesheet" type="text/css" /><link href="/places/static/css/web2py_bootstrap.css" rel="stylesheet" type="text/css" />


  

  <!-- uncomment here to load jquery-ui
       <link rel="stylesheet" href="http://ajax.googleapis.com/ajax/libs/jqueryui/1.10.3/themes/ui-lightness/jquery-ui.css" type="text/css" media="all" />
       <script src="http://ajax.googleapis.com/ajax/libs/jqueryui/1.10.3/jquery-ui.min.js" type="text/javascript"></script>
       uncomment to load jquery-ui //-->
  <noscript><link href="/places/static/css/web2py_bootstrap_nojs.css" rel="stylesheet" type="text/css" /></noscript>
  
</head>

<body>
  <!-- Navbar ================================================== -->
  <div class="navbar navbar-inverse">
    <div class="flash"></div>
    <div class="navbar-inner">
      <div class="container">
        
        <!-- the next tag is necessary for bootstrap menus, do not remove -->
        <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse" style="display:none;">
          <span class="icon-bar"></span>
          <span class="icon-bar"></span>
          <span class="icon-bar"></span>
        </button>
        
        <ul id="navbar" class="nav pull-right"><li class="dropdown"><a class="dropdown-toggle" data-toggle="dropdown" href="#" rel="nofollow">Log In</a><ul class="dropdown-menu"><li><a href="/places/default/user/register?_next=/places/default/view/China-47" rel="nofollow"><i class="icon icon-user glyphicon glyphicon-user"></i> Sign Up</a></li><li class="divider"></li><li><a href="/places/default/user/login?_next=/places/default/view/China-47" rel="nofollow"><i class="icon icon-off glyphicon glyphicon-off"></i> Log In</a></li></ul></li></ul>
        <div class="nav">
          
          <ul class="nav"><li class="web2py-menu-first"><a href="/places/default/index">Home</a></li><li class="web2py-menu-last"><a href="/places/default/search">Search</a></li></ul>
          
        </div><!--/.nav-collapse -->
      </div>
    </div>
  </div><!--/top navbar -->

  <div class="container">
    <!-- Masthead ================================================== -->
      
    <header class="mastheader row" id="header">
        <div class="span12">
            <div class="page-header">
                <h1>
                    Example web scraping website
                    <small></small>
                </h1>
            </div>
        </div>
    </header>
   

    <section id="main" class="main row">
        

        <div class="span12">
            
            

<form action="#" enctype="multipart/form-data" method="post"><table><tr id="places_national_flag__row"><td class="w2p_fl"><label class="readonly" for="places_national_flag" id="places_national_flag__label">National Flag: </label></td><td class="w2p_fw"><img src="/places/static/images/flags/cn.png" /></td><td class="w2p_fc"></td></tr><tr id="places_area__row"><td class="w2p_fl"><label class="readonly" for="places_area" id="places_area__label">Area: </label></td><td class="w2p_fw">9,596,960 square kilometres</td><td class="w2p_fc"></td></tr><tr id="places_population__row"><td class="w2p_fl"><label class="readonly" for="places_population" id="places_population__label">Population: </label></td><td class="w2p_fw">1,330,044,000</td><td class="w2p_fc"></td></tr><tr id="places_iso__row"><td class="w2p_fl"><label class="readonly" for="places_iso" id="places_iso__label">Iso: </label></td><td class="w2p_fw">CN</td><td class="w2p_fc"></td></tr><tr id="places_country__row"><td class="w2p_fl"><label class="readonly" for="places_country" id="places_country__label">Country: </label></td><td class="w2p_fw">China</td><td class="w2p_fc"></td></tr><tr id="places_capital__row"><td class="w2p_fl"><label class="readonly" for="places_capital" id="places_capital__label">Capital: </label></td><td class="w2p_fw">Beijing</td><td class="w2p_fc"></td></tr><tr id="places_continent__row"><td class="w2p_fl"><label class="readonly" for="places_continent" id="places_continent__label">Continent: </label></td><td class="w2p_fw"><a href="/places/default/continent/AS">AS</a></td><td class="w2p_fc"></td></tr><tr id="places_tld__row"><td class="w2p_fl"><label class="readonly" for="places_tld" id="places_tld__label">Tld: </label></td><td class="w2p_fw">.cn</td><td class="w2p_fc"></td></tr><tr id="places_currency_code__row"><td class="w2p_fl"><label class="readonly" for="places_currency_code" id="places_currency_code__label">Currency Code: </label></td><td class="w2p_fw">CNY</td><td class="w2p_fc"></td></tr><tr id="places_currency_name__row"><td class="w2p_fl"><label class="readonly" for="places_currency_name" id="places_currency_name__label">Currency Name: </label></td><td class="w2p_fw">Yuan Renminbi</td><td class="w2p_fc"></td></tr><tr id="places_phone__row"><td class="w2p_fl"><label class="readonly" for="places_phone" id="places_phone__label">Phone: </label></td><td class="w2p_fw">86</td><td class="w2p_fc"></td></tr><tr id="places_postal_code_format__row"><td class="w2p_fl"><label class="readonly" for="places_postal_code_format" id="places_postal_code_format__label">Postal Code Format: </label></td><td class="w2p_fw">######</td><td class="w2p_fc"></td></tr><tr id="places_postal_code_regex__row"><td class="w2p_fl"><label class="readonly" for="places_postal_code_regex" id="places_postal_code_regex__label">Postal Code Regex: </label></td><td class="w2p_fw">^(\d{6})$</td><td class="w2p_fc"></td></tr><tr id="places_languages__row"><td class="w2p_fl"><label class="readonly" for="places_languages" id="places_languages__label">Languages: </label></td><td class="w2p_fw">zh-CN,yue,wuu,dta,ug,za</td><td class="w2p_fc"></td></tr><tr id="places_neighbours__row"><td class="w2p_fl"><label class="readonly" for="places_neighbours" id="places_neighbours__label">Neighbours: </label></td><td class="w2p_fw"><div><a href="/places/default/iso/LA">LA </a><a href="/places/default/iso/BT">BT </a><a href="/places/default/iso/TJ">TJ </a><a href="/places/default/iso/KZ">KZ </a><a href="/places/default/iso/MN">MN </a><a href="/places/default/iso/AF">AF </a><a href="/places/default/iso/NP">NP </a><a href="/places/default/iso/MM">MM </a><a href="/places/default/iso/KG">KG </a><a href="/places/default/iso/PK">PK </a><a href="/places/default/iso/KP">KP </a><a href="/places/default/iso/RU">RU </a><a href="/places/default/iso/VN">VN </a><a href="/places/default/iso/IN">IN </a></div></td><td class="w2p_fc"></td></tr></table><div style="display:none;"><input name="id" type="hidden" value="2714591" /></div></form>

<a href="/places/default/edit/China-47">Edit</a>

            
        </div>

        
    </section><!--/main-->

    <!-- Footer ================================================== -->
    <div class="row">
        <footer class="footer span12" id="footer">
        </footer>
    </div>

  </div> <!-- /container -->

  <!-- The javascript =============================================
       (Placed at the end of the document so the pages load faster) -->
  <script src="/places/static/js/bootstrap.min.js"></script>
  <script src="/places/static/js/web2py_bootstrap.js"></script>
  <!--[if lt IE 7 ]>
      <script src="/places/static/js/dd_belatedpng.js"></script>
      <script> DD_belatedPNG.fix('img, .png_bg'); //fix any <img> or .png_bg background-images </script>
      <![endif]-->
</body>
</html>


"""

"""
/:从根标签开始
//:从当前标签
*:通配符,选择所有
//div/book[1]/title:选择div下第一个book标签的tietle元素,注意这里下标是从1开始不是0
//div/book/title[@lang='zh']:选择title属性含有lang且内容是zh的title元素
//div/book/title //book/title //title具有相同的结果,因为使用相对路径最终都指向title
//book/title[@*]:将含有任意属性的title选出来
//book/title/@*:获取所有title的属性值,结果是个列表如:['eng', 'zh']
//book/title/text():内置函数text()直接选中title标签的内容,而不是标签,如['Harry Potter', 'Learning XML', 'python']
//a[@href='link1.html' and @id='places_neighbours__row']:把href属性为link1.html和id属性为places_neighbours__row的a标签选中
//a[@href='link1.html' or @id='places_neighbours__row']:把href属性为link1.html或id属性为places_neighbours__row的a标签选中
//div/book[price>39]/title:将book标签的子标签price的内容大于39的取出来,这里取出来的是个book标签,相当于一个过滤条件,然后把符合要求的book标签下的title标签取出来
//li[starts-with(@class,'item')]/a:选择标签属性class的值的前缀是以'item'开头的所有li标签下的a标签
//title[contains(@lang,'eng')]:选择属性lang的值里包含'eng'的所有title标签,包含就可以,无论前后还有没有其他字符
//div/descendant::* :选择div标签内的所有子节点
//book/ancestor::/title :选择book标签外的所有节点
"""



"""
# c = re.compile(r'(\d+[,]\d+[,]\d+)')
# c = re.compile(r'([0-9+,])')
# s = c.findall(test_data)
# print(s)


# 创建对象
# soup = BeautifulSoup(test_data, 'lxml')
# 标签和属性
# print(soup.a)# soup.a是抽取第一个a标签及里面的内容
# print(soup.a["href"])# 获取这个a标签里面的指定属性的值
# print(soup.a.contents)# 获取这个a标签里面的内容,列表形式
# print(soup.a.text)# 获取这个a标签里面的内容

# a_list = soup.find_all("a")  # 获取所有a标签及里面的内容,组成一个列表
# print(a_list)
# for a in a_list:
#     print(a['href'])# 获取所有a标签的'href'属性的值
#     print(a.text)# 获取所有a标签的内容

# 查找所有具有指定属性值的"a"标签
# a_place = soup.find_all("a", id="places_neighbours__row")
# for i in a_place:
#     print(i.text)
# print(a_place)

# 查找具有指定属性的所有标签"id":"places_neighbours__row"
# attrs_place = soup.find_all(attrs={"id": "places_neighbours__row"})
# print(attrs_place)
# print(soup.ul)
"""
# ==================================================================================
"""
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
}

r = requests.get('http://example.webscraping.com/places/default/view/China-47', headers=headers).content
def parse_bs4(html_str):
    soup = BeautifulSoup(html_str, 'lxml')

    tr = soup.find(attrs={'id':'places_area__row'})
    area = tr.find(attrs={'class':'w2p_fw'})
    return area.content
area = parse_bs4(r)
print(area)
"""



# with open('html/china.html','wb') as f:
#     f.write(result.content)

# area = result.find_all(attrs={'class':'w2p_fw'})
# for i in area:
#     a = i.td
#     print(a)
# area = result.find_all('td',class_='w2p_fw')
# print(area)

# print(result.text)
猜你喜欢