AI&BigData one: Crawling HTML page content using Python crawler

# -*- coding: utf-8 -*-  

'Python implements crawling HTML page content

# import requests package
import requests
# BeautifulSoup for page parsing
from bs4 import BeautifulSoup

# requests get request
# requests.get returns a pure web page file, which needs further analysis below
req = requests.get('http://finance.eastmoney.com/news/cywjh.html') # Oriental Fortune Network
# page data structure
bs = BeautifulSoup(req.content, 'html.parser')
# =============================================================================  
# page parsing  
# =============================================================================  
# Get all p paragraphs whose class label is title, which is the title of the news report on the page
# For why the paragraph with class='title' is selected, this requires viewing the specific HTML source code through the browser's development tools.
p_title = bs.find_all('p', attrs={'class': 'title'})
# print p_title[0]
# Loop to get all news headlines on this page
for info in p_title:
    # Get the url of the a tag inside the p tag
    url = info.a.get('href')
    # print url
    # Detail page get request
    req = requests.get(url)
    # page data structure
    bs1 = BeautifulSoup(req.content, 'html.parser')
    # =============================================================================  
    # Notes:  
    # The find() method is to find the first object that meets the conditions  
    # The find_all() method is to find all the objects that meet the conditions  
    # =============================================================================  
    # Detailed page analysis
    title = bs1.find('h1').text # find the first h1 tag
    p_all = bs1.find_all('p') # find all p tags
    # print the title of the news
    print title
    for p in p_all:
        # loop to print all paragraphs of the news
        print p.text
    print '-------------------------------------'

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325588269&siteId=291194637