# -*- coding: utf-8 -*- 'Python implements crawling HTML page content # import requests package import requests # BeautifulSoup for page parsing from bs4 import BeautifulSoup # requests get request # requests.get returns a pure web page file, which needs further analysis below req = requests.get('http://finance.eastmoney.com/news/cywjh.html') # Oriental Fortune Network # page data structure bs = BeautifulSoup(req.content, 'html.parser') # ============================================================================= # page parsing # ============================================================================= # Get all p paragraphs whose class label is title, which is the title of the news report on the page # For why the paragraph with class='title' is selected, this requires viewing the specific HTML source code through the browser's development tools. p_title = bs.find_all('p', attrs={'class': 'title'}) # print p_title[0] # Loop to get all news headlines on this page for info in p_title: # Get the url of the a tag inside the p tag url = info.a.get('href') # print url # Detail page get request req = requests.get(url) # page data structure bs1 = BeautifulSoup(req.content, 'html.parser') # ============================================================================= # Notes: # The find() method is to find the first object that meets the conditions # The find_all() method is to find all the objects that meet the conditions # ============================================================================= # Detailed page analysis title = bs1.find('h1').text # find the first h1 tag p_all = bs1.find_all('p') # find all p tags # print the title of the news print title for p in p_all: # loop to print all paragraphs of the news print p.text print '-------------------------------------'
AI&BigData one: Crawling HTML page content using Python crawler
Guess you like
Origin http://43.154.161.224:23101/article/api/json?id=325588269&siteId=291194637
Recommended
Ranking