python爬取慕课网课程信息

基本要求:

 分析网页源码后:

import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import io
import sys
import csv
import re
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')

f = open('abc.csv','w',encoding='gb18030',newline='')
csv_writer = csv.writer(f)
csv_writer.writerow(["标题","人物" , "岗位","难度","时长","学习人数" ,"综合评分" ,"内容实用" ,"简洁易懂" ,"逻辑清晰" ,"姓名","得分","评论内容","点赞数","时间" ,"简介"])

#open every page
def get_text(url):
	res = requests.get(url)
	res.encode = 'utf-8'
	content = res.text
	doc = BeautifulSoup(content ,'lxml')
	#print(doc)
	return doc


def get_page1(doc1):
	pageinfo1 = doc1.find('div' ,{'class' : 'course-nav-row course-nav-skills clearfix'})
	url1 = pageinfo1.findAll('a')
	url2 = url1[1:]
	longPage = []
	for item in range(len(url2)):
		attr1 = url2[item].attrs['href']
		attr2 = 'https://www.imooc.com/course/list' + attr1[12:] + '&sort=pop'
		#print(attr2)
		longPage.append(get_text(attr2))
	return longPage
		

#find avaliable message
def get_page(doc2):
	pageinfo1=doc2.findAll('a',{'class':'course-card'});
	shortTitleDoc = []

	for item in range(len(pageinfo1)):
		href1 = pageinfo1[item].attrs['href']
		if href1[0:6] == '/learn':
			href2 = 'https://www.imooc.com/coursescore' + href1[6:]
			shortTitleDoc.append(get_text(href2))
		if len(shortTitleDoc) == 3:
			return shortTitleDoc
	return shortTitleDoc

def get_introduceDoc(doc4):
	pageinfo1=doc4.findAll('a',{'class':'course-card'});
	shortTitleDoc2 = []

	for item in range(len(pageinfo1)):
		href1 = pageinfo1[item].attrs['href']
		if href1[0:6] == '/learn':
			href2 = 'https://www.imooc.com/learn' + href1[6:]
			shortTitleDoc2.append(get_text(href2))
		if len(shortTitleDoc2) == 3:
			return shortTitleDoc2
	return shortTitleDoc2

def get_AjaxSourceData(doc):
	pageinfo1=doc.findAll('a',{'class':'course-card'});
	AjaxData = []

	for item in range(len(pageinfo1)):
		href1 = pageinfo1[item].attrs['href']
		if href1[0:6] == '/learn':
			href2 = 'https://www.imooc.com/course/AjaxCourseMembers?ids=' + href1[7:]
			#print(href2)
			AjaxData.append(get_text(href2))
		if len(AjaxData) == 3:
			return AjaxData
	return AjaxData
	

def get_page_text(shortTitleDoc ,doc5 ,AjaxData):
	#顶部部分
	h2 = shortTitleDoc.find('div' ,{'class':'hd clearfix'}).find('h2').text
	#print(h2)
	title = shortTitleDoc.find('div' ,{'class' : 'statics clearfix'})
	name = shortTitleDoc.find('div' ,{'class' :'teacher-info l'})
	details = shortTitleDoc.findAll('div' ,{'class' : 'static-item'})

	name1 = name.find('span' ,{'class' : 'tit'}).find('a').text
	job1 = name.find('span' ,{'class' :'job'}).text

	metaValue = []
	for item in range(4):
		meta = details[item].find('span' ,{'class' : 'meta'}).text
		metaValue.append(details[item].find('span' ,{'class' : 'meta-value'}).text)
	#print(metaValue[0])
	r = re.findall('numbers":"(.*?)"',str(AjaxData))
	metaValue[2] = r[0]

	#中间部分
	content = shortTitleDoc.find('div' ,{'class' : 'evaluation-info'})
	#a = content.find('div' ,{'class' : 'evaluation-title'}).text
	score = content.find('div' ,{'class' : 'evaluation-score'}).text
	li = content.findAll('li')
	metaValue1 = []
	for item in range(3):
		metaValue1.append(li[item].find('span').text)
		
	#评论区部分
	comment = shortTitleDoc.find('div' ,{'class' : 'evaluation-list'})
	commentLen = comment.findAll('div' ,{'class' : 'evaluation evaluate'})
	#print(len(commentLen))
	commentName = commentLen[0].find('a' ,{'class' : 'username'}).text
	commentScore = commentLen[0].find('div' ,{'class' : 'star-box'}).find('span').text
	commentContent = commentLen[0].find('p' ,{'class' : 'content'}).text
	commentPraise = commentLen[0].find('div' ,{'class' : 'info clearfix'}).find('em').text
	commentTime = commentLen[0].find('span' ,{'class' : 'time'}).text

	content1 = doc5.find('div' ,{'class' :'course-description course-wrap'}).text

	csv_writer.writerow([h2 ,name1 ,job1 ,metaValue[0] ,metaValue[1] ,metaValue[2] ,metaValue[3] ,metaValue1[0] ,metaValue1[1] ,metaValue1[2],commentName,commentScore,commentContent,commentPraise,commentTime ,content1])

doc3 = get_text('https://www.imooc.com/course/list')
longPage1 = get_page1(doc3)
for item in range(len(longPage1)):
	shortTitleDoc1 = get_page(longPage1[item])
	shortTitleDoc3 = get_introduceDoc(longPage1[item])
	AjaxData = get_AjaxSourceData(longPage1[item])
	for i in range(len(shortTitleDoc1)):
		get_page_text(shortTitleDoc1[i] ,shortTitleDoc3[i] ,AjaxData[i])
		#get_introduce(shortTitleDoc3[i])
		#print(len(shortTitleDoc1))
f.close()

  结果保存在CSV表格当中(部分数据)

猜你喜欢

转载自www.cnblogs.com/Crush999/p/12078305.html