Python 爬取2014年世界杯球队及球员信息

1. 遍历所有国家（http://worldcup.2014.163.com/team_group/），将链接地址保存到 dataframe，再遍历。

2. 遍历到各球队信息页面，读取球队基本信息，保存到表。（如巴西队）

CREATE TABLE Team(
cid INT,
country VARCHAR(200),
country_en VARCHAR(200),
chief_coach VARCHAR(200),
star_player VARCHAR(200),
star_player_his VARCHAR(200),
best_ever VARCHAR(500),
)
GO

3. 同一页面往下移动，可看到该球队各场比赛的信息统计，保存到表。

CREATE TABLE TeamMatch(
cid INT,
country VARCHAR(200),
score INT,
match_country VARCHAR(200),
match_score INT,
jinqiu INT,
shiqiu INT,
shemen INT,
shezheng INT,
jiaoqiu INT,
fangui INT,
chuanqiu INT,
qiangduan INT,
yuewei INT,
huangpai INT,
hongpai INT,
kongqiu DECIMAL(6,2),
)
GO

4. 同一页面再往下移动，可看到该球队-队员的信息统计，保存到表。

CREATE TABLE TeamMatchPlayer(
cid INT,
country VARCHAR(200),
num INT,
name VARCHAR(200),
position VARCHAR(20),
chuchang INT,
shijian INT,
jinqiu INT,
zhugong INT,
shemen INT,
shezheng INT,
fangui INT,
qiangduan INT,
huangpai INT,
hongpai INT,
buqiu INT,
shiqiu INT,
)
GO

5. 球队信息爬取完成，接下来再爬取所有球员的信息（http://worldcup.2014.163.com/player_position/），保存到表。其中 dataframe 也把球员的页面链接地址也保存了。为了不分页爬取，直接爬取“所有”的数据。

CREATE TABLE PlayerMatch(
country VARCHAR(200)
,name VARCHAR(200)
,link VARCHAR(500)
,num INT
,chuchang INT
,shijian INT
,jinqiu INT
,zhugong INT
,shemen INT
,shemen_s INT
,chuanqiu INT
,chuanqiu_s INT
,chuqiu INT
,qiangduan INT
,qiangduan_s INT
,fangui INT
,huangpai INT
,hongpai INT
,pujiu INT
)
GO

6. 遍历到球员页面，将球员的基本信息保存到表。（如梅西）

CREATE TABLE PlayerInfo(
country	VARCHAR(200)
,name VARCHAR(200)
,english_name VARCHAR(200)
,birth_date DATE
,height DECIMAL(6,2)
,[weight] DECIMAL(6,2)
,position VARCHAR(20)
,match_infor VARCHAR(500)
)
GO

7. 同一页面往下移动，可以看到该球员在各场比赛的信息，保存到表。

CREATE TABLE PlayerMatchDetail(
name VARCHAR(200),
country VARCHAR(200),
score INT,
match_country VARCHAR(200),
match_score INT,
chuchang_t  VARCHAR(20),
chuchang  INT,
jinqiu INT,
zhugong INT,
shemen INT,
shemen_s INT,
jiaoqiu INT,
fangui INT,
qiangduan INT,
huangpai INT,
hongpai INT,
puqiu INT,
shiqiu INT,
)
GO

完成了，现在分两个脚本运行爬取。一个脚本爬取球队信息，保存到前3张表；另一个脚本爬取球员信息，保存到后3张表。

爬取球队信息：

# -*- coding: utf-8 -*-
# python 3.5

import re
import time
import sqlalchemy
import pandas as pd
import urllib.request
from string import digits
from bs4 import BeautifulSoup

url = "http://worldcup.2014.163.com"
engine = sqlalchemy.create_engine("mssql+pymssql://kk:kk@HZC/Myspider") 
df_country = pd.DataFrame(columns=['cid','country','rel_url'])

#所有球队的URL
def get_team():
	response = urllib.request.urlopen(url+"/team_continent/")
	html = response.read().decode()
	soup = BeautifulSoup(html,"html.parser")
	team = soup.find('div',class_="sect_teamgather sect").find_all('li')
	index = 0
	for li in team:
		df_country.loc[index,'cid'] = index
		df_country.loc[index,'country'] = li.find('a').get_text().strip()
		df_country.loc[index,'rel_url'] = li.find('a')['href'].strip()
		index = index + 1
		

#球队基本信息
def get_team_info(cid,country,soup):
	df_team = pd.DataFrame(columns=['cid','country','country_en','chief_coach','star_player','star_player_his','best_ever'])
	base_info = soup.find('div',class_="tx_box").find('div',class_="sub_info").find_all('p')
	df_team.loc[cid,'cid'] = cid
	df_team.loc[cid,'country'] = country
	df_team.loc[cid,'country_en'] = base_info[0].get_text().split('：')[1].strip()
	df_team.loc[cid,'chief_coach'] = base_info[1].get_text().split('：')[1].strip()
	df_team.loc[cid,'star_player'] = base_info[2].get_text().split('：')[1].strip()
	df_team.loc[cid,'best_ever'] = base_info[3].get_text().split('：')[1].strip()
	df_team.loc[cid,'star_player_his'] = base_info[4].get_text().split('：')[1].strip()
	df_team.to_sql("Team", engine, index=False, if_exists='append') 
	
	
#球队比赛统计信息
def get_team_match_info(cid,country,soup):
	df_team_match = pd.DataFrame(columns=['cid','country','score','match_country','match_score','jinqiu','shiqiu'
	,'shemen','shezheng','jiaoqiu','fangui','chuanqiu','qiangduan','yuewei','huangpai','hongpai','kongqiu'])
	match_info = soup.find_all('table',class_="wctable wctable_statdata")[0].find('tbody').find_all('tr')
	index = df_country['cid'].count()
	for rows in match_info:
		if not rows.find('th'):
			tr = rows.find_all('td')
			tr0 = tr[0].get_text().replace(' ', '')
			cc = tr0.split('-')[0] if re.search(country, tr0.split('-')[0]) else tr0.split('-')[1]
			mcc = tr0.split('-')[1] if re.search(country, tr0.split('-')[0]) else tr0.split('-')[0]
			df_team_match.loc[index,'cid'] = cid
			df_team_match.loc[index,'country'] = country
			df_team_match.loc[index,'score'] = int(''.join(filter(str.isdigit, cc)))
			df_team_match.loc[index,'match_country'] = mcc.translate(str.maketrans('', '', digits)) 
			df_team_match.loc[index,'match_score'] = int(''.join(filter(str.isdigit, mcc)))
			df_team_match.loc[index,'jinqiu'] = tr[1].get_text().strip()
			df_team_match.loc[index,'shiqiu'] = tr[2].get_text().strip()
			df_team_match.loc[index,'shemen'] = tr[3].get_text().strip()
			df_team_match.loc[index,'shezheng'] = tr[4].get_text().strip()
			df_team_match.loc[index,'jiaoqiu'] = tr[5].get_text().strip()
			df_team_match.loc[index,'fangui'] = tr[6].get_text().strip()
			df_team_match.loc[index,'chuanqiu'] = tr[7].get_text().strip()
			df_team_match.loc[index,'qiangduan'] = tr[8].get_text().strip()
			df_team_match.loc[index,'yuewei'] = tr[9].get_text().strip()
			df_team_match.loc[index,'huangpai'] = tr[10].get_text().strip()
			df_team_match.loc[index,'hongpai'] = tr[11].get_text().strip()
			df_team_match.loc[index,'kongqiu'] = tr[12].get_text().strip()
			index = index + 1
	df_team_match.to_sql("TeamMatch", engine, index=False, if_exists='append') 
	
	
#球队球员比赛统计信息
def get_team_match_player_info(cid,country,soup):
	df_team_match_player = pd.DataFrame(columns=['cid','country','num','name','position','chuchang','shijian'
	,'jinqiu','zhugong','shemen','shezheng','fangui','qiangduan','huangpai','hongpai','puqiu','shiqiu'])
	match_info = soup.find_all('table',class_="wctable wctable_statdata")[1].find('tbody').find_all('tr')
	index = df_country['cid'].count()
	for rows in match_info:
		tr = rows.find_all('td')
		df_team_match_player.loc[index,'cid'] = cid
		df_team_match_player.loc[index,'country'] = country
		df_team_match_player.loc[index,'num'] = tr[0].get_text().strip()
		df_team_match_player.loc[index,'name'] = tr[1].get_text().strip()
		df_team_match_player.loc[index,'position'] = tr[2].get_text().strip()
		df_team_match_player.loc[index,'chuchang'] = tr[3].get_text().strip()
		df_team_match_player.loc[index,'shijian'] = tr[4].get_text().strip()
		df_team_match_player.loc[index,'jinqiu'] = tr[5].get_text().strip()
		df_team_match_player.loc[index,'zhugong'] = tr[6].get_text().strip()
		df_team_match_player.loc[index,'shemen'] = tr[7].get_text().strip()
		df_team_match_player.loc[index,'shezheng'] = tr[8].get_text().strip()
		df_team_match_player.loc[index,'fangui'] = tr[9].get_text().strip()
		df_team_match_player.loc[index,'qiangduan'] = tr[10].get_text().strip()
		df_team_match_player.loc[index,'huangpai'] = tr[11].get_text().strip()
		df_team_match_player.loc[index,'hongpai'] = tr[12].get_text().strip()
		df_team_match_player.loc[index,'puqiu'] = tr[13].get_text().strip()
		df_team_match_player.loc[index,'shiqiu'] = tr[14].get_text().strip()
		index = index + 1
	df_team_match_player.to_sql("TeamMatchPlayer", engine, index=False, if_exists='append') 	
	
	
#所有团队信息	
def get_team_all_info():
	if not df_country.empty:
		for ind,row in df_country.iterrows():
			print("%s  %s" %(row['country'],row['rel_url']))
			response = urllib.request.urlopen(url+row['rel_url'])
			html = response.read().decode()
			soup = BeautifulSoup(html,"html.parser")
			get_team_info(row['cid'],row['country'],soup)
			get_team_match_info(row['cid'],row['country'],soup)
			get_team_match_player_info(row['cid'],row['country'],soup)
			time.sleep(2)

get_team()
get_team_all_info()

爬取球员信息：

# -*- coding: utf-8 -*-
# python 3.5

import re
import time
import sqlalchemy
import pandas as pd
import urllib.request
from string import digits
from bs4 import BeautifulSoup

url = "http://worldcup.2014.163.com"
engine = sqlalchemy.create_engine("mssql+pymssql://kk:kk@HZC/Myspider") 
player = pd.DataFrame(columns=['country','name','link','num','chuchang','shijian','jinqiu','zhugong','shemen'
,'shemen_s','chuanqiu','chuanqiu_s','chuqiu','qiangduan','qiangduan_s','fangui','huangpai','hongpai','pujiu'])


#所有团队信息	
def get_all_player_info():
	response = urllib.request.urlopen(url+"/player_position/")
	html = response.read().decode()
	soup = BeautifulSoup(html,"html.parser")
	tbody = soup.find('div',class_="panels").find_all('div')[0].find('tbody').find_all('tr')
	index = 0
	for rows in tbody:
		tr = rows.find_all('td')
		player.loc[index,'num'] = tr[0].get_text().strip()
		player.loc[index,'name'] = tr[1].get_text().strip()
		player.loc[index,'link'] = tr[1].find('a')['href'].strip()
		player.loc[index,'country'] = tr[2].get_text().strip()
		player.loc[index,'chuchang'] = tr[3].get_text().strip()
		player.loc[index,'shijian'] = tr[4].get_text().strip()
		player.loc[index,'jinqiu'] = tr[5].get_text().strip()
		player.loc[index,'zhugong'] = tr[6].get_text().strip()
		player.loc[index,'shemen'] = tr[7].get_text().split('/')[1].strip()
		player.loc[index,'shemen_s'] = tr[7].get_text().split('/')[0].strip()
		player.loc[index,'chuanqiu'] = tr[8].get_text().split('/')[1].strip()
		player.loc[index,'chuanqiu_s'] = tr[8].get_text().split('/')[0].strip()
		player.loc[index,'chuqiu'] = tr[9].get_text().strip()
		player.loc[index,'qiangduan'] = tr[10].get_text().split('/')[1].strip()
		player.loc[index,'qiangduan_s'] = tr[10].get_text().split('/')[0].strip()
		player.loc[index,'fangui'] = tr[11].get_text().strip()
		player.loc[index,'huangpai'] = tr[12].get_text().strip()
		player.loc[index,'hongpai'] = tr[13].get_text().strip()
		player.loc[index,'pujiu'] = tr[14].get_text().strip()
		index = index + 1
	player.to_sql("PlayerMatch", engine, index=False, if_exists='append') 

def get_player_base_info(name,country,soup):
	df_base = pd.DataFrame(columns=['country','name','english_name','birth_date','height','weight','position','match_infor'])
	base_info = soup.find('div',class_="tx_box")
	base = base_info.find('div',class_="main_info").find_all('p')
	match = base_info.find('div',class_="sub_info")
	df_base.loc[0,'country'] = country
	df_base.loc[0,'name'] = name
	df_base.loc[0,'english_name'] = base[0].get_text().split(':')[1].strip()
	df_base.loc[0,'birth_date'] = base[1].get_text().split(':')[1].strip()
	df_base.loc[0,'height'] = base[2].get_text().split(':')[1].strip()
	df_base.loc[0,'weight'] = base[3].get_text().split(':')[1].strip()
	df_base.loc[0,'position'] = base[4].get_text().split(':')[1].strip()
	df_base.loc[0,'match_infor'] = re.sub(r"[\n\t\s\比赛信息]*", "", match.get_text()).replace("/0","; ")
	df_base.to_sql("PlayerInfo", engine, index=False, if_exists='append') 

	
def get_player_match_info(name,country,soup):
	df_match = pd.DataFrame(columns=['name','country','score','match_country','match_score','chuchang_t','chuchang'
	,'jinqiu','zhugong','shemen','shemen_s','jiaoqiu','fangui','qiangduan','huangpai','hongpai','puqiu','shiqiu'])
	match_info = soup.find_all('table',class_="wctable wctable_statdata")[0].find('tbody').find_all('tr')
	index = 0
	for rows in match_info:
		if not rows.find('th'):
			tr = rows.find_all('td')
			tr0 = tr[0].get_text().replace(' ', '')
			cc = tr0.split('-')[0] if re.search(country, tr0.split('-')[0]) else tr0.split('-')[1]
			mcc = tr0.split('-')[1] if re.search(country, tr0.split('-')[0]) else tr0.split('-')[0]
			df_match.loc[index,'country'] = country
			df_match.loc[index,'name'] = name
			df_match.loc[index,'score'] = int(''.join(filter(str.isdigit, cc)))
			df_match.loc[index,'match_country'] = mcc.translate(str.maketrans('', '', digits)) 
			df_match.loc[index,'match_score'] = int(''.join(filter(str.isdigit, mcc)))
			df_match.loc[index,'chuchang_t'] = tr[1].get_text().strip()
			df_match.loc[index,'chuchang'] = tr[2].get_text().strip()
			df_match.loc[index,'jinqiu'] = tr[3].get_text().strip()
			df_match.loc[index,'zhugong'] = tr[4].get_text().strip()
			df_match.loc[index,'shemen'] = tr[5].get_text().split('/')[1].strip()
			df_match.loc[index,'shemen_s'] = tr[5].get_text().split('/')[0].strip()
			df_match.loc[index,'jiaoqiu'] = tr[6].get_text().strip()
			df_match.loc[index,'fangui'] = tr[7].get_text().strip()
			df_match.loc[index,'qiangduan'] = tr[8].get_text().strip()
			df_match.loc[index,'huangpai'] = tr[9].get_text().strip()
			df_match.loc[index,'hongpai'] = tr[10].get_text().strip()
			df_match.loc[index,'puqiu'] = tr[11].get_text().strip()
			df_match.loc[index,'shiqiu'] = tr[12].get_text().strip()
			index = index + 1
	df_match.to_sql("PlayerMatchDetail", engine, index=False, if_exists='append') 
			
	
def get_player_info():
	if not player.empty:
		for ind,row in player.iterrows():
			print("%s（%s） %s" %(row['name'],row['country'],row['link']))
			response = urllib.request.urlopen(url+row['link'])
			html = response.read().decode()
			soup = BeautifulSoup(html,"html.parser")
			get_player_base_info(row['name'],row['country'],soup)
			get_player_match_info(row['name'],row['country'],soup)
			time.sleep(2)
	
get_all_player_info()
get_player_info()

变量各球队或者各球员信息的时候，每一次暂停2秒钟，避免访问太频繁。不过去掉sleep后实测也没问题。有的信息爬取较久，所以每次爬取一个球队/球员信息后就立即保存下来。

==================================

以下是爬取全世界国家队排名及其近几年的比赛信息。

--所有球队积分排名
CREATE TABLE WC_Country(
ranking INT,		--世界排名
name VARCHAR(100),	--国家队
score INT,			--积分情况
link VARCHAR(50)	--详细信息链接
)
GO
--球队近几年比赛结果
CREATE TABLE WC_CountryMatch(
event_type VARCHAR(100),--赛事
event_date DATE,		--比赛时间
name VARCHAR(100),		--国家队
goal INT,				--得分
match_name VARCHAR(100),--比赛国家
match_goal INT,			--比赛国家得分
is_host TINYINT,		--是否是主队
results  CHAR(2),		--比赛结果
)
GO

# -*- coding: utf-8 -*-
# python 3.5

import re
import time
import sqlalchemy
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup

url = "http://liansai.500.com/paiming/"
engine = sqlalchemy.create_engine("mssql+pymssql://kk:kk@HZC/Myspider") 
country = pd.DataFrame(columns=['ranking','name','score','link'])

#获取所有球队及积分排名
def get_all_country():
	response = urllib.request.urlopen(url)
	html = response.read().decode("gbk")
	soup = BeautifulSoup(html,"lxml")
	tbody = soup.find('tbody',class_="pm_data").find_all('tr')
	index = 0
	for rows in tbody:
		tr = rows.find_all('td')
		country.loc[index,'ranking'] = tr[0].get_text().strip()
		country.loc[index,'name'] = tr[1].get_text().strip()
		country.loc[index,'link'] = "http:"+tr[1].find('a')['href'].strip()
		country.loc[index,'score'] = tr[3].get_text().strip()
		index = index + 1
	country.to_sql("WC_Country", engine, index=False, if_exists='append')

#遍历所有国家获取比赛结果
def get_all_country_macht():
	if not country.empty:
		for ind,row in country.iterrows():
			print("%s  %s" %(row['name'],row['link']))
			response = urllib.request.urlopen(row['link']+"teamfixture/")
			html = response.read().decode("gbk")
			soup = BeautifulSoup(html,"lxml")
			get_one_country_macht(row['name'],soup)

#获取一个国家比赛结果		
def get_one_country_macht(name,soup):
	macht = pd.DataFrame(columns=['event_type','event_date','name','goal','match_name','match_goal','is_host','results'])
	tbody = soup.find('div',class_="ltab_bd jTabBD").find_all('tbody',class_="jTrInterval his_table")[0].find_all('tr')
	index = 0
	for rows in tbody:
		tr = rows.find_all('td')
		host = re.sub(r"[\n\t\s]*", "", tr[2].get_text())
		guest = re.sub(r"[\n\t\s]*", "", tr[4].get_text())
		goalA = re.sub(r"[\n\t\s]*", "", tr[3].get_text().split('(')[0].split(':')[0])
		goalB = re.sub(r"[\n\t\s]*", "", tr[3].get_text().split('(')[0].split(':')[1])
		goal =  goalA if re.search(name,host) else goalB
		match_goal =  goalB if re.search(name,host) else goalA
		match_name = guest if re.search(name,host) else host
		is_host = 1 if re.search(name,host) else 0
		macht.loc[index,'event_type'] = tr[0].get_text().strip()
		macht.loc[index,'event_date'] = tr[1].get_text().strip()
		macht.loc[index,'name'] = name
		macht.loc[index,'goal'] = goal
		macht.loc[index,'match_name'] = match_name
		macht.loc[index,'match_goal'] = match_goal
		macht.loc[index,'is_host'] = is_host
		macht.loc[index,'results'] = tr[5].get_text().strip()
		index = index + 1
	macht.to_sql("WC_CountryMatch", engine, index=False, if_exists='append')
	time.sleep(2)


get_all_country()
get_all_country_macht()

Python 爬取2014年世界杯球队及球员信息

猜你喜欢