将《利用Python进行数据分析》书中第2章 1880-2010年间全美婴儿姓名的项目作为练习,name数据可在GitHub中获得。
拓展练习如下:
1.计算指定名字的年度性别比例
2.计算某个名字的相对排名
3.1计算增长或减少最多的名字
3.2计算年度最流行的名字
4.分析名字趋势:长度
#导入数据
pieces=[]
years=range(1880,2011)
for year in years:
location = 'data/names/yob%d.txt'%year
frame = pd.read_csv(location,names=['name','sex','births'])
frame['year']=year
pieces.append(frame)
names = pd.concat(pieces)
#整理数据
def add_prop(group):
group['prop']=group['births']/group['births'].sum()
return group
names = names.groupby(['year','sex']).apply(add_prop)
#整理数据
def select_top(group):
return group[:1000]
top1000=names.groupby(['year','sex']).apply(select_top)
top1000.index = top1000.index.droplevel().droplevel()
top1000.head()
1.计算指定名字的年度性别比例
# 计算指定名字的年度性别比例
specific = top1000[(top1000['name']=='Mary')]
specific_pivot_table = specific.pivot_table('births',index='year',columns=['sex'],aggfunc=sum)
specific_pivot_table.head()
specific_prop = specific_pivot_table.div( specific_pivot_table.sum(1),axis=0)
specific_prop.head()
specific_prop.plot(subplots=True)
2.计算指定名字的年度性别比例2
name_list= ['Harry','Mary']
name_table = top1000.pivot_table('births',columns=['name'],index=['year','sex'],aggfunc=sum)
several_name = name_table[name_list]
several_name = several_name.unstack()
several_name['Harry_F']=several_name['Harry']['F'].div(several_name['Harry'].sum(1))
several_name['Harry_M']=several_name['Harry']['M'].div(several_name['Harry'].sum(1))
several_name['Mary_F']=several_name['Mary']['F'].div(several_name['Mary'].sum(1))
several_name['Mary_M']=several_name['Mary']['M'].div(several_name['Mary'].sum(1))
several_name
several_name = several_name[['Harry_F','Harry_M','Mary_F','Mary_M']]
several_name
several_name.plot(subplots=True)
2.计算某个名字的相对排名
# 计算某个名字的相对排名
name_table = top1000.pivot_table('births',columns='year',index='name',aggfunc=sum)
name_table.head()
扫描二维码关注公众号,回复:
11940379 查看本文章
name_rank = name_table.rank(method='min',axis=0,ascending=False)
myname=name_rank.loc[['Mary','Jerry','Tom'],:]
myname
mynameT=myname.T
mynameT.to_csv('mynameT.csv')
mynameT
mynameT = -mynameT
mynameT
mynameT.plot(kind='line')
3.1计算增长或减少最多的名字
name_table = top1000[top1000.sex=='F'].pivot_table('births',columns='year',index='name',aggfunc=sum)
name_rank = name_table.rank(method='min',axis=0,ascending=False)
name_rankT = name_rank.T
name_rankT
rank_diff = name_rankT.diff() #求变化值
rank_diff
rank_diff_min = rank_diff.idxmin(axis=1)#名次绝对值下降最快(即名次上升最多)
rank_diff_min.to_csv('rank_diff_min.csv')
rank_diff_min
#year
#1880 NaN
#1881 Isa
#1882 Clementine
#1883 Cathryn
#1884 Belva
...
#2006 Ayla
#2007 Abril
#2008 Khloe
#2009 Maliyah
#2010 Maci
#Length: 131, dtype: object
plt.figure()
rank_diff['Mary'].plot() #Mary名字变化图
3.2计算年度最流行的名字
name_rank
popular_name = name_rank.idxmin(axis=0)
popular_name.to_csv('popular_name.csv')
# 求当年最流行的名字
popular_name
#year
#1880 Mary
#1881 Mary
#1882 Mary
#1883 Mary
#1884 Mary
# ...
#2006 Emily
#2007 Emily
#2008 Emma
#2009 Isabella
#2010 Isabella
#Length: 131, dtype: object
popular_name.value_counts()
#Mary 76
#Jennifer 15
#Emily 12
#Jessica 9
#Lisa 8
#Linda 6
#Ashley 2
#Isabella 2
#Emma 1
#dtype: int64
4.分析名字趋势:长度
# 分析名字趋势:长度
top1000['len_of_name']=top1000['name'].str.len()#分析长度
top1000
name_len_table = top1000.pivot_table('births',index='year',columns=['len_of_name'],aggfunc='sum')
name_len_table = name_len_table.div(name_len_table.sum(1),axis=0)
name_len_table
name_len_table.plot(subplots=True,figsize=(10,10)) #子图查看不同长度的名字
name_len_table.plot(kind='line',figsize=(8,8)) #总图查看不同趋势