《利用python进行数据分析》————美国1880-2010年的婴儿名字

[本次数据分析所用到的数据集链接]
(http://github.com/wesm/pydata-book)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
names1880 = pd.read_csv('datasets/babynames/yob1880.txt',names = ['name','sex','births']) #由于它已经以逗号分隔的形式出现，因此可以使用pandas.read_csv将其加载到DataFrame中
print(names1880)
print(names1880.groupby('sex').births.sum()) #按性别列出的出生总和作为当年的出生总数
years = range(1880,2011)
pieces = []
columns = ['name','sex','births']
for year in years:
    path = 'datasets/babynames/yob%d.txt' %year
    frame = pd.read_csv(path,names = columns)

    frame['year'] = year
    pieces.append(frame)
names = pd.concat(pieces,ignore_index=True) #将所有数据集中到一个DataFrame中，然后再添加一个年份字段
total_bitrhs = names.pivot_table('births',index = 'year',columns = 'sex',aggfunc = sum) #使用pivot_table聚合年份和性别的数据
print(total_bitrhs.tail()) #读取并显示数据的后五行
total_bitrhs.plot(title = 'Total births by sex and year') #画出按性别和年份划分的出生总数图
plt.show()
def add_prop(group):
    group['prop'] = group.births/group.births.sum()
    return group
names = names.groupby(['year','sex']).apply(add_prop) #插入一个prop列，按年份和性别给出每个婴儿名字相对于出生总数的比例
print(names.groupby(['year','sex']).prop.sum) #这一步是在进行完整性检查，即看概率相加是否为1
def get_top1000(group):
    return group.sort_values(by = 'births',ascending=False)[:1000]  #按照列births进行数据降序排序
grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000) #按年份和性别获取前1000名数据
top1000.reset_index(inplace = True,drop = True) #删除组索引
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
total_bitrhs = top1000.pivot_table('births',index = 'year',columns = 'name',aggfunc=sum) #按年份和姓名形成出生总数的数据透视表
subset = total_bitrhs[['John','Harry','Mary','Marilyn']]
subset.plot(subplots = True,figsize = (12,10),grid = False,title = "Number of births per year") #画出一些男孩名字和女孩名字随时间变化的趋势
plt.show()
table = top1000.pivot_table('prop',index = 'year',columns = 'sex',aggfunc=sum) #按年份和性别形成婴儿的出生比例的数据透视表
table.plot(title = 'Sum of table1000.prop by year and sex',yticks = np.linspace(0,1.2,13),xticks = range(1880,2020,10)) #画出按性别划分的Top 1000名字的出生比例
def get_quantile_count(group,q = 0.5):
    group = group.sort_values(by = 'prop',ascending=False) #按照prop这一列进行降序排列
    return group.prop.cumsum().values.searchsorted(q) + 1 #先获取prop的累计总和cumsum，然后调用searchsorted方法返回传入的q = 0.5在累计总和中的位置
diversity = top1000.groupby(['year','sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity.plot(title = "Number of popular names in top 50%") #画出按年份划分的多样性指标图
plt.show()
get_last_letter = lambda x:x[-1]
last_letters = names.name.map(get_last_letter) #从name列提取最后一个字母
last_letters.name = 'last_letter'
table = names.pivot_table('births',index = last_letters,columns=['sex','year'],aggfunc = sum) #按照年份、性别和最后一个字母汇总完整数据集中的所有出生情况
subtable = table.reindex(columns = [1910,1960,2010],level = 'year') #选出历史上三个有代表性的年份
print(subtable.head()) #列出选出来的前5行数据
letter_prop = subtable/subtable.sum() #计算出每个性别的每个结束字母占总出生数的比例
fig,axes = plt.subplots(2,1,figsize = (10,8))
letter_prop['M'].plot(kind = 'bar',rot = 0,ax = axes[0],title = 'Male') #画出男孩名字最后一个字母的比例
letter_prop['F'].plot(kind = 'bar',rot = 0,ax = axes[1],title = 'Female',legend = False) #画出女孩名字最后一个字母的比例
letter_prop = table/table.sum()
dny_ts = letter_prop.loc[['d','n','y'],'M'].T #按照年份和性别进行标准化，并为男孩选择一个字母子集，最后转置使每列成为一个时间序列
dny_ts.plot() #随着时间推移名字以d/n/y结尾的男孩的比例变化趋势
plt.show()
all_names = pd.Series(top1000.name.unique())
lesley_like = all_names[all_names.str.lower().str.contains('lesl')] #计算数据集中以“lesl”开头的名字列表
filtered = top1000[top1000.name.isin(lesley_like)] #过滤掉那些名字
filtered.groupby('name').births.sum()
table = filtered.pivot_table('births',index = 'year',columns = 'sex',aggfunc = 'sum') #按照性别和年份进行聚合
table = table.div(table.sum(1),axis = 0) #在年内进行标准化
table.plot(style = {'M':'k_','F':'k--'})
plt.show()
《利用python进行数据分析》————美国1880-2010年的婴儿名字

猜你喜欢