介绍
Pandas 是基于 NumPy 的一个非常好用的库。
基本用法
1、
#读文件read_csv
import pandas
food_info = pandas.read_csv("food_info.csv")
print (food_info)
print (food_info.dtypes)
print (help(pandas.read_csv))
#.head显示前五条数据(默认5)
food_info.head(3)#显示前三条数据
food_info.tail(4)#显示当前数据后四行
print (food_info.columns)#得到列名指标
print (food_info.shape)#当前数据有多少样本,有多少指标(当前文件规模)
#取数据
print (food_info.loc[0])#定位当前数据(一列一列取)
#取数据 定位3:6
food_info.loc[3:6]
#通过名称拿到列
ndb_col = food_info["NDB_No"]
print (ndb_col)
#定位两个列名
columns = ["Zinc_(mg)","Copper_(mg)"]
zinc_copper = food_info[columns]#取出两个列指标
print (zinc_copper)
#查找哪些单位以(g)为结尾,哪些以g为结尾
col_names = food_info.columns.tolist()
print (col_names)
gram_columns = []
2、
for c in col_names:
if c.endswith("g()"):#找到数据
gram_columns.append(c)
gram_df = food_info[gram_columns]
print (gram_df.head(3))
import pandas
food_info = pandas.read_csv("food_info.csv")
col_names = food_info.columns.tolist()
print(col_names)
print(food_info.head(3))
#加减乘除操作
print (food_info["Iron_(mg)"])
div_1000 = food_info["Iron_(mg)"] / 1000
print (div_1000)
#组合乘法(两列相乘)
water_energy = food_info["Water_(g)"]*food_info["Energ_Kcal"]
water_energy = food_info["Water_(g)"]*food_info["Energ_Kcal"]
iron_grams = food_info["Iron_(mg)"] / 1000
print (food_info.shape)
food_info["Iron_(g)"] = iron_grams#新建一个列,往里面加值(赋值)(对应列,维度对应上)
print (food_info.shape)
#乘法加法
weighted_protein = food_info["Protein_(g)"]*2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat
#求最大值.max(),
max_calories = food_info["Energ_Kcal"].max()#某一列最大值
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"]
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"]
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat
#排序操作
food_info.sort_values("Sodium_(mg)",inplace=True)#默认升序,从小到大
print (food_info["Sodium_(mg)"])
#降序操作(从大到小)
food_info.sort_values("Sodium_(mg)",inplace=True,ascending = False )#默认升序,从小到大
print (food_info["Sodium_(mg)"])
3、典型案例
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")#泰坦尼克船员获救典型案例
titanic_survival.head()
age = titanic_survival["Age"]
print(age.loc[0:10])
age_is_null = pd.isnull(age)#判断age是否为空,把true值保留
print (age_is_null)
age_null_true = age[age_is_null]#索引
print (age_null_true)
age_null_count = len(age_null_true)#统计缺失值
print (age_null_count)
#计算均值(有的值才取出来)
good_ages = titanic_survival["Age"][age_is_null == False]
#打印平均值
corrent_mean_age = sum(good_ages) / len(good_ages)
print (corrent_mean_age)
#mean()方法直接求出均值
corrent_mean_age = titanic_survival["Age"].mean()
print (corrent_mean_age)
#分别求一等二等三等仓船票平均价格
passenger_classes = [1,2,3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class
print (fares_by_class)
#分别求一等二等三等仓船票平均价格
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived",aggfunc=np.mean)
print (passenger_survival)
#计算舱人数的平年龄aggfunc默认求均值,可以不用写
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print (passenger_age)
#船票和获救人数关系(两个量之间统计关系)
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"],aggfunc=np.sum)
print (port_stats)
#dropan函数,将缺省值丢弃
drop_na_columns = titanic_survival.dropna(axis=1)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"])
#索引找值
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print (row_index_83_age)
print (row_index_1000_pclass)#打印排序前十的值
#打印排序
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print (new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print('- - - - - -')
print(titanic_reindexed.loc[0:10])
#自定义函数操作
def hundredth_row(column):
hundredth_item = column.loc[99]#定义100行数据
return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)#得到第一百个值是多少
print(hundredth_row)
#统计缺省个数
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print (column_null_count)
#对数据进行转换
def which_class(row):
pclass = row["Pclass"]
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass ==3:
return "Third Class"
classes = titanic_survival.apply(which_class,axis=1)#自定义函数apply
print (classes)
#连续值离散化(判断是否是成年人)
def is_minor(row):
if row["Age"] < 18:
return True
else:
return False
minors = titanic_survival.apply(is_minor,axis=1)#升序排列
#print (minors)
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label,axis=1)
print (age_labels)
#计算当前获救人数,跟一个人是成年和未成年获救的几率是多少?
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels",values="Survived")
print (age_group_survival)
4、#DaataFrame结构(数据读取出来的矩阵)
#电影评分指标分析
import pandas as pd
fandango = pd.read_csv('fandango_score_comparison.csv')#取文件内容
series_film = fandango['FILM']#
print (type(series_film))
print (series_film[0:5])
series_rt = fandango['RottenTomatoes']
print (series_rt[0:5])
#
from pandas import Series
film_names = series_film.values#把每个值拿出来
print (type(film_names))#类型
#print (film_names)
rt_scores = series_rt.values#拿到评分值
#print (rt_scores)
series_custom = Series(rt_scores , index=film_names)#指定得分导入Series
series_custom[['Minions (2015)','Leviathan (2014)']]#用名称当索引
#举例
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)','Leviathan (2014)']]
fiveten = series_custom[5:10]#打印5-10
print (fiveten)
#排序
original_index = series_custom.index.tolist()
#print original_index
sorted_index = sorted(original_index)#排序
sorted_by_index = series_custom.reindex(sorted_index)
print (sorted_by_index)
sc2 = series_custom.sort_index()#排序两种方式
sc3 = series_custom.sort_values()#
print(sc2[0:10])
print("- - - - - - - ")
print(sc3[0:10])
#对应位置相加
import numpy as np
print (np.add(series_custom,series_custom))
np.sin(series_custom)
np.max(series_custom)
#判断评分大于50小于75的电影
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50]
criteria_one = series_custom > 50
criteria_two = series_custom <75
both_criteria = series_custom[criteria_one & criteria_two]
print (both_criteria)
#两媒体对电影评分的平均值
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2
print(rt_mean)
5、#得到当前电影名
fandango = pd.read_csv('fandango_score_comparison.csv')
print (type(fandango))
fandango_films = fandango.set_index('FILM',drop=False)
print(fandango_films.index)
#元组-类似用数值作索引
fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]#按照字典排序
fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
#特殊电影
fandango_films.loc['Kumiko, The Treasure Hunter (2015)']
#选择电影列表
movies = ['Kumiko, The Treasure Hunter (2015)','Do You Believe? (2015)','Ant-Man (2015)']
fandango_films.loc[movies]
#类型转换
import numpy as np
type = fandango_films.dtypes
print (type)
# filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == 'float64'].index
# use bracket notation to filter columns to just float columns
float_df = fandango_films[float_columns]
#print float_df
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))
print(deviations)
#匿名函数
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
rt_mt_user.apply(lambda x: np.std(x), axis=1)