import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams['font.sans-serif'] = ['KaiTi']
##注意地址的格式\\
train_path = "D:\\Spyder\\exercise\\data\\titanic\\train.csv"
test_path = "D:\\Spyder\\exercise\\data\\titanic\\test.csv"
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
#print(train_data)
#print(test_data)
train_data.info()
##特征有很多,但是不确定与最终Survived的关系,可以画出各特征与Survived的关系图
#plt.figure(figsize=(15,8),dpi=80)
#train_data.hist()
#train_data.plot(subplots=True,figsize=(10,10))
##看年龄和获救的关系
#x1 = train_data["Survived"]
#y = train_data["Age"]
#plt.scatter(x1,y)
##感觉年龄与获救并没有太大的相关
##看pclass与Survived的关系
#y = train_data["Pclass"]
#x2 = train_data["Survived"]
#Survived_0 = y[x2==0].value_counts()
#Survived_1 = y[x2==1].value_counts()
#df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
#df.plot(kind='bar', stacked=True)
#plt.title(u"各乘客等级的获救情况")
#plt.xlabel(u"乘客等级")
#plt.ylabel(u"人数")
#plt.show()
##3等级的未获救的比例多
##看性别和获救的关系
#y = train_data["Sex"]
#x2 = train_data["Survived"]
#Survived_0 = y[x2==0].value_counts()
#Survived_1 = y[x2==1].value_counts()
#df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
#df.plot(kind='bar', stacked=True)
##明显看出女人里面获救的比例高
##SibSp与获救的关系 兄弟姐妹、堂兄弟姐妹人数
#y = train_data["SibSp"]
#x2 = train_data["Survived"]
#Survived_0 = y[x2==0].value_counts()
#Survived_1 = y[x2==1].value_counts()
#df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
#df.plot(kind='bar', stacked=True)
#plt.show()
##SibSp中0和1的人数比较多,其中0最多,其他等级(2-8)人数少
##认为参考价值不大,0中未获救的人数占大多数,1中一半以上获救,其他未获救的人数占比多
##父母与子女个数
#y = train_data["Parch"]
#x2 = train_data["Survived"]
#Survived_0 = y[x2==0].value_counts()
#Survived_1 = y[x2==1].value_counts()
#df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
#df.plot(kind='bar', stacked=True)
#plt.show()
##Parch中同样0和1,2人数多一些,其中0中人数最多,0中获救人数不到一半,1,2中接近一半
#其他未获救的人数占比多,但是人数太少
##Ticket:船票信息(上面记载着座位号)
#print(train_data["Ticket"])#信息比较复杂,有英文数字
##Fare票价比较复杂
##Embarked上岸地点
##print(train_data["Embarked"])
#y = train_data["Embarked"]
#x2 = train_data["Survived"]
#Survived_0 = y[x2==0].value_counts()
#Survived_1 = y[x2==1].value_counts()
#df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
#df.plot(kind='bar', stacked=True)
#plt.show()
##在Q岸上岸的人最少,S岸最多,但是Q岸获救的比例高
#describe=train_data.describe()
#print(describe)#约0.38的人数能够获救
泰坦尼克问题数据分析
猜你喜欢
转载自blog.csdn.net/ziqingnian/article/details/108351085
今日推荐
周排行