import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn import model_selection train=pd.read_csv('../1/train.csv') #数据读取 test=pd.read_csv('../1/test.csv') print(train.info()) #观察数据的基本信息 print(test.info()) train['Sex'] = train['Sex'].map({'female': 0, 'male': 1}) train['Embarked'] = train['Embarked'].map({'C': 0, 'Q': 1, 'S': 2}) test['Sex'] = test['Sex'].map({'female': 0, 'male': 1}) test['Embarked'] = test['Embarked'].map({'C': 0, 'Q': 1, 'S': 2}) x_train=train[['Pclass','Sex', 'Age', 'Embarked','SibSp','Parch', 'Fare']] #特征属性 x_test=test[['Pclass','Sex', 'Age', 'Embarked','SibSp','Parch', 'Fare']] #通过之前对数据的总体观察,得知Embarked特征存在缺失值,由于缺失数量相对于总样本量可以忽略,所以采用丢弃缺失值 x_train=x_train[x_train['Embarked'].notnull()] x_test=x_test[x_test['Embarked'].notnull()] y_train=train[['Survived', 'Embarked']] y_train=y_train[y_train[ 'Embarked'].notnull()] #保留非空 y_train=y_train['Survived'] #只保留标签属性 #通过之前对数据的总体观察,得知Age特征存在缺失值,由于缺失数量相对于总样本量不能忽略,所以需要填充,此处采用平均值填充 x_train['Age'].fillna(x_train['Age'].mean(),inplace=True) x_test['Age'].fillna(x_test['Age'].mean(),inplace=True) x_test['Fare'].fillna(x_test['Fare'].mean(), inplace = True) #因比赛要求,测试样本不能丢弃数据,采用填充 print(x_train.info()) #再次观察处理后的数据 print(x_test.info()) lr=LogisticRegression(C=10000) score=model_selection.cross_val_score(lr, x_train,y_train, cv=5) print("分类器交叉验证结果:") print(score.mean()) answer = pd.read_csv('../1/gender_submission.csv') # 训练分类器 lr.fit(x_train,y_train) y_predict = lr.predict(x_test) ruselt= pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':y_predict}) ruselt.to_csv(r'../1/ruselt2.csv',index = False) #保存结果
LR模型 kaggle入门项目Titanic
猜你喜欢
转载自blog.csdn.net/lonely2018/article/details/80285221
今日推荐
周排行