▍前言:阶段定位与目标
当前进入项目核心攻坚期,聚焦模型开发与优化升级两大阶段。此阶段将运用前期准备好的数据和工程环境,构建可用的预测模型,并通过系统化调优提升模型性能至工业级标准。
▍回顾:
-
- 项目准备阶段
流程认知:建立ML开发标准流程(CRISP-DM方法论)
环境配置:完成Python虚拟环境搭建(含scikit-learn/pandas库)
架构设计:规范项目目录结构,建立版本控制体系
- 项目准备阶段
-
- 数据工程阶段
数据加载:掌握sklearn内置数据集与外部CSV加载方法
探索分析:产出EDA报告(含分布直方图/散点矩阵图)
预处理:完成标准化处理(Z-score标准化)与训练测试集划分(7:3比例)
- 数据工程阶段
阶段三:模型开发(核心攻坚)
7.1 基础模型实现
# train.py
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
def initialize_models():
"""初始化4种基础分类器"""
return {
"Logistic Regression": LogisticRegression(
max_iter=1000,
multi_class='multinomial',
solver='lbfgs',
random_state=42
),
"SVM": SVC(
kernel='rbf',
probability=True,
gamma='scale',
random_state=42
),
"Random Forest": RandomForestClassifier(
n_estimators=100,
max_depth=3,
random_state=42
),
"KNN": KNeighborsClassifier(
n_neighbors=5,
weights='distance'
)
}
def cross_validation_eval(models, X, y):
"""交叉验证评估"""
cv_results = {
}
for name, model in models.items():
scores = cross_val_score(
model, X, y,
cv=5,
scoring='accuracy',
n_jobs=-1
)
cv_results[name] = {
'mean_score': np.mean(scores),
'std_score': np.std(scores),
'scores': scores
}
print(f"{
name:20} | 平均准确率: {
np.mean(scores):.3f} ± {
np.std(scores):.3f}")
return cv_results
# 执行交叉验证
models = initialize_models()
cv_results = cross_validation_eval(models, X_train, y_train)
输出示例:
模型 | 平均准确率 |
---|---|
SVM | 0.967 ± 0.024 |
Random Forest | 0.958 ± 0.032 |
KNN | 0.967 ± 0.024 |
7.2 模型训练与验证
from sklearn.metrics import accuracy_score, log_loss
def train_and_evaluate(models, X_train, X_test, y_train, y_test):
"""完整训练与评估流程"""
metrics = ['accuracy', 'log_loss']
results = {
}
for name, model in models.items():
# 训练模型
model.fit(X_train, y_train)
# 预测结果
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)
# 计算指标
results[name] = {
'model': model,
'accuracy': accuracy_score(y_test, y_pred),
'log_loss': log_loss(y_test, y_proba),
'feature_importance': get_feature_importance(model, features_processed)
}
return results
def get_feature_importance(model, feature_names):
"""统一获取特征重要性"""
if hasattr(model, 'feature_importances_'):
return dict(zip(feature_names, model.feature_importances_))
elif hasattr(model, 'coef_'):
return dict(zip(feature_names, np.abs(model.coef_).mean(axis=0)))
else:
return None
# 执行训练评估
model_results = train_and_evaluate(models, X_train, X_test, y_train, y_test)
7.3 模型性能可视化
def visualize_results(results):
"""模型性能对比可视化"""
plt.figure(figsize=(15, 6))
# 准确率对比
plt.subplot(1, 2, 1)
accuracies = [res['accuracy'] for res in results.values()]
plt.bar(range(len(results)), accuracies, color='skyblue')
plt.xticks(range(len(results)), results.keys(), rotation=45)
plt.ylim(0.8, 1.0)
plt.title('Test Accuracy Comparison')
plt.ylabel('Accuracy')
# 特征重要性可视化
plt.subplot(1, 2, 2)
for name, res in results.items():
if res['feature_importance']:
importances = pd.Series(res['feature_importance'])
importances.sort_values().plot(
kind='barh',
alpha=0.6,
label=name
)
plt.title('Feature Importance Comparison')
plt.xlabel('Importance Score')
plt.legend()
plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300)
plt.show()
visualize_results(model_results)
可视化说明:
- 柱状图:对比各模型在测试集上的准确率
- 横向条形图:展示不同模型的特征重要性排序
7.4 完整训练流程
# training.py
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import joblib
def train_full_model(best_model_name, models, X_train, y_train, X_test, y_test):
"""完整训练流程"""
# 获取最佳模型配置
model = clone(models[best_model_name])
# 构建完整管道
full_pipeline = Pipeline([
('preprocessor', joblib.load('models/preprocessor.pkl')),
('model', model)
])
# 全量数据训练
full_pipeline.fit(np.vstack((X_train, X_test)), np.concatenate((y_train, y_test)))
# 保存完整管道
joblib.dump(full_pipeline, f'models/{
best_model_name}_full.pkl')
return full_pipeline
# 选择交叉验证最佳模型
best_model_name =