【代码模版】数据预处理类python代码模版

【删除不需要的列】

# 前置库：pandas
# 删除不需要的列
DataFrame.drop(columns=['column_name'], inplace=True)

【对某列数据进行Z-score标准化】

# 前置库：pandas
# 对某列进行标准化
from sklearn.preprocessing import StandardScaler
DataFrame['column_name'] = StandardScaler().fit_transform(DataFrame['column_name'].values.reshape(-1,1))

【有监督学习类数据target列的计数与可视化】

# 前置库：pandas
# 对标签列进行观察，查看已标注样本各类别对数量
print(pd.value_counts(DataFrame['target_column'], sort=True))
# 图形化展示
from matplotlib import pyplot as plt
pd.value_counts(DataFrame['target_column']).plot(kind = 'bar')
plt.title("class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show

【下采样函数】

# 前置库：pandas/numpy
def under_sample_function(data=data, column='xxx'):
    
    # 得到所有小样本的索引
    small_number_records = len(data[data[column] == 1])  # 小样本的数量
    small_indices = np.array(data[data[column] == 1].index)  # 小样本的索引

    # 得到所有大本的索引
    large_indices = data[data[column] == 0].index  # 大样本的索引

    # 在大样本中随机采样出指定个数的样本，并取其索引
    random_large_indices = np.random.choice(large_indices, small_number_records, replace = False)  # 按照小样本的数量在大样本中随机采样
    random_large_indices = np.array(random_large_indices)  # 求出随机采样的索引

    # 有了随机采样后的大样本和小样本后把它们的索引都拿到手
    under_sample_indices = np.concatenate([small_indices,random_large_indices])  # 将小样本和随机采样后的大样本的索引合并

    # 根据索引得到下采样所有样本点
    under_sample_data = data.iloc[under_sample_indices,:]  # 根据索引生成新下采样数据集
    
    # 下采样的样本比例
    print("大样本所占整体比例: ", len(under_sample_data[under_sample_data[column] == 0])/len(under_sample_data))
    print("小样本所占整体比例: ", len(under_sample_data[under_sample_data[column] == 1])/len(under_sample_data))
    print("下采样总体样本数量: ", len(under_sample_data))
    
    # 返回under_sample_data
    return under_sample_data

【过采样函数】

# 前置库：pandas
from imblearn.over_sampling import SMOTE  # 过采样使用的库
oversampler = SMOTE(random_state=0)
def over_sample_function(X_data=x_train, Y_data=y_train):
	os_x, os_y = oversampler.fit_sample(X_data, Y_data)
	os_x = pd.DataFrame(os_x)
	os_y = pd.DataFrame(os_y)
	print("过采样后1类数据的数据量：", len(os_y[os_y==1]))
	print("过采样后0类数据的数据量：", len(os_y[os_y==0]))
	return os_x, os_y  # 返回的是过采样后x与y的训练集

将来有新的再继续补充。

不停下脚步的乌龟

发布了22 篇原创文章 · 获赞 0 · 访问量 949

私信关注

【代码模版】数据预处理类python代码模版

【删除不需要的列】

【对某列数据进行Z-score标准化】

【有监督学习类数据target列的计数与可视化】

【下采样函数】

【过采样函数】

猜你喜欢