楼主最近在参加一个比赛:比赛链接如下:
https://www.datafountain.cn/competitions/466
现在分享自己的代码,该方案目前成绩在65/2865
运行环境: python 3.7.9 ,pycharm
Name:Model_LGB
一:数据预处理+特征工程
import numpy as np
import pandas as pd
import os
import datetime
from tqdm import tqdm
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
def get_info(x):
return [i.split(":")[-1] for i in x.split(" ")]
def get_speed(x):
return np.array([i.split(",")[0] for i in x],dtype='float16')
def get_eta(x):
return np.array([i.split(",")[1] for i in x],dtype="float16")
def get_state(x):
return np.array([i.split(",")[2] for i in x])
def get_cnt(x):
return np.array([i.split(",")[3] for i in x],dtype="int16")
def get_feature(input_file_path_his, input_file_path_attr,input_file_path_topo, mode):
# his
df = pd.read_csv(input_file_path_his, sep=";", header=None)
df["link"] = df[0].apply(lambda x: x.split(" ")[0]).astype(int)
df["label"] = df[0].apply(lambda x: x.split(" ") [1]).astype(int)
df["current_slice_id"] = df[0].apply(lambda x: x.split(" ")[2]).astype(int)
df["future_slice_id"] = df[0].apply(lambda x: x.split(" ")[3]).astype(int)
df["time_diff"] = df["future_slice_id"] - df["current_slice_id"]
df = df.drop([0], axis=1)
if mode == "is_train":
df["label"] = df["label"].map(lambda x: 3 if x >= 3 else x)
df['label'] -= 1
else:
df = df.drop(["label"], axis=1)
df["current_state_last"] = df[1].apply(lambda x: x.split(" ")[-1].split(":")[-1])
# 路况速度,eta速度,路况状态,参与路况计算的车辆数
df["current_speed"] = df["current_state_last"].apply(lambda x: x.split(",")[0])
df["current_eat_speed"] = df["current_state_last"].apply(lambda x: x.split(",")[1])
df["current_state"] = df["current_state_last"].apply(lambda x: x.split(",")[2])
df["current_count"] = df["current_state_last"].apply(lambda x: x.split(",")[3])
df = df.drop(["current_state_last"], axis=1)
for i in tqdm(range(1, 6, 1)):
flag = f"his_{(6-i)*7}"
df["history_info"] = df[i].apply(get_info)
# speed
df["his_speed"] = df["history_info"].apply(get_speed)
df[f'{flag}_speed_mean'] = df["his_speed"].apply(lambda x: x.mean())
# eta
df["his_eta"] = df["history_info"].apply(get_eta)
df[f"{flag}_eta_mean"] = df["his_eta"].apply(lambda x: x.mean())
# state
df["his_state"] = df["history_info"].apply(get_state)
df[f"{flag}_state_max"] = df["his_state"].apply(lambda x: Counter(x).most_common()[0][0])
df[f"{flag}_state_min"] = df["his_state"].apply(lambda x: Counter(x).most_common()[-1][0])
# cnt
df["his_cnt"] = df["history_info"].apply(get_cnt)
df[f"{flag}_cnt_mean"] = df["his_cnt"].apply(lambda x: x.mean())
df = df.drop([i, "history_info", "his_speed", "his_eta", "his_state", "his_cnt"], axis=1)
# break
df2 = pd.read_csv(input_file_path_attr, sep='\t',
names=['link', 'length', 'direction', 'path_class', 'speed_class',
'LaneNum', 'speed_limit', 'level', 'width'], header=None)
df = df.merge(df2, on='link', how='left')
if mode =="is_train":
output_file_path =f"./data/{mode}_{input_file_path_his.split('/')[-1].split('.')[0]}" +".csv"
df.to_csv(output_file_path,index =False,mode='w', header=True)
else:
output_file_path=f"./data/{input_file_path_his.split('/')[-1].split('.')[0]}" +".csv"
df.to_csv(output_file_path,index = False,mode='w', header=True)
# print(df.dtypes)
if __name__ =="__main__":
print(datetime.datetime.now())
#训练集
get_feature(input_file_path_his="D:/traffic-fix/input_data/traffic/20190701.txt",\
input_file_path_attr="D:/traffic-fix/input_data/road_attribute/attr.txt",\
input_file_path_topo="D:/traffic-fix/input_data/road_topo/topo.txt",mode="is_train")
#测试集
get_feature(input_file_path_his="D:/traffic-fix/test_data/20190801_testdata.txt",\
input_file_path_attr="D:/traffic-fix/input_data/road_attribute/attr.txt",\
input_file_path_topo="D:/traffic-fix/input_data/road_topo/topo.txt",mode="is_test")
print(datetime.datetime.now())
二:建立模型+模型评估
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
import warnings
from sklearn.model_selection import KFold, train_test_split
import lightgbm as lgb
import datetime
warnings.filterwarnings("ignore")
def f1_score_eval(preds, valid_df):
labels = valid_df.get_label()
preds = np.argmax(preds.reshape(3, -1), axis=0)
scores = f1_score(y_true=labels, y_pred=preds, average=None)
scores = scores[0]*0.2+scores[1]*0.2+scores[2]*0.6
return 'f1_score', scores, True
def lgb_model(train=None,label=None,test=None,use_features=None,categorical_feats=None,n_class=None):
X_train = train[use_features].values
Y_train = train[label].values
X_test = test[use_features].values
params = {
'learning_rate': 0.05,
'boosting_type': 'gbdt',
'objective': 'multiclass',
'metric': 'None',
'num_leaves': 31,
'num_class': n_class,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'seed': 1,
'bagging_seed': 1,
'feature_fraction_seed': 7,
'min_data_in_leaf': 20,
'nthread': -1,
'verbose': -1
}
# 五折交叉验证
folds = KFold(n_splits=5, shuffle=False, random_state=2019)
oof_lgb = np.zeros([X_train.shape[0], 3])
predictions = np.zeros([X_test.shape[0], 3])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, Y_train)):
print("fold n°{}".format(fold_ + 1))
trn_data = lgb.Dataset(X_train[trn_idx], Y_train[trn_idx],categorical_feature=categorical_feats)
val_data = lgb.Dataset(X_train[val_idx], Y_train[val_idx],categorical_feature=categorical_feats)
num_round = 1000
clf = lgb.train(params,
trn_data,
num_round,
valid_sets=[trn_data, val_data],
verbose_eval=100,
early_stopping_rounds=100,
feval=f1_score_eval
)
oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
# train.loc[train["link"].isin(train_user_id[val_idx]),"pred_label"] = np.argmax(oof_lgb, axis=1) + 1
# print("CV Score: {:<8.5f}".format(f1_score(Y_train, train["pred_label"],average=None)))
test["label"] = np.argmax(predictions, axis=1)+1
return test[["link",'current_slice_id','future_slice_id',"label"]]
if __name__ =="__main__":
train = pd.read_csv('./data/is_train_20190701.csv')
test = pd.read_csv("./data/20190801_testdata.csv")
del_feature = ['link','label']
use_features = [i for i in train.columns if i not in del_feature]
# ## Convert the label to two categories
# train['label'] = train['label'].apply(lambda x: 1 if x == 32 else 0)
category = ["direction","pathclass","speedclass","LaneNum","level"]
print(datetime.datetime.now())
submit =lgb_model(train=train,label="label", test=test, use_features=use_features,
categorical_feats=None, n_class=3)
submit.to_csv('submit.csv', index=False, encoding='utf8')
print(datetime.datetime.now())