lightgbm用于排序

一.

　　LTR(learning to rank)经常用于搜索排序中，开源工具中比较有名的是微软的ranklib，但是这个好像是单机版的，也有好长时间没有更新了。所以打算想利用lightgbm进行排序，但网上关于lightgbm用于排序的代码很少，关于回归和分类的倒是一堆。这里我将贴上python版的lightgbm用于排序的代码，里面将包括训练、获取叶结点、ndcg评估、预测以及特征重要度等处理代码，有需要的朋友可以参考一下或进行修改。

　　其实在使用时，本人也对比了ranlib中的lambdamart和lightgbm，令人映像最深刻的是lightgbm的训练速度非常快，快的起飞。可能lambdamart训练需要几个小时，而lightgbm只需要几分钟，但是后面的ndcg测试都差不多，不像论文中所说的lightgbm精度高一点。lightgbm的训练速度快，我想可能最大的原因要可能是：a.节点分裂用到了直方图，而不是预排序方法；b.基于梯度的单边采样，即行采样；c.互斥特征绑定，即列采样；d.其于leaf-wise决策树生长策略；e.类别特征的支持

二.代码

第一部分代码块是主代码，后面三个代码块是用到的加载数据和ndcg。运行主代码使用命令如训练模型使用：python lgb.py -train等

  1 import os
  2 import lightgbm as lgb
  3 from sklearn import datasets as ds
  4 import pandas as pd
  5 
  6 import numpy as np
  7 from datetime import datetime
  8 import sys
  9 from sklearn.preprocessing import OneHotEncoder
 10 
 11 def split_data_from_keyword(data_read, data_group, data_feats):
 12     '''
 13     利用pandas
 14     转为lightgbm需要的格式进行保存
 15     :param data_read:
 16     :param data_save:
 17     :return:
 18     '''
 19     with open(data_group, 'w', encoding='utf-8') as group_path:
 20         with open(data_feats, 'w', encoding='utf-8') as feats_path:
 21             dataframe = pd.read_csv(data_read,
 22                                     sep=' ',
 23                                     header=None,
 24                                     encoding="utf-8",
 25                                     engine='python')
 26             current_keyword = ''
 27             current_data = []
 28             group_size = 0
 29             for _, row in dataframe.iterrows():
 30                 feats_line = [str(row[0])]
 31                 for i in range(2, len(dataframe.columns) - 1):
 32                     feats_line.append(str(row[i]))
 33                 if current_keyword == '':
 34                     current_keyword = row[1]
 35                 if row[1] == current_keyword:
 36                     current_data.append(feats_line)
 37                     group_size += 1
 38                 else:
 39                     for line in current_data:
 40                         feats_path.write(' '.join(line))
 41                         feats_path.write('\n')
 42                     group_path.write(str(group_size) + '\n')
 43 
 44                     group_size = 1
 45                     current_data = []
 46                     current_keyword = row[1]
 47                     current_data.append(feats_line)
 48 
 49             for line in current_data:
 50                 feats_path.write(' '.join(line))
 51                 feats_path.write('\n')
 52             group_path.write(str(group_size) + '\n')
 53 
 54 def save_data(group_data, output_feature, output_group):
 55     '''
 56     group与features分别进行保存
 57     :param group_data:
 58     :param output_feature:
 59     :param output_group:
 60     :return:
 61     '''
 62     if len(group_data) == 0:
 63         return
 64     output_group.write(str(len(group_data)) + '\n')
 65     for data in group_data:
 66         # 只包含非零特征
 67         # feats = [p for p in data[2:] if float(p.split(":")[1]) != 0.0]
 68         feats = [p for p in data[2:]]
 69         output_feature.write(data[0] + ' ' + ' '.join(feats) + '\n') # data[0] => level ; data[2:] => feats
 70 
 71 def process_data_format(test_path, test_feats, test_group):
 72     '''
 73      转为lightgbm需要的格式进行保存
 74      '''
 75     with open(test_path, 'r', encoding='utf-8') as fi:
 76         with open(test_feats, 'w', encoding='utf-8') as output_feature:
 77             with open(test_group, 'w', encoding='utf-8') as output_group:
 78                 group_data = []
 79                 group = ''
 80                 for line in fi:
 81                     if not line:
 82                         break
 83                     if '#' in line:
 84                         line = line[:line.index('#')]
 85                     splits = line.strip().split()
 86                     if splits[1] != group: # qid => splits[1]
 87                         save_data(group_data, output_feature, output_group)
 88                         group_data = []
 89                     group = splits[1]
 90                     group_data.append(splits)
 91                 save_data(group_data, output_feature, output_group)
 92 
 93 def load_data(feats, group):
 94     '''
 95     加载数据
 96     分别加载feature,label,query
 97     '''
 98     x_train, y_train = ds.load_svmlight_file(feats)
 99     q_train = np.loadtxt(group)
100     return x_train, y_train, q_train
101 
102 def load_data_from_raw(raw_data):
103     with open(raw_data, 'r', encoding='utf-8') as testfile:
104         test_X, test_y, test_qids, comments = letor.read_dataset(testfile)
105     return test_X, test_y, test_qids, comments
106 
107 def train(x_train, y_train, q_train, model_save_path):
108     '''
109     模型的训练和保存
110     '''
111     train_data = lgb.Dataset(x_train, label=y_train, group=q_train)
112     params = {
113         'task': 'train',  # 执行的任务类型
114         'boosting_type': 'gbrt',  # 基学习器
115         'objective': 'lambdarank',  # 排序任务(目标函数)
116         'metric': 'ndcg',  # 度量的指标(评估函数)
117         'max_position': 10,  # @NDCG 位置优化
118         'metric_freq': 1,  # 每隔多少次输出一次度量结果
119         'train_metric': True,  # 训练时就输出度量结果
120         'ndcg_at': [10],
121         'max_bin': 255,  # 一个整数，表示最大的桶的数量。默认值为 255。lightgbm 会根据它来自动压缩内存。如max_bin=255 时，则lightgbm 将使用uint8 来表示特征的每一个值。
122         'num_iterations': 500,  # 迭代次数
123         'learning_rate': 0.01,  # 学习率
124         'num_leaves': 31,  # 叶子数
125         # 'max_depth':6,
126         'tree_learner': 'serial',  # 用于并行学习，‘serial’： 单台机器的tree learner
127         'min_data_in_leaf': 30,  # 一个叶子节点上包含的最少样本数量
128         'verbose': 2  # 显示训练时的信息
129     }
130     gbm = lgb.train(params, train_data, valid_sets=[train_data])
131     gbm.save_model(model_save_path)
132 
133 def predict(x_test, comments, model_input_path):
134     '''
135     预测得分并排序
136     '''
137     gbm = lgb.Booster(model_file=model_input_path)  # 加载model
138 
139     ypred = gbm.predict(x_test)
140 
141     predicted_sorted_indexes = np.argsort(ypred)[::-1]  # 返回从大到小的索引
142 
143     t_results = comments[predicted_sorted_indexes]  # 返回对应的comments,从大到小的排序
144 
145     return t_results
146 
147 def test_data_ndcg(model_path, test_path):
148     '''
149     评估测试数据的ndcg
150     '''
151     with open(test_path, 'r', encoding='utf-8') as testfile:
152         test_X, test_y, test_qids, comments = letor.read_dataset(testfile)
153 
154     gbm = lgb.Booster(model_file=model_path)
155     test_predict = gbm.predict(test_X)
156 
157     average_ndcg, _ = ndcg.validate(test_qids, test_y, test_predict, 60)
158     # 所有qid的平均ndcg
159     print("all qid average ndcg: ", average_ndcg)
160     print("job done!")
161 
162 def plot_print_feature_importance(model_path):
163     '''
164     打印特征的重要度
165     '''
166     #模型中的特征是Column_数字,这里打印重要度时可以映射到真实的特征名
167     feats_dict = {
168         'Column_0': '特征0名称',
169         'Column_1': '特征1名称',
170         'Column_2': '特征2名称',
171         'Column_3': '特征3名称',
172         'Column_4': '特征4名称',
173         'Column_5': '特征5名称',
174         'Column_6': '特征6名称',
175         'Column_7': '特征7名称',
176         'Column_8': '特征8名称',
177         'Column_9': '特征9名称',
178         'Column_10': '特征10名称',
179     }
180     if not os.path.exists(model_path):
181         print("file no exists! {}".format(model_path))
182         sys.exit(0)
183 
184     gbm = lgb.Booster(model_file=model_path)
185 
186     # 打印和保存特征重要度
187     importances = gbm.feature_importance(importance_type='split')
188     feature_names = gbm.feature_name()
189 
190     sum = 0.
191     for value in importances:
192         sum += value
193 
194     for feature_name, importance in zip(feature_names, importances):
195         if importance != 0:
196             feat_id = int(feature_name.split('_')[1]) + 1
197             print('{} : {} : {} : {}'.format(feat_id, feats_dict[feature_name], importance, importance / sum))
198 
199 def get_leaf_index(data, model_path):
200     '''
201     得到叶结点并进行one-hot编码
202     '''
203     gbm = lgb.Booster(model_file=model_path)
204     ypred = gbm.predict(data, pred_leaf=True)
205 
206     one_hot_encoder = OneHotEncoder()
207     x_one_hot = one_hot_encoder.fit_transform(ypred)
208     print(x_one_hot.toarray()[0])
209 
210 if __name__ == '__main__':
211     model_path = "保存模型的路径"
212 
213     if len(sys.argv) != 2:
214         print("Usage: python main.py [-process | -train | -predict | -ndcg | -feature | -leaf]")
215         sys.exit(0)
216 
217     if sys.argv[1] == '-process':
218         # 训练样本的格式与ranklib中的训练样本是一样的,但是这里需要处理成lightgbm中排序所需的格式
219         # lightgbm中是将样本特征和group分开保存为txt的,什么意思呢,看下面解释
220         '''
221         feats:
222         1 1:0.2 2:0.4 ...
223         2 1:0.2 2:0.4 ...
224         1 1:0.2 2:0.4 ...
225         3 1:0.2 2:0.4 ...
226         group:
227         2
228         4
229         这里group中2表示前2个是一个qid,4表示后两个是一个qid
230         '''
231         raw_data_path = '训练样本集路径'
232         data_feats = '特征保存路径'
233         data_group = 'group保存路径'
234         process_data_format(raw_data_path, data_feats, data_group)
235 
236     elif sys.argv[1] == '-train':
237         # train
238         train_start = datetime.now()
239         data_feats = '特征保存路径'
240         data_group = 'group保存路径'
241         x_train, y_train, q_train = load_data(data_feats, data_group)
242         train(x_train, y_train, q_train, model_path)
243         train_end = datetime.now()
244         consume_time = (train_end - train_start).seconds
245         print("consume time : {}".format(consume_time))
246 
247     elif sys.argv[1] == '-predict':
248         train_start = datetime.now()
249         raw_data_path = '需要预测的数据路径'#格式如ranklib中的数据格式
250         test_X, test_y, test_qids, comments = load_data_from_raw(raw_data_path)
251         t_results = predict(test_X, comments, model_path)
252         train_end = datetime.now()
253         consume_time = (train_end - train_start).seconds
254         print("consume time : {}".format(consume_time))
255 
256     elif sys.argv[1] == '-ndcg':
257         # ndcg
258         test_path = '测试的数据路径'#评估测试数据的平均ndcg
259         test_data_ndcg(model_path, test_path)
260 
261     elif sys.argv[1] == '-feature':
262         plot_print_feature_importance(model_path)
263 
264     elif sys.argv[1] == '-leaf':
265         #利用模型得到样本叶结点的one-hot表示
266         raw_data = '测试数据路径'#
267         with open(raw_data, 'r', encoding='utf-8') as testfile:
268             test_X, test_y, test_qids, comments = letor.read_dataset(testfile)
269         get_leaf_index(test_X, model_path)

  1 """
  2 
  3 Various utilities for converting data from/to Microsoft's LETOR format.
  4 
  5 """
  6 
  7 import numpy as np
  8 import re
  9 import sklearn.externals.six
 10 from sklearn.externals.six import moves
 11 range = moves.range
 12 
 13 
 14 def iter_lines(lines, has_targets=True, one_indexed=True, missing=0.0):
 15     """Transforms an iterator of lines to an iterator of LETOR rows.
 16 
 17     Each row is represented by a (x, y, qid, comment) tuple.
 18 
 19     Parameters
 20     ----------
 21     lines : iterable of lines
 22         Lines to parse.
 23     has_targets : bool, optional
 24         Whether the file contains targets. If True, will expect the first token
 25         of every line to be a real representing the sample's target (i.e.
 26         score). If False, will use -1 as a placeholder for all targets.
 27     one_indexed : bool, optional 特征id从1开始的转为从0开始
 28         Whether feature ids are one-indexed. If True, will subtract 1 from each
 29         feature id.
 30     missing : float, optional
 31         Placeholder to use if a feature value is not provided for a sample.
 32 
 33     Yields
 34     ------
 35     x : array of floats
 36         Feature vector of the sample.
 37     y : float
 38         Target value (score) of the sample, or -1 if no target was parsed.
 39     qid : object
 40         Query id of the sample. This is currently guaranteed to be a string.
 41     comment : str
 42         Comment accompanying the sample.
 43 
 44     """
 45     for line in lines:
 46         data, _, comment = line.rstrip().partition('#')
 47         toks = data.strip().split()
 48         #toks = line.rstrip()
 49         #toks = re.split('\s+', toks.strip())
 50         #print("toks: ", toks)
 51         #comment = "no comment"
 52         num_features = 0 # 统计特征个数
 53         x = np.repeat(missing, 8)
 54         y = -1.0
 55         if has_targets:
 56             y = float(toks[0].strip()) # 相关度label
 57             toks = toks[1:]
 58         # qid:1 => 1
 59         qid = _parse_qid_tok(toks[0].strip())
 60         
 61         # feature(id:value)
 62         for tok in toks[1:]:
 63             #fid, _, val = tok.strip().partition(':') # fid,_,val => featureID,:,featureValue
 64             fid, val = tok.split(":") # featureID:featureValue
 65             fid = int(fid)
 66             val = float(val)
 67             if one_indexed:
 68                 fid -= 1
 69             assert fid >= 0
 70             while len(x) <= fid:
 71                 orig = len(x)
 72                 #x=np.resize(x,(len(x) * 2))
 73                 x.resize(len(x) * 2)
 74                 x[orig:orig * 2] = missing
 75             x[fid] = val
 76             num_features = max(fid + 1, num_features)
 77 
 78         assert num_features > 0
 79         x.resize(num_features)
 80 
 81         yield (x, y, qid, comment)
 82 
 83 
 84 def read_dataset(source, has_targets=True, one_indexed=True, missing=0.0):
 85     """Parses a LETOR dataset from `source`.
 86 
 87     Parameters
 88     ----------
 89     source : string or iterable of lines
 90         String, file, or other file-like object to parse.
 91     has_targets : bool, optional
 92         See `iter_lines`.
 93     one_indexed : bool, optional
 94         See `iter_lines`.
 95     missing : float, optional
 96         See `iter_lines`.
 97 
 98     Returns
 99     -------
100     X : array of arrays of floats
101       Feature matrix (see `iter_lines`).
102     y : array of floats
103         Target vector (see `iter_lines`).
104     qids : array of objects
105         Query id vector (see `iter_lines`).
106     comments : array of strs
107         Comment vector (see `iter_lines`).
108     """
109     if isinstance(source, sklearn.externals.six.string_types):
110         source = source.splitlines(True)
111 
112     max_width = 0 # 某行最多特征个数
113     xs, ys, qids, comments = [], [], [], []
114     iter_content = iter_lines(source, has_targets=has_targets,
115                     one_indexed=one_indexed, missing=missing)
116     # x:特征向量; y:float 相关度值[0-4]; qid:string query id; comment: #后面内容
117     for x, y, qid, comment in iter_content:
118         xs.append(x)
119         ys.append(y)
120         qids.append(qid)
121         comments.append(comment)
122         max_width = max(max_width, len(x))
123 
124     assert max_width > 0
125     # X.shape = [len(xs), max_width]
126     X = np.ndarray((len(xs), max_width), dtype=np.float64)
127     X.fill(missing)
128     for i, x in enumerate(xs):
129         X[i, :len(x)] = x
130     ys = np.array(ys) if has_targets else None
131     qids = np.array(qids)
132     comments = np.array(comments)
133 
134     return (X, ys, qids, comments)
135 
136 
137 def _parse_qid_tok(tok):
138     assert tok.startswith('qid:')
139     return tok[4:]

 1 import numpy as np
 2 import sklearn.externals.six
 3 from sklearn.externals.six import moves
 4 range = moves.range
 5 
 6 
 7 def iter_lines(lines):
 8     for line in lines:
 9         toks = line.split()
10         qid = toks[0]
11         target = float(toks[4])
12         pred = float(toks[5])
13         yield (qid, target, pred)
14 
15 def read_dataset(source):
16 
17     if isinstance(source, sklearn.externals.six.string_types):
18         source = source.splitlines(True)
19 
20     qids, targets, preds = [], [], []
21     iter_content = iter_lines(source)
22     for qid, target, pred in iter_content:
23         qids.append(qid)
24         targets.append(target)
25         preds.append(pred)
26 
27     qids = np.array(qids)
28     targets = np.array(targets)
29     preds = np.array(preds)
30 
31     return (qids, targets, preds)

  1 import numpy as np
  2 import collections
  3 
  4 def validate(qids, targets, preds, k):
  5     """
  6     Predicts the scores for the test dataset and calculates the NDCG value.
  7     Parameters
  8     ----------
  9     data : Numpy array of documents
 10         Numpy array of documents with each document's format is [relevance score, query index, feature vector]
 11     k : int
 12         this is used to compute the NDCG@k
 13 
 14     Returns
 15     -------
 16     average_ndcg : float
 17         This is the average NDCG value of all the queries
 18     predicted_scores : Numpy array of scores
 19         This contains an array or the predicted scores for the documents.
 20     """
 21     query_groups = get_groups(qids)  # (qid,from,to),一个元组,表示这个qid的样本从哪到哪
 22     all_ndcg = []
 23     every_qid_ndcg = collections.OrderedDict()
 24 
 25     for qid, a, b in query_groups:
 26         predicted_sorted_indexes = np.argsort(preds[a:b])[::-1] # 从大到小的索引
 27         t_results = targets[a:b] # 目标数据的相关度
 28         t_results = t_results[predicted_sorted_indexes] #是predicted_sorted_indexes排好序的在test_data中的相关度
 29 
 30         dcg_val = dcg_k(t_results, k)
 31         idcg_val = ideal_dcg_k(t_results, k)
 32         ndcg_val = (dcg_val / idcg_val)
 33         all_ndcg.append(ndcg_val)
 34         every_qid_ndcg.setdefault(qid, ndcg_val)
 35 
 36     average_ndcg = np.nanmean(all_ndcg)
 37     return average_ndcg, every_qid_ndcg
 38 
 39 
 40     '''
 41     for query in query_indexes:
 42         results = np.zeros(len(query_indexes[query]))
 43 
 44         for tree in self.trees:
 45             results += self.learning_rate * tree.predict(data[query_indexes[query], 2:])
 46         predicted_sorted_indexes = np.argsort(results)[::-1]
 47         t_results = data[query_indexes[query], 0] # 第0列的相关度
 48         t_results = t_results[predicted_sorted_indexes]
 49 
 50         dcg_val = dcg_k(t_results, k)
 51         idcg_val = ideal_dcg_k(t_results, k)
 52         ndcg_val = (dcg_val / idcg_val)
 53         average_ndcg.append(ndcg_val)
 54     average_ndcg = np.nanmean(average_ndcg)
 55     return average_ndcg
 56 '''
 57 
 58 def get_groups(qids):
 59     """Makes an iterator of query groups on the provided list of query ids.
 60 
 61     Parameters
 62     ----------
 63     qids : array_like of shape = [n_samples]
 64         List of query ids.
 65 
 66     Yields
 67     ------
 68     row : (qid, int, int)
 69         Tuple of query id, from, to.
 70         ``[i for i, q in enumerate(qids) if q == qid] == range(from, to)``
 71 
 72     """
 73     prev_qid = None
 74     prev_limit = 0
 75     total = 0
 76 
 77     for i, qid in enumerate(qids):
 78         total += 1
 79         if qid != prev_qid:
 80             if i != prev_limit:
 81                 yield (prev_qid, prev_limit, i)
 82             prev_qid = qid
 83             prev_limit = i
 84 
 85     if prev_limit != total:
 86         yield (prev_qid, prev_limit, total)
 87 
 88 def group_queries(training_data, qid_index):
 89     """
 90         Returns a dictionary that groups the documents by their query ids.
 91         Parameters
 92         ----------
 93         training_data : Numpy array of lists
 94             Contains a list of document information. Each document's format is [relevance score, query index, feature vector]
 95         qid_index : int
 96             This is the index where the qid is located in the training data
 97 
 98         Returns
 99         -------
100         query_indexes : dictionary
101             The keys were the different query ids and teh values were the indexes in the training data that are associated of those keys.
102     """
103     query_indexes = {}  # 每个qid对应的样本索引范围,比如qid=1020,那么此qid在training data中的训练样本从0到100的范围, { key=str,value=[] }
104     index = 0
105     for record in training_data:
106         query_indexes.setdefault(record[qid_index], [])
107         query_indexes[record[qid_index]].append(index)
108         index += 1
109     return query_indexes
110 
111 
112 def dcg_k(scores, k):
113     """
114         Returns the DCG value of the list of scores and truncates to k values.
115         Parameters
116         ----------
117         scores : list
118             Contains labels in a certain ranked order
119         k : int
120             In the amount of values you want to only look at for computing DCG
121 
122         Returns
123         -------
124         DCG_val: int
125             This is the value of the DCG on the given scores
126     """
127     return np.sum([
128                       (np.power(2, scores[i]) - 1) / np.log2(i + 2)
129                       for i in range(len(scores[:k]))
130                       ])
131 
132 
133 def ideal_dcg_k(scores, k):
134     """
135     前k个理想状态下的dcg
136         Returns the Ideal DCG value of the list of scores and truncates to k values.
137         Parameters
138         ----------
139         scores : list
140             Contains labels in a certain ranked order
141         k : int
142             In the amount of values you want to only look at for computing DCG
143 
144         Returns
145         -------
146         Ideal_DCG_val: int
147             This is the value of the Ideal DCG on the given scores
148     """
149     # 相关度降序排序
150     scores = [score for score in sorted(scores)[::-1]]
151     return dcg_k(scores, k)

猜你喜欢