1: 对整体数据的处理
train_data['bedrooms'] = train_data['bedrooms'].astype(int)
test_data['bedrooms'] = test_data['bedrooms'].astype(int)
train_data = train_data[['longitude', 'latitude', 'price', 'buildingTypeId', 'bedrooms','daysOnMarket']]
test_data = test_data[['longitude', 'latitude', 'price', 'buildingTypeId', 'bedrooms','daysOnMarket']]
# 打印column类型
print(train_data.dtypes)
print(test_data.dtypes)
# 打印形状
print(train_data.shape)
print(test_data.shape)
# 根据图形看出longitude存在严重的异常值,根据大于-3 去除
train_data = train_data[train_data.longitude < -3]
test_data = test_data[test_data.longitude <-3]
print(train_data.shape)
print(test_data.shape)
# 更具图形:latitude >40 ,latitude <58,price <90w,bedrooms <=8;
train_data = train_data[train_data.latitude>42]
train_data = train_data[train_data.latitude<58]
train_data = train_data[train_data.price<900000]
train_data = train_data[train_data.bedrooms<=8]
train_data = train_data[train_data.daysOnMarket<40]
train_data = train_data[train_data.longitude != train_data['longitude'].min()]
train_data = train_data[train_data.buildingTypeId.isin([3,1,6,19,12,17,13,7,16,14])]
train_data = train_data[train_data.bedrooms.isin([0,1,2,3,4,5,6,7])]
test_data = test_data[test_data.latitude>40]
test_data = test_data[test_data.latitude<58]
test_data = test_data[test_data.price<900000]
test_data = test_data[test_data.bedrooms<=8]
test_data = test_data[test_data.daysOnMarket<40]
test_data = test_data[test_data.buildingTypeId.isin([3,1,6,19,12,17,13,7,16,14])]
test_data = test_data[test_data.bedrooms.isin([0,1,2,3,4,5,6,7])]
print(train_data.shape)
print(test_data.shape)
# 利用盒图去除离群点,只在price,longitude,latitude,daysonmarket中考虑;
# 观测效果并不是特别理想;去掉daysonmarkt之后要好一点;
def remove_filers_with_boxplot(data):
p = data.boxplot(return_type='dict')
for index,value in enumerate(['longitude','latitude','price']):
# 获取异常值
fliers_value_list = p['fliers'][index].get_ydata()
# 删除异常值
for flier in fliers_value_list:
data = data[data.loc[:,value] != flier]
return data
# train_data = remove_filers_with_boxplot(train_data)
# print(train_data.shape)
# 根据分类来去除离群点;
def use_pivot_box_to_remove_fliers(data,pivot_columns_list,pivot_value_list):
for column in pivot_columns_list:
for value in pivot_value_list:
# 获取分组的dataframe
new_data = data.pivot(columns=column,values=value)
p = new_data.boxplot(return_type='dict')
for index,value_new in enumerate(new_data.columns):
# 获取异常值
fliers_value_list = p['fliers'][index].get_ydata()
# 删除异常值
for flier in fliers_value_list:
data = data[data.loc[:, value] != flier]
return data
train_data['buildingTypeId'] = train_data['buildingTypeId'].astype(str)
print(train_data.dtypes)
print(train_data['buildingTypeId'].value_counts())
print(test_data['buildingTypeId'].value_counts())
train_data['bedrooms'] = train_data['bedrooms'].astype(str)
print(train_data['bedrooms'].value_counts())
print(test_data['bedrooms'].value_counts())
trian_data = use_pivot_box_to_remove_fliers(train_data,['buildingTypeId','bedrooms'],['longitude','latitude','price','daysOnMarket'])
print(train_data.shape)
# train_data.to_csv('./month_6_train_1.csv',index=False)
# test_data.to_csv('./test_data_1.csv',index=False)
train_data = train_data.dropna()
# train_data['longitude'] = abs(train_data['longitude'])
# train_data['longitude'] = np.log1p(np.log1p(train_data['longitude']))
2:对bedrooms和bathroomtotal的处理:
for bedrooms in data["bedrooms"]: # print(bedrooms) if isinstance(bedrooms,float): bedrooms_list.append(int(bedrooms)) else: bedrooms_list.append(int(eval(bedrooms))) data["bedrooms"] = bedrooms_list bathroom_total_list = [] for bathroom_total in data["bathroomTotal"]: bathroom_total_list.append(int(bathroom_total)) data["bathroomTotal"] = bathroom_total_list return data
3:把类别型的数据拆分开来进行预测:(这种方式的结果是合并之后结果一样,也就是说,一些算法内部就是把不同类别
分开了进行预测的,预测的具体数值是通过数值型数据进行计算的)但是也有可能其他一些算法内部不是如深度学习;
还有就是减少类别特征值的个数:
# -*- coding:utf-8 _*- """ @author:Administrator @file: devided_category_to_predict.py @time: 2018/9/25 """ # -*- coding:utf-8 _*- """ @author:Administrator @file: data_analysis.py @time: 2018/9/25 """ import pandas as pd train_data = pd.read_csv("./input/month_567_data.csv") test_data = pd.read_csv("./input/hose_info_201808_predict_2.csv") # 预处理数据 def preprocess_data(data): data = data[[ "longitude", "latitude", "city", "province", "price", "tradeTypeId", "listingDate", "buildingTypeId", "bedrooms", "bathroomTotal", 'postalCode', 'daysOnMarket', 'ownerShipType' ]] # data = data[data.tradeTypeId == 1] # data = data.drop(columns=['tradeTypeId']) print('data shape=%s before dropna' % (str(data.shape))) data = data.dropna(axis=0) bedrooms_list = [] for bedrooms in data["bedrooms"]: # print(bedrooms) if isinstance(bedrooms, float): bedrooms_list.append(int(bedrooms)) else: bedrooms_list.append(int(eval(bedrooms))) data["bedrooms"] = bedrooms_list bathroom_total_list = [] for bathroom_total in data["bathroomTotal"]: bathroom_total_list.append(int(bathroom_total)) data["bathroomTotal"] = bathroom_total_list return data def date_processing(data): list_date = list(data['listingDate']) year_list = [] month_list = [] day_list = [] for date in list_date: if '/' in date: list_break = date.split('/') year_list.append(int(list_break[0])) month_list.append(list_break[1]) day_list.append(list_break[2]) elif '-' in date: list_break = date.split('-') year_list.append(int(list_break[0])) month_list.append(list_break[1]) day_list.append(list_break[2]) data['year'] = year_list data['month'] = month_list # data['day'] = day_list data = data.drop(columns='listingDate') return data def show_value_counts(data, columns): for column in columns: print(data[column].value_counts()) print(data.shape) ''' 就目前来讲,数据量最大的和最小的相差最多10倍情况下来取数据: city: ''' # 处理城市 def process_city(train_data, threshold_value): print('city nums before process:', len(set(train_data['city']))) city_list = set(train_data['city']) list_fill = [] for city in city_list: if len(train_data[train_data.city == city]) > threshold_value: list_fill.append(city) print('city nums after process:', len(list_fill)) # 只要满足条件的数据 train_data = train_data[train_data.city.isin(list_fill)] return train_data # 处理postalCode def get_category_class_bigger_than_threshold_value_postalcode(data, column, threshold_value): column_list = set(data[column]) print('postalCode nums before process:', len(column_list)) list_fill = [] for value in column_list: if len(data[data.postalCode == value]) > threshold_value: list_fill.append(value) print('postalCode nums after process:', len(list_fill)) data = data[data.postalCode.isin(list_fill)] return data # 处理省份 def get_category_class_bigger_than_threshold_value_province(data, column, threshold_value): column_list = set(data[column]) print('province nums before process:', len(column_list)) list_fill = [] for value in column_list: if len(data[data.province == value]) > threshold_value: list_fill.append(value) print('province nums after process:', len(list_fill)) data = data[data.province.isin(list_fill)] return data # 处理buildingTypeId def get_category_class_bigger_than_threshold_value_buildingTypeId(data, column, threshold_value): column_list = set(data[column]) print('buildingTypeId nums before process:', len(column_list)) list_fill = [] for value in column_list: if len(data[data.buildingTypeId == value]) > threshold_value: list_fill.append(value) print('buildingTypeId nums after process:', len(list_fill)) data = data[data.buildingTypeId.isin(list_fill)] return data # 处理 ownerShipType def get_category_class_bigger_than_threshold_value_ownerShipType(data, column, threshold_value): column_list = set(data[column]) print('ownerShipType nums before process:', len(column_list)) list_fill = [] for value in column_list: if len(data[data.ownerShipType == value]) > threshold_value: list_fill.append(value) print('ownerShipType nums after process:', len(list_fill)) data = data[data.ownerShipType.isin(list_fill)] return data # 处理 bedrooms def get_category_class_bigger_than_threshold_value_bedrooms(data, column, threshold_value): column_list = set(data[column]) print('bedrooms nums before process:', len(column_list)) list_fill = [] for value in column_list: if len(data[data.bedrooms == value]) > threshold_value: list_fill.append(value) print('bedrooms nums after process:', len(list_fill)) data = data[data.bedrooms.isin(list_fill)] return data # 处理 bathroomTotal def get_category_class_bigger_than_threshold_value_bathroomTotal(data, column, threshold_value): column_list = set(data[column]) print('bathroomTotal nums before process:', len(column_list)) list_fill = [] for value in column_list: if len(data[data.bathroomTotal == value]) > threshold_value: list_fill.append(value) print('bathroomTotal nums after process:', len(list_fill)) data = data[data.bathroomTotal.isin(list_fill)] return data if __name__ == '__main__': # 预处理数据 train_data = preprocess_data(train_data) test_data = preprocess_data(test_data) train_data = date_processing(train_data) test_data = date_processing(test_data) category_variable = ['province', 'city', 'tradeTypeId', 'buildingTypeId', 'bedrooms', 'bathroomTotal', 'postalCode', 'ownerShipType', 'year', 'month', # 'daysOnMarket' ] # 处理city train_data = process_city(train_data, 100) # 处理postalCode train_data = get_category_class_bigger_than_threshold_value_postalcode(train_data, 'postalCode', 10) # # 处理省份 train_data = get_category_class_bigger_than_threshold_value_province(train_data, 'province', 100) train_data = get_category_class_bigger_than_threshold_value_buildingTypeId(train_data, 'buildingTypeId', 100) # # 处理ownerShipType train_data = get_category_class_bigger_than_threshold_value_ownerShipType(train_data, 'ownerShipType', 100) # # 处理bedrooms train_data = get_category_class_bigger_than_threshold_value_bedrooms(train_data, 'bedrooms', 100) # # 处理bathroomTotal train_data = get_category_class_bigger_than_threshold_value_bathroomTotal(train_data, 'bathroomTotal', 100) show_value_counts(train_data, category_variable) 3:在tensorflow的dnn算法中,将latitude和longitude特征转化成网格数据,tensorflow中自带有一个feature_column(特征工程的工具):具体方法看:https://tensorflow.google.cn/api_docs/python/tf/feature_column
4:获取坐标的异常值;
def get_outlier(x,y,init_point_count ,distance,least_point_count): x_outliers_list = [] y_outliers_list = [] for i in range(len(x)): for j in range(len(x)): d =np.sqrt(np.square(x[i]-x[j])+np.square(y[i]-y[j])) # print('距离',d) if d <= distance: init_point_count +=1 if init_point_count <least_point_count+1: x_outliers_list.append(x[i]) y_outliers_list.append(y[i]) print(x[i],y[i]) init_point_count =0 return x_outliers_list,y_outliers_list x, y = get_outlier(x,y,0,10,1) print(x,y)
-----------------特征之间的关联热度图:(只能计算数值型的)
# corrmat = data.corr()
扫描二维码关注公众号,回复:
4501918 查看本文章
# cols = corrmat.nlargest(k, 'daysOnMarket')['daysOnMarket'].index # cm = np.corrcoef(data[cols].values.T) # sns.set(font_scale=1.25) # hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values) # plt.show()
# 标签编码
import pandas as pd def label_encode(data): for column in data.columns: if data[column].dtypes=='object': data[column] = pd.factorize(data[column].values, sort=True)[0] + 1 data[column] = data[column].astype('str') return data
还有一种是直接使用sklearn.preprocess 下的label_encodeing