业务相关的数据处理方式

1: 对整体数据的处理
train_data['bedrooms'] = train_data['bedrooms'].astype(int)
test_data['bedrooms'] = test_data['bedrooms'].astype(int)

train_data = train_data[['longitude', 'latitude', 'price', 'buildingTypeId', 'bedrooms','daysOnMarket']]
test_data = test_data[['longitude', 'latitude', 'price', 'buildingTypeId', 'bedrooms','daysOnMarket']]

# 打印column类型
print(train_data.dtypes)
print(test_data.dtypes)

# 打印形状
print(train_data.shape)
print(test_data.shape)

# 根据图形看出longitude存在严重的异常值,根据大于-3 去除
train_data = train_data[train_data.longitude < -3]
test_data = test_data[test_data.longitude <-3]

print(train_data.shape)
print(test_data.shape)

# 更具图形:latitude >40 ,latitude <58,price <90w,bedrooms <=8;
train_data = train_data[train_data.latitude>42]
train_data = train_data[train_data.latitude<58]
train_data = train_data[train_data.price<900000]
train_data = train_data[train_data.bedrooms<=8]
train_data = train_data[train_data.daysOnMarket<40]
train_data = train_data[train_data.longitude != train_data['longitude'].min()]
train_data = train_data[train_data.buildingTypeId.isin([3,1,6,19,12,17,13,7,16,14])]
train_data = train_data[train_data.bedrooms.isin([0,1,2,3,4,5,6,7])]

test_data = test_data[test_data.latitude>40]
test_data = test_data[test_data.latitude<58]
test_data = test_data[test_data.price<900000]
test_data = test_data[test_data.bedrooms<=8]
test_data = test_data[test_data.daysOnMarket<40]
test_data = test_data[test_data.buildingTypeId.isin([3,1,6,19,12,17,13,7,16,14])]
test_data = test_data[test_data.bedrooms.isin([0,1,2,3,4,5,6,7])]

print(train_data.shape)
print(test_data.shape)

# 利用盒图去除离群点,只在price,longitude,latitude,daysonmarket中考虑;
# 观测效果并不是特别理想;去掉daysonmarkt之后要好一点;
def remove_filers_with_boxplot(data):
    p = data.boxplot(return_type='dict')
    for index,value in enumerate(['longitude','latitude','price']):
        # 获取异常值
        fliers_value_list = p['fliers'][index].get_ydata()
        # 删除异常值
        for flier in fliers_value_list:
            data = data[data.loc[:,value] != flier]
    return data

# train_data = remove_filers_with_boxplot(train_data)
# print(train_data.shape)

# 根据分类来去除离群点;
def use_pivot_box_to_remove_fliers(data,pivot_columns_list,pivot_value_list):
    for column in pivot_columns_list:
        for value in pivot_value_list:
            # 获取分组的dataframe
            new_data = data.pivot(columns=column,values=value)
            p = new_data.boxplot(return_type='dict')
            for index,value_new in enumerate(new_data.columns):
                # 获取异常值
                fliers_value_list = p['fliers'][index].get_ydata()
                # 删除异常值
                for flier in fliers_value_list:
                    data = data[data.loc[:, value] != flier]
    return data

train_data['buildingTypeId'] = train_data['buildingTypeId'].astype(str)

print(train_data.dtypes)
print(train_data['buildingTypeId'].value_counts())
print(test_data['buildingTypeId'].value_counts())
train_data['bedrooms'] = train_data['bedrooms'].astype(str)
print(train_data['bedrooms'].value_counts())
print(test_data['bedrooms'].value_counts())

trian_data = use_pivot_box_to_remove_fliers(train_data,['buildingTypeId','bedrooms'],['longitude','latitude','price','daysOnMarket'])
print(train_data.shape)

# train_data.to_csv('./month_6_train_1.csv',index=False)
# test_data.to_csv('./test_data_1.csv',index=False)
train_data = train_data.dropna()

# train_data['longitude'] = abs(train_data['longitude'])
# train_data['longitude'] = np.log1p(np.log1p(train_data['longitude']))

2:对bedrooms和bathroomtotal的处理:

for bedrooms in data["bedrooms"]:
    # print(bedrooms)
    if isinstance(bedrooms,float):
        bedrooms_list.append(int(bedrooms))
    else:
        bedrooms_list.append(int(eval(bedrooms)))
data["bedrooms"] = bedrooms_list
bathroom_total_list = []
for bathroom_total in data["bathroomTotal"]:
    bathroom_total_list.append(int(bathroom_total))
data["bathroomTotal"] = bathroom_total_list
return data

3:把类别型的数据拆分开来进行预测:(这种方式的结果是合并之后结果一样,也就是说,一些算法内部就是把不同类别

分开了进行预测的,预测的具体数值是通过数值型数据进行计算的)但是也有可能其他一些算法内部不是如深度学习;

还有就是减少类别特征值的个数:

# -*- coding:utf-8 _*-  
""" 
@author:Administrator
@file: devided_category_to_predict.py
@time: 2018/9/25
"""
# -*- coding:utf-8 _*-
""" 
@author:Administrator
@file: data_analysis.py
@time: 2018/9/25
"""
import pandas as pd

train_data = pd.read_csv("./input/month_567_data.csv")
test_data = pd.read_csv("./input/hose_info_201808_predict_2.csv")


# 预处理数据
def preprocess_data(data):
    data = data[[
        "longitude",
        "latitude",
        "city",
        "province",
        "price",
        "tradeTypeId",
        "listingDate",
        "buildingTypeId",
        "bedrooms",
        "bathroomTotal",
        'postalCode',
        'daysOnMarket',
        'ownerShipType'
    ]]
    # data = data[data.tradeTypeId == 1]
    # data = data.drop(columns=['tradeTypeId'])
    print('data shape=%s before dropna' % (str(data.shape)))
    data = data.dropna(axis=0)
    bedrooms_list = []
    for bedrooms in data["bedrooms"]:
        # print(bedrooms)
        if isinstance(bedrooms, float):
            bedrooms_list.append(int(bedrooms))
        else:
            bedrooms_list.append(int(eval(bedrooms)))
    data["bedrooms"] = bedrooms_list
    bathroom_total_list = []
    for bathroom_total in data["bathroomTotal"]:
        bathroom_total_list.append(int(bathroom_total))
    data["bathroomTotal"] = bathroom_total_list
    return data


def date_processing(data):
    list_date = list(data['listingDate'])
    year_list = []
    month_list = []
    day_list = []
    for date in list_date:
        if '/' in date:
            list_break = date.split('/')
            year_list.append(int(list_break[0]))
            month_list.append(list_break[1])
            day_list.append(list_break[2])
        elif '-' in date:
            list_break = date.split('-')
            year_list.append(int(list_break[0]))
            month_list.append(list_break[1])
            day_list.append(list_break[2])
    data['year'] = year_list
    data['month'] = month_list
    # data['day'] = day_list
    data = data.drop(columns='listingDate')

    return data


def show_value_counts(data, columns):
    for column in columns:
        print(data[column].value_counts())
    print(data.shape)


'''
就目前来讲,数据量最大的和最小的相差最多10倍情况下来取数据:
city:

'''


# 处理城市
def process_city(train_data, threshold_value):
    print('city nums before process:', len(set(train_data['city'])))
    city_list = set(train_data['city'])
    list_fill = []
    for city in city_list:
        if len(train_data[train_data.city == city]) > threshold_value:
            list_fill.append(city)
    print('city nums after process:', len(list_fill))
    # 只要满足条件的数据
    train_data = train_data[train_data.city.isin(list_fill)]

    return train_data


# 处理postalCode
def get_category_class_bigger_than_threshold_value_postalcode(data, column, threshold_value):
    column_list = set(data[column])
    print('postalCode nums before process:', len(column_list))
    list_fill = []
    for value in column_list:
        if len(data[data.postalCode == value]) > threshold_value:
            list_fill.append(value)
    print('postalCode nums after process:', len(list_fill))
    data = data[data.postalCode.isin(list_fill)]
    return data


# 处理省份
def get_category_class_bigger_than_threshold_value_province(data, column, threshold_value):
    column_list = set(data[column])
    print('province nums before process:', len(column_list))
    list_fill = []
    for value in column_list:
        if len(data[data.province == value]) > threshold_value:
            list_fill.append(value)
    print('province nums after process:', len(list_fill))
    data = data[data.province.isin(list_fill)]
    return data


# 处理buildingTypeId
def get_category_class_bigger_than_threshold_value_buildingTypeId(data, column, threshold_value):
    column_list = set(data[column])
    print('buildingTypeId nums before process:', len(column_list))
    list_fill = []
    for value in column_list:
        if len(data[data.buildingTypeId == value]) > threshold_value:
            list_fill.append(value)
    print('buildingTypeId nums after process:', len(list_fill))
    data = data[data.buildingTypeId.isin(list_fill)]
    return data


# 处理 ownerShipType
def get_category_class_bigger_than_threshold_value_ownerShipType(data, column, threshold_value):
    column_list = set(data[column])
    print('ownerShipType nums before process:', len(column_list))
    list_fill = []
    for value in column_list:
        if len(data[data.ownerShipType == value]) > threshold_value:
            list_fill.append(value)
    print('ownerShipType nums after process:', len(list_fill))
    data = data[data.ownerShipType.isin(list_fill)]

    return data


# 处理 bedrooms
def get_category_class_bigger_than_threshold_value_bedrooms(data, column, threshold_value):
    column_list = set(data[column])
    print('bedrooms nums before process:', len(column_list))
    list_fill = []
    for value in column_list:
        if len(data[data.bedrooms == value]) > threshold_value:
            list_fill.append(value)
    print('bedrooms nums after process:', len(list_fill))
    data = data[data.bedrooms.isin(list_fill)]

    return data


# 处理 bathroomTotal
def get_category_class_bigger_than_threshold_value_bathroomTotal(data, column, threshold_value):
    column_list = set(data[column])
    print('bathroomTotal nums before process:', len(column_list))
    list_fill = []
    for value in column_list:
        if len(data[data.bathroomTotal == value]) > threshold_value:
            list_fill.append(value)
    print('bathroomTotal nums after process:', len(list_fill))
    data = data[data.bathroomTotal.isin(list_fill)]

    return data


if __name__ == '__main__':
    # 预处理数据
    train_data = preprocess_data(train_data)
    test_data = preprocess_data(test_data)
    train_data = date_processing(train_data)
    test_data = date_processing(test_data)

    category_variable = ['province', 'city',
                         'tradeTypeId', 'buildingTypeId',
                         'bedrooms', 'bathroomTotal',
                         'postalCode',
                         'ownerShipType',
                         'year', 'month',
                         # 'daysOnMarket'
                         ]

    # 处理city
    train_data = process_city(train_data, 100)
    # 处理postalCode
    train_data = get_category_class_bigger_than_threshold_value_postalcode(train_data, 'postalCode', 10)
    # # 处理省份
    train_data = get_category_class_bigger_than_threshold_value_province(train_data, 'province', 100)
    train_data = get_category_class_bigger_than_threshold_value_buildingTypeId(train_data, 'buildingTypeId', 100)
    # # 处理ownerShipType
    train_data = get_category_class_bigger_than_threshold_value_ownerShipType(train_data, 'ownerShipType', 100)
    # # 处理bedrooms
    train_data = get_category_class_bigger_than_threshold_value_bedrooms(train_data, 'bedrooms', 100)
    # # 处理bathroomTotal
    train_data = get_category_class_bigger_than_threshold_value_bathroomTotal(train_data, 'bathroomTotal', 100)

    show_value_counts(train_data, category_variable)




3:在tensorflow的dnn算法中,将latitude和longitude特征转化成网格数据,tensorflow中自带有一个feature_column(特征工程的工具):具体方法看:https://tensorflow.google.cn/api_docs/python/tf/feature_column

4:获取坐标的异常值;

def get_outlier(x,y,init_point_count ,distance,least_point_count):
    x_outliers_list = []
    y_outliers_list = []
    for i in range(len(x)):
        for j in range(len(x)):
             d =np.sqrt(np.square(x[i]-x[j])+np.square(y[i]-y[j]))
             # print('距离',d)
             if d <= distance:
                init_point_count +=1
        if init_point_count <least_point_count+1:
            x_outliers_list.append(x[i])
            y_outliers_list.append(y[i])
            print(x[i],y[i])
        init_point_count =0
    return x_outliers_list,y_outliers_list


x, y = get_outlier(x,y,0,10,1)
print(x,y)

-----------------特征之间的关联热度图:(只能计算数值型的)

# corrmat = data.corr()

扫描二维码关注公众号,回复: 4501918 查看本文章
# cols = corrmat.nlargest(k, 'daysOnMarket')['daysOnMarket'].index
# cm = np.corrcoef(data[cols].values.T)
# sns.set(font_scale=1.25)
# hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
# plt.show()

# 标签编码

import pandas as pd
def label_encode(data):
    for column in data.columns:
        if data[column].dtypes=='object':
            data[column] = pd.factorize(data[column].values, sort=True)[0] + 1
            data[column] = data[column].astype('str')
    return data

还有一种是直接使用sklearn.preprocess 下的label_encodeing

 

猜你喜欢

转载自blog.csdn.net/weixin_38859557/article/details/84957679