3detla法处理异常值

class deals_abnormal_by_3delta():
    '''
    3*delta法处理异常值
    属性:
        self.data :data的浅复制
        self.scale :默认为3
        self.val_low:下边界
        self.val_high:上边界
        self.high_index:触犯了上边界的索引
        self.low_index:触犯了下边界的索引
        self.all_abnormal_index:所有有问题的索引
    方法:
        __init__
            data:需要处理异常值的数据DataFrame(只保留有异常值的列)  
            scale:默认为3
            
        remove_all_abnormal
            返回:删除所有有问题行的数据
            
        remove_specific_column
            返回:删除制定列有问题行的数据
            
        replace_all_abnormal_with_boundary
            返回:用边界值替换所有有问题行的数据
            
        replace_all_abnormal_with_nan
            返回:用np.nan替换所有有问题行的数据    
    '''
    def __init__(self,data,scale = 3):
        self.data = data.copy()
        self.scale = scale
    
        iqr = (self.data.quantile(0.75) - self.data.quantile(0.25)) * self.scale
        self.val_low = self.data.quantile(0.25) - iqr
        self.val_high = self.data.quantile(0.75) + iqr
        self.operate_columns = [x for x in self.data.columns ]
        self.high_index = {}
        self.low_index = {}
        for i in self.operate_columns:
            temp_data = self.data[i]
            high_val = self.val_high[i]
            low_val = self.val_low[i]
            high_index = self.data[self.data[i] > high_val].index
            low_index = self.data[self.data[i] < low_val].index
            if len(high_index) > 0:
                self.high_index[i] = high_index
            if len(low_index) > 0:    
                self.low_index[i] = low_index

        self.all_abnormal_index = []
        for i in self.high_index.keys():
            for j in self.high_index[i]:
                self.all_abnormal_index.append(j)
        for i in self.low_index.keys():
            for j in self.low_index[i]:
                self.all_abnormal_index.append(j)
        self.all_abnormal_index = [x for x in set(self.all_abnormal_index)]

    def remove_all_abnormal(self):
        result = self.data.drop(self.all_abnormal_index,inplace = False)
        return result
    
    def remove_specific_column(self,col):
        result = self.data.copy()
        result.drop(self.high_index[col],inplace = True)
        result.drop(self.low_index[col],inplace = True)
        return result
    
    def replace_all_abnormal_with_boundary(self):
        result = self.data.copy()
        for i in self.operate_columns:
            if i in self.high_index.keys():
                for j in self.high_index[i]:
                    
                    result[i][j] = self.val_high[i]
            if i in self.low_index.keys():
                for j in self.low_index[i]:    
                    result[i][j] = self.val_low[i]
        return result
    
    def replace_all_abnormal_with_nan(self):
        result = self.data.copy()
        for i in self.operate_columns:
            if i in self.high_index.keys():
                for j in self.high_index[i]:
                    result[i][j] = np.nan
            if i in self.low_index.keys():
                for j in self.low_index[i]:    
                    result[i][j] = np.nan
        return result 

猜你喜欢

转载自blog.csdn.net/weixin_44414593/article/details/107298501
今日推荐