Python数据预处理：缺失值统计个数、判断是否、过滤与删除、还有Nan的计算、填充、插入、重复数据、异常值

关注微信号：小程在线

关注CSDN博客：程志伟的博客

import numpy as np
import pandas as pd
pd.set_option("display.max_rows",10)
pd.set_option("display.max_columns",10)

#### 1.1统计数据缺失值个数 ####
eu2012 = pd.read_csv("H:/0date/Eueo2012_na.csv")
eu2012
Out[1]:
Team Goals Shots on target Shots off target
0 Croatia 4 13 12
1 Czech Republic 4 -999 18
2 Denmark 4 10 10
3 England 5 11 18
4 France 3 22 24
.. ... ... ... ...
11 Republic of Ireland 1 7 12
12 Russia 5 9 31
13 Spain 12 42 33
14 xyz 5 17 19
15 Ukraine 2 7 26

[16 rows x 35 columns]

#统计每行缺失值个数
eu2012.isnull().sum()
Out[2]:
Team 0
Goals 0
Shots on target 0
Shots off target 0
Shooting Accuracy 0
..
Yellow Cards 0
Red Cards 0
Subs on 0
Subs off 0
Players Used 0
Length: 35, dtype: int64

#统计所有缺失值个数
eu2012.isnull().sum().sum()
Out[3]: 2

#定义xyz也为缺失值
eu2013 = pd.read_csv("H:/0date/Eueo2012_na.csv", na_values=['xyz'])
eu2013.isnull().sum().sum()
Out[4]: 3

eu2012['Team']
Out[5]:
0 Croatia
1 Czech Republic
2 Denmark
3 England
4 France

11 Republic of Ireland
12 Russia
13 Spain
14 xyz
15 Ukraine
Name: Team, Length: 16, dtype: object

#定义多个缺失值
eu2012 = pd.read_csv("H:/0date/Eueo2012_na.csv", na_values=['NA','xyz','-999'])
eu2012.isnull().sum().sum()
Out[6]: 4

#### 1.2构造一个带NaN的 data frame ####
df = pd.DataFrame(np.random.randint(0,100,15).reshape(5, 3),
index=['a', 'b', 'c', 'd', 'e'],
columns=['c1', 'c2', 'c3'])
df
Out[7]:
c1 c2 c3
a 50 76 69
b 23 64 61
c 2 61 82
d 31 77 42
e 32 87 42

df['c4']=np.nan
df.loc['f']=np.arange(10,14)
df.loc['g']=np.nan
df['c5']=np.nan
df['c4']['a']=18
df
Out[8]:
c1 c2 c3 c4 c5
a 50.0 76.0 69.0 18.0 NaN
b 23.0 64.0 61.0 NaN NaN
c 2.0 61.0 82.0 NaN NaN
d 31.0 77.0 42.0 NaN NaN
e 32.0 87.0 42.0 NaN NaN
f 10.0 11.0 12.0 13.0 NaN
g NaN NaN NaN NaN NaN

#判断是否为缺失值
df.isnull()
Out[9]:
c1 c2 c3 c4 c5
a False False False False True
b False False False True True
c False False False True True
d False False False True True
e False False False True True
f False False False False True
g True True True True True

#每列缺失值的个数
df.isnull().sum()
Out[10]:
c1 1
c2 1
c3 1
c4 5
c5 7
dtype: int64

#每行缺失值的个数
df.isnull().sum(axis=1)
Out[11]:
a 1
b 2
c 2
d 2
e 2
f 1
g 5
dtype: int64

#总缺失值的个数
df.isnull().sum().sum()
Out[12]: 15

#统计not null
df.count()
Out[13]:
c1 6
c2 6
c3 6
c4 2
c5 0
dtype: int64

#判断是否不是缺失值
df.notnull()
Out[14]:
c1 c2 c3 c4 c5
a True True True True False
b True True True False False
c True True True False False
d True True True False False
e True True True False False
f True True True True False
g False False False False False

#### 1.3 过滤或者去掉缺失值 ####
#筛选c4列不为缺失的数据
df.c4[df.c4.notnull()]
Out[15]:
a 18.0
f 13.0
Name: c4, dtype: float64

df.c4.dropna() #inplace=False
Out[16]:
a 18.0
f 13.0
Name: c4, dtype: float64

#删除有缺失值的列
df.dropna()
Out[17]:
Empty DataFrame
Columns: [c1, c2, c3, c4, c5]
Index: []

#删除全部缺失值的行
df.dropna(how='all')
Out[18]:
c1 c2 c3 c4 c5
a 50.0 76.0 69.0 18.0 NaN
b 23.0 64.0 61.0 NaN NaN
c 2.0 61.0 82.0 NaN NaN
d 31.0 77.0 42.0 NaN NaN
e 32.0 87.0 42.0 NaN NaN
f 10.0 11.0 12.0 13.0 NaN

#删除全部缺失值的列
df.dropna(how='all',axis=1)
Out[19]:
c1 c2 c3 c4
a 50.0 76.0 69.0 18.0
b 23.0 64.0 61.0 NaN
c 2.0 61.0 82.0 NaN
d 31.0 77.0 42.0 NaN
e 32.0 87.0 42.0 NaN
f 10.0 11.0 12.0 13.0
g NaN NaN NaN NaN

#构建新的数据
df2=df.copy()
df2.loc['g'].c1=0
df2.loc['g'].c3=0
df2
Out[20]:
c1 c2 c3 c4 c5
a 50.0 76.0 69.0 18.0 NaN
b 23.0 64.0 61.0 NaN NaN
c 2.0 61.0 82.0 NaN NaN
d 31.0 77.0 42.0 NaN NaN
e 32.0 87.0 42.0 NaN NaN
f 10.0 11.0 12.0 13.0 NaN
g 0.0 NaN 0.0 NaN NaN

#删除包含缺失值的列
df2.dropna(how='any',axis=1)
Out[21]:
c1 c3
a 50.0 69.0
b 23.0 61.0
c 2.0 82.0
d 31.0 42.0
e 32.0 42.0
f 10.0 12.0
g 0.0 0.0

#删除包含缺失值大于等于5的列
df2.dropna(thresh=5,axis=1)
Out[22]:
c1 c2 c3
a 50.0 76.0 69.0
b 23.0 64.0 61.0
c 2.0 61.0 82.0
d 31.0 77.0 42.0
e 32.0 87.0 42.0
f 10.0 11.0 12.0
g 0.0 NaN 0.0

#### 1.4 计算中的NaN处理 ####
a=np.array([np.nan,1,2,np.nan,3])
a
Out[23]: array([nan, 1., 2., nan, 3.])

s=pd.Series(a)
s
Out[24]:
0 NaN
1 1.0
2 2.0
3 NaN
4 3.0
dtype: float64

#有NaN的均值为NaN
a.mean(),s.mean()
Out[25]: (nan, 2.0)

#加法计算
df2.c4+1
Out[26]:
a 19.0
b NaN
c NaN
d NaN
e NaN
f 14.0
g NaN
Name: c4, dtype: float64

#累加计算
df2.c4.cumsum()
Out[27]:
a 18.0
b NaN
c NaN
d NaN
e NaN
f 31.0
g NaN
Name: c4, dtype: float64

#### 1.5 填充缺失值 ####
fill_0=df.fillna(0)
fill_0
Out[28]:
c1 c2 c3 c4 c5
a 50.0 76.0 69.0 18.0 0.0
b 23.0 64.0 61.0 0.0 0.0
c 2.0 61.0 82.0 0.0 0.0
d 31.0 77.0 42.0 0.0 0.0
e 32.0 87.0 42.0 0.0 0.0
f 10.0 11.0 12.0 13.0 0.0
g 0.0 0.0 0.0 0.0 0.0

#只填充3个缺失值
df.fillna(0,limit=3)
Out[29]:
c1 c2 c3 c4 c5
a 50.0 76.0 69.0 18.0 0.0
b 23.0 64.0 61.0 0.0 0.0
c 2.0 61.0 82.0 0.0 0.0
d 31.0 77.0 42.0 0.0 NaN
e 32.0 87.0 42.0 NaN NaN
f 10.0 11.0 12.0 13.0 NaN
g 0.0 0.0 0.0 NaN NaN

#fillna前一个值填充，bfill一个值填充
df.c4
Out[30]:
a 18.0
b NaN
c NaN
d NaN
e NaN
f 13.0
g NaN
Name: c4, dtype: float64

df.c4.fillna(method='ffill')
Out[31]:
a 18.0
b 18.0
c 18.0
d 18.0
e 18.0
f 13.0
g 13.0
Name: c4, dtype: float64

df.c4.fillna(method='bfill')
Out[32]:
a 18.0
b 13.0
c 13.0
d 13.0
e 13.0
f 13.0
g NaN
Name: c4, dtype: float64

#根据索引填充
fill_values=pd.Series([1,2],index=['b','c'])
df.c4.fillna(fill_values)
Out[33]:
a 18.0
b 1.0
c 2.0
d NaN
e NaN
f 13.0
g NaN
Name: c4, dtype: float64

#根据列均值填充
df.fillna(df.mean())
Out[34]:
c1 c2 c3 c4 c5
a 50.000000 76.000000 69.000000 18.0 NaN
b 23.000000 64.000000 61.000000 15.5 NaN
c 2.000000 61.000000 82.000000 15.5 NaN
d 31.000000 77.000000 42.000000 15.5 NaN
e 32.000000 87.000000 42.000000 15.5 NaN
f 10.000000 11.000000 12.000000 13.0 NaN
g 24.666667 62.666667 51.333333 15.5 NaN

#### 1.6 插值 ####
s=pd.Series([1,2,np.nan,5,np.nan,9])
s
Out[35]:
0 1.0
1 2.0
2 NaN
3 5.0
4 NaN
5 9.0
dtype: float64

#基于数字的均值
s.interpolate()
Out[36]:
0 1.0
1 2.0
2 3.5
3 5.0
4 7.0
5 9.0
dtype: float64

#基于时间的插入
import datetime
ts = pd.Series([1, np.nan, 2],
index=[datetime.datetime(2016, 1, 1),
datetime.datetime(2016, 2, 1),
datetime.datetime(2016, 4, 1)])
ts
Out[37]:
2016-01-01 1.0
2016-02-01 NaN
2016-04-01 2.0
dtype: float64

ts.interpolate()
Out[38]:
2016-01-01 1.0
2016-02-01 1.5
2016-04-01 2.0
dtype: float64

ts.interpolate(method='time')
Out[39]:
2016-01-01 1.000000
2016-02-01 1.340659
2016-04-01 2.000000
dtype: float64

#基于索引的插入
s = pd.Series([0, np.nan, 20], index=[0, 1, 10])
s
Out[40]:
0 0.0
1 NaN
10 20.0
dtype: float64

s.interpolate()
Out[41]:
0 0.0
1 10.0
10 20.0
dtype: float64

s.interpolate(method='values')
Out[42]:
0 0.0
1 2.0
10 20.0
dtype: float64

#### 1.7重复值 ####
data = pd.DataFrame({'a': ['x'] * 3 + ['y'] * 4,
'b': [1, 1, 2, 3, 3, 4, 4]})
data
Out[43]:
a b
0 x 1
1 x 1
2 x 2
3 y 3
4 y 3
5 y 4
6 y 4

#判断是否出现重复值
data.duplicated()
Out[44]:
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool

#删除后面重复的
data.drop_duplicates()
Out[45]:
a b
0 x 1
2 x 2
3 y 3
5 y 4

#删除前面重复的
data.drop_duplicates(keep='last')
Out[46]:
a b
1 x 1
2 x 2
4 y 3
6 y 4

data['c']=np.arange(7)
data.duplicated()
Out[47]:
0 False
1 False
2 False
3 False
4 False
5 False
6 False
dtype: bool

#删除ab两列重复的值
data.drop_duplicates(['a','b'])
Out[48]:
a b c
0 x 1 0
2 x 2 2
3 y 3 3
5 y 4 5

#### 1.8 transform ####
#map函数映射
x = pd.Series({"one": 1, "two": 2, "three": 3})
y = pd.Series({1: "a", 2: "b", 3: "c"})

x.map(y)
Out[49]:
one a
two b
three c
dtype: object

df=pd.DataFrame({'c1':['a']*3+['b']*4,
'c2':[1,1,2,3,3,4,4]})
df
Out[50]:
c1 c2
0 a 1
1 a 1
2 a 2
3 b 3
4 b 3
5 b 4
6 b 4

#将1替换成5
df.replace(1,5)
Out[51]:
c1 c2
0 a 5
1 a 5
2 a 2
3 b 3
4 b 3
5 b 4
6 b 4

#将a替换成x
df.replace('a','x')
Out[52]:
c1 c2
0 x 1
1 x 1
2 x 2
3 b 3
4 b 3
5 b 4
6 b 4

#将c2列的3替换成100
df.replace({'c2': 3}, 100)
Out[53]:
c1 c2
0 a 1
1 a 1
2 a 2
3 b 100
4 b 100
5 b 4
6 b 4

#### outlier异常值处理 ####
df=pd.DataFrame({'Data':np.random.normal(size=200)})
df.head()
Out[54]:
Data
0 0.458493
1 -0.444134
2 0.422875
3 -0.397458
4 -1.040618

#筛选数值
df[np.abs(df.Data-df.Data.mean())<=(2*df.Data.std())]
Out[55]:
Data
0 0.458493
1 -0.444134
2 0.422875
3 -0.397458
4 -1.040618
.. ...
195 0.825967
196 -0.764515
197 1.170473
198 1.220785
199 -0.175777

[189 rows x 1 columns]

mask=np.abs(df.Data-df.Data.mean())>=(2*df.Data.std())

df[mask]
Out[57]:
Data
13 -2.434275
57 -2.099274
91 -2.286433
98 1.844199
127 2.070731
.. ...
149 2.347566
151 1.983026
156 -2.051421
157 -1.962457
184 -2.110960

[11 rows x 1 columns]

#将异常值替换成均值
df[mask]=df.Data.mean()
df[mask]
Out[58]:
Data
13 -0.047164
57 -0.047164
91 -0.047164
98 -0.047164
127 -0.047164
.. ...
149 -0.047164
151 -0.047164
156 -0.047164
157 -0.047164
184 -0.047164

[11 rows x 1 columns]

程志伟

发布了89 篇原创文章 · 获赞 109 · 访问量 37万+

私信关注

Python数据预处理：缺失值统计个数、判断是否、过滤与删除、还有Nan的计算、填充、插入、重复数据、异常值

猜你喜欢