Pandas与sklearn结合实例

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.DataFrame({'Condition 1': np.random.rand(20),
                   'Condition 2': np.random.rand(20) * 0.9,
                   'Condition 3': np.random.rand(20) * 1.1
                  })
df
  Condition 1 Condition 2 Condition 3
0 0.150388 0.319698 0.307660
1 0.969830 0.613011 0.695216
2 0.660890 0.552131 0.229432
3 0.574232 0.679883 0.738781
4 0.527174 0.578460 0.981132
5 0.952754 0.388025 0.935823
6 0.077330 0.331501 0.663525
7 0.288425 0.755113 0.829731
8 0.398153 0.668251 0.674626
9 0.687752 0.540433 0.971847
10 0.470583 0.352360 0.249517
11 0.643588 0.240827 0.640346
12 0.278763 0.012188 0.506313
13 0.486791 0.538330 0.005713
14 0.661333 0.101712 0.868087
15 0.420160 0.640365 0.388247
16 0.932169 0.580433 0.594378
17 0.956558 0.878580 0.458417
18 0.637018 0.058973 0.338527
19 0.950942 0.647577 0.687604
fig,ax = plt.subplots(figsize=(10,8))
#stacked 是否堆叠
df.plot.bar(ax=ax,stacked=False)
<matplotlib.axes._subplots.AxesSubplot at 0xa263898>

from matplotlib.ticker import FuncFormatter

df_ratio = df.div(df.sum(axis=1),axis=0)
fig,ax = plt.subplots()
df_ratio.plot.bar(ax=ax,stacked=True)
ax.yaxis.set_major_formatter(FuncFormatter(lambda y,_:'{:.0%}'.format(y)))

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv'
df = pd.read_csv(url, na_values="?")
df.head()
  Age Number of sexual partners First sexual intercourse Num of pregnancies Smokes Smokes (years) Smokes (packs/year) Hormonal Contraceptives Hormonal Contraceptives (years) IUD ... STDs: Time since first diagnosis STDs: Time since last diagnosis Dx:Cancer Dx:CIN Dx:HPV Dx Hinselmann Schiller Citology Biopsy
0 18 4.0 15.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN 0 0 0 0 0 0 0 0
1 15 1.0 14.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN 0 0 0 0 0 0 0 0
2 34 1.0 NaN 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN 0 0 0 0 0 0 0 0
3 52 5.0 16.0 4.0 1.0 37.0 37.0 1.0 3.0 0.0 ... NaN NaN 1 0 1 0 0 0 0 0
4 46 3.0 21.0 4.0 0.0 0.0 0.0 1.0 15.0 0.0 ... NaN NaN 0 0 0 0 0 0 0 0

5 rows × 36 columns

from sklearn.preprocessing import Imputer
#缺失值填充处理
impute =  pd.DataFrame(Imputer().fit_transform(df))
impute.columns = df.columns
impute.index = df.index

impute.head()
  Age Number of sexual partners First sexual intercourse Num of pregnancies Smokes Smokes (years) Smokes (packs/year) Hormonal Contraceptives Hormonal Contraceptives (years) IUD ... STDs: Time since first diagnosis STDs: Time since last diagnosis Dx:Cancer Dx:CIN Dx:HPV Dx Hinselmann Schiller Citology Biopsy
0 18.0 4.0 15.0000 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 6.140845 5.816901 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 15.0 1.0 14.0000 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 6.140845 5.816901 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 34.0 1.0 16.9953 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 6.140845 5.816901 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 52.0 5.0 16.0000 4.0 1.0 37.0 37.0 1.0 3.0 0.0 ... 6.140845 5.816901 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
4 46.0 3.0 21.0000 4.0 0.0 0.0 0.0 1.0 15.0 0.0 ... 6.140845 5.816901 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 36 columns

%matplotlib notebook
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
features = impute.drop('Dx:Cancer', axis=1)
y = impute["Dx:Cancer"]

pca = PCA(n_components=3)
X_r = pca.fit_transform(features)

print("Explained variance:\nPC1 {:.2%}\nPC2 {:.2%}\nPC3 {:.2%}"
      .format(pca.explained_variance_ratio_[0],
              pca.explained_variance_ratio_[1],
              pca.explained_variance_ratio_[2]))

fig = plt.figure()
ax = Axes3D(fig)

ax.scatter(X_r[:, 0], X_r[:, 1], X_r[:, 2], c=y, cmap=plt.cm.coolwarm)

# Label the axes
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')

猜你喜欢

转载自blog.csdn.net/xzy53719/article/details/82835115
今日推荐