sklearn数据 预处理

简单易懂的机器学学习第三方库,使用前需要安装

pip install sklearn  #提前安装,有点大

导入需要的包

#encoding=utf-8
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits,fetch_20newsgroups
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from  sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
import jieba
import numpy as np
#特征抓取
#导入包
# from sklearn.feature_extraction.text import CountVectorizer
#实例化CountVectorizer()
vector = CountVectorizer()
#调用fit_transform 输入并转换数据
# res = vector.fit_transform(['life_is_short,i_like_python','life_is'])

#字典向量化

def dictvec():
    #字典向量化
    #有多少种状态,用多少位二进制表示,1表示该标签出现,0表示没有出现
    dict = DictVectorizer(sparse=False)
    data = dict.fit_transform([
        {'city':'北京','pos':'北方','temperatue':100},
        {'city':'上海', 'pos': '东方', 'temperatue': 60},
        {'city': '深圳', 'pos': '西方', 'temperatue': 30},
        {'city': '重庆', 'pos': '南方', 'temperatue': 70
    }])
    #先调出names,知道每个位置的标签在翻译
    print(dict.get_feature_names())
    print(data)
    print(dict.get_feature_names()[0])
    print(dict.get_feature_names()[1])
    print(dict.get_feature_names()[2])
    print(dict.get_feature_names()[3])
    #再翻译成字典类型
    print(dict.inverse_transform(data)[0])

jiaba切词与内置函数

def countvec():
    #把每个词在文档中出现的次数填在指定为位置(词向量对应的位置)
    cv = CountVectorizer()
    data = cv.fit_transform(['this is a test test test',"we have have a test"])
    print(cv.get_feature_names())
    print(data.toarray())
    return None




def cutword():
    con1 = jieba.cut("床前明月光,我要学python.")
    con2 = jieba.cut("床前明月光,疑是地上霜.")
    con3 = jieba.cut("生存或死亡,这是一个问题")
    #转化为列表
    content1 = list(con1)
    content2 = list(con2)
    content3 = list(con3)
    print(content1)
    #列表转化为字符串
    c1 = ' '.join(content1)
    c2 = ' '.join(content2)
    c3 = ' '.join(content3)
    print(c1)
    return c1,c2,c3

def hanzivec():
    '''
    中文特征化
    :return:None
    '''
    c1,c2,c3 = cutword()
    cv = CountVectorizer()
    print(c1,c2,c3)
    data = cv.fit_transform([c1,c2,c3])
    for f_name in cv.get_feature_names():
        print(f_name)

    print(data.toarray())
    return None

归一化处理

def mm():

    """
    归一化处理
    :return:
    比如把[a,b,c,d]规划在[x,y]之间
    假设lt = [a,b,c,d]极差为d-a
    lt里面的数在[x,y之间对应的书为]
    m = x + {[(y-x)*(lt[i] - lt(min))]/(lt.max-lt.min)}
    例如[90,60,75]规划在(3,5)之间
    最大间隔 a = 90 - 60 = 30
    规划区域间隔b = 5 - 3 = 2
    90对应的数据 3 +  2 *(90-60)/a = 5
    60 对应 3
    75对应   3 + 2*(75-60)/a = 4

    """
    #划定在2-3之间
    mm = MinMaxScaler(feature_range=(3,5))
    data = mm.fit_transform([[90,2,10,40],[60,4,15,45],[75,3,13,46]])
    print(data)
    return  None

mm()

def stand():
    std = StandardScaler()
    data = std.fit_transform([[1,-1,3],[2,4,2],[4,6,-1]])
    print(data)

stand()

猜你喜欢

转载自blog.csdn.net/qq_43004728/article/details/83995743