【机器学习】特征提取代码汇总

特征提取代码汇总

import jieba
from sklearn.datasets import load_iris
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split


def datasets_demo():
    """
    sklearn 数据集使用


    数据集的划分:
        机器学习一般的数据集会划分为两个部分
            训练数据:用于训练,构建模型
            测试数据:在模型校验使用,用于评估模型是否可用
    :return:
    """
    # 获取数据集
    iris = load_iris()
    print("鸢尾花数据集:\n", iris)
    print("查询数据集描述:\n", iris["DESCR"])
    print("查询特征值的名字:\n", iris.feature_names)
    print("查看特征值:\n", iris.data, iris.data.shape)
    print("查看特征值:\n", iris.data.shape)

    # 数据集划分
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
    print("训练集的特征值:\n", x_train, x_train.shape)

    return None


def dict_demo():
    """
    字典特征值提取
    :return:
    """
    data = [{
    
    'city': '北京', 'temperature': 100}, {
    
    'city': '上海', 'temperature': 60}, {
    
    'city': '深圳', 'temperature': 30}]
    # 1. 实例化一个转换器   默认返回 sparse矩阵  将非0值按位置表示出来 以节省内存 提高加载效率
    transfer = DictVectorizer(sparse=False)

    # 应用场景:数据集中类别特征值较多;将数据集的特征-》字典类型;DictVectorizer转换;本身拿到的就是字典

    # 2. 调用fit_transform()
    data_new = transfer.fit_transform(data)
    print("data_new:\n", data_new)
    print("特征名字:\n", transfer.get_feature_names())
    return None


def count_demo():
    """
    文本特征值抽取
    :return:
    """
    data = ["life is short, i like python", "life is too long i dislike python"]
    # 1、实例化一个转换器类
    transfer = CountVectorizer()
    # 演示停用词
    # transfer = CountVectorizer(stop_words=["is", "too"])
    data_new = transfer.fit_transform(data)

    print("data_new:\n", data_new.toarray())
    print("特征名字:\n", transfer.get_feature_names())
    # 2、调用fit_transform

    return None


def count_chinese_demo():
    """
    中文文本特征值抽取
    :return:
    """
    data = ["我 爱 北京 天安门", "天安门 上 太阳 升"]
    data2 = ["我爱北京天安门", "天安门上太阳升"]
    # 1、实例化一个转换器类
    transfer = CountVectorizer()
    data_new = transfer.fit_transform(data)

    print("data_new:\n", data_new.toarray())
    print("特征名字:\n", transfer.get_feature_names())
    # 2、调用fit_transform

    return None


def count_word(text):
    """
    进行中文分词 我爱北京天安门-》我 爱 北京 天安门
    :param text:
    :return:
    """
    a = " ".join(list(jieba.cut(text)))
    print(a)
    return a


def count_chinese_demo2():
    """
    中文文本特征值抽取 自动分词
    :return:
    """
    data = ["在过去两个月里,我和60多位小伙伴进行了1对1的一小时沟通;",
            "TA绝大多数是想要尝试副业变现的朋友。",
            "从一线城市到三线城市,从宝妈到职场人,从职场到体制内。"]
    # 1、实例化一个转换器类
    transfer = CountVectorizer(stop_words=["从宝妈"])
    data_new = transfer.fit_transform(count_word(item) for item in data)

    print("data_new:\n", data_new.toarray())
    print("特征名字:\n", transfer.get_feature_names())
    # 2、调用fit_transform

    return None


def tfidf_demo():
    """
    用TF-IDF方法进行文本特征值抽取
    :return:
    """
    data = ["在过去两个月里,我和60多位小伙伴进行了1对1的一小时沟通;",
            "TA绝大多数是想要尝试副业变现的朋友。",
            "从一线城市到三线城市,从宝妈到职场人,从职场到体制内。"]
    transfer = TfidfVectorizer(stop_words=["从宝妈"])
    data_new = transfer.fit_transform(count_word(item) for item in data)

    print("data_new:\n", data_new.toarray())
    print("特征名字:\n", transfer.get_feature_names())
    return None


if __name__ == '__main__':
    # 代码1
    # datasets_demo()

    # 代码2
    # dict_demo()

    # 代码3
    # count_demo()

    # 代码4
    # count_chinese_demo()

    # 代码5
    # count_chinese_demo2()

    # count_word("我爱后端码匠微信公众号")

    # 代码6
    tfidf_demo()

猜你喜欢

转载自blog.csdn.net/weixin_43874301/article/details/119488938