特征提取代码汇总
import jieba
from sklearn.datasets import load_iris
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
def datasets_demo():
"""
sklearn 数据集使用
数据集的划分:
机器学习一般的数据集会划分为两个部分
训练数据:用于训练,构建模型
测试数据:在模型校验使用,用于评估模型是否可用
:return:
"""
iris = load_iris()
print("鸢尾花数据集:\n", iris)
print("查询数据集描述:\n", iris["DESCR"])
print("查询特征值的名字:\n", iris.feature_names)
print("查看特征值:\n", iris.data, iris.data.shape)
print("查看特征值:\n", iris.data.shape)
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
print("训练集的特征值:\n", x_train, x_train.shape)
return None
def dict_demo():
"""
字典特征值提取
:return:
"""
data = [{
'city': '北京', 'temperature': 100}, {
'city': '上海', 'temperature': 60}, {
'city': '深圳', 'temperature': 30}]
transfer = DictVectorizer(sparse=False)
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new)
print("特征名字:\n", transfer.get_feature_names())
return None
def count_demo():
"""
文本特征值抽取
:return:
"""
data = ["life is short, i like python", "life is too long i dislike python"]
transfer = CountVectorizer()
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new.toarray())
print("特征名字:\n", transfer.get_feature_names())
return None
def count_chinese_demo():
"""
中文文本特征值抽取
:return:
"""
data = ["我 爱 北京 天安门", "天安门 上 太阳 升"]
data2 = ["我爱北京天安门", "天安门上太阳升"]
transfer = CountVectorizer()
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new.toarray())
print("特征名字:\n", transfer.get_feature_names())
return None
def count_word(text):
"""
进行中文分词 我爱北京天安门-》我 爱 北京 天安门
:param text:
:return:
"""
a = " ".join(list(jieba.cut(text)))
print(a)
return a
def count_chinese_demo2():
"""
中文文本特征值抽取 自动分词
:return:
"""
data = ["在过去两个月里,我和60多位小伙伴进行了1对1的一小时沟通;",
"TA绝大多数是想要尝试副业变现的朋友。",
"从一线城市到三线城市,从宝妈到职场人,从职场到体制内。"]
transfer = CountVectorizer(stop_words=["从宝妈"])
data_new = transfer.fit_transform(count_word(item) for item in data)
print("data_new:\n", data_new.toarray())
print("特征名字:\n", transfer.get_feature_names())
return None
def tfidf_demo():
"""
用TF-IDF方法进行文本特征值抽取
:return:
"""
data = ["在过去两个月里,我和60多位小伙伴进行了1对1的一小时沟通;",
"TA绝大多数是想要尝试副业变现的朋友。",
"从一线城市到三线城市,从宝妈到职场人,从职场到体制内。"]
transfer = TfidfVectorizer(stop_words=["从宝妈"])
data_new = transfer.fit_transform(count_word(item) for item in data)
print("data_new:\n", data_new.toarray())
print("特征名字:\n", transfer.get_feature_names())
return None
if __name__ == '__main__':
tfidf_demo()