News_predict - 机器学习分类（0/1）

复习代码如下：

# -*- coding:utf8 -*-
# @TIME : 2018/5/18 上午11:10
# @Author : Allen
# @File : 5.17_news_predict.py

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from datetime import date
import os

print("【1】正在读入数据...")
os.chdir('/Users/a1/Desktop/learning/5.17_News_predict/')
#输入数据
data = pd.read_csv('Combined_News_DJIA.csv')
print(data.shape)
print("********")

print("【1.1】正在划分训练/测试集合...")
train = data[data["Date"] < '2015-01-01']
test = data[data["Date"] > '2014-12-31']
print(train.shape)
print(test.shape)
print("********")

X_train = train[train.columns[2:]]

print("【1.2】正在获取语料库flatten,corpus...")
corpus = X_train.values.flatten().astype(str)
print(corpus[1])

print("【1.3】正在把25条新闻合并中train/test...")
X_train = X_train.values.astype(str)
X_train = np.array([' '.join(x) for x in X_train])
X_test = test[test.columns[2:]]
X_test = X_test.values.astype(str)
X_test = np.array([' '.join(x) for x in X_test])

y_train = train['Label'].values
y_test = test['Label'].values

print(corpus[:3])
print(X_train[:1])

print("【1.4】正在tokenize合并后对语料库...")
from nltk.tokenize import word_tokenize
corpus = [word_tokenize(x) for x in corpus]
X_train = [word_tokenize(x) for x in X_train]
X_test = [word_tokenize(x) for x in X_test]

print("#################")
print(X_train[:2])
print(corpus[:2])
print("#################")



#用nltk处理句子前，先把句子中的停用词，数字、字符、去掉，提取词干
from nltk.corpus import stopwords
stop = stopwords.words('english')

#数字 & 特殊字符
import re
def hasNumber(inputString):
    return bool(re.search(r'\d', inputString))
def isSymbol(inputString):
    return bool(re.match('[^\w]', inputString))

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def check(word):
    word = word.lower()
    if word in stop:
        return False
    elif hasNumber(word) or isSymbol(word):
        return False
    else:
        return True

def preprocessing(sen):
    res = []
    for word in sen:
        if check(word):
            word = word.lower().replace("b'", '').replace('b"', '').replace('"', '').replace("'", '')
            res.append(wordnet_lemmatizer.lemmatize(word))
    return res
print('****************')

#处理干净的数据
print("【1.5】正在预处理数据_停用词/数字/字符/提取词干...")
corpus = [preprocessing(x) for x in corpus]
X_train = [preprocessing(x) for x in X_train]
X_test = [preprocessing(x) for x in X_test]

print('****************')
# print(corpus[1])
# print(X_train[1])
print('****************')

#用NLP模型
print("【2】正在将预处理结束的词语——Word2Vec_平均每一个单词的向量...")
from gensim.models.word2vec import Word2Vec
model = Word2Vec(corpus, size=128, window=5, min_count=5, workers=4)
print(model['ok'])

#用NLP模型表达我们的X
vocab = model.wv.vocab

def get_vector(word_list):
    res = np.zeros([128])
    count = 0
    for word in word_list:
        if word in vocab:
            res += model[word]
            count += 1
    return res/count

get_vector(['hello', 'from', 'the', 'other', 'side'])
print("*******我是分割线1*********")

wordlist_train = X_train
wordlist_test = X_test

print("【2.1】正在处理X_train/X_test中的每一条向量...")
X_train = [get_vector(x) for x in X_train]
X_test = [get_vector(x) for x in X_test]
print(X_train[10])
print("*******我是分割线2*********")


print("【3】正在调用SVM_SVR模型...")
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score

params = [0.1, 0.5, 1, 5, 7, 9, 12, 15, 20, 25, 30, 40, 50]
test_scores = []
for param in params:
    clf = SVR(gamma=param)
    test_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
    test_scores.append(np.mean(test_score))

print("【4】打印Roc_auc精度 - Params...")
import matplotlib.pyplot as plt
plt.plot(params, test_scores)
plt.title("Roc_auc - SVR ")
print(min(test_scores))
plt.show()

处理结果如下；

【1】正在读入数据...
(1989, 27)
********
【1.1】正在划分训练/测试集合...
(1611, 27)
(378, 27)
********
【1.2】正在获取语料库flatten,corpus...
b'BREAKING: Musharraf to be impeached.'
【1.3】正在把25条新闻合并中train/test...
['b"Georgia \'downs two Russian warplanes\' as countries move to brink of war"'
 "b'BREAKING: Musharraf to be impeached.'"
 "b'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)'"]
['b"Georgia \'downs two Russian warplanes\' as countries move to brink of war" b\'BREAKING: Musharraf to be impeached.\' b\'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)\' b\'Russian tanks are moving towards the capital of South Ossetia, which has reportedly been completely destroyed by Georgian artillery fire\' b"Afghan children raped with \'impunity,\' U.N. official says - this is sick, a three year old was raped and they do nothing" b\'150 Russian tanks have entered South Ossetia whilst Georgia shoots down two Russian jets.\' b"Breaking: Georgia invades South Ossetia, Russia warned it would intervene on SO\'s side" b"The \'enemy combatent\' trials are nothing but a sham: Salim Haman has been sentenced to 5 1/2 years, but will be kept longer anyway just because they feel like it." b\'Georgian troops retreat from S. Osettain capital, presumably leaving several hundred people killed. [VIDEO]\' b\'Did the U.S. Prep Georgia for War with Russia?\' b\'Rice Gives Green Light for Israel to Attack Iran: Says U.S. has no veto over Israeli military ops\' b\'Announcing:Class Action Lawsuit on Behalf of American Public Against the FBI\' b"So---Russia and Georgia are at war and the NYT\'s top story is opening ceremonies of the Olympics?  What a fucking disgrace and yet further proof of the decline of journalism." b"China tells Bush to stay out of other countries\' affairs" b\'Did World War III start today?\' b\'Georgia Invades South Ossetia - if Russia gets involved, will NATO absorb Georgia and unleash a full scale war?\' b\'Al-Qaeda Faces Islamist Backlash\' b\'Condoleezza Rice: "The US would not act to prevent an Israeli strike on Iran." Israeli Defense Minister Ehud Barak: "Israel is prepared for uncompromising victory in the case of military hostilities."\' b\'This is a busy day:  The European Union has approved new sanctions against Iran in protest at its nuclear programme.\' b"Georgia will withdraw 1,000 soldiers from Iraq to help fight off Russian forces in Georgia\'s breakaway region of South Ossetia" b\'Why the Pentagon Thinks Attacking Iran is a Bad Idea - US News & World Report\' b\'Caucasus in crisis: Georgia invades South Ossetia\' b\'Indian shoe manufactory  - And again in a series of "you do not like your work?"\' b\'Visitors Suffering from Mental Illnesses Banned from Olympics\' b"No Help for Mexico\'s Kidnapping Surge"']
【1.4】正在tokenize合并后对语料库...
#################
[['b', "''", 'Georgia', "'downs", 'two', 'Russian', 'warplanes', "'", 'as', 'countries', 'move', 'to', 'brink', 'of', 'war', "''", "b'BREAKING", ':', 'Musharraf', 'to', 'be', 'impeached', '.', "'", "b'Russia", 'Today', ':', 'Columns', 'of', 'troops', 'roll', 'into', 'South', 'Ossetia', ';', 'footage', 'from', 'fighting', '(', 'YouTube', ')', "'", "b'Russian", 'tanks', 'are', 'moving', 'towards', 'the', 'capital', 'of', 'South', 'Ossetia', ',', 'which', 'has', 'reportedly', 'been', 'completely', 'destroyed', 'by', 'Georgian', 'artillery', 'fire', "'", 'b', "''", 'Afghan', 'children', 'raped', 'with', "'impunity", ',', "'", 'U.N.', 'official', 'says', '-', 'this', 'is', 'sick', ',', 'a', 'three', 'year', 'old', 'was', 'raped', 'and', 'they', 'do', 'nothing', "''", "b'150", 'Russian', 'tanks', 'have', 'entered', 'South', 'Ossetia', 'whilst', 'Georgia', 'shoots', 'down', 'two', 'Russian', 'jets', '.', "'", 'b', "''", 'Breaking', ':', 'Georgia', 'invades', 'South', 'Ossetia', ',', 'Russia', 'warned', 'it', 'would', 'intervene', 'on', 'SO', "'s", 'side', "''", 'b', "''", 'The', "'enemy", 'combatent', "'", 'trials', 'are', 'nothing', 'but', 'a', 'sham', ':', 'Salim', 'Haman', 'has', 'been', 'sentenced', 'to', '5', '1/2', 'years', ',', 'but', 'will', 'be', 'kept', 'longer', 'anyway', 'just', 'because', 'they', 'feel', 'like', 'it', '.', "''", "b'Georgian", 'troops', 'retreat', 'from', 'S.', 'Osettain', 'capital', ',', 'presumably', 'leaving', 'several', 'hundred', 'people', 'killed', '.', '[', 'VIDEO', ']', "'", "b'Did", 'the', 'U.S.', 'Prep', 'Georgia', 'for', 'War', 'with', 'Russia', '?', "'", "b'Rice", 'Gives', 'Green', 'Light', 'for', 'Israel', 'to', 'Attack', 'Iran', ':', 'Says', 'U.S.', 'has', 'no', 'veto', 'over', 'Israeli', 'military', 'ops', "'", "b'Announcing", ':', 'Class', 'Action', 'Lawsuit', 'on', 'Behalf', 'of', 'American', 'Public', 'Against', 'the', 'FBI', "'", 'b', "''", 'So', '--', '-Russia', 'and', 'Georgia', 'are', 'at', 'war', 'and', 'the', 'NYT', "'s", 'top', 'story', 'is', 'opening', 'ceremonies', 'of', 'the', 'Olympics', '?', 'What', 'a', 'fucking', 'disgrace', 'and', 'yet', 'further', 'proof', 'of', 'the', 'decline', 'of', 'journalism', '.', "''", 'b', "''", 'China', 'tells', 'Bush', 'to', 'stay', 'out', 'of', 'other', 'countries', "'", 'affairs', "''", "b'Did", 'World', 'War', 'III', 'start', 'today', '?', "'", "b'Georgia", 'Invades', 'South', 'Ossetia', '-', 'if', 'Russia', 'gets', 'involved', ',', 'will', 'NATO', 'absorb', 'Georgia', 'and', 'unleash', 'a', 'full', 'scale', 'war', '?', "'", "b'Al-Qaeda", 'Faces', 'Islamist', 'Backlash', "'", "b'Condoleezza", 'Rice', ':', '``', 'The', 'US', 'would', 'not', 'act', 'to', 'prevent', 'an', 'Israeli', 'strike', 'on', 'Iran', '.', "''", 'Israeli', 'Defense', 'Minister', 'Ehud', 'Barak', ':', '``', 'Israel', 'is', 'prepared', 'for', 'uncompromising', 'victory', 'in', 'the', 'case', 'of', 'military', 'hostilities', '.', "''", "'", "b'This", 'is', 'a', 'busy', 'day', ':', 'The', 'European', 'Union', 'has', 'approved', 'new', 'sanctions', 'against', 'Iran', 'in', 'protest', 'at', 'its', 'nuclear', 'programme', '.', "'", 'b', "''", 'Georgia', 'will', 'withdraw', '1,000', 'soldiers', 'from', 'Iraq', 'to', 'help', 'fight', 'off', 'Russian', 'forces', 'in', 'Georgia', "'s", 'breakaway', 'region', 'of', 'South', 'Ossetia', "''", "b'Why", 'the', 'Pentagon', 'Thinks', 'Attacking', 'Iran', 'is', 'a', 'Bad', 'Idea', '-', 'US', 'News', '&', 'amp', ';', 'World', 'Report', "'", "b'Caucasus", 'in', 'crisis', ':', 'Georgia', 'invades', 'South', 'Ossetia', "'", "b'Indian", 'shoe', 'manufactory', '-', 'And', 'again', 'in', 'a', 'series', 'of', '``', 'you', 'do', 'not', 'like', 'your', 'work', '?', "''", "'", "b'Visitors", 'Suffering', 'from', 'Mental', 'Illnesses', 'Banned', 'from', 'Olympics', "'", 'b', "''", 'No', 'Help', 'for', 'Mexico', "'s", 'Kidnapping', 'Surge', "''"], ["b'Why", 'wont', 'America', 'and', 'Nato', 'help', 'us', '?', 'If', 'they', 'wont', 'help', 'us', 'now', ',', 'why', 'did', 'we', 'help', 'them', 'in', 'Iraq', '?', "'", "b'Bush", 'puts', 'foot', 'down', 'on', 'Georgian', 'conflict', "'", 'b', "''", 'Jewish', 'Georgian', 'minister', ':', 'Thanks', 'to', 'Israeli', 'training', ',', 'we', "'re", 'fending', 'off', 'Russia', '``', "b'Georgian", 'army', 'flees', 'in', 'disarray', 'as', 'Russians', 'advance', '-', 'Gori', 'abandoned', 'to', 'Russia', 'without', 'a', 'shot', 'fired', "'", 'b', "''", 'Olympic', 'opening', 'ceremony', 'fireworks', "'faked", "'", "''", "b'What", 'were', 'the', 'Mossad', 'with', 'fraudulent', 'New', 'Zealand', 'Passports', 'doing', 'in', 'Iraq', '?', "'", "b'Russia", 'angered', 'by', 'Israeli', 'military', 'sale', 'to', 'Georgia', "'", "b'An", 'American', 'citizen', 'living', 'in', 'S.Ossetia', 'blames', 'U.S.', 'and', 'Georgian', 'leaders', 'for', 'the', 'genocide', 'of', 'innocent', 'people', "'", "b'Welcome", 'To', 'World', 'War', 'IV', '!', 'Now', 'In', 'High', 'Definition', '!', "'", 'b', "''", 'Georgia', "'s", 'move', ',', 'a', 'mistake', 'of', 'monumental', 'proportions', '``', "b'Russia", 'presses', 'deeper', 'into', 'Georgia', ';', 'U.S.', 'says', 'regime', 'change', 'is', 'goal', "'", "b'Abhinav", 'Bindra', 'wins', 'first', 'ever', 'Individual', 'Olympic', 'Gold', 'Medal', 'for', 'India', "'", 'b', "'", 'U.S.', 'ship', 'heads', 'for', 'Arctic', 'to', 'define', 'territory', "'", "b'Drivers", 'in', 'a', 'Jerusalem', 'taxi', 'station', 'threaten', 'to', 'quit', 'rather', 'than', 'work', 'for', 'their', 'new', 'boss', '-', 'an', 'Arab', "'", "b'The", 'French', 'Team', 'is', 'Stunned', 'by', 'Phelps', 'and', 'the', '4x100m', 'Relay', 'Team', "'", "b'Israel", 'and', 'the', 'US', 'behind', 'the', 'Georgian', 'aggression', '?', "'", 'b', "'", "''", 'Do', 'not', 'believe', 'TV', ',', 'neither', 'Russian', 'nor', 'Georgian', '.', 'There', 'are', 'much', 'more', 'victims', "''", "'", "b'Riots", 'are', 'still', 'going', 'on', 'in', 'Montreal', '(', 'Canada', ')', 'because', 'police', 'murdered', 'a', 'boy', 'on', 'Saturday', '.', "'", "b'China", 'to', 'overtake', 'US', 'as', 'largest', 'manufacturer', "'", "b'War", 'in', 'South', 'Ossetia', '[', 'PICS', ']', "'", "b'Israeli", 'Physicians', 'Group', 'Condemns', 'State', 'Torture', "'", 'b', "'", 'Russia', 'has', 'just', 'beaten', 'the', 'United', 'States', 'over', 'the', 'head', 'with', 'Peak', 'Oil', "'", "b'Perhaps", '*the*', 'question', 'about', 'the', 'Georgia', '-', 'Russia', 'conflict', "'", "b'Russia", 'is', 'so', 'much', 'better', 'at', 'war', "'", 'b', "''", 'So', 'this', 'is', 'what', 'it', "'s", 'come', 'to', ':', 'trading', 'sex', 'for', 'food', '.', "''"]]
[['b', "''", 'Georgia', "'downs", 'two', 'Russian', 'warplanes', "'", 'as', 'countries', 'move', 'to', 'brink', 'of', 'war', "''"], ["b'BREAKING", ':', 'Musharraf', 'to', 'be', 'impeached', '.', "'"]]
#################
****************
【1.5】正在预处理数据_停用词/数字/字符/提取词干...
****************
****************
【2】正在将预处理结束的词语——Word2Vec_平均每一个单词的向量...
[-0.12198066 -0.19349506  0.2272697   0.17855231 -0.13176537 -0.05874182
 -0.0038822   0.26390707 -0.11713045  0.03686697  0.21172981  0.18202488
  0.0625032  -0.20089018  0.05533357 -0.11601048  0.06156243  0.05652046
  0.03306048  0.18263416 -0.03129722 -0.19075146 -0.14293405  0.02830653
  0.08682565  0.09871766  0.11757042  0.30550867  0.1495696   0.0840437
  0.19152424  0.03698351  0.0860781   0.03099143 -0.06468096 -0.02715972
 -0.12213221  0.00517559  0.13000089  0.10167067  0.09931231 -0.20446911
 -0.1533122  -0.2045926   0.08525646 -0.0807068   0.27022147 -0.11640876
  0.0013384  -0.06654033  0.09167262  0.08706438  0.05540042  0.03862222
  0.10412232  0.04944295  0.08350987  0.15736297  0.15726984 -0.06071978
  0.0052347  -0.03026398 -0.1476018  -0.02563275 -0.01206466 -0.01588351
 -0.22076711 -0.04309006 -0.00491295 -0.12354583 -0.04404394 -0.03992388
 -0.05955072 -0.03095401 -0.00665365 -0.0677621   0.31858638 -0.3258347
  0.10484565  0.07364411  0.06579694  0.04243176  0.06389505  0.0664937
 -0.05259128 -0.34212872  0.08969858 -0.05974254 -0.2542102   0.22155738
  0.05351034 -0.0572664  -0.24335395 -0.07126738 -0.06069791  0.18856542
 -0.05407845  0.03955258  0.02313455 -0.06236781 -0.10119005 -0.13466756
  0.06583793 -0.16692838  0.15498406  0.10939295  0.03090773  0.02114977
  0.01098908  0.08838415  0.05798338  0.3442737   0.04969002  0.05149802
 -0.23125067  0.0098875  -0.2644284   0.18171756 -0.08449835 -0.13592513
  0.13917942 -0.0350637   0.3025364  -0.06173839  0.06829002  0.08331735
  0.08410589  0.06224122]
*******我是分割线1*********
【2.1】正在处理X_train/X_test中的每一条向量...
[-0.19389711 -0.36415527  0.42890032  0.30812147 -0.23086774 -0.1314086
  0.05643824  0.37439691 -0.24945417  0.10694078  0.32720501  0.24867821
  0.07946815 -0.28634701  0.05045462 -0.25169352  0.10614993  0.12144271
  0.00345918  0.299194   -0.01539751 -0.29544419 -0.2754242   0.0190655
  0.11954962  0.2013825   0.21401726  0.46924831  0.19991203  0.13479685
  0.32444439  0.04833731  0.11932954  0.07103907 -0.14060805 -0.04024484
 -0.18111716 -0.04061282  0.13660831  0.16181362  0.17162352 -0.30906741
 -0.29716192 -0.26674847  0.16652763 -0.15136299  0.38823764 -0.23045407
 -0.07513595 -0.04705774  0.14249443  0.20484248  0.05601557  0.09372386
  0.18372274  0.05481556  0.12576672  0.26380944  0.3540449  -0.07330822
 -0.06383919 -0.04666297 -0.31466208 -0.01653562  0.00803791 -0.04995444
 -0.33330808 -0.04854833 -0.02735453 -0.19239997 -0.12130485 -0.06797619
 -0.11062339  0.02960971 -0.0389864  -0.11901791  0.54829714 -0.47069883
  0.18953402  0.1166861   0.06852083  0.05546854  0.07791001  0.04070867
 -0.08374217 -0.51137908  0.16073435 -0.08125622 -0.39010982  0.3542238
  0.01146496 -0.02442945 -0.42576402 -0.13023017 -0.13134052  0.34049785
 -0.07820753  0.02882772  0.09540374 -0.10279276 -0.1906894  -0.20165037
  0.07815217 -0.33511828  0.25848117  0.17182536  0.04049695  0.01956215
 -0.00173748  0.13132811  0.12115839  0.58085842  0.06486564  0.08636677
 -0.35884734  0.07561271 -0.40698867  0.26746094 -0.16769977 -0.17340923
  0.25833386 -0.0914841   0.52499937 -0.09275098  0.16764335  0.17813361
  0.08292444  0.07964778]
*******我是分割线2*********
【3】正在调用SVM_SVR模型...
【4】打印Roc_auc精度 - Params...
0.4747014709597402

结论：Roc_auc 非常的低，效果不好！

可能的原因：语料库太少，一共才2000条句子，几万个单词。这块需要提高，后续需要试一试深度学习的方法！！！

News_predict - 机器学习分类（0/1）

猜你喜欢