数据分析之学术前沿---任务4

import seaborn as sns #⽤于画图
from bs4 import BeautifulSoup #⽤于爬取arxiv的数据
import re #⽤于正则表达式，匹配字符串的模式
import requests #⽤于⽹络连接，发送⽹络请求，使⽤域名获取对应信息
import json #读取数据，我们的数据为json格式的
import pandas as pd #数据处理，数据分析
import matplotlib.pyplot as plt #画图⼯具

data = [] #初始化
#使⽤用with语句句优势： 1.⾃自动关闭⽂文件句句柄； 2.⾃自动显示（处理理）⽂文件读取数据异常
with open("arxiv-metadata-oai-2019.json", 'r') as f:
    for idx, line in enumerate(f):
        d = json.loads(line)
        d = {
    
    'title': d['title'], 'categories': d['categories'], 'abstract': d['abstract']}
        data.append(d)
        
        #选择部分数据
        if idx > 200000:
            break
data = pd.DataFrame(data) #将list变为dataframe格式，⽅方便便使⽤用pandas进⾏行行分析

data

	title	categories	abstract
0	Remnant evolution after a carbon-oxygen white ...	astro-ph	We systematically explore the evolution of t...
1	Cofibrations in the Category of Frolicher Spac...	math.AT	Cofibrations are defined in the category of ...
2	Torsional oscillations of longitudinally inhom...	astro-ph	We explore the effect of an inhomogeneous ma...
3	On the Energy-Momentum Problem in Static Einst...	gr-qc	This paper has been removed by arXiv adminis...
4	The Formation of Globular Cluster Systems in M...	astro-ph	The most massive elliptical galaxies show a ...
...	...	...	...
170613	Enhancement of Magneto-Optic Effects via Large...	quant-ph	We utilize the generation of large atomic co...
170614	Explicit and Exact Solutions to a Kolmogorov-P...	solv-int nlin.SI	Some explicit traveling wave solutions to a ...
170615	Linear r-Matrix Algebra for a Hierarchy of One...	solv-int nlin.SI	We consider a hierarchy of many-particle sys...
170616	Pfaff tau-functions	solv-int adap-org hep-th nlin.AO nlin.SI	Consider the evolution $$ \frac{\pl m_\iy}{\...
170617	The General Solution of the Complex Monge-Amp\...	solv-int nlin.SI	A general solution to the Complex Monge-Amp\...

170618 rows × 3 columns

data['text'] = data['title'] + data['abstract']
data['text']

0         Remnant evolution after a carbon-oxygen white ...
1         Cofibrations in the Category of Frolicher Spac...
2         Torsional oscillations of longitudinally inhom...
3         On the Energy-Momentum Problem in Static Einst...
4         The Formation of Globular Cluster Systems in M...
                                ...                        
170613    Enhancement of Magneto-Optic Effects via Large...
170614    Explicit and Exact Solutions to a Kolmogorov-P...
170615    Linear r-Matrix Algebra for a Hierarchy of One...
170616    Pfaff tau-functions  Consider the evolution $$...
170617    The General Solution of the Complex Monge-Amp\...
Name: text, Length: 170618, dtype: object

data['text'] = data['text'].apply(lambda x: x.replace('\n',' '))  #把换行符删除
data['text']

0         Remnant evolution after a carbon-oxygen white ...
1         Cofibrations in the Category of Frolicher Spac...
2         Torsional oscillations of longitudinally inhom...
3         On the Energy-Momentum Problem in Static Einst...
4         The Formation of Globular Cluster Systems in M...
                                ...                        
170613    Enhancement of Magneto-Optic Effects via Large...
170614    Explicit and Exact Solutions to a Kolmogorov-P...
170615    Linear r-Matrix Algebra for a Hierarchy of One...
170616    Pfaff tau-functions  Consider the evolution $$...
170617    The General Solution of the Complex Monge-Amp\...
Name: text, Length: 170618, dtype: object

data['text'] = data['text'].apply(lambda x: x.lower()) #lower() 方法转换字符串中所有大写字符为小写。
data['text']

0         remnant evolution after a carbon-oxygen white ...
1         cofibrations in the category of frolicher spac...
2         torsional oscillations of longitudinally inhom...
3         on the energy-momentum problem in static einst...
4         the formation of globular cluster systems in m...
                                ...                        
170613    enhancement of magneto-optic effects via large...
170614    explicit and exact solutions to a kolmogorov-p...
170615    linear r-matrix algebra for a hierarchy of one...
170616    pfaff tau-functions  consider the evolution $$...
170617    the general solution of the complex monge-amp\...
Name: text, Length: 170618, dtype: object

data = data.drop(['abstract', 'title'], axis=1)
data

	categories	text
0	astro-ph	remnant evolution after a carbon-oxygen white ...
1	math.AT	cofibrations in the category of frolicher spac...
2	astro-ph	torsional oscillations of longitudinally inhom...
3	gr-qc	on the energy-momentum problem in static einst...
4	astro-ph	the formation of globular cluster systems in m...
...	...	...
170613	quant-ph	enhancement of magneto-optic effects via large...
170614	solv-int nlin.SI	explicit and exact solutions to a kolmogorov-p...
170615	solv-int nlin.SI	linear r-matrix algebra for a hierarchy of one...
170616	solv-int adap-org hep-th nlin.AO nlin.SI	pfaff tau-functions consider the evolution $$...
170617	solv-int nlin.SI	the general solution of the complex monge-amp\...

170618 rows × 2 columns

data['categories'] = data['categories'].apply(lambda x : x.split(' '))
data['categories']

0                                             [astro-ph]
1                                              [math.AT]
2                                             [astro-ph]
3                                                [gr-qc]
4                                             [astro-ph]
                               ...                      
170613                                        [quant-ph]
170614                               [solv-int, nlin.SI]
170615                               [solv-int, nlin.SI]
170616    [solv-int, adap-org, hep-th, nlin.AO, nlin.SI]
170617                               [solv-int, nlin.SI]
Name: categories, Length: 170618, dtype: object

# 单个类别，不不包含⼦子分类
data['categories_big'] = data['categories'].apply(lambda x : [xx.split('.')[0] for xx in x])
data['categories_big']

0                                       [astro-ph]
1                                           [math]
2                                       [astro-ph]
3                                          [gr-qc]
4                                       [astro-ph]
                            ...                   
170613                                  [quant-ph]
170614                            [solv-int, nlin]
170615                            [solv-int, nlin]
170616    [solv-int, adap-org, hep-th, nlin, nlin]
170617                            [solv-int, nlin]
Name: categories_big, Length: 170618, dtype: object

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data_label = mlb.fit_transform(data['categories_big'].iloc[:])
data_label

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)
data_tfidf = vectorizer.fit_transform(data['text'].iloc[:])

data_tfidf

<170618x4000 sparse matrix of type '<class 'numpy.float64'>'
	with 13410660 stored elements in Compressed Sparse Row format>

# 划分训练集和验证集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_tfidf, data_label,test_size = 0.2,random_state =1)

# 构建多标签分类模型
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
clf = MultiOutputClassifier(MultinomialNB()).fit(x_train, y_train)

from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         0
           3       0.91      0.85      0.88      3625
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         0
           8       0.77      0.76      0.77      3801
           9       0.84      0.89      0.86     10715
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00       186
          12       0.44      0.41      0.42      1621
          13       0.00      0.00      0.00         1
          14       0.75      0.59      0.66      1096
          15       0.61      0.80      0.69      1078
          16       0.90      0.19      0.32       242
          17       0.53      0.67      0.59      1451
          18       0.71      0.54      0.62      1400
          19       0.88      0.84      0.86     10243
          20       0.40      0.09      0.15       934
          21       0.00      0.00      0.00         1
          22       0.87      0.03      0.06       414
          23       0.48      0.65      0.55       517
          24       0.37      0.33      0.35       539
          25       0.00      0.00      0.00         1
          26       0.60      0.42      0.49      3891
          27       0.00      0.00      0.00         0
          28       0.82      0.08      0.15       676
          29       0.86      0.12      0.21       297
          30       0.80      0.40      0.53      1714
          31       0.00      0.00      0.00         4
          32       0.56      0.65      0.60      3398
          33       0.00      0.00      0.00         0

   micro avg       0.76      0.70      0.72     47851
   macro avg       0.39      0.27      0.29     47851
weighted avg       0.75      0.70      0.71     47851
 samples avg       0.74      0.76      0.72     47851



E:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
E:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1272: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
E:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['text'].iloc[:],data_label,test_size = 0.2,random_state =1)

x_train, x_test, y_train, y_test

(74478     spin-wave mediated interactions for majority c...
 13509     identification of a monoclinic metallic state ...
 108596    effects of polarization and high harmonics of ...
 145385    distributed sgd generalizes well under asynchr...
 159998    interaction mechanism and response of tidal ef...
                                 ...                        
 73349     solving differential equations with neural net...
 109259    a trainable multiplication layer for auto-corr...
 50057     a phase variable approach for improved rhythmi...
 5192      on improving roth's theorem in the primes  let...
 128037    merger rate of stellar black hole binaries abo...
 Name: text, Length: 136494, dtype: object,
 57465     logistic type attraction-repulsion chemotaxis ...
 2978      sur le produit de vari\'et\'es localement fact...
 156935    hmtnet:3d hand pose estimation from single dep...
 64243     using structurally well-defined norbornyl-brid...
 94939     rotational symmetry of ancient solutions to th...
                                 ...                        
 38827     exactly solvable deterministic lattice model o...
 169521    a poincare-covariant parton cascade model for ...
 67719     interleaving loidreau's rank-metric cryptosyst...
 132236    effects of string cloud on gauss-bonnet hologr...
 20244     real hypersurfaces with miao-tam critical metr...
 Name: text, Length: 34124, dtype: object,
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

# parameter
max_features= 500
max_len= 150
embed_size=100

# 定义模型并完成训练：
import tensorflow as tf
batch_size = 128
epochs = 5

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

E:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
E:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
E:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
E:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
E:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
E:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
E:\ProgramData\Anaconda3\lib\site-packages\tensorboard\compat\tensorflow_stub\dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
E:\ProgramData\Anaconda3\lib\site-packages\tensorboard\compat\tensorflow_stub\dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
E:\ProgramData\Anaconda3\lib\site-packages\tensorboard\compat\tensorflow_stub\dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
E:\ProgramData\Anaconda3\lib\site-packages\tensorboard\compat\tensorflow_stub\dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
E:\ProgramData\Anaconda3\lib\site-packages\tensorboard\compat\tensorflow_stub\dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
E:\ProgramData\Anaconda3\lib\site-packages\tensorboard\compat\tensorflow_stub\dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.

tokens = Tokenizer(num_words = max_features)
tokens.fit_on_texts(list(x_train)+list(x_test))

x_sub_train = tokens.texts_to_sequences(x_train)
x_sub_test = tokens.texts_to_sequences(x_test)
x_sub_train=sequence.pad_sequences(x_sub_train, maxlen=max_len)
x_sub_test=sequence.pad_sequences(x_sub_test, maxlen=max_len)

# LSTM model
# Keras Layers:
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D,Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D,concatenate, SpatialDropout1D# Keras Callback Functions:
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers,callbacks
from keras.models import Model
from keras.optimizers import Adam
sequence_input = Input(shape=(max_len, ))
x = Embedding(max_features, embed_size,trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128,
return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer ="glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
preds = Dense(34, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
model.fit(x_sub_train, y_train, batch_size=batch_size, epochs=epochs)

WARNING:tensorflow:From E:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.

Epoch 1/5
130688/136494 [===========================>..] - ETA: 57s - loss: 0.1288 - accuracy: 0.9578

数据分析之学术前沿---任务4

猜你喜欢