莎士比亚文本数据集
实战
步骤
1.产生词表
2.建立字符与id的对应
3.将词表数据都转成id
4.对文本输入做出输出:abcd->bcd*
导包
import matplotlib as mpl
import matplotlib. pyplot as plt
% matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print ( tf. __version__)
print ( sys. version_info)
for module in mpl, np , pd, sklearn, tf, keras:
print ( module. __name__, module. __version__)
2.1.0
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.1
numpy 1.16.5
pandas 0.25.1
sklearn 0.21.3
tensorflow 2.1.0
tensorflow_core.python.keras.api._v2.keras 2.2.4-tf
from urllib import request
file_path = 'E:/zym_test/test/RNN'
txt_path = 'E:/zym_test/test/RNN/shakespeare.txt'
if not os. path. exists( txt_path) :
txt_path = os. path. join( file_path, 'shakespeare.txt' )
url = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
request. urlretrieve( url, txt_path)
text = open ( txt_path, 'r' ) . read( )
print ( len ( text) )
print ( text[ 0 : 100 ] )
1115394
First Citizen:
Before we proceed any further, hear me speak.
All:
Speak, speak.
First Citizen:
You
vocab = sorted ( set ( text) )
print ( len ( vocab) )
print ( vocab)
65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
char2idx = { char: idx for idx, char in enumerate ( vocab) }
print ( char2idx)
{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
print ( type ( vocab) )
<class 'list'>
idx2char = np. array( vocab)
print ( type ( idx2char) )
<class 'numpy.ndarray'>
text_as_int = np. array( [ char2idx[ c] for c in text] )
print ( text[ 0 : 10 ] )
print ( text_as_int[ 0 : 10 ] )
First Citi
[18 47 56 57 58 1 15 47 58 47]
def split_input_target ( id_text) :
"""
abcde -> abcd, bcde
"""
return id_text[ 0 : - 1 ] , id_text[ 1 : ]
char_dataset = tf. data. Dataset. from_tensor_slices( text_as_int)
for _ in char_dataset. take( 3 ) :
print ( _, idx2char[ _. numpy( ) ] )
seq_length = 100
seq_dataset = char_dataset. batch( seq_length + 1 ,
drop_remainder= True )
for _ in seq_dataset. take( 2 ) :
print ( _)
print ( repr ( '' . join( idx2char[ _. numpy( ) ] ) ) )
tf.Tensor(18, shape=(), dtype=int32) F
tf.Tensor(47, shape=(), dtype=int32) i
tf.Tensor(56, shape=(), dtype=int32) r
tf.Tensor(
[18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 14 43 44 53 56 43 1 61 43
1 54 56 53 41 43 43 42 1 39 52 63 1 44 59 56 58 46 43 56 6 1 46 43
39 56 1 51 43 1 57 54 43 39 49 8 0 0 13 50 50 10 0 31 54 43 39 49
6 1 57 54 43 39 49 8 0 0 18 47 56 57 58 1 15 47 58 47 64 43 52 10
0 37 53 59 1], shape=(101,), dtype=int32)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43 1 39 50 50 1 56 43 57 53 50 60 43 42 1 56 39 58 46 43 56 1
58 53 1 42 47 43 1 58 46 39 52 1 58 53 1 44 39 51 47 57 46 12 0 0
13 50 50 10 0 30 43 57 53 50 60 43 42 8 1 56 43 57 53 50 60 43 42 8
0 0 18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 18 47 56 57 58 6 1
63 53 59 1 49], shape=(101,), dtype=int32)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
seq_dataset = seq_dataset. map ( split_input_target)
for item_input, item_output in seq_dataset. take( 2 ) :
print ( item_input. numpy( ) )
print ( item_output. numpy( ) )
[18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 14 43 44 53 56 43 1 61 43
1 54 56 53 41 43 43 42 1 39 52 63 1 44 59 56 58 46 43 56 6 1 46 43
39 56 1 51 43 1 57 54 43 39 49 8 0 0 13 50 50 10 0 31 54 43 39 49
6 1 57 54 43 39 49 8 0 0 18 47 56 57 58 1 15 47 58 47 64 43 52 10
0 37 53 59]
[47 56 57 58 1 15 47 58 47 64 43 52 10 0 14 43 44 53 56 43 1 61 43 1
54 56 53 41 43 43 42 1 39 52 63 1 44 59 56 58 46 43 56 6 1 46 43 39
56 1 51 43 1 57 54 43 39 49 8 0 0 13 50 50 10 0 31 54 43 39 49 6
1 57 54 43 39 49 8 0 0 18 47 56 57 58 1 15 47 58 47 64 43 52 10 0
37 53 59 1]
[39 56 43 1 39 50 50 1 56 43 57 53 50 60 43 42 1 56 39 58 46 43 56 1
58 53 1 42 47 43 1 58 46 39 52 1 58 53 1 44 39 51 47 57 46 12 0 0
13 50 50 10 0 30 43 57 53 50 60 43 42 8 1 56 43 57 53 50 60 43 42 8
0 0 18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 18 47 56 57 58 6 1
63 53 59 1]
[56 43 1 39 50 50 1 56 43 57 53 50 60 43 42 1 56 39 58 46 43 56 1 58
53 1 42 47 43 1 58 46 39 52 1 58 53 1 44 39 51 47 57 46 12 0 0 13
50 50 10 0 30 43 57 53 50 60 43 42 8 1 56 43 57 53 50 60 43 42 8 0
0 18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 18 47 56 57 58 6 1 63
53 59 1 49]
batch_size = 64
buffer_size = 10000
seq_dataset = seq_dataset. shuffle( buffer_size) . batch(
batch_size, drop_remainder= True )