Tensorflow2.0学习(21):文本生成之数据处理

莎士比亚文本数据集

实战

  • 步骤
    • 1.产生词表
    • 2.建立字符与id的对应
    • 3.将词表数据都转成id
    • 4.对文本输入做出输出:abcd->bcd*
  • 导包
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
print(sys.version_info)
for module in mpl, np ,pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)
2.1.0
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.1
numpy 1.16.5
pandas 0.25.1
sklearn 0.21.3
tensorflow 2.1.0
tensorflow_core.python.keras.api._v2.keras 2.2.4-tf
  • 下载数据集
from urllib import request
file_path = 'E:/zym_test/test/RNN'
txt_path = 'E:/zym_test/test/RNN/shakespeare.txt'
if not os.path.exists(txt_path):
    txt_path = os.path.join(file_path,'shakespeare.txt')
    url = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
    request.urlretrieve(url, txt_path)
text = open(txt_path, 'r').read()

print(len(text))
print(text[0:100])
1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
  • 生成词表
# 1.产生词表
# 2.建立字符与id的对应
# 3.将词表数据都转成id
# 4.对文本输入做出输出:abcd->bcd*

# 1.生成词表
# set() 函数创建一个无序不重复元素集
# sorted() 函数对所有可迭代的对象进行排序操作
vocab = sorted(set(text))
print(len(vocab))
print(vocab)
65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  • 建立词与其id的对应
# 2.建立字符与id的对应
# enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列
# 同时列出数据和数据下标,一般用在 for 循环当中
char2idx = {char:idx for idx, char in enumerate(vocab)}
print(char2idx)
{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
print(type(vocab))
<class 'list'>
  • 建立id与其词的对应
idx2char = np.array(vocab)
print(type(idx2char))
<class 'numpy.ndarray'>
  • 将整个文本转换成id文本
# 3.将词表数据都转成id
# 将文本转化为id值
text_as_int = np.array([char2idx[c] for c in text])
print(text[0:10])
print(text_as_int[0:10])
First Citi
[18 47 56 57 58  1 15 47 58 47]
  • 将文本转换为dataset格式,并得到输入输出
# 4.对文本输入做出输出:abcd->bcd*
def split_input_target(id_text):
    """
    abcde -> abcd, bcde
    """
    return id_text[0:-1], id_text[1:]


# 转换为dataset,是词的dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
for _ in char_dataset.take(3):
    print(_, idx2char[_.numpy()] )
    
# 转变为句子的dataset
seq_length = 100
seq_dataset = char_dataset.batch(seq_length + 1, 
                                 # 最后一组batch不够的话,就去掉
                                 drop_remainder=True)
for _ in seq_dataset.take(2):
    print(_)
    # repr:使特殊字符也显示出来
    print(repr(''.join(idx2char[_.numpy()])))
tf.Tensor(18, shape=(), dtype=int32) F
tf.Tensor(47, shape=(), dtype=int32) i
tf.Tensor(56, shape=(), dtype=int32) r
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int32)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
# 使用map方法对dataset进行处理

seq_dataset = seq_dataset.map(split_input_target)

for item_input, item_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())
    
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1]
[56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1 58
 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0 13
 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8  0
  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1 63
 53 59  1 49]
batch_size = 64
buffer_size = 10000

seq_dataset = seq_dataset.shuffle(buffer_size).batch(
    batch_size, drop_remainder=True)
发布了35 篇原创文章 · 获赞 3 · 访问量 2491

猜你喜欢

转载自blog.csdn.net/Smile_mingm/article/details/104638395