keras-io
keras-io copied to clipboard
[example/nlp] Replace tokenizer from pretrain in keras_nlp_tokenizers got error
trafficstars
Issue Type
Bug
Source
source
Keras Version
Keras 2.13.1
Custom Code
Yes
OS Platform and Distribution
Linux Ubuntu 20.04.5 LTS
Python version
3.8.18
GPU model and memory
CPU/378GB
Current Behavior?
After I run the example GPT text generation from scratch with KerasNLP in my computer, I want to replace the tokenizer in the scipt from keras_nlp/tokenizers to realize my design. However, I got error; I also check the size of the train dataset using:
features = iter(train_ds).next()
features[0].shape
all output of them was TensorShape([64, 128]) , when batch_size was set to 64.
What I want to do is replacing the tokenizer in the script with the pretrain tokenizer.
I could not solve this problem by myself, so I open this issue for help.
Standalone code to reproduce the issue or tutorial link
import os
import keras_nlp
import keras
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings
# Data
BATCH_SIZE = 64
MIN_STRING_LEN = 512 # Strings shorter than this will be discarded
SEQ_LEN = 128 # Length of training sequences, in tokens
# Model
EMBED_DIM = 256
FEED_FORWARD_DIM = 128
NUM_HEADS = 3
NUM_LAYERS = 2
# VOCAB_SIZE = 5000 # Limits parameters in model.
# Training
EPOCHS = 5
# Inference
NUM_TOKENS_TO_GENERATE = 80
keras.utils.get_file(
origin="https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip",
extract=True,
)
dir = os.path.expanduser("~/.keras/datasets/simplebooks/")
# Load simplebooks-92 train set and filter out short lines.
raw_train_ds = (
tf_data.TextLineDataset(dir + "simplebooks-92-raw/train.txt")
.filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
.batch(BATCH_SIZE)
.shuffle(buffer_size=256)
)
# Load simplebooks-92 validation set and filter out short lines.
raw_val_ds = (
tf_data.TextLineDataset(dir + "simplebooks-92-raw/valid.txt")
.filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
.batch(BATCH_SIZE)
)
tokenizer = keras_nlp.models.GPT2Tokenizer.from_preset("gpt2_base_en")
vocab = tokenizer.get_vocabulary()
vocab_size = len(vocab)
gpt2_preprocessor = keras_nlp.models.GPT2Preprocessor(tokenizer, sequence_length=SEQ_LEN, add_start_token=False, add_end_token=False)
VOCAB_SIZE = len(vocab)
start_packer = keras_nlp.layers.StartEndPacker(
sequence_length=SEQ_LEN,
start_value=tokenizer.token_to_id("<|endoftext|>"),
)
def preprocess(inputs):
outputs = tokenizer(inputs)
features = start_packer(outputs)
labels = outputs
return features, labels
# Tokenize and split into train and label sequences.
train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
tf_data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
tf_data.AUTOTUNE
)
inputs = keras.layers.Input(shape=(None,), dtype="int32")
# Embedding.
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
vocabulary_size=VOCAB_SIZE,
sequence_length=SEQ_LEN,
embedding_dim=EMBED_DIM,
mask_zero=True,
)
x = embedding_layer(inputs)
# Transformer decoders.
for _ in range(NUM_LAYERS):
decoder_layer = keras_nlp.layers.TransformerDecoder(
num_heads=NUM_HEADS,
intermediate_dim=FEED_FORWARD_DIM,
)
x = decoder_layer(x) # Giving one argument only skips cross-attention.
# Output.
outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=tokenizer.token_to_id("!"))
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)
Relevant log output
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/home/limingbo/projects/GPT2/text_genration_gpt_pretrain_tokenizer.ipynb 单元格 16 line 9
95 perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=tokenizer.token_to_id("!"))
96 model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])
---> 98 model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)
File ~/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
67 filtered_tb = _process_traceback_frames(e.__traceback__)
68 # To get the full stack trace, call:
69 # `tf.debugging.disable_traceback_filtering()`
---> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
File /tmp/__autograph_generated_fileguxsxbh3.py:15, in outer_factory.<locals>.inner_factory.<locals>.tf__train_function(iterator)
13 try:
14 do_return = True
---> 15 retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
16 except:
17 do_return = False
AttributeError: in user code:
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/training.py", line 1338, in train_function *
return step_function(self, iterator)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/training.py", line 1322, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/training.py", line 1303, in run_step **
outputs = model.train_step(data)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/training.py", line 1081, in train_step
loss = self.compute_loss(x, y, y_pred, sample_weight)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/training.py", line 1139, in compute_loss
return self.compiled_loss(
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/compile_utils.py", line 265, in __call__
loss_value = loss_obj(y_t, y_p, sample_weight=sw)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/losses.py", line 142, in __call__
losses = call_fn(y_true, y_pred)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/losses.py", line 268, in call **
return ag_fn(y_true, y_pred, **self._fn_kwargs)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/losses.py", line 2385, in _ragged_tensor_sparse_categorical_crossentropy
return _ragged_tensor_apply_loss(fn, y_true, y_pred, y_pred_extra_dim=True)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/losses.py", line 1686, in _ragged_tensor_apply_loss
nested_splits_list = [rt.nested_row_splits for rt in (y_true, y_pred)]
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/losses.py", line 1686, in <listcomp>
nested_splits_list = [rt.nested_row_splits for rt in (y_true, y_pred)]
AttributeError: 'Tensor' object has no attribute 'nested_row_splits'