bert-for-tf2
bert-for-tf2 copied to clipboard
may be there is some problem work with tf hub
hi, I am using this script to generate albert saved model which is capativble with tf serving
since i genrated model , the input is { "instances":[ {"inputs": ["你好么"]} ] } output result seem not right, actually i want the albert out embedding vector.
{ "predictions": [ [ 101, 872, 1962, 720, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] ] }
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import bert, os
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()
model_name = "albert_base"
model_dir = bert.fetch_brightmart_albert_model(model_name, ".models")
model_ckpt = os.path.join(model_dir, "albert_model.ckpt")
bert_params = bert.params_from_pretrained_ckpt(model_dir)
l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")
class BertTokenizerLayer(tf.keras.layers.Layer):
def __init__(self, vocab_file_path, sequence_length=128, lower_case=True):
super(BertTokenizerLayer, self).__init__()
self.CLS_ID = tf.constant(101, dtype=tf.int64)
self.SEP_ID = tf.constant(102, dtype=tf.int64)
self.PAD_ID = tf.constant(0, dtype=tf.int64)
self.sequence_length = tf.constant(sequence_length)
vocab = self.load_vocab(vocab_file_path)
# These two lines are basically what makes it work
# assigning the vocab to a tf.Module and then later assigning the
# intantiated Module to e.g. a Keras Model
self.create_vocab_table(vocab)
self.bert_tokenizer = text.BertTokenizer(
vocab_lookup_table=self.vocab_table,
token_out_type=tf.int64,
lower_case=lower_case,
)
def load_vocab(self, vocab_file):
"""Loads a vocabulary file into a list."""
vocab = []
with tf.io.gfile.GFile(vocab_file, "r") as reader:
while True:
token = reader.readline()
if not token:
break
token = token.strip()
vocab.append(token)
return vocab
def create_vocab_table(self, vocab, num_oov=1):
vocab_values = tf.range(tf.size(vocab, out_type=tf.int64), dtype=tf.int64)
self.init = tf.lookup.KeyValueTensorInitializer(
keys=vocab, values=vocab_values, key_dtype=tf.string, value_dtype=tf.int64
)
self.vocab_table = tf.lookup.StaticVocabularyTable(
self.init, num_oov, lookup_key_dtype=tf.string
)
@tf.function
def call(self, inputs: tf.Tensor) -> tf.Tensor:
"""
Perform the BERT preprocessing from text -> input token id
"""
# Convert text into token ids
tokens = self.bert_tokenizer.tokenize(inputs)
# Flatten the ragged tensors
tokens = tokens.merge_dims(1, 2)
# Add start and end token ids to the id sequence
start_tokens = tf.fill([tf.shape(inputs)[0], 1], self.CLS_ID)
end_tokens = tf.fill([tf.shape(inputs)[0], 1], self.SEP_ID)
tokens = tf.concat([start_tokens, tokens, end_tokens], axis=1)
# Truncate to sequence length
tokens = tokens[:, : self.sequence_length]
# Convert ragged tensor to tensor and pad with PAD_ID
tokens = tokens.to_tensor(default_value=self.PAD_ID)
# Pad to sequence length
pad = self.sequence_length - tf.shape(tokens)[1]
tokens = tf.pad(tokens, [[0, 0], [0, pad]], constant_values=self.PAD_ID)
return tf.reshape(tokens, [-1, self.sequence_length])
# text_input = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
# tokenizerd = BertTokenizerLayer(vocab_file_path=os.path.join(model_dir, "vocab.txt"))
# input_tokens = tokenizerd(text_input)
# embed_output = l_bert(input_tokens) # output: [batch_size, max_seq_len, hidden_size]
# model = tf.keras.Model(inputs=text_input, outputs=embed_output)
# model.save("./models/albert-zh/1", signatures=tokenizerd.call.get_concrete_function(tf.TensorSpec([], tf.string)))
model = tf.keras.Sequential([
tf.keras.Input(shape=(1,), dtype=tf.string),
l_bert
])
model.tokenizer = BertTokenizerLayer(vocab_file_path=os.path.join(model_dir, "vocab.txt"))
model.save("./models/albert-zh/1", signatures=model.tokenizer.call.get_concrete_function(tf.TensorSpec(None, tf.string)))