bert-for-tf2 icon indicating copy to clipboard operation
bert-for-tf2 copied to clipboard

albert classification error(Failed copying input tensor from GPU in order to run Identity: GPU sync failed [Op:Identity])

Open DrinkingMilktea opened this issue 3 years ago • 0 comments

#tokenization define from bert.tokenization.albert_tokenization import FullTokenizer def createTokenizer(): return FullTokenizer("../albert_base/assets/30k-clean.vocab", spm_model_file="../albert_base/assets/30k-clean.model", do_lower_case=True)

def get_masks(tokens, max_seq_length): """Mask for padding""" if len(tokens)>max_seq_length: #Cutting down the excess length tokens = tokens[0:max_seq_length] return [1]*len(tokens) else : return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length): if len(tokens)>max_seq_length: #Cutting down the excess length tokens = tokens[:max_seq_length] segments = [] current_segment_id = 0 for token in tokens: segments.append(current_segment_id) if token == "[SEP]": current_segment_id = 1 return segments else: segments = [] current_segment_id = 0 for token in tokens: segments.append(current_segment_id) if token == "[SEP]": current_segment_id = 1 return segments + [0] * (max_seq_length - len(tokens))

def get_ids(tokens, tokenizer, max_seq_length): if len(tokens)>max_seq_length: tokens = tokens[:max_seq_length] token_ids = tokenizer.convert_tokens_to_ids(tokens) return token_ids else: token_ids = tokenizer.convert_tokens_to_ids(tokens) input_ids = token_ids + [0] * (max_seq_length-len(token_ids)) return input_ids

tokenizer = createTokenizer() max_seq_length = 64 #This number will determine the number of tokens

def prep(s, get = 'id'): stokens = tokenizer.tokenize(s) stokens = ["[CLS]"] + stokens + ["[SEP]"] if get == 'id': input_ids = get_ids(stokens, tokenizer, max_seq_length) return input_ids elif get == 'mask': input_masks = get_masks(stokens, max_seq_length) return input_masks else: input_segments = get_segments(stokens, max_seq_length) return input_segments

#train and test data load import pandas as pd train_set = pd.read_csv("../goemotion/train_set.csv") test_set = pd.read_csv("../goemotion/test_set.csv") train_X = [prep(sentence) for sentence in train_set["text"]] train_Y = list(map(int, train_set["emotion"].tolist())) test_X = [prep(sentence) for sentence in test_set["text"]] test_Y = list(map(int, test_set["emotion"].tolist())) print("data preprocess finished")

#albert model calling import os import bert import tensorflow as tf

#GPU config tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices("GPU")[0], True)

#parameters model_name = "albert_base_v2" model_ckpt = os.path.join("../albert_base", "model.ckpt-best") model_params = bert.albert_params("../albert_base/")

#call and define model layers albert_layer = bert.BertModelLayer.from_params(model_params, name="albert") model_layer = tf.keras.Sequential([ tf.keras.layers.Input(shape=(max_seq_length,), dtype="int32", name="input_ids"), albert_layer, tf.keras.layers.Dense(112, activation=tf.nn.relu), tf.keras.layers.Dense(27, activation=tf.nn.softmax),#0~27 tf.keras.layers.Dense(1, activation=tf.nn.softmax) ]) model_layer.build(input_shape=(None, max_seq_length)) bert.load_albert_weights(albert_layer, model_ckpt)

#compile model_layer.compile(loss="sparse_categorical_crossentropy", optimizer=tf.optimizers.Adam(lr=0.00001), metrics=["sparse_categorical_accuracy"]) print(model_layer.summary())

#train start checkpointName = os.path.join("../albert_base/models/", "albert_faq.ckpt") cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpointName, save_weights_only=True, verbose=1)

#train_start history = model_layer.fit( test_X, test_Y, epochs=300, validation_data=(train_X, train_Y), verbose=1, callbacks=[cp_callback], batch_size=2)

above is my code and +-----------------------------------------------------------------------------+ | NVIDIA-SMI 455.23.04 Driver Version: 455.23.04 CUDA Version: 11.1 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 GeForce RTX 3090 On | 00000000:09:00.0 On | N/A | | 33% 53C P2 111W / 350W | 1016MiB / 24265MiB | 1% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ this is nvidia-smi

I use tensorflow-gpu 2.2 and cuda toolkit 10.1 and cudnn 7.6 My computer is 3900X 128GB(RAM) RTX3090 500GB(SSD)

and if run above code error message is below.

File "/home/sentiment/anaconda3/envs/mybert/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 6606, in raise_from_not_ok_status six.raise_from(core._status_to_exception(e.code, message), None) File "", line 3, in raise_from tensorflow.python.framework.errors_impl.InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

I want to train albert finetuning. if i use tensorflow for cpu. it work fine but 1 epoch per 6 hour for training. so I hope to use gpu

I really hard to find out solution for fixing but failed.

is there anyone know how to fix this error?

DrinkingMilktea avatar Nov 18 '20 12:11 DrinkingMilktea