Flickr4Java
Flickr4Java copied to clipboard
Error training model on Colab TPU with data generator
Subject of the issue
Training my model with a TPU on Colab throws a following error:
6 root error(s) found. (0) INTERNAL: {{function_node __inference_train_function_57310}} failed to connect to all addresses Additional GRPC error information from remote target /job:localhost/replica:0/task:0/device:CPU:0: :{"created":"@1642687810.299638456","description":"Failed to pick subchannel","file":"third_party/grpc/src/core/ext/filters/client_channel/client_channel.cc","file_line":3151,"referenced_errors":[{"created":"@1642687810.299637559","description":"failed to connect to all addresses","file":"third_party/grpc/src/core/lib/transport/error_utils.cc","file_line":161,"grpc_status":14}]} [[{{node MultiDeviceIteratorGetNextFromShard}}]] Executing non-communication op <MultiDeviceIteratorGetNextFromShard> originally returned UnavailableError, and was replaced by InternalError to avoid invoking TF network error handling logic. [[RemoteCall]] [[IteratorGetNextAsOptional]] [[strided_slice_49/_322]] ...
I use custom data generator which is probably the cause of this error because when I store a batch of data into a variable the training loop goes well.
Data gen and model:
`
class CustomDataGen(tf.keras.utils.Sequence):
def init(self,path,batch_size,input_size):
self.batch_size = batch_size
self.path = path
self.input_size = input_size
self.img_size = 0
self.files_acc = np.sort(os.listdir(save_dir))
self.labels = {}
self.get_labels_from_files()
def get_labels_from_files(self):
for i in self.files_acc:
if not i == '.ipynb_checkpoints':
with open(save_dir + i,'r') as f:
content = f.readlines()
content = content[0]
content = [float(i) for i in content[1:-1].split(',')]
self.labels[i] = content
def __get_input(self,f_name,target_size):
label_name = f_name
f_name = f_name.split('.')[0] + '.jpg'
f_path = self.path+f_name
image_arr = np.array(Image.open(f_path).convert('RGB')) / 255.0
self.img_size = image_arr.shape[0:2]
image_arr = cv2.resize(image_arr,(target_size[0],target_size[1]))
bbox = self.labels[label_name][:4]
bbox = np.array(bbox,dtype='int')
sliced_image_arr = image_arr[bbox[1]:bbox[3],bbox[0]:bbox[2]]
sliced_image_arr = cv2.resize(sliced_image_arr,(target_size))
return sliced_image_arr
def __get_labels(self,f_name):
f_name = f_name.split('.')[0] + '.txt'
label = self.labels[f_name][4:]
return label
def __get_data(self,batches):
X_batch = np.array([self.__get_input(x,self.input_size) for x in batches])
y_batch = np.array([self.__get_labels(y) for y in batches])
return X_batch.astype('float32'),y_batch.astype('float32')
def __getitem__(self,index):
files = self.files_acc[index*self.batch_size:(index+1)*self.batch_size]
if index == len(self.files_acc) - self.batch_size - 1:
np.random.shuffle(self.files_acc)
return self.__get_data(files)
def __len__(self):
return len(self.files_acc) // self.batch_size
def initialize_second_model():
model2 = Sequential()
resnet = ResNet50(False,input_shape=(img_shape[0],img_shape[1],3))
model2.add(resnet)
model2.add(GlobalAveragePooling2D())
model2.add(Flatten())
model2.add(Dropout(0.2))
model2.add(Dense(4))
return model2
with strategy.scope():
model2 = initialize_second_model()
model2.compile(tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')
lr_reducer =
tf.keras.callbacks.ReduceLROnPlateau(factor=0.1,monitor='loss',mode='min',patience=1,verbose=1)
history = model2.fit(x,y,batch_size=batch_size,epochs=10,callbacks= [lr_reducer],use_multiprocessing=True)
Epoch 1/10 1/1 [==============================] - 41s 41s/step - loss: 1.8858 - lr: 0.0010 Epoch 2/10 1/1 [==============================] - ETA: 0s - loss: 2.5232 Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513. 1/1 [==============================] - 0s 99ms/step - loss: 2.5232 - lr: 0.0010 Epoch 3/10 1/1 [==============================] - 0s 93ms/step - loss: 1.6295 - lr: 1.0000e-04 Epoch 4/10 1/1 [==============================] - 0s 93ms/step - loss: 0.8416 - lr: 1.0000e-04 Epoch 5/10 1/1 [==============================] - 0s 86ms/step - loss: 0.4136 - lr: 1.0000e-04 Epoch 6/10 1/1 [==============================] - 0s 88ms/step - loss: 0.3121 - lr: 1.0000e-04 Epoch 7/10 1/1 [==============================] - ETA: 0s - loss: 0.4494 Epoch 00007: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05. 1/1 [==============================] - 0s 106ms/step - loss: 0.4494 - lr: 1.0000e-04 Epoch 8/10 1/1 [==============================] - ETA: 0s - loss: 0.3423 Epoch 00008: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06. 1/1 [==============================] - 0s 105ms/step - loss: 0.3423 - lr: 1.0000e-05 Epoch 9/10 1/1 [==============================] - ETA: 0s - loss: 0.3282 Epoch 00009: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07. 1/1 [==============================] - 0s 98ms/step - loss: 0.3282 - lr: 1.0000e-06 Epoch 10/10 1/1 [==============================] - ETA: 0s - loss: 0.3410 Epoch 00010: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08. 1/1 [==============================] - 0s 106ms/step - loss: 0.3410 - lr: 1.0000e-07`
According to many people online turns out I cannot use tensorflow TPU with custom data, it has to be located in the google cloud platform. Can anyone confirm?