Apply landmark to MoViNet

Open ghost opened this issue 8 months ago • 0 comments

def build_classifier(batch_size, num_frames, backbone, resolution, num_classes): # --- Video input --- video_input = layers.Input(shape=(num_frames, resolution, resolution, 3), batch_size=batch_size, name='video_input')

# Feature extraction from MoViNet backbone
def extract_video_features(x):
    endpoints = backbone(x, training=False)
    x = endpoints['head'] 
    x = tf.squeeze(x, axis=[2, 3]) 
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    return x

video_features = layers.Lambda(
    extract_video_features,
    output_shape=(480,),
    name="video_features"
)(video_input)

# --- Landmark input ---
landmark_input = layers.Input(shape=(num_frames, 234), batch_size=batch_size, name='landmark_input')
landmark_features = layers.Bidirectional(layers.LSTM(128, return_sequences=False))(landmark_input)
landmark_features = layers.Dense(128, activation='relu')(landmark_features)

# --- Fusion ---
merged = layers.Concatenate()([video_features, landmark_features])  # shape: (B, 608)
x = layers.Dense(256, activation='relu')(merged)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(num_classes, activation='softmax')(x)

model = tf.keras.Model(inputs=[video_input, landmark_input], outputs=outputs)
return model

I build this model but when I run model.fit() it shows in extract_video_features(x) 6 def extract_video_features(x): 7 endpoints = backbone(x, training=False) ----> 8 x = endpoints['head'] # shape: (B, T, 1, 1, C) 9 x = tf.squeeze(x, axis=[2, 3]) # shape: (B, T, C) 10 x = tf.keras.layers.GlobalAveragePooling1D()(x) # shape: (B, C)

TypeError: Exception encountered when calling Lambda.call().

tuple indices must be integers or slices, not str

Arguments received by Lambda.call(): • inputs=tf.Tensor(shape=(None, 50, 224, 224, 3), dtype=float32) • mask=None • training=True.

How can I fix this or any other ways to apply landmarks?

May 11 '25 14:05 ghost