ast icon indicating copy to clipboard operation
ast copied to clipboard

Discrepancy in Model Performance Using HuggingFace Pipeline Utility

Open penguinwang96825 opened this issue 1 week ago • 5 comments

Hi I'm attempting to reproduce the performance metrics of models using HuggingFace's Pipeline utility, but I'm encountering different results. Below is the Python code I used for testing:

import warnings

import torch
import datasets
import numpy as np
from tqdm.auto import tqdm
from dotenv import load_dotenv
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset

pipe = pipeline('audio-classification', model='MIT/ast-finetuned-audioset-16-16-0.442')
dataset = datasets.load_dataset('confit/audioset', 'balanced', split='test')
classes = dataset.features["label"].feature.names
id2label = {idx:row for idx, row in enumerate(classes)}
label2id = {row:idx for idx, row in enumerate(classes)}

y_scores = []
for out in tqdm(pipe(KeyDataset(dataset, 'file'), top_k=527)):
    score = torch.zeros(len(classes)) # placeholder
    for item in out:
        label = item['label']
        score[label2id.get(label)] = item['score']
    y_scores.append(score)
y_scores = torch.vstack(y_scores)
print(y_scores)
print(y_scores.shape)

y_true = []
for example in tqdm(dataset, total=len(dataset)):
    num_instances = 1
    num_classes = len(classes)
    one_hot_tensor = np.zeros((num_instances, num_classes), dtype=float)
    one_hot_tensor[0, example['label']] = 1
    y_true.append(torch.from_numpy(one_hot_tensor))
y_true = torch.cat(y_true, dim=0)
print(y_true)
print(y_true.shape)

map_score = mean_average_precision(y_true.to('cpu').numpy(), y_scores.to('cpu').numpy())
print("Mean Average Precision:", map_score)

mean_auc = mean_auc_roc(y_true.to('cpu').numpy(), y_scores.to('cpu').numpy())
print("Mean AUC-ROC:", mean_auc)

The helper functions for the metrics calculations are implemented as follows:

import numpy as np
from sklearn.metrics import average_precision_score, roc_auc_score

def mean_average_precision(y_true, y_scores):
    """
    Calculate the mean average precision (mAP) for multilabel classification.

    Args:
    y_true (np.array): A binary matrix (samples x labels) of ground truth labels.
    y_scores (np.array): A matrix (samples x labels) of predicted scores.

    Returns:
    float: mean average precision score
    """
    # Number of classes
    n_classes = y_true.shape[1]

    # List to store average precision for each class
    ap_scores = []

    # Calculate average precision for each class
    for i in range(n_classes):
        ap = average_precision_score(y_true[:, i], y_scores[:, i])
        ap_scores.append(ap)

    # Calculate mean of average precision scores
    mAP = np.mean(ap_scores)
    return mAP


def mean_auc_roc(y_true, y_scores):
    """
    Calculate the mean AUC-ROC for multilabel classification.

    Args:
    y_true (np.array): A binary matrix (samples x labels) of ground truth labels.
    y_scores (np.array): A matrix (samples x labels) of predicted scores.

    Returns:
    float: mean AUC-ROC score
    """
    # Number of classes
    n_classes = y_true.shape[1]

    # List to store AUC-ROC for each class
    auc_scores = []

    # Calculate AUC-ROC for each class
    for i in range(n_classes):
        # Ensure there is more than one class to avoid sklearn ValueError
        if len(np.unique(y_true[:, i])) > 1:
            auc = roc_auc_score(y_true[:, i], y_scores[:, i])
            auc_scores.append(auc)
        else:
            # Handle the case where a class has only one class present in y_true
            # Typically handled by assigning an AUC of 0.5 (random guessing score)
            # or by not including this class in the average calculation
            auc_scores.append(0.5)

    # Calculate mean of AUC-ROC scores
    mean_auc = np.mean(auc_scores)
    return mean_auc

The recorded performance metrics were:

Checkpoint mAP AUC-ROC
MIT/ast-finetuned-audioset-16-16-0.442 0.4040 0.9671
MIT/ast-finetuned-audioset-10-10-0.4593 0.4256 0.9737

These results do not align closely with the expected performance. Could you help me identify any potential issues with my approach or provide guidance on achieving the expected performance levels?

penguinwang96825 avatar Jun 25 '24 18:06 penguinwang96825