ast
ast copied to clipboard
Discrepancy in Model Performance Using HuggingFace Pipeline Utility
Hi I'm attempting to reproduce the performance metrics of models using HuggingFace's Pipeline utility, but I'm encountering different results. Below is the Python code I used for testing:
import warnings
import torch
import datasets
import numpy as np
from tqdm.auto import tqdm
from dotenv import load_dotenv
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
pipe = pipeline('audio-classification', model='MIT/ast-finetuned-audioset-16-16-0.442')
dataset = datasets.load_dataset('confit/audioset', 'balanced', split='test')
classes = dataset.features["label"].feature.names
id2label = {idx:row for idx, row in enumerate(classes)}
label2id = {row:idx for idx, row in enumerate(classes)}
y_scores = []
for out in tqdm(pipe(KeyDataset(dataset, 'file'), top_k=527)):
score = torch.zeros(len(classes)) # placeholder
for item in out:
label = item['label']
score[label2id.get(label)] = item['score']
y_scores.append(score)
y_scores = torch.vstack(y_scores)
print(y_scores)
print(y_scores.shape)
y_true = []
for example in tqdm(dataset, total=len(dataset)):
num_instances = 1
num_classes = len(classes)
one_hot_tensor = np.zeros((num_instances, num_classes), dtype=float)
one_hot_tensor[0, example['label']] = 1
y_true.append(torch.from_numpy(one_hot_tensor))
y_true = torch.cat(y_true, dim=0)
print(y_true)
print(y_true.shape)
map_score = mean_average_precision(y_true.to('cpu').numpy(), y_scores.to('cpu').numpy())
print("Mean Average Precision:", map_score)
mean_auc = mean_auc_roc(y_true.to('cpu').numpy(), y_scores.to('cpu').numpy())
print("Mean AUC-ROC:", mean_auc)
The helper functions for the metrics calculations are implemented as follows:
import numpy as np
from sklearn.metrics import average_precision_score, roc_auc_score
def mean_average_precision(y_true, y_scores):
"""
Calculate the mean average precision (mAP) for multilabel classification.
Args:
y_true (np.array): A binary matrix (samples x labels) of ground truth labels.
y_scores (np.array): A matrix (samples x labels) of predicted scores.
Returns:
float: mean average precision score
"""
# Number of classes
n_classes = y_true.shape[1]
# List to store average precision for each class
ap_scores = []
# Calculate average precision for each class
for i in range(n_classes):
ap = average_precision_score(y_true[:, i], y_scores[:, i])
ap_scores.append(ap)
# Calculate mean of average precision scores
mAP = np.mean(ap_scores)
return mAP
def mean_auc_roc(y_true, y_scores):
"""
Calculate the mean AUC-ROC for multilabel classification.
Args:
y_true (np.array): A binary matrix (samples x labels) of ground truth labels.
y_scores (np.array): A matrix (samples x labels) of predicted scores.
Returns:
float: mean AUC-ROC score
"""
# Number of classes
n_classes = y_true.shape[1]
# List to store AUC-ROC for each class
auc_scores = []
# Calculate AUC-ROC for each class
for i in range(n_classes):
# Ensure there is more than one class to avoid sklearn ValueError
if len(np.unique(y_true[:, i])) > 1:
auc = roc_auc_score(y_true[:, i], y_scores[:, i])
auc_scores.append(auc)
else:
# Handle the case where a class has only one class present in y_true
# Typically handled by assigning an AUC of 0.5 (random guessing score)
# or by not including this class in the average calculation
auc_scores.append(0.5)
# Calculate mean of AUC-ROC scores
mean_auc = np.mean(auc_scores)
return mean_auc
The recorded performance metrics were:
Checkpoint | mAP | AUC-ROC |
---|---|---|
MIT/ast-finetuned-audioset-16-16-0.442 |
0.4040 | 0.9671 |
MIT/ast-finetuned-audioset-10-10-0.4593 |
0.4256 | 0.9737 |
These results do not align closely with the expected performance. Could you help me identify any potential issues with my approach or provide guidance on achieving the expected performance levels?