rust-bert icon indicating copy to clipboard operation
rust-bert copied to clipboard

Help finding the cause of different Inference results from the original python script

Open mazksr opened this issue 10 months ago • 4 comments

I'm trying to rewrite an inference script for a fine-tuned BERT model i made with python into rust, here's how i save my model:

indobert-finetuned
├── config.json
├── model.safetensors
├── special_tokens_map.json
├── tokenizer.json
├── tokenizer_config.json
├── training_args.bin
└── vocab.txt

The python inference script:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_path = "indobert-finetuned/"

tokenizer = AutoTokenizer.from_pretrained(model_path) 
model = AutoModelForSequenceClassification.from_pretrained(model_path) 

#Tokenize input
inputs = tokenizer("anjing", return_tensors='pt', padding=True, truncation=False).to(device)

#Move inputs to the same device as the model
inputs = {key: value.to(device) for key, value in inputs.items()}
inputs = inputs

#Perform inference
with torch.inference_mode():
    outputs = model(**inputs)

logits = outputs.logits
print(logits, model)

predicted_label = torch.softmax(logits, dim=-1).squeeze().cpu()

print(predicted_label.tolist())

Prints the following:

[
    0.0022081946954131126,
    0.9808889031410217,
    0.0035459366627037525,
    0.0007706195465289056,
    0.00043291927431710064,
    0.00036624292260967195,
    0.0012361736735329032,
    0.0010357820428907871,
    0.004077346995472908,
    0.004229736048728228,
    0.000952212605625391,
    0.00025596615159884095
  ]

Here's how i load the model for rust-bert:

let model_res = LocalResource{
  local_path: "indobert-finetuned/model.safetensors".into()
};

let sequence_classification_config = ZeroShotClassificationConfig::new(
  Bert, // model_type
  Torch(Box::new(model_res)), // model_resource
  LocalResource {
      local_path: "indobert-finetuned/config.json".into()
  }, // config_resource
  LocalResource {
      local_path: "indobert-finetuned/vocab.txt".into()
  }, // vocab_resource
  None, // merges_resource
  true, // lowercase
  None, // strip_accent
  None // add_prefix_space
);

// loading tokenizer.json & special_tokens_map.json
let tokenizer = TokenizerOption::from_hf_tokenizer_file(
  "indobert-finetuned/tokenizer.json",
  "indobert-finetuned/special_tokens_map.json"
).unwrap();

// zero shot classification pipeline
let sequence_classification_model = ZeroShotClassificationModel::new_with_tokenizer(
  sequence_classification_config,
  tokenizer
);

let input = [
  "anjing",
];

let labels = [...] // string array with 12 elements

// Run model inference
let output = sequence_classification_model.unwrap().predict_multilabel(
  &input,
  &labels,
  None,
  256
);

if let Ok(out) = output {
  let scores: Vec<_> = out[0].iter()
      .filter_map(|label| Some(label.score))
      .collect();

  println!("{:?}", scores);
}

Outputs:

[
0.6780490875244141, 
0.7056891918182373, 
0.6685047745704651, 
0.563891589641571, 
0.6616213917732239, 
0.6930138468742371, 
0.6496127247810364, 
0.6565861701965332, 
0.6566416621208191, 
0.7492241859436035, 
0.68653404712677, 
0.7110175490379333
]

Any help would be appreciated :).

mazksr avatar Dec 20 '24 14:12 mazksr