PaddleNLP
PaddleNLP copied to clipboard
[Bug]: UIE导出onnx后模型可以正常推理,但是指标全0
软件环境
- paddlepaddle: 3.0.0
- paddlepaddle-gpu: none
- paddlenlp: 2.6.1
重复问题
- [x] I have searched the existing issues
错误描述
使用paddle.onnx.export导出UIE时发现模型可以正常进行推理,但是评估指标为全0,导出前评估正常。
稳定复现步骤 & 代码
导出代码
from paddlenlp.transformers import UIE
import paddle
model = UIE.from_pretrained("./checkpoint/model_best/checkpoint-200")
# export to ONNX
save_path = 'uie' # 需要保存的路径
input_ids = paddle.static.InputSpec([None, 512], 'int64', 'input_ids')
attention_mask = paddle.static.InputSpec([None, 512], 'int64', 'attention_mask')
token_type_ids = paddle.static.InputSpec([None, 512], 'int64', 'token_type_ids')
paddle.onnx.export(model, save_path, input_spec=[input_ids, attention_mask, token_type_ids], opset_version=11)
评估代码
import argparse
from functools import partial
import paddle
import numpy as np
import onnxruntime as ort
from utils import convert_example, create_data_loader, reader
from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.datasets import MapDataset, load_dataset
from paddlenlp.metrics import SpanEvaluator
from paddlenlp.transformers import AutoTokenizer
from paddlenlp.utils.log import logger
from tqdm import tqdm
def evaluate_onnx(session, metric, data_loader):
"""
使用 ONNX 模型的推理会话对数据进行评估,并计算指标。
Args:
session: onnxruntime.InferenceSession 对象。
metric: 用于计算指标的评估器,例如 SpanEvaluator。
data_loader: 评估数据加载器。
multilingual (bool): 是否为多语言模型(决定输入字段)。
"""
metric.reset()
for batch in tqdm(data_loader):
# 构造输入字典,并将 paddle.Tensor 转为 numpy 数组
inputs = {}
inputs["input_ids"] = batch["input_ids"].numpy()
inputs["token_type_ids"] = batch["token_type_ids"].numpy()
inputs["attention_mask"] = batch["attention_mask"].numpy()
# 使用 ONNX Runtime 进行推理,outputs 顺序与导出时一致
outputs = session.run(None, inputs)
print(len(outputs))
start_prob, end_prob = outputs[0], outputs[1]
# 将 ground truth 转为 numpy 数组并转换类型
start_ids = batch["start_positions"].numpy().astype("float32")
end_ids = batch["end_positions"].numpy().astype("float32")
# 计算指标:假设 SpanEvaluator.compute 接收的均为 numpy 数组
num_correct, num_infer, num_label = metric.compute(
start_prob, end_prob, start_ids, end_ids
)
metric.update(num_correct, num_infer, num_label)
precision, recall, f1 = metric.accumulate()
return precision, recall, f1
def do_eval():
paddle.set_device(args.device)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
session = ort.InferenceSession(args.onnx_model_path)
# 加载评估数据集
test_ds = load_dataset(
reader, data_path=args.test_path, max_seq_len=args.max_seq_len, lazy=False
)
class_dict = {}
class_dict["all_classes"] = test_ds
# 构造数据预处理函数
trans_fn = partial(
convert_example,
tokenizer=tokenizer,
max_seq_len=args.max_seq_len,
)
test_ds = test_ds.map(trans_fn)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=args.max_seq_len)
test_data_loader = create_data_loader(
test_ds, mode="test", batch_size=args.batch_size, trans_fn=data_collator
)
metric = SpanEvaluator()
precision, recall, f1 = evaluate_onnx(session, metric, test_data_loader)
logger.info("-----------------------------")
logger.info(
"Evaluation Precision: %.5f | Recall: %.5f | F1: %.5f"
% (precision, recall, f1)
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, default=None, help="本地加载 tokenizer。")
parser.add_argument("--onnx_model_path", type=str, default=None, help="本地 ONNX 模型文件路径。")
parser.add_argument("--test_path", type=str, default=None, help="测试集数据路径。")
parser.add_argument("--batch_size", type=int, default=16, help="每个 GPU/CPU 上评估时的 batch size。")
parser.add_argument("--device", type=str, default="gpu", choices=["gpu", "cpu", "npu"], help="评估时使用的设备。")
parser.add_argument("--max_seq_len", type=int, default=512, help="分词后的最大序列长度。")
parser.add_argument("--schema_lang", choices=["ch", "en"], default="ch", help="Schema 语言类型。")
args = parser.parse_args()
do_eval()
转换后评估结果:
[2025-04-11 15:35:53,838] [ INFO] - -----------------------------
[2025-04-11 15:35:53,838] [ INFO] - Evaluation Precision: 0.00000 | Recall: 0.00000 | F1: 0.00000
转换onnx前评估代码:
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from functools import partial
import paddle
from utils import convert_example, create_data_loader, reader
from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.datasets import MapDataset, load_dataset
from paddlenlp.metrics import SpanEvaluator
from paddlenlp.transformers import UIE, UIEM, AutoTokenizer
from paddlenlp.utils.ie_utils import get_relation_type_dict, unify_prompt_name
from paddlenlp.utils.log import logger
from tqdm import tqdm
@paddle.no_grad()
def evaluate(model, metric, data_loader, multilingual=False):
"""
Given a dataset, it evals model and computes the metric.
Args:
model(obj:`paddle.nn.Layer`): A model to classify texts.
metric(obj:`paddle.metric.Metric`): The evaluation metric.
data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
multilingual(bool): Whether is the multilingual model.
"""
model.eval()
metric.reset()
for batch in tqdm(data_loader):
if multilingual:
start_prob, end_prob = model(batch["input_ids"], batch["position_ids"])
else:
start_prob, end_prob = model(
batch["input_ids"],
batch["token_type_ids"],
batch["position_ids"],
batch["attention_mask"],
)
start_ids = paddle.cast(batch["start_positions"], "float32")
end_ids = paddle.cast(batch["end_positions"], "float32")
num_correct, num_infer, num_label = metric.compute(
start_prob, end_prob, start_ids, end_ids
)
metric.update(num_correct, num_infer, num_label)
precision, recall, f1 = metric.accumulate()
model.train()
return precision, recall, f1
def do_eval():
paddle.set_device(args.device)
if args.model_path in ["uie-m-base", "uie-m-large"]:
args.multilingual = True
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
if args.multilingual:
model = UIEM.from_pretrained(args.model_path)
else:
model = UIE.from_pretrained(args.model_path)
test_ds = load_dataset(
reader, data_path=args.test_path, max_seq_len=args.max_seq_len, lazy=False
)
class_dict = {}
relation_data = []
if args.debug:
for data in test_ds:
class_name = unify_prompt_name(data["prompt"])
# Only positive examples are evaluated in debug mode
if len(data["result_list"]) != 0:
p = "的" if args.schema_lang == "ch" else " of "
if p not in data["prompt"]:
class_dict.setdefault(class_name, []).append(data)
else:
relation_data.append((data["prompt"], data))
relation_type_dict = get_relation_type_dict(
relation_data, schema_lang=args.schema_lang
)
else:
class_dict["all_classes"] = test_ds
trans_fn = partial(
convert_example,
tokenizer=tokenizer,
max_seq_len=args.max_seq_len,
multilingual=args.multilingual,
)
for key in class_dict.keys():
if args.debug:
test_ds = MapDataset(class_dict[key])
else:
test_ds = class_dict[key]
test_ds = test_ds.map(trans_fn)
data_collator = DataCollatorWithPadding(tokenizer)
test_data_loader = create_data_loader(
test_ds, mode="test", batch_size=args.batch_size, trans_fn=data_collator
)
metric = SpanEvaluator()
precision, recall, f1 = evaluate(
model, metric, test_data_loader, args.multilingual
)
logger.info("-----------------------------")
logger.info("Class Name: %s" % key)
logger.info(
"Evaluation Precision: %.5f | Recall: %.5f | F1: %.5f"
% (precision, recall, f1)
)
if args.debug and len(relation_type_dict.keys()) != 0:
for key in relation_type_dict.keys():
test_ds = MapDataset(relation_type_dict[key])
test_ds = test_ds.map(trans_fn)
test_data_loader = create_data_loader(
test_ds, mode="test", batch_size=args.batch_size, trans_fn=data_collator
)
metric = SpanEvaluator()
precision, recall, f1 = evaluate(model, metric, test_data_loader)
logger.info("-----------------------------")
if args.schema_lang == "ch":
logger.info("Class Name: X的%s" % key)
else:
logger.info("Class Name: %s of X" % key)
logger.info(
"Evaluation Precision: %.5f | Recall: %.5f | F1: %.5f"
% (precision, recall, f1)
)
if __name__ == "__main__":
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, default=None, help="The path of saved model that you want to load.")
parser.add_argument("--test_path", type=str, default=None, help="The path of test set.")
parser.add_argument("--batch_size", type=int, default=16, help="Batch size per GPU/CPU for training.")
parser.add_argument("--device", type=str, default="gpu", choices=["gpu", "cpu", "npu"], help="Device selected for evaluate.")
parser.add_argument("--max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization.")
parser.add_argument("--debug", action='store_true', help="Precision, recall and F1 score are calculated for each class separately if this option is enabled.")
parser.add_argument("--multilingual", action='store_true', help="Whether is the multilingual model.")
parser.add_argument("--schema_lang", choices=["ch", "en"], default="ch", help="Select the language type for schema.")
args = parser.parse_args()
# yapf: enable
do_eval()
转换前评估结果
[2025-04-11 15:58:51,075] [ INFO] - -----------------------------
[2025-04-11 15:58:51,076] [ INFO] - Class Name: all_classes
[2025-04-11 15:58:51,076] [ INFO] - Evaluation Precision: 0.90000 | Recall: 0.90000 | F1: 0.90000
Hi! @DrownFish19 这是因为macOS的原因吗,我目前还没在其他平台测试过这段代码
哈喽 @DrownFish19 ,我们换了一个平台测试了这个代码结果依然不行,模型输出数据的格式是正确的,但是不知道为何评估结果是0,请问有遇到过类似的问题吗?
目前paddle2onnx支持PaddleX中的模型转化,其他模型还没有验证过,可以使用paddle2onnx==2.0.0a5 试一下,安装方式pip install paddle2onnx==2.0.0a5
This issue is stale because it has been open for 60 days with no activity. 当前issue 60天内无活动,被标记为stale。
This issue was closed because it has been inactive for 14 days since being marked as stale. 当前issue 被标记为stale已有14天,即将关闭。