PaddleNLP [Bug]: UIE导出onnx后模型可以正常推理，但是指标全0

软件环境

- paddlepaddle: 3.0.0
- paddlepaddle-gpu: none 
- paddlenlp: 2.6.1

重复问题

[x] I have searched the existing issues

错误描述

使用paddle.onnx.export导出UIE时发现模型可以正常进行推理，但是评估指标为全0，导出前评估正常。

稳定复现步骤 & 代码

导出代码

from paddlenlp.transformers import UIE
import paddle

model = UIE.from_pretrained("./checkpoint/model_best/checkpoint-200")
# export to ONNX
save_path = 'uie' # 需要保存的路径
input_ids = paddle.static.InputSpec([None, 512], 'int64', 'input_ids')
attention_mask = paddle.static.InputSpec([None, 512], 'int64', 'attention_mask')
token_type_ids = paddle.static.InputSpec([None, 512], 'int64', 'token_type_ids')
paddle.onnx.export(model, save_path, input_spec=[input_ids, attention_mask, token_type_ids], opset_version=11)

评估代码

import argparse
from functools import partial

import paddle
import numpy as np
import onnxruntime as ort
from utils import convert_example, create_data_loader, reader

from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.datasets import MapDataset, load_dataset
from paddlenlp.metrics import SpanEvaluator
from paddlenlp.transformers import AutoTokenizer
from paddlenlp.utils.log import logger
from tqdm import tqdm


def evaluate_onnx(session, metric, data_loader):
    """
    使用 ONNX 模型的推理会话对数据进行评估，并计算指标。

    Args:
        session: onnxruntime.InferenceSession 对象。
        metric: 用于计算指标的评估器，例如 SpanEvaluator。
        data_loader: 评估数据加载器。
        multilingual (bool): 是否为多语言模型（决定输入字段）。
    """
    metric.reset()
    for batch in tqdm(data_loader):
        # 构造输入字典，并将 paddle.Tensor 转为 numpy 数组
        inputs = {}
        inputs["input_ids"] = batch["input_ids"].numpy()
        inputs["token_type_ids"] = batch["token_type_ids"].numpy()
        inputs["attention_mask"] = batch["attention_mask"].numpy()

        # 使用 ONNX Runtime 进行推理，outputs 顺序与导出时一致
        outputs = session.run(None, inputs)
        print(len(outputs))
        start_prob, end_prob = outputs[0], outputs[1]

        # 将 ground truth 转为 numpy 数组并转换类型
        start_ids = batch["start_positions"].numpy().astype("float32")
        end_ids = batch["end_positions"].numpy().astype("float32")

        # 计算指标：假设 SpanEvaluator.compute 接收的均为 numpy 数组
        num_correct, num_infer, num_label = metric.compute(
            start_prob, end_prob, start_ids, end_ids
        )
        metric.update(num_correct, num_infer, num_label)
    precision, recall, f1 = metric.accumulate()
    return precision, recall, f1


def do_eval():
    paddle.set_device(args.device)
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
    session = ort.InferenceSession(args.onnx_model_path)

    # 加载评估数据集
    test_ds = load_dataset(
        reader, data_path=args.test_path, max_seq_len=args.max_seq_len, lazy=False
    )
    class_dict = {}
    class_dict["all_classes"] = test_ds

    # 构造数据预处理函数
    trans_fn = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_len=args.max_seq_len,
    )

    test_ds = test_ds.map(trans_fn)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=args.max_seq_len)

    test_data_loader = create_data_loader(
        test_ds, mode="test", batch_size=args.batch_size, trans_fn=data_collator
    )

    metric = SpanEvaluator()
    precision, recall, f1 = evaluate_onnx(session, metric, test_data_loader)
    logger.info("-----------------------------")
    logger.info(
        "Evaluation Precision: %.5f | Recall: %.5f | F1: %.5f"
        % (precision, recall, f1)
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--model_path", type=str, default=None, help="本地加载 tokenizer。")
    parser.add_argument("--onnx_model_path", type=str, default=None, help="本地 ONNX 模型文件路径。")
    parser.add_argument("--test_path", type=str, default=None, help="测试集数据路径。")
    parser.add_argument("--batch_size", type=int, default=16, help="每个 GPU/CPU 上评估时的 batch size。")
    parser.add_argument("--device", type=str, default="gpu", choices=["gpu", "cpu", "npu"], help="评估时使用的设备。")
    parser.add_argument("--max_seq_len", type=int, default=512, help="分词后的最大序列长度。")
    parser.add_argument("--schema_lang", choices=["ch", "en"], default="ch", help="Schema 语言类型。")

    args = parser.parse_args()
    do_eval()

Apr 11 '25 07:04 zhangzef

转换后评估结果：

[2025-04-11 15:35:53,838] [    INFO] - -----------------------------
[2025-04-11 15:35:53,838] [    INFO] - Evaluation Precision: 0.00000 | Recall: 0.00000 | F1: 0.00000

转换onnx前评估代码：

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
from functools import partial

import paddle
from utils import convert_example, create_data_loader, reader

from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.datasets import MapDataset, load_dataset
from paddlenlp.metrics import SpanEvaluator
from paddlenlp.transformers import UIE, UIEM, AutoTokenizer
from paddlenlp.utils.ie_utils import get_relation_type_dict, unify_prompt_name
from paddlenlp.utils.log import logger
from tqdm import tqdm


@paddle.no_grad()
def evaluate(model, metric, data_loader, multilingual=False):
    """
    Given a dataset, it evals model and computes the metric.
    Args:
        model(obj:`paddle.nn.Layer`): A model to classify texts.
        metric(obj:`paddle.metric.Metric`): The evaluation metric.
        data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
        multilingual(bool): Whether is the multilingual model.
    """
    model.eval()
    metric.reset()
    for batch in tqdm(data_loader):
        if multilingual:
            start_prob, end_prob = model(batch["input_ids"], batch["position_ids"])
        else:
            start_prob, end_prob = model(
                batch["input_ids"],
                batch["token_type_ids"],
                batch["position_ids"],
                batch["attention_mask"],
            )

        start_ids = paddle.cast(batch["start_positions"], "float32")
        end_ids = paddle.cast(batch["end_positions"], "float32")
        num_correct, num_infer, num_label = metric.compute(
            start_prob, end_prob, start_ids, end_ids
        )
        metric.update(num_correct, num_infer, num_label)
    precision, recall, f1 = metric.accumulate()
    model.train()
    return precision, recall, f1


def do_eval():
    paddle.set_device(args.device)

    if args.model_path in ["uie-m-base", "uie-m-large"]:
        args.multilingual = True
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
    if args.multilingual:
        model = UIEM.from_pretrained(args.model_path)
    else:
        model = UIE.from_pretrained(args.model_path)

    test_ds = load_dataset(
        reader, data_path=args.test_path, max_seq_len=args.max_seq_len, lazy=False
    )
    class_dict = {}
    relation_data = []
    if args.debug:
        for data in test_ds:
            class_name = unify_prompt_name(data["prompt"])
            # Only positive examples are evaluated in debug mode
            if len(data["result_list"]) != 0:
                p = "的" if args.schema_lang == "ch" else " of "
                if p not in data["prompt"]:
                    class_dict.setdefault(class_name, []).append(data)
                else:
                    relation_data.append((data["prompt"], data))

        relation_type_dict = get_relation_type_dict(
            relation_data, schema_lang=args.schema_lang
        )
    else:
        class_dict["all_classes"] = test_ds

    trans_fn = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_len=args.max_seq_len,
        multilingual=args.multilingual,
    )

    for key in class_dict.keys():
        if args.debug:
            test_ds = MapDataset(class_dict[key])
        else:
            test_ds = class_dict[key]
        test_ds = test_ds.map(trans_fn)

        data_collator = DataCollatorWithPadding(tokenizer)

        test_data_loader = create_data_loader(
            test_ds, mode="test", batch_size=args.batch_size, trans_fn=data_collator
        )

        metric = SpanEvaluator()
        precision, recall, f1 = evaluate(
            model, metric, test_data_loader, args.multilingual
        )
        logger.info("-----------------------------")
        logger.info("Class Name: %s" % key)
        logger.info(
            "Evaluation Precision: %.5f | Recall: %.5f | F1: %.5f"
            % (precision, recall, f1)
        )

    if args.debug and len(relation_type_dict.keys()) != 0:
        for key in relation_type_dict.keys():
            test_ds = MapDataset(relation_type_dict[key])
            test_ds = test_ds.map(trans_fn)
            test_data_loader = create_data_loader(
                test_ds, mode="test", batch_size=args.batch_size, trans_fn=data_collator
            )

            metric = SpanEvaluator()
            precision, recall, f1 = evaluate(model, metric, test_data_loader)
            logger.info("-----------------------------")
            if args.schema_lang == "ch":
                logger.info("Class Name: X的%s" % key)
            else:
                logger.info("Class Name: %s of X" % key)
            logger.info(
                "Evaluation Precision: %.5f | Recall: %.5f | F1: %.5f"
                % (precision, recall, f1)
            )


if __name__ == "__main__":
    # yapf: disable
    parser = argparse.ArgumentParser()

    parser.add_argument("--model_path", type=str, default=None, help="The path of saved model that you want to load.")
    parser.add_argument("--test_path", type=str, default=None, help="The path of test set.")
    parser.add_argument("--batch_size", type=int, default=16, help="Batch size per GPU/CPU for training.")
    parser.add_argument("--device", type=str, default="gpu", choices=["gpu", "cpu", "npu"], help="Device selected for evaluate.")
    parser.add_argument("--max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization.")
    parser.add_argument("--debug", action='store_true', help="Precision, recall and F1 score are calculated for each class separately if this option is enabled.")
    parser.add_argument("--multilingual", action='store_true', help="Whether is the multilingual model.")
    parser.add_argument("--schema_lang", choices=["ch", "en"], default="ch", help="Select the language type for schema.")

    args = parser.parse_args()
    # yapf: enable

    do_eval()

转换前评估结果

[2025-04-11 15:58:51,075] [    INFO] - -----------------------------
[2025-04-11 15:58:51,076] [    INFO] - Class Name: all_classes
[2025-04-11 15:58:51,076] [    INFO] - Evaluation Precision: 0.90000 | Recall: 0.90000 | F1: 0.90000

Apr 11 '25 08:04 zhangzef

Hi! @DrownFish19 这是因为macOS的原因吗，我目前还没在其他平台测试过这段代码

Apr 14 '25 07:04 zhangzef

哈喽 @DrownFish19 ，我们换了一个平台测试了这个代码结果依然不行，模型输出数据的格式是正确的，但是不知道为何评估结果是0，请问有遇到过类似的问题吗？

Apr 15 '25 03:04 zhangzef

目前paddle2onnx支持PaddleX中的模型转化，其他模型还没有验证过，可以使用paddle2onnx==2.0.0a5 试一下，安装方式pip install paddle2onnx==2.0.0a5

Apr 15 '25 03:04 0x3878f

This issue is stale because it has been open for 60 days with no activity. 当前issue 60天内无活动，被标记为stale。

Jun 15 '25 00:06 github-actions[bot]

This issue was closed because it has been inactive for 14 days since being marked as stale. 当前issue 被标记为stale已有14天，即将关闭。

Jun 29 '25 00:06 github-actions[bot]

PaddleNLP PaddleNLP copied to clipboard

[Bug]: UIE导出onnx后模型可以正常推理，但是指标全0

软件环境

重复问题

错误描述

稳定复现步骤 & 代码

导出代码

评估代码

转换后评估结果：

转换onnx前评估代码：

转换前评估结果

PaddleNLP
PaddleNLP copied to clipboard