如题，使用 pipline 的方式部署了 cascade 服务，使用 http 接口进行图像预测，使用多线程方式调接口，分析了多张图像之后出现段错误。

测试环境

CUDA 11.2
显卡：RTX 3090
python 3.7.0
PaddlePaddle 2.1.0.post112
paddle-serving-server-gpu 0.6.0.post11
paddle_serving_app 0.6.0

web_service.py

import sys
import base64
import logging

import cv2
import numpy as np
from paddle_serving_app.reader import *
from paddle_serving_server.web_service import WebService, Op

NAME = "bank"

class CascadeBankOp(Op):
    def init_op(self):
        self.img_preprocess = Sequential([
            BGR2RGB(), Div(255.0),
            Normalize([0.4966, 0.4876, 0.4861], [0.0270, 0.0245, 0.0238], False),
            Resize((720, 1280)), Transpose((2, 0, 1)), PadStride(32)
        ])
        self.img_postprocess = RCNNPostprocess("label_list.txt", "output")

    def preprocess(self, input_dicts, data_id, log_id):
        (_, input_dict), = input_dicts.items()
        imgs = []
        #print("keys", input_dict.keys())
        for key in input_dict.keys():
            data = base64.b64decode(input_dict[key].encode('utf8'))
            data = np.fromstring(data, np.uint8)
            im = cv2.imdecode(data, cv2.IMREAD_COLOR)
            im = self.img_preprocess(im)
            imgs.append({
              "image": im[np.newaxis,:],
              "im_shape": np.array(list(im.shape[1:])).reshape(-1)[np.newaxis,:],
              "scale_factor": np.array([1.0, 1.0]).reshape(-1)[np.newaxis,:],
            })
        feed_dict = {
            "image": np.concatenate([x["image"] for x in imgs], axis=0),
            "im_shape": np.concatenate([x["im_shape"] for x in imgs], axis=0),
            "scale_factor": np.concatenate([x["scale_factor"] for x in imgs], axis=0)
        }
        #for key in feed_dict.keys():
        #    print(key, feed_dict[key].shape)
        return feed_dict, False, None, ""

    def postprocess(self, input_dicts, fetch_dict, log_id):
        # print(fetch_dict)
        bbox_result = self.img_postprocess(fetch_dict, visualize=False)
        bbox_result = list(filter(lambda x: x.get("score") > 0.5, bbox_result))
        res_dict = {"bbox_result": str(bbox_result)}
        return res_dict, None, ""


class CascadeBankService(WebService):
    def get_pipeline_response(self, read_op):
        cascade_bank_op = CascadeBankOp(name=NAME, input_ops=[read_op])
        return cascade_bank_op


cascade_bank_service = CascadeBankService(name=NAME)
cascade_bank_service.prepare_pipeline_config("config.yml")
cascade_bank_service.run_service()

config.yaml

dag:
  is_thread_op: false
  tracer:
    interval_s: 30
http_port: 9292
op:
  bank:
    concurrency: 4

    local_service_conf:
      client_type: local_predictor
      device_type: 1
      devices: '8'
      fetch_list:
      - save_infer_model/scale_0.tmp_1
      model_config: serving_server/
rpc_port: 9998
worker_num: 32

错误信息

--------------------------------------
C++ Traceback (most recent call last):
--------------------------------------
0   paddle::AnalysisPredictor::ZeroCopyRun()
1   paddle::framework::NaiveExecutor::Run()
2   paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, paddle::platform::Place const&)
3   paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&) const
4   paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&, paddle::framework::RuntimeContext*) const
5   std::_Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CPUPlace, false, 2ul, paddle::operators::SliceKernel<paddle::platform::C
PUDeviceContext, int>, paddle::operators::SliceKernel<paddle::platform::CPUDeviceContext, long>, paddle::operators::SliceKernel<paddle::platform::CPUDeviceContext, float>, paddle::operators::SliceKernel<paddle::p
latform::CPUDeviceContext, double>, paddle::operators::SliceKernel<paddle::platform::CPUDeviceContext, paddle::platform::complex64>, paddle::operators::SliceKernel<paddle::platform::CPUDeviceContext, paddle::plat
form::complex128> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::_M_invoke(std::_Any_data const&, paddle::framework::ExecutionContext const&)
6   void paddle::operators::SliceKernel<paddle::platform::CPUDeviceContext, float>::SliceCompute<2ul>(paddle::framework::ExecutionContext const&) const
7   Eigen::internal::TensorExecutor<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorSlicingOp<Eigen::DSizes<int, 2> const, Eigen::DSizes<int, 2> const
, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const, Eigen::DefaultDevice, true, (Eigen::internal::TiledEvaluation)1>::run(Eigen::TensorAssignOp<Eigen::TensorMap<
Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorSlicingOp<Eigen::DSizes<int, 2> const, Eigen::DSizes<int, 2> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePo
inter> const> const> const&, Eigen::DefaultDevice const&)
8   paddle::framework::SignalHandle(char const*, int)
9   paddle::platform::GetCurrentTraceBackString[abi:cxx11]()

----------------------
Error Message Summary:
----------------------
FatalError: `Segmentation fault` is detected by the operating system.
  [TimeInfo: *** Aborted at 1623228748 (unix time) try "date -d @1623228748" if you are using GNU date ***]
  [SignalInfo: *** SIGSEGV (@0x7fd13d3f1600) received by PID 17800 (TID 0x7fd2b7261740) from PID 1027544576 ***]

Jun 09 '21 09:06 okooo5km

Message that will be displayed on users' first issue

Jun 09 '21 09:06 github-actions[bot]

您好，今晚我复现一下。图片是公共数据集吗？

Jun 09 '21 09:06 TeslaZhao

您好，今晚我复现一下。图片是公共数据集吗？

好的，我训练的模型是自训练的，不知道paddle model zoo 中 cascade 模型是否可以复现，没测试！

Jun 09 '21 09:06 okooo5km

您好，我跑通了Serving目录下cascade示例在Pipeline上部署，你训练的模型是在Paddle 2.1上训练的吗？从报错信息上看是slice op运行报错，你看一下PipelineServingLogs/pipeline.log.wf里是否有更多的错误信息。打印一下image名称，image_shape， scale_factor，看下是否在特定图片报错，image_shape， scale_factor是否正确？

Jun 09 '21 12:06 TeslaZhao

您好，我跑通了Serving目录下cascade示例在Pipeline上部署，你训练的模型是在Paddle 2.1上训练的吗？从报错信息上看是slice op运行报错，你看一下PipelineServingLogs/pipeline.log.wf里是否有更多的错误信息。打印一下image名称，image_shape， scale_factor，看下是否在特定图片报错，image_shape， scale_factor是否正确？

感谢您的回复，在您的提示下，找到了原因，是 web_service.py 中图像预处理的 Resize 参数的问题，写反了：

class CascadeBankOp(Op):
    def init_op(self):
        self.img_preprocess = Sequential([
            BGR2RGB(), Div(255.0),
            Normalize([0.4966, 0.4876, 0.4861], [0.0270, 0.0245, 0.0238], False),
            Resize((720, 1280)), Transpose((2, 0, 1)), PadStride(32)
        ])
...

改正之后：

class CascadeBankOp(Op):
    def init_op(self):
        self.img_preprocess = Sequential([
            BGR2RGB(), Div(255.0),
            Normalize([0.4966, 0.4876, 0.4861], [0.0270, 0.0245, 0.0238], False),
            Resize((1280, 720)), Transpose((2, 0, 1)), PadStride(32)
        ])

可以正常稳定运行了，再次感谢 🤙🏻🤙🏻🤙🏻

Jun 11 '21 01:06 okooo5km

@TeslaZhao 您好，我这边又进行了批量图像分析测试，发现有一张图分析的时候仍然会导致段错误，看图像没什么异常，不知道为什么只有这一张图像有问题，单独只使用这一张图像调分析接口，返回的错误是：

{
    'err_no': 8, 
    'err_msg': "(data_id=0 log_id=0) [bank|0] Failed to postprocess: 'save_infer_model/scale_0.tmp_1.lod'", 
    'key': [], 
    'value': []
}

打印了下 python3.7/site-packages/paddle_serving_app/reader/image_reader.py 中 (第 344 行) def _get_bbox_result(self, fetch_map, fetch_name, clsid2catid) 方法中 fetch_map：

{'save_infer_model/scale_0.tmp_1': array([[-1.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)}

确实是没有 'save_infer_model/scale_0.tmp_1.lod'，尝试打印能正常分析图像的 fetch_map:

{'save_infer_model/scale_0.tmp_1': array([[0.0000000e+00, 9.8990679e-01, 6.6382672e+02, 4.6241437e+02,
        8.3422443e+02, 7.1902985e+02],
       [1.0000000e+00, 7.3206198e-01, 6.5045532e+02, 6.0466644e+02,
        7.2246985e+02, 7.0166632e+02]], dtype=float32), 'save_infer_model/scale_0.tmp_1.lod': array([0, 2], dtype=int32)}

Jun 11 '21 08:06 okooo5km

您好，从错误信息上看缺少模型结果缺少save_infer_model/scale_0.tmp_1.lod，您可以直接修改一下python3.7/site-packages/paddle_serving_app/reader/image_reader.py中_get_bbox_result的代码，判断lod存在在取值。至于为什么会出现lod不存在的情况，我们要和模型的同学反馈一下，再决定如何修改。

Jun 16 '21 02:06 TeslaZhao

您好，从错误信息上看缺少模型结果缺少save_infer_model/scale_0.tmp_1.lod，您可以直接修改一下python3.7/site-packages/paddle_serving_app/reader/image_reader.py中_get_bbox_result的代码，判断lod存在在取值。至于为什么会出现lod不存在的情况，我们要和模型的同学反馈一下，再决定如何修改。

您好，这个问题原因找到了吗？

Oct 20 '23 08:10 HuiHuiSun

Serving
Serving copied to clipboard

pipline 部署 cascade 模型 http 分析了多张图像后报 Segmentation fault

测试环境

web_service.py

config.yaml

错误信息

Serving Serving copied to clipboard

pipline 部署 cascade 模型 http 分析了多张图像后报 Segmentation fault

测试环境

web_service.py

config.yaml

错误信息

Serving
Serving copied to clipboard