FATE icon indicating copy to clipboard operation
FATE copied to clipboard

训练和预测过程中任务一直运行不结束

Open ShaoYULi12 opened this issue 1 year ago • 0 comments

你好,我现在用的是fate 2.1版本在训练任务中使用的dag文件内容如下(从fateboard中导出的): { "schema_version":"2.0.0.alpha", "kind":"fate", "dag":{ "stage":"train", "party_tasks":{ "guest_9999":{ "parties":[ { "role":"guest", "party_id":[ "10000" ] } ], "conf":{}, "tasks":{ "reader_0":{ "conf":null, "parameters":{ "name":"1722474962438998", "namespace":"202408" } } } }, "host_10000":{ "parties":[ { "role":"host", "party_id":[ "9999" ] } ], "conf":{}, "tasks":{ "reader_0":{ "conf":null, "parameters":{ "name":"1722475205848520", "namespace":"202408" } } } } }, "parties":[ { "role":"guest", "party_id":[ "10000" ] }, { "role":"host", "party_id":[ "9999" ] }, { "role":"arbiter", "party_id":[ "10000" ] } ], "conf":{ "model_version":"0", "scheduler_party_id":"10000", "sync_type":"callback", "auto_retries":0, "inheritance":null, "model_warehouse":null, "model_id":"202408011032163817380", "priority":null, "initiator_party_id":"10000", "cores":null, "task":null, "computing_partitions":8, "extra":null }, "tasks":{ "lr_0":{ "outputs":null, "stage":null, "inputs":{ "data":{ "train_data":{ "task_output_artifact":{ "output_artifact_key":"output_data", "producer_task":"psi_0", "parties":[ { "role":"guest", "party_id":[ "10000" ] }, { "role":"host", "party_id":[ "9999" ] } ], "output_artifact_type_alias":null } } }, "model":{} }, "parties":null, "component_ref":"coordinated_lr", "conf":null, "dependent_tasks":[ "psi_0" ], "parameters":{ "output_cv_data":true, "batch_size":128, "early_stop":"diff", "threshold":0.5, "epochs":5 } }, "reader_0":{ "outputs":null, "stage":"default", "inputs":null, "parties":[ { "role":"guest", "party_id":[ "10000" ] }, { "role":"host", "party_id":[ "9999" ] } ], "component_ref":"reader", "conf":null, "dependent_tasks":null, "parameters":null }, "psi_0":{ "outputs":null, "stage":"default", "inputs":{ "data":{ "input_data":{ "task_output_artifact":{ "output_artifact_key":"output_data", "producer_task":"reader_0", "parties":null, "output_artifact_type_alias":null } } }, "model":null }, "parties":[ { "role":"guest", "party_id":[ "10000" ] }, { "role":"host", "party_id":[ "9999" ] } ], "component_ref":"psi", "conf":null, "dependent_tasks":[ "reader_0" ], "parameters":{} } } } } 但是训练任务一直卡在epoch 4.已经运行了超过4小时 使用的是给出的breast_hetero_guset/host数据。 同样在预测任务时也存在这种问题,使用训练完的sshe_lr模型做预测的dag文件(从fateboard中导出的)如下: `schema_version: 2.0.0.alpha kind: fate dag: stage: predict party_tasks: guest_9999: parties: - role: guest party_id: - '10000' conf: {} tasks: reader_01: parameters: name: '1722477697252509' namespace: '202408' host_10000: parties: - role: host party_id: - '9999' conf: {} tasks: reader_01: parameters: name: '1722475205848520' namespace: '202408' parties:

  • role: guest party_id:
    • '10000'
  • role: host party_id:
    • '9999' conf: model_version: '0' scheduler_party_id: '10000' sync_type: callback auto_retries: 0 model_warehouse: model_version: '0' model_id: '202408010927308979450' model_id: '202408011004199813420' initiator_party_id: '10000' computing_partitions: 8 tasks: lr_0: inputs: data: test_data: task_output_artifact: output_artifact_key: output_data producer_task: psi_0 parties: - role: guest party_id: - '10000' - role: host party_id: - '9999' model: input_model: model_warehouse: output_artifact_key: output_model producer_task: lr_0 parties: - role: guest party_id: - '10000' - role: host party_id: - '9999' component_ref: sshe_lr dependent_tasks:
      • psi_0 parameters: output_cv_data: true batch_size: 256 early_stop: diff threshold: 0.5 epochs: 2 psi_0: stage: default inputs: data: input_data: task_output_artifact: output_artifact_key: output_data producer_task: reader_01 parties:
      • role: guest party_id:
        • '10000'
      • role: host party_id:
        • '9999' component_ref: psi dependent_tasks:
      • reader_01 parameters: {} reader_01: stage: default parties:
      • role: guest party_id:
        • '10000'
      • role: host party_id:
        • '9999' component_ref: reader ` 目前已经运行了超过五小时,使用的数据也为breast_hetero_guset/host数据。 上面的情况有时也不会出现,能正常运行完coordinated_lr的训练任务以及sshe_lr的预测任务。

ShaoYULi12 avatar Aug 01 '24 06:08 ShaoYULi12