FATE
FATE copied to clipboard
训练和预测过程中任务一直运行不结束
你好,我现在用的是fate 2.1版本在训练任务中使用的dag文件内容如下(从fateboard中导出的):
{ "schema_version":"2.0.0.alpha", "kind":"fate", "dag":{ "stage":"train", "party_tasks":{ "guest_9999":{ "parties":[ { "role":"guest", "party_id":[ "10000" ] } ], "conf":{}, "tasks":{ "reader_0":{ "conf":null, "parameters":{ "name":"1722474962438998", "namespace":"202408" } } } }, "host_10000":{ "parties":[ { "role":"host", "party_id":[ "9999" ] } ], "conf":{}, "tasks":{ "reader_0":{ "conf":null, "parameters":{ "name":"1722475205848520", "namespace":"202408" } } } } }, "parties":[ { "role":"guest", "party_id":[ "10000" ] }, { "role":"host", "party_id":[ "9999" ] }, { "role":"arbiter", "party_id":[ "10000" ] } ], "conf":{ "model_version":"0", "scheduler_party_id":"10000", "sync_type":"callback", "auto_retries":0, "inheritance":null, "model_warehouse":null, "model_id":"202408011032163817380", "priority":null, "initiator_party_id":"10000", "cores":null, "task":null, "computing_partitions":8, "extra":null }, "tasks":{ "lr_0":{ "outputs":null, "stage":null, "inputs":{ "data":{ "train_data":{ "task_output_artifact":{ "output_artifact_key":"output_data", "producer_task":"psi_0", "parties":[ { "role":"guest", "party_id":[ "10000" ] }, { "role":"host", "party_id":[ "9999" ] } ], "output_artifact_type_alias":null } } }, "model":{} }, "parties":null, "component_ref":"coordinated_lr", "conf":null, "dependent_tasks":[ "psi_0" ], "parameters":{ "output_cv_data":true, "batch_size":128, "early_stop":"diff", "threshold":0.5, "epochs":5 } }, "reader_0":{ "outputs":null, "stage":"default", "inputs":null, "parties":[ { "role":"guest", "party_id":[ "10000" ] }, { "role":"host", "party_id":[ "9999" ] } ], "component_ref":"reader", "conf":null, "dependent_tasks":null, "parameters":null }, "psi_0":{ "outputs":null, "stage":"default", "inputs":{ "data":{ "input_data":{ "task_output_artifact":{ "output_artifact_key":"output_data", "producer_task":"reader_0", "parties":null, "output_artifact_type_alias":null } } }, "model":null }, "parties":[ { "role":"guest", "party_id":[ "10000" ] }, { "role":"host", "party_id":[ "9999" ] } ], "component_ref":"psi", "conf":null, "dependent_tasks":[ "reader_0" ], "parameters":{} } } } }
但是训练任务一直卡在epoch 4.已经运行了超过4小时 使用的是给出的breast_hetero_guset/host数据。
同样在预测任务时也存在这种问题,使用训练完的sshe_lr模型做预测的dag文件(从fateboard中导出的)如下:
`schema_version: 2.0.0.alpha
kind: fate
dag:
stage: predict
party_tasks:
guest_9999:
parties:
- role: guest
party_id:
- '10000'
conf: {}
tasks:
reader_01:
parameters:
name: '1722477697252509'
namespace: '202408'
host_10000:
parties:
- role: host
party_id:
- '9999'
conf: {}
tasks:
reader_01:
parameters:
name: '1722475205848520'
namespace: '202408'
parties:
- role: guest
party_id:
- '10000'
- role: host
party_id:
- '9999'
conf:
model_version: '0'
scheduler_party_id: '10000'
sync_type: callback
auto_retries: 0
model_warehouse:
model_version: '0'
model_id: '202408010927308979450'
model_id: '202408011004199813420'
initiator_party_id: '10000'
computing_partitions: 8
tasks:
lr_0:
inputs:
data:
test_data:
task_output_artifact:
output_artifact_key: output_data
producer_task: psi_0
parties:
- role: guest
party_id:
- '10000'
- role: host
party_id:
- '9999'
model:
input_model:
model_warehouse:
output_artifact_key: output_model
producer_task: lr_0
parties:
- role: guest
party_id:
- '10000'
- role: host
party_id:
- '9999'
component_ref: sshe_lr
dependent_tasks:
- psi_0 parameters: output_cv_data: true batch_size: 256 early_stop: diff threshold: 0.5 epochs: 2 psi_0: stage: default inputs: data: input_data: task_output_artifact: output_artifact_key: output_data producer_task: reader_01 parties:
- role: guest
party_id:
- '10000'
- role: host
party_id:
- '9999' component_ref: psi dependent_tasks:
- reader_01 parameters: {} reader_01: stage: default parties:
- role: guest
party_id:
- '10000'
- role: host
party_id:
- '9999' component_ref: reader ` 目前已经运行了超过五小时,使用的数据也为breast_hetero_guset/host数据。 上面的情况有时也不会出现,能正常运行完coordinated_lr的训练任务以及sshe_lr的预测任务。
- '9999'
conf:
model_version: '0'
scheduler_party_id: '10000'
sync_type: callback
auto_retries: 0
model_warehouse:
model_version: '0'
model_id: '202408010927308979450'
model_id: '202408011004199813420'
initiator_party_id: '10000'
computing_partitions: 8
tasks:
lr_0:
inputs:
data:
test_data:
task_output_artifact:
output_artifact_key: output_data
producer_task: psi_0
parties:
- role: guest
party_id:
- '10000'
- role: host
party_id:
- '9999'
model:
input_model:
model_warehouse:
output_artifact_key: output_model
producer_task: lr_0
parties:
- role: guest
party_id:
- '10000'
- role: host
party_id:
- '9999'
component_ref: sshe_lr
dependent_tasks: