FATE icon indicating copy to clipboard operation
FATE copied to clipboard

100W500特征数据, 运行SSHE训练出错

Open lvying0019 opened this issue 3 years ago • 1 comments
trafficstars

任务配置文件:

{
    "dsl_version": 2,
    "initiator": {
        "role": "guest",
        "party_id": 9999
    },
    "role": {
        "host": [
            10000
        ],
        "guest": [
            9999
        ]
    },
    "component_parameters": {
        "role": {
            "guest": {
                "0": {
                    "data_transform_0": {
                        "with_label": true
                    },
                    "reader_0": {
                        "table": {
                            "name": "test_guest_100w_500_hetero",
                            "namespace": "test"
                        }
                    }
                }
            },
            "host": {
                "0": {
                    "data_transform_0": {
                        "with_label": false
                    },
                    "reader_0": {
                        "table": {
                            "name": "test_host_100w_500_hetero",
                            "namespace": "test"
                        }
                    }
                }
            }
        },
        "job_parameters":{
            "common":{
                "task_cores":16,
                "timeout":604800
            }
        },
        "common": {
            "data_transform_0": {
                "output_format": "dense"
            },
            "hetero_sshe_lr_0": {
                "penalty": "L2",
                "tol": 0.0001,
                "alpha": 0.01,
                "optimizer": "rmsprop",
                "batch_size": -1,
                "learning_rate": 0.1,
                "init_param": {
                    "init_method": "zeros",
                    "fit_intercept": true
                },
                "max_iter": 3,
                "early_stop": "diff",
                "encrypt_param": {
                    "key_length": 1024
                },
                "reveal_strategy": "respectively",
                "reveal_every_iter": true
            },
            "evaluation_0": {
                "eval_type": "binary"
            }
        }
    }
}

报错信息

[WARNING] [2022-08-16 18:17:22,362] [202208161736486735430] [10665:140295538583360] - [mini_batch.get_batch_generator] [line:92]: As batch_size >= data size, all batch strategy will be disabled
[ERROR] [2022-08-16 18:50:29,284] [202208161736486735430] [10665:140295538583360] - [task_executor._run_] [line:243]: ('Failed to call command: CommandURI(_uri=v1/egg-pair/runTask) to endpoint: nodemanager:43175, caused by: ', <_Rendezvous of RPC that terminated with:
 status = StatusCode.UNKNOWN
 details = "Exception calling application: 
==== detail start, at 20220816.185029.205 ====
Traceback (most recent call last):
  File "/data/projects/fate/eggroll/python/eggroll/core/pair_store/lmdb.py", line 244, in put
    ret = self.txn.put(k, v)
lmdb.MapFullError: mdb_put: MDB_MAP_FULL: Environment mapsize limit reached
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/data/projects/fate/eggroll/python/eggroll/roll_pair/egg_pair.py", line 131, in _run_unary
    func(rb, input_key_serdes, input_value_serdes, wb)
  File "/data/projects/fate/eggroll/python/eggroll/roll_pair/egg_pair.py", line 263, in map_values_wrapper
    output_writebatch.put(k_bytes, value_serdes.serialize(f(v)))
  File "/data/projects/fate/eggroll/python/eggroll/core/pair_store/lmdb.py", line 248, in put
    raise ValueError(f"put key={k}, value={v} raise Exception")
ValueError: put key=b'\x80\x03X \x00\x00\x005a4b25aaed25c2ee1b74de72dc03c14eq\x00.', value=b'\x80\x03cnumpy.core.multiarray\n_reconstruct\nq\x00cnumpy\nndarray\nq\x01K\x00\x85q\x02C\x01bq\x03\x87q\x04Rq\x05(K\x01M\xf5\x01\x85q\x06cnumpy\ndtype\nq\x07X\x02\x00\x00\x00O8q\x08K\x00K\x01\x87q\tRq\n(K\x03X\x01\x00\x00\x00|q\x0bNNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK?tq\x0cb\x89]q\r(cfederatedml.secureprotol.fixedpoint\nFixedPointNumber\nq\x0e)\x81q\x0f}q\x10(X\x01\x00\x00\x00nq\x11\x8a\x81\x8fo\t\x1e\xb5c_\xa2\xbb\x0f\xb2\xed|Lj?*\x96^\xceGS\x86\xc4\x83g\xd7\x053\x02\x01\x88\xad\xa3\xfb\x9dS\xbeJ\x8bL\xf9\x00V\xba\x93\x8bX\xcc\xe3RX\'zU\x86\xb4F\x19\xa76\xa1\x19\x1b\x19\xee\xca]\x8f\xc1W\x06\xf0\x8bE\xd1(\xc7/G\x12\xc2\xdb\x06\x06\x18\xe5"\x1530\x19\xd1#\n\xd5\xbc\xde\x1f<\xfc[\xf5\t} \x15\xcc=R\xde\xa9\\P\xe9w\xc2\xe6`\x8ah\x1a\r\xedZ\x02\xdf\x96\x00X\x07\x00\x00\x00max_intq\x12\x8a\x80\xc7\xb7\x04\x8f\xda\xb1/\xd1\xdd\x07\xd9v>&\xb5\x1f\x15K/\xe7\xa3)C\xe2\xc1\xb3\xeb\x82\x19\x81\x00\xc4\xd6\xd1\xfd\xce)_\xa5E\xa6|\x00+\xdd\xc9E,\xe6q)\xac\x13\xbd*CZ\xa3\x8cS\x9b\xd0\x8c\x8d\x0cw\xe5\xae\xc7\xe0+\x03\xf8\xc5\xa2h\x94\xe3\x97#\t\xe1m\x03\x03\x8cr\x91\x8a\x

lvying0019 avatar Aug 17 '22 00:08 lvying0019

Increasing task_cores or computing partitions to avoid this issue.

mgqa34 avatar Aug 17 '22 03:08 mgqa34

This issue has been marked as stale because it has been open for 365 days with no activity. If this issue is still relevant or if there is new information, please feel free to update or reopen it.

github-actions[bot] avatar Jul 09 '24 06:07 github-actions[bot]