Merlin
Merlin copied to clipboard
[BUG] Errors when importing SOK and Data loader
I am not able to import Merlin Dataloader + SOK in TensorFlow. Either order (first SOK -> Dataloader OR Dataloder -> SOK) throws an error (see thread).
Importing sok and then data loader
import os
import tensorflow as tf
import sparse_operation_kit as sok
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async" # fraction of free memory
BATCH_SIZE = 64000
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(1)))
sok.Init(global_batch_size=BATCH_SIZE)
import nvtabular as nvt
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater
from nvtabular.framework_utils.tensorflow import layers
File /usr/local/lib/python3.8/dist-packages/tensorflow/python/framework/config.py:874, in set_logical_device_configuration(device, logical_devices)
810 """Set the logical device configuration for a `tf.config.PhysicalDevice`.
811
812 A visible `tf.config.PhysicalDevice` will by default have a single
(...)
872 RuntimeError: Runtime is already initialized.
873 """
--> 874 context.context().set_logical_device_configuration(device, logical_devices)
File /usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/context.py:1601, in Context.set_logical_device_configuration(self, dev, virtual_devices)
1600 if self._context_handle is not None:
-> 1601 raise RuntimeError(
1602 "Virtual devices cannot be modified after being initialized")
1604 self._virtual_device_map[dev] = virtual_devices
RuntimeError: Virtual devices cannot be modified after being initialized
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
Input In [1], in <cell line: 13>()
10 sok.Init(global_batch_size=BATCH_SIZE)
12 import nvtabular as nvt
---> 13 from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater
14 from nvtabular.framework_utils.tensorflow import layers
File /nvtabular/nvtabular/loader/tensorflow.py:28, in <module>
25 from nvtabular.loader.backend import DataLoader
26 from nvtabular.loader.tf_utils import configure_tensorflow, get_dataset_schema_from_feature_columns
---> 28 from_dlpack = configure_tensorflow()
29 LOG = logging.getLogger("nvtabular")
30 # tf import must happen after config to restrict memory use
File /nvtabular/nvtabular/loader/tf_utils.py:64, in configure_tensorflow(memory_allocation, device)
58 tf.config.experimental.set_virtual_device_configuration(
59 tf_devices[device],
60 [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=memory_allocation)],
61 )
62 except RuntimeError as e:
63 # Virtual devices must be set before GPUs have been initialized
---> 64 warnings.warn(e)
66 # versions using TF earlier than 2.3.0 need to use extension
67 # library for dlpack support to avoid memory leak issue
68 __TF_DLPACK_STABLE_VERSION = "2.3.0"
TypeError: expected string or bytes-like object
Importing Dataloader and then SOK
import os
import tensorflow as tf
import sparse_operation_kit as sok
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async" # fraction of free memory
import nvtabular as nvt
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater
from nvtabular.framework_utils.tensorflow import layers
BATCH_SIZE = 64000
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(1)))
sok.Init(global_batch_size=BATCH_SIZE)
---------------------------------------------------------------------------
AbortedError Traceback (most recent call last)
Input In [1], in <cell line: 15>()
13 BATCH_SIZE = 64000
14 os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(1)))
---> 15 sok.Init(global_batch_size=BATCH_SIZE)
File /usr/local/lib/python3.8/dist-packages/SparseOperationKit-1.1.2-py3.8-linux-x86_64.egg/sparse_operation_kit/core/initialize.py:237, in Init(**kwargs)
234 return _horovod_init(**kwargs)
235 else:
236 # horovod not imported
--> 237 return _one_device_init(**kwargs)
File /usr/local/lib/python3.8/dist-packages/SparseOperationKit-1.1.2-py3.8-linux-x86_64.egg/sparse_operation_kit/core/initialize.py:198, in Init.<locals>._one_device_init(**kwargs)
196 global_seed = kwargs.get("seed", None) or kit_lib.gen_random_seed()
197 visible_devices = _get_visible_devices()
--> 198 status = kit_lib.plugin_init(local_rank, 1, unique_id, global_seed, visible_devices,
199 global_batch_size=kwargs["global_batch_size"])
200 return status
File <string>:1455, in plugin_init(global_replica_id, num_replicas_in_sync, nccl_unique_id, global_seed, visible_devices, global_batch_size, name)
File /usr/local/lib/python3.8/dist-packages/tensorflow/python/framework/ops.py:7107, in raise_from_not_ok_status(e, name)
7105 def raise_from_not_ok_status(e, name):
7106 e.message += (" name: " + name if name is not None else "")
-> 7107 raise core._status_to_exception(e) from None
AbortedError: /workspace/build-env/sparse_operation_kit/kit_cc/kit_cc_infra/src/resources/cpu_resource.cc:47 Intra-process barrier blocking threads time out. [Op:PluginInit]
The HugeCTR team proposed that it could be related to having multiple GPUs and not using MirrorStrategies. They shared an example:
import os
import tensorflow as tf
import sparse_operation_kit as sok
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async" # fraction of free memory
import nvtabular as nvt
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater
from nvtabular.framework_utils.tensorflow import layers
BATCH_SIZE = 64000
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(1)))
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
sok.Init(global_batch_size=BATCH_SIZE)
@bschifferer , please triage this bug
I dont know if this bug is still valid - it is from April 6th. I havent worked on SOK + dataloader since then. But if we want to provide both, then this is important
@bschifferer , is this a P0 or P1 ?