dpctl dpctl has issues with multiprocessing

import dpctl
import dpctl.tensor as dpt

import multiprocessing
import os

def _exec(di):
    print(os.getpid())

    x = dpt.ones(1, device="cpu")
    print(2)
    y = dpt.empty_like(x)
    y[...] = x
    di["hey"] = dpt.asnumpy(y)


def main():
    print("starting")
    with multiprocessing.Manager() as manager:
        di_ = manager.dict()
        p = multiprocessing.Process(
            target=_exec,
            args=(di_,)
        )
        p.start()
        p.join(200)

    print("Done")

if __name__ == "__main__":
    # dpt.ones(1)
    main()

Running this produces an output:

(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp$ python run.py
starting
16819
2
Done
(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp$

Uncommenting the line # dpt.ones(1) above call to main() causes the script to hang (looks like a deadlock).

If in addition x = dpt.ones(1, device="cpu") is replaced with x = dpt.ones(1), the scripts errors out with

(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp$ python run.py
starting
16969
Process Process-2:
Traceback (most recent call last):
  File "/home/opavlyk/miniconda3/envs/triage_dpbench/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/opavlyk/miniconda3/envs/triage_dpbench/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/opavlyk/tmp/run.py", line 10, in _exec
    x = dpt.ones(1)
  File "/home/opavlyk/repos/dpctl/dpctl/tensor/_ctors.py", line 1006, in ones
    hev, _ = ti._full_usm_ndarray(1, res, sycl_queue)
RuntimeError: Native API failed. Native API returns: -997 (The plugin has emitted a backend specific error) -997 (The plugin has emitted a backend specific error)
Done
(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp$

Thank you to @ZzEeKkAa for reporting a related issue which was distilled into this reproducer

Apr 11 '23 21:04 oleksandr-pavlyk

Actually, this reproducer does not need dpctl.tensor at all. This following script:

# run.py
import dpctl
import dpctl.memory as dpm

import multiprocessing
import os

def _exec(di):
    print(os.getpid())

    d = dpctl.select_default_device()
    c = d.sycl_platform.default_context
    q = dpctl.SyclQueue(c, d)
    x = dpm.MemoryUSMDevice(40, queue=q)
    x.memset(1)
    print(2)
    di["hey"] = [1,2,3]


def main():
    print("starting")
    with multiprocessing.Manager() as manager:
        di_ = manager.dict()
        p = multiprocessing.Process(
            target=_exec,
            args=(di_,)
        )
        p.start()
        p.join(200)

    print("Done")

if __name__ == "__main__":
    d = dpctl.select_default_device()
    c = d.sycl_platform.default_context
    q = dpctl.SyclQueue(c, d)
    x = dpm.MemoryUSMDevice(40, queue=q)
    x.memset(1)
    main()

fails just the same:

(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp/zzeekkaa$ python run.py
starting
22081
die: [L0] getZeQueue: failed to create queue
libc++abi: terminating
Done

My hunch is that DPC++ runtime global states do not like to be forked.

Acting on such a hunch I whipped a C++ snippet building on a tutorial from geeks-for-geeks:

// forking.cpp
#include <sycl/sycl.hpp>
#include <unistd.h>
#include <iostream>

int main(void) {
  sycl::queue q0 { sycl::default_selector_v };

  int *data_main = sycl::malloc_device<int>(10, q0);
  q0.fill<int>(data_main, int(1), 10).wait();
  sycl::free(data_main, q0);

  std::cout << "PID before forking " << getpid() << std::endl;

  pid_t c_pid = fork();

  if (c_pid == -1) {
    std::cout << "Fork failed" << std::endl;
    std::terminate();
  } else if (c_pid > 0) { // parent process
      std::cout << "Parent PID after forking " << getpid() << std::endl;
      sycl::queue q1 { sycl::default_selector_v };

      int *data1 = sycl::malloc_device<int>(10, q1);
      q1.fill<int>(data1, int(1), 10).wait();
      sycl::free(data1, q1);

  } else { // child process
      std::cout << "Parent PID after forking " << getpid() << std::endl;
      sycl::queue q2 {sycl::default_selector_v};

      int *data2 = sycl::malloc_device<int>(10, q2);
      q2.fill<int>(data2, int(1), 10).wait();
      sycl::free(data2, q2);

  }
  return 0;
}

Compiled this code with icpx -fsycl forking.cpp -o forking. Now, lo and behold, the same behavior emerges:

(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp$ ./forking
PID before forking 22478
Parent PID after forking 22478
Parent PID after forking 22479
die: [L0] getZeQueue: failed to create queue
libc++abi: terminating

Apr 12 '23 11:04 oleksandr-pavlyk

Perhaps a better (proper forking) C++ reproducer:

// forking.cpp
#include <sycl/sycl.hpp>
#include <unistd.h>
#include <sys/wait.h>
#include <iostream>

int main(void) {
  sycl::queue q0 { sycl::default_selector_v };

  int *data_main = sycl::malloc_device<int>(10, q0);
  q0.fill<int>(data_main, int(1), 10).wait();
  sycl::free(data_main, q0);

  std::cout << "PID before forking " << getpid() << std::endl;

  pid_t c_pid = fork();

  if (c_pid < 0) {
    std::cout << "Fork failed" << std::endl;
    std::terminate();
  } else if (c_pid > 0) { // parent process
    int status = 0;
    // just wait for the child process to do its thing
    (void) waitpid(c_pid, &status, 0);
    std::cout << "Status retuned by child process: " << status << std::endl;
  } else { // child process
      std::cout << "Child PID after forking " << getpid() << std::endl;
      sycl::queue q2 {sycl::default_selector_v};

      try {
        int *data2 = sycl::malloc_device<int>(10, q2);
        q2.fill<int>(data2, int(1), 10).wait();
        sycl::free(data2, q2);
      } catch (const std::exception &e) {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        return -1;
      }

      return 0; // finish the child process
  }

  std::cout << "From parent process after joining" << std::endl;
  return 0;
}

Running:

$ icpx -fsycl forking.cpp -o forking && ./forking
PID before forking 19329
Child PID after forking 19330
die: [L0] getZeQueue: failed to create queue
libc++abi: terminating
Status retuned by child process: 6
From parent process after joining

Interestingly, replacing sycl::default_selector_v with sycl::cpu_selector_v in the child process branch the executable runs fine.

Using sycl::ext::oneapi::filter_selector to select "opencl:gpu" device runs into problems:

PID before forking 20103
Child PID after forking 20104
Caught exception: The program was built for 1 devices
Build program log for 'Intel(R) Graphics [0x9a49]':
 -6 (PI_ERROR_OUT_OF_HOST_MEMORY)
Status retuned by child process: 65280
From parent process after joining

I suppose this finding hints that presently dpctl, as any other SYCL application is not compatible with Unix's forking, and hence dpctl should be used in processed spawned by multiprocessing module.

Apr 12 '23 15:04 oleksandr-pavlyk

Another useful variant, more closely related to the error seen in dpbench:

// $ cat forking.cpp
#include <sycl/sycl.hpp>
#include <unistd.h>
#include <sys/wait.h>
#include <iostream>

class my_filter_selector
{
 public:
  static constexpr int REJECT_DEVICE = -1;
  my_filter_selector(const std::string &fs) : _impl(fs) {}
  int operator()(const sycl::device &d) const { return _impl(d); };

 private:
  sycl::ext::oneapi::filter_selector _impl;
};


int main(void) {
  sycl::queue q0 { sycl::default_selector_v };

  int *data_main = sycl::malloc_device<int>(10, q0);
  q0.fill<int>(data_main, int(1), 10).wait();
  sycl::free(data_main, q0);

  std::cout << "PID before forking " << getpid() << std::endl;

  pid_t c_pid = fork();

  if (c_pid < 0) {
    std::cout << "Fork failed" << std::endl;
    std::terminate();
  } else if (c_pid > 0) { // parent process
    int status = 0;
    // just wait for the child process to do its thing
    (void) waitpid(c_pid, &status, 0);
    std::cout << "Status retuned by child process: " << status << std::endl;
  } else { // child process
      std::cout << "Child PID after forking " << getpid() << std::endl;
      sycl::queue q2 { sycl::default_selector_v };

      try {
        int *data2 = sycl::malloc_host<int>(10, q2);
        if (data2 == nullptr) {
          throw std::runtime_error("Failed USM-host allocation");
        }
        q2.fill<int>(data2, int(1), 10).wait();
        sycl::free(data2, q2);
      } catch (const std::exception &e) {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        return -1;
      }

      return 0; // finish the child process
  }

  std::cout << "From parent process after joining" << std::endl;
  return 0;
}

(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp/zzeekkaa$ icpx -fsycl forking.cpp -o forking
(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp/zzeekkaa$ ./forking
PID before forking 20198
Child PID after forking 20199
Caught exception: Failed USM-host allocation
Status retuned by child process: 65280
From parent process after joining

Apr 12 '23 15:04 oleksandr-pavlyk

Likely related to this defect reported in dpbench.

Apr 12 '23 16:04 adarshyoga