john Async transfers to host for multiple salts

Here's an idea on how we can speed up some OpenCL formats when working with multiple salts:

crypt_all is passed a pointer to the current salt in the database. It can also access salt->next->salt for likely next call's salt (except in self-test, benchmark, and "single crack" modes). When salt->next is non-NULL, before starting (edit: or waiting for completion of) transfer of computed hashes from device to host crypt_all may set the next salt and invoke the kernel for that one (with the set of candidate passwords previously transferred to device). crypt_all will also skip kernel invocation for the current salt if it's already been invoked by the previous call and the salt was predicted correctly (e.g., set_salt may reset a flag to indicate occasional misprediction - which should never happen with usual cracking modes). That way, for most salts (for all but the first one in salts list) transfers from device to host will overlap in time with computation of hashes for next salt.

In order to allow for such overlapping, we need two (sets of) output buffers (both on device and on host), pointers to which will be swapped before each kernel invocation. The rest of the format's functions such as cmp_* and get_hash_* will read from the buffer that was used for the current salt (not the next salt).

Jun 19 '19 16:06 solardiz

Quick and dirty PoC (could very well be wrong), which somehow doesn't improve speeds in my testing:

+++ b/src/opencl_md5crypt_fmt_plug.c
@@ -82,7 +82,8 @@ static crypt_md5_salt host_salt;              /** salt **/
 
 //OpenCL variables:
 static cl_mem mem_in, mem_out, pinned_in, pinned_out, mem_salt;
-static int new_keys;
+static cl_mem mem_outs[2];
+static int new_keys, kernel_invoked;
 static struct fmt_main *self;
 
 #define insize (sizeof(crypt_md5_password) * global_work_size)
@@ -201,8 +202,11 @@ static void create_clobj(size_t gws, struct fmt_main *self)
 
        pinned_out = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, out_size, NULL, &ret_code);
        HANDLE_CLERROR(ret_code, "Error while allocating pinned memory for hashes");
-       mem_out = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY, out_size, NULL, &ret_code);
-       HANDLE_CLERROR(ret_code, "Error while allocating GPU memory for hashes");
+       int i;
+       for (i = 0; i < 2; i++) {
+               mem_out = mem_outs[i] = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY, out_size, NULL, &ret_code);
+               HANDLE_CLERROR(ret_code, "Error while allocating GPU memory for hashes");
+       }
        outbuffer = clEnqueueMapBuffer(queue[gpu_id], pinned_out, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, out_size, 0, NULL, NULL, &ret_code);
        HANDLE_CLERROR(ret_code, "Error mapping results buffer");
 
@@ -225,7 +229,10 @@ static void release_clobj(void)
                HANDLE_CLERROR(clReleaseMemObject(mem_in), "Release mem_in");
                HANDLE_CLERROR(clReleaseMemObject(mem_salt), "Release mem_salt");
                HANDLE_CLERROR(clReleaseMemObject(pinned_out), "Release pinned_out");
-               HANDLE_CLERROR(clReleaseMemObject(mem_out), "Release mem_out");
+               int i;
+               for (i = 0; i < 2; i++) {
+                       HANDLE_CLERROR(clReleaseMemObject(mem_outs[i]), "Release mem_out");
+               }
 
                mem_out = NULL;
        }
@@ -277,11 +284,14 @@ static void set_salt(void *salt)
        uint8_t len;
 
        for (len = 0; len < 8 && s[len]; len++);
+       if (kernel_invoked && len == host_salt.saltlen && !memcmp(salt, host_salt.salt, len))
+               return;
        host_salt.saltlen = len;
        memcpy(host_salt.salt, s, host_salt.saltlen);
        host_salt.prefix = s[8];
 
        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_salt, CL_FALSE, 0, saltsize, &host_salt, 0, NULL, NULL), "Copy memsalt");
+       kernel_invoked = 0; /* Even if invoked, it's for wrong salt */
 }
 
 static void *get_salt(char *ciphertext)
@@ -365,13 +375,31 @@ static int crypt_all(int *pcount, struct db_salt *salt)
                        "Copy memin");
 
        ///Run kernel
-       BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1,
-               NULL, &global_work_size, lws, 0, NULL, multi_profilingEvent[1]),
-               "Set ND range");
+       if (new_keys || !kernel_invoked) {
+               BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1,
+                       NULL, &global_work_size, lws, 0, NULL, multi_profilingEvent[1]),
+                       "Set ND range");
+       }
+
        BENCH_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_FALSE,
                0, outsize, outbuffer, 0, NULL, multi_profilingEvent[2]),
                "Copy data back");
 
+       kernel_invoked = 0;
+       if (salt->next && !bench_or_test_running) {
+               set_salt(salt->next->salt);
+               if (mem_out == mem_outs[0]) {
+                       mem_out = mem_outs[1];
+               } else {
+                       mem_out = mem_outs[0];
+               }
+               HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof(mem_out), &mem_out), "Error while setting mem_out kernel argument");
+               BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1,
+                       NULL, &global_work_size, lws, 0, NULL, multi_profilingEvent[1]),
+                       "Set ND range");
+               kernel_invoked = 1;
+       }
+
        ///Await completion of all the above
        BENCH_CLERROR(clFinish(queue[gpu_id]), "clFinish error");

Jun 19 '19 16:06 solardiz

somehow doesn't improve speeds in my testing

Upon more testing, there's a ~3% speedup with these changes on Vega 64 with LWS=64 GWS=32768 (which autotune doesn't always reach, sometimes staying at the slower GWS=16384), but somehow no obvious performance difference (some runs a little bit slower, some a little bit faster) on NVIDIA.

Jun 19 '19 16:06 solardiz

Skipping of async transfers when !bench_or_test_running results in this speedup not included in "Many salts" benchmarks, nor in autotuning, which is wrong. Maybe we need a way to predict next salt in benchmarks as well - e.g., just have the format assume that two salts are being alternated when in benchmark? Unfortunately, this would complicate the formats. Introduce some shared function predict_next_salt that would be passed the salt pointer from crypt_all, but would also be benchmark-aware?

Jun 19 '19 17:06 solardiz

I've just simplified the PoC patch (edited the comment above): there was no need to use 2 host-side buffers, it was sufficient to have 2 device-side buffers. This change appears to have improved speeds on Vega 64 a little bit further, but there's still no obvious speedup on NVIDIA.

Thinking that maybe invoking clEnqueueReadBuffer takes a while or it sometimes blocks, I've also tried the below additional change. However, even with that one there's still no obvious speedup on NVIDIA. What's more puzzling is that with this change along with the !bench_or_test_running check dropped, I am getting self-test failures on Vega 64 (but not on NVIDIA) at varying cmp_all indices, which isn't happening without this additional change (self-tests passes on all GPUs even with the !bench_or_test_running check dropped), yet actual runs on Vega 64 crack all expected passwords either way (tried cracking ~500 passwords out of 3107 with incremental mode - results match what's seen prior to all of these patches).

+++ b/src/opencl_md5crypt_fmt_plug.c
@@ -381,9 +381,7 @@
                        "Set ND range");
        }
 
-       BENCH_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_FALSE,
-               0, outsize, outbuffer, 0, NULL, multi_profilingEvent[2]),
-               "Copy data back");
+       cl_mem prev_out = mem_out;
 
        kernel_invoked = 0;
        if (salt->next && !bench_or_test_running) {
@@ -400,6 +398,10 @@
                kernel_invoked = 1;
        }
 
+       BENCH_CLERROR(clEnqueueReadBuffer(queue[gpu_id], prev_out, CL_FALSE,
+               0, outsize, outbuffer, 0, NULL, multi_profilingEvent[2]),
+               "Copy data back");
+
        ///Await completion of all the above
        BENCH_CLERROR(clFinish(queue[gpu_id]), "clFinish error");

Jun 20 '19 12:06 solardiz

Instead of clFinish, ideally we'd wait for completion of only the transfer before return from crypt_all, but of the kernel before starting the next call's transfer. How to do that? Use clWaitForEvents? I guess clFinish waits for both the transfer and the kernel to complete? (I'm still not good at OpenCL APIs.) That way, the cmp_all call or bitmap/hash table lookups in cracker.c can proceed in parallel with next salt's kernel still running rather than only after it completed running (which is the case with the patches I posted here).

Jun 20 '19 12:06 solardiz

Added the below. Now this finally improves the reported GPU utilization on GTX 1080 from 98% to 100% with a corresponding speedup. However, this change also results in intermittent self-test failures on Vega 64 if I drop the !bench_or_test_running check.

+++ b/src/opencl_md5crypt_fmt_plug.c
@@ -381,8 +381,12 @@
                        "Set ND range");
        }

+       cl_event event, *eventp = &event;
+       if (multi_profilingEvent[2])
+               eventp = multi_profilingEvent[2];
+
        BENCH_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_FALSE,
-               0, outsize, outbuffer, 0, NULL, multi_profilingEvent[2]),
+               0, outsize, outbuffer, 0, NULL, eventp),
                "Copy data back");

        kernel_invoked = 0;
@@ -400,8 +404,7 @@
                kernel_invoked = 1;
        }

-       ///Await completion of all the above
-       BENCH_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
+       BENCH_CLERROR(clWaitForEvents(1, eventp), "clWaitForEvents");

        new_keys = 0;
        return count;

Jun 20 '19 13:06 solardiz

Perhaps use self_test_running and/or bench_running instead of the "any of them" flag? BTW we also have autotune_running IIRC

Jun 20 '19 13:06 magnumripper

We probably also need this:

        if (eventp == event)
                clReleaseEvent(eventp);

Jun 20 '19 13:06 solardiz

magnum, OK I can use just bench_running || autotune_running (assuming your memory is correct), so that the self-test always runs using this salt-predicting code (even if the predictions are wrong) and thus this code is better tested. This would save me the need to edit the source file for such tests. However, it wouldn't solve the mystery of the self-test intermittently failing on Vega 64 with some of these changes. I suspect the erroneous behavior can also be reproduced in actual cracking, even though I haven't yet. It's as if on AMD the wait for completion of the transfer sometimes ends before the transfer is actually complete, or something like that.

Jun 20 '19 13:06 solardiz

I am also thinking of adjusting our benchmark code so that the salt->next pointers are correct (point to each other for the two salts, except maybe every 256th iteration when we'd reset next to NULL so that we'd benchmark for 256 salts rather than an infinite number of salts).

Jun 20 '19 13:06 solardiz

the mystery of the self-test intermittently failing on Vega 64 with some of these changes.

I think I figured it out. When in self-test, next salt is often mispredicted. In that case, my code was starting the same kernel with the same output buffer for the new salt, but without waiting for the previous kernel invocation to complete. Adding a clFinish to the start of crypt_all seems to have fixed that.

Jun 20 '19 16:06 solardiz

Instead of clFinish, ideally we'd wait for completion of only the transfer before return from crypt_all, but of the kernel before starting the next call's transfer. How to do that? Use clWaitForEvents? I guess clFinish waits for both the transfer and the kernel to complete? (I'm still not good at OpenCL APIs.)

If you keep clFinish, formats won't be asynchronous: clFinish blocks until all previously queued OpenCL commands [...] are issued to the [...] device and have completed.

At least, you should use blocking read and write (and no clFinish):

blocking_read Indicates if the read operations are blocking or non-blocking. If blocking_read is CL_TRUE i.e. the read command is blocking, clEnqueueReadBuffer does not return until the buffer data has been read and copied into memory pointed to by ptr.
blocking_write Indicates if the write operations are blocking or nonblocking. If blocking_write is CL_TRUE, the OpenCL implementation copies the data referred to by ptr and enqueues the write operation in the command-queue. The memory pointed to by ptr can be reused by the application after the clEnqueueWriteBuffer call returns.

A proper blocking read before cmp_* is a possible solution. But, some event handling to get the status of clEnqueueNDRangeKernel seems needed, at least for debug. Well, in fact, we can't use blocking reads/writes everywhere. We need to start running the kernel as soon as possible for "(semi)fast" formats. So, a general solution have to be 100% asynchronous.

If you have a notebook for testing, you can use codeXL (or the NVIDIA tool) to get some numbers as in https://stackoverflow.com/questions/42315537/amd-opencl-asynchronous-execution-efficency. Maybe the post/result is confusing, but it is clear that clFinish is slow.

For example, I always wondered if we should use more queues.

Jun 20 '19 17:06 claudioandre-br

Thanks, Claudio.

If you keep clFinish, formats won't be asynchronous

It's not that simple. In the code I posted, clFinish was invoked after clEnqueueReadBuffer of the previous call's kernel invocation's computation result and a new clEnqueueNDRangeKernel, so the transfer and computation could proceed in parallel despite of the clFinish. However, further processing on host such as cmp_all, etc. proceeded with the device idle. I solved the latter drawback by starting to use clWaitForEvents.

Jun 20 '19 17:06 solardiz

Here's my current code. There's still room for improvement, and this will hurt "single crack" mode (need to check for it).

@@ -277,10 +284,18 @@ static void set_salt(void *salt)
        uint8_t len;
 
        for (len = 0; len < 8 && s[len]; len++);
+       if (kernel_invoked && len == host_salt.saltlen && !memcmp(salt, host_salt.salt, len)) {
+               kernel_invoked = 2; /* yes, for correct salt */
+               return;
+       }
        host_salt.saltlen = len;
        memcpy(host_salt.salt, s, host_salt.saltlen);
        host_salt.prefix = s[8];
 
+       if (kernel_invoked) { /* was for wrong salt */
+               HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
+               kernel_invoked = 0;
+       }
        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_salt, CL_FALSE, 0, saltsize, &host_salt, 0, NULL, NULL), "Copy memsalt");
 }
 
@@ -358,6 +373,11 @@ static int crypt_all(int *pcount, struct db_salt *salt)
 
        global_work_size = GET_NEXT_MULTIPLE(count, local_work_size);
 
+       if (kernel_invoked) /* must finish before we replace the salt */
+               BENCH_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
+       if (new_keys || kernel_invoked != 2) /* was for wrong keys or salt */
+               kernel_invoked = 0;
+
        ///Copy data to GPU memory
        if (new_keys)
                BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_in, CL_FALSE,
@@ -365,15 +385,40 @@ static int crypt_all(int *pcount, struct db_salt *salt)
                        "Copy memin");
 
        ///Run kernel
-       BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1,
-               NULL, &global_work_size, lws, 0, NULL, multi_profilingEvent[1]),
-               "Set ND range");
+       if (!kernel_invoked) {
+               BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1,
+                       NULL, &global_work_size, lws, 0, NULL, multi_profilingEvent[1]),
+                       "Set ND range");
+               if (salt->next && (!benchmark_running || self_test_running)) /* must finish before we replace the salt */
+                       BENCH_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
+       }
+
+       cl_event event, *eventp = &event;
+       if (multi_profilingEvent[2])
+               eventp = multi_profilingEvent[2];
+
        BENCH_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_FALSE,
-               0, outsize, outbuffer, 0, NULL, multi_profilingEvent[2]),
+               0, outsize, outbuffer, 0, NULL, eventp),
                "Copy data back");

-       ///Await completion of all the above
-       BENCH_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
+       kernel_invoked = 0;
+       if (salt->next && (!benchmark_running || self_test_running)) { /* not for benchmark, but for cracking or self-test */
+               set_salt(salt->next->salt);
+               if (mem_out == mem_outs[0]) {
+                       mem_out = mem_outs[1];
+               } else {
+                       mem_out = mem_outs[0];
+               }
+               BENCH_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof(mem_out), &mem_out), "Error while setting mem_out kernel argument");
+               BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1,
+                       NULL, &global_work_size, lws, 0, NULL, multi_profilingEvent[1]),
+                       "Set ND range");
+               kernel_invoked = 1;
+       }
+
+       BENCH_CLERROR(clWaitForEvents(1, eventp), "clWaitForEvents");
+       if (eventp == &event)
+               BENCH_CLERROR(clReleaseEvent(event), "clReleaseEvent");

        new_keys = 0;
        return count;

Jun 20 '19 17:06 solardiz

I think we'll want shared code like this, which formats will invoke for their kernels. Ideally we shouldn't duplicate this complexity in each OpenCL format where it's relevant (supporting multiple salts, commonly used on many salts, fast enough that the non-perfect device utilization has significant effect on overall performance prior to this enhancement).

Jun 20 '19 18:06 solardiz

It's not that simple. In the code I posted, clFinish was invoked after clEnqueueReadBuffer of the previous call's kernel invocation's computation result and a new clEnqueueNDRangeKernel, so the transfer and computation could proceed in parallel despite of the clFinish.

We already have data transfers on set_salt() and set_key() in some formats. So, we already have transfer and computation in parallel (of the same set of data, not for two different sets).

Once there is available data, the kernel can start to run, per OpenCL design.
we use clFlush(), not clFinish() since clFinish() blocks execution.

If it is a good solution to md5crypt, that is nice. But, it is not a general solution. I would like to have a way to send a signal to JtR core saying generate more keys or crypt_all() has finished / run the cmp_*(), and so on.

Or, to use synchronous calls and avoid stop processing because of blocking, a multi-thread run.

Jun 20 '19 18:06 claudioandre-br

I never use clFinish. Like @claudioandre-br, I prefer blocking read where needed.

Your code looks fine to me (just reviewing and dry running). Perhaps as a miniscule optimization you should change the order of these two?

+       if (kernel_invoked) /* must finish before we replace the salt */
+               BENCH_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
+       if (new_keys || kernel_invoked != 2) /* was for wrong keys or salt */
+               kernel_invoked = 0;

Also, we could obviously add single_mode_running as yet another flag to look for (actually only needed in batch mode).

Jun 20 '19 21:06 magnumripper

I think we'll want shared code like this, which formats will invoke for their kernels. Ideally we shouldn't duplicate this complexity in each OpenCL format where it's relevant (supporting multiple salts, commonly used on many salts, fast enough that the non-perfect device utilization has significant effect on overall performance prior to this enhancement).

Aye. What if cracker.c was in charge for it, even? What changes to the formats interface would be needed for that? I haven't tried to picture it yet, it's more or less rethorical. I'm thinking it might be less guessing and opportunisticly running a kernel then.

Jun 20 '19 21:06 magnumripper

We already have data transfers on set_salt() and set_key() in some formats. So, we already have transfer and computation in parallel (of the same set of data, not for two different sets).

Transfer and CPU computation in parallel, yes. This is about CPU and GPU processing in parallel. Nevertheless I'm not sure clFinish() is ever needed.

Once there is available data, the kernel can start to run, per OpenCL design.

we use clFlush(), not clFinish() since clFinish() blocks execution.

@claudioandre-br it now hits me I believe we're not currently calling clFlush() from set_salt(). Should we possibly add that?

Jun 20 '19 21:06 magnumripper

@claudioandre-br it now hits me I believe we're not currently calling clFlush() from set_salt(). Should we possibly add that?

I do. I see no problem. Why are you worried?

Since clFlush "guarantees that all queued commands to command_queue get issued to the appropriate device". And that's all.
Well, I use only one queue
And, in the end, it is synchronous.

Jun 20 '19 22:06 claudioandre-br

@claudioandre-br it now hits me I believe we're not currently calling clFlush() from set_salt(). Should we possibly add that?

I do. I see no problem. Why are you worried?

I think I haven't used it in any of my formats. I do use non-blocking but without a flush. So until I add a clFlush() the salt possibly isn't really transfered until the kernel actually needs it, which defeats the purpose of the non-blocking. I'm mostly thinking of huge salts here, non-hashes. I need to investigate this.

Jun 20 '19 22:06 magnumripper

Note to self: We should add some new separate issues from the discussion above.

Jun 20 '19 22:06 magnumripper

the salt possibly isn't really transfered until the kernel actually needs it,

I guess so. No flush, no issue, send, emit, transmit, ...

Jun 20 '19 22:06 claudioandre-br

Transfer and CPU computation in parallel, yes. This is about CPU and GPU processing in parallel.

You are right. But you are right because I have nothing to do on the CPU in this crypt_all() snip.

load_hash();
clEnqueueNDRangeKernel(prepare_kernel)
clFlush()
#####################
# More stuff is here. 
#####################
clEnqueueNDRangeKernel(crypt_kernel)
clFinish()

Basically, I do'nt pre-process or post-process stuff.

Jun 20 '19 22:06 claudioandre-br

I have nothing to do on the CPU in this crypt_all() snip.

After your clFinish(), the GPU is totally idling while the CPU is working hard checking for positives in cmp_*() and then either loading a new salt, or creating and loading the next set of candidates (more or less loaded by cracking mode, rules, mask and overhead). Ideally we should have the GPU working hard as hell in the mean time. This particular issue doesn't address all that though, it merely tries to mitigate, I think, the cmp_*() side of it.

Jun 20 '19 23:06 magnumripper

Perhaps as a miniscule optimization you should change the order of these two?

You mean not waiting for the kernel to complete if it was for wrong salt? In this place, it can happen when the passwords have been replaced but the salt stayed the same (if the salt changed, then set_salt already waited for the kernel to complete) - perhaps only in self-tests. When this does happen, I think it's subtly wrong to proceed to replace the kernel's inputs while it's still running, even if we don't care about its computation results.

For the set_salt instance of the wait for a wrong-salt kernel to complete, I think this can be avoided by having two separate salt buffers and swapping them like the output buffers. Regardless, the wrong-salt scenario is very costly (a 2x performance hit), and shouldn't occur in cases where we care about performance, and when it does then this wait is the least of our worries.

Jun 21 '19 10:06 solardiz

Added label "Potentially 1.9.0-jumbo-2 material" for the md5crypt-opencl changes. I'm not sure whether we have other formats where these changes are as beneficial. (They'd be even more beneficial for faster salted hashes, but for those we need on-device comparison instead.)

Jun 22 '19 15:06 solardiz

Somehow rebasing my md5crypt-opencl changes on the latest tree results in AMD driver lockups/reboots on super when running:

while :; do ./john -test -form=md5crypt-opencl -v=5 -dev=1; sleep 1; done

The last line of output is always "Raw speed figures including buffer transfers:". I can reproduce this problem with my old tree from June 19-20.

Sep 22 '19 17:09 solardiz

I've just tried adding swapping of two salt buffers as well (in addition to two output buffers), and this appeared to help at first, but then I got an eventual lockup on AMD anyway. So no luck.

Sep 22 '19 17:09 solardiz

So it does work for a while but eventually locks up? What if you supply LWS/GWS figures, does that avoid the lockups?

Sep 22 '19 21:09 magnumripper

Perhaps we're tearing everything down in format's done() while the "next salt" is still running on the GPU? If so, just adding a clFinish() in done() should avoid it.

Sep 22 '19 21:09 magnumripper

So it does work for a while but eventually locks up?

No. Some of the --test runs work fine, and eventually one run locks up at the very start.

I didn't try your other suggestions yet.

Sep 22 '19 21:09 solardiz

In the invocation of kernel for next salt, you should replace multi_profilingEvent[1] with NULL. Not sure if it interferes with anything (might point to NULL anyway) but regardless it's a bit confusing as written above.

Sep 23 '19 07:09 magnumripper

Are you using ocl_autotune_running in current code? I guess you wouldn't want to invoke next-salt kernel during auto tune and I'm not sure we can rely on just the other *_running variables.

Sep 23 '19 07:09 magnumripper

BTW please post a current patch, I might want to experiment a bit with it too.

Sep 23 '19 07:09 magnumripper

In the invocation of kernel for next salt, you should replace multi_profilingEvent[1] with NULL.

I'll try. I'm not familiar with our use of profiling events, so was mimicking the existing code. I think my "next salt" code isn't invoked when we're profiling anyway (but it is during self-test).

I don't use ocl_autotune_running, relying on benchmark_running instead. IIRC, omitting benchmark_running would halve the speeds during auto-tuning and benchmarking because we'd be invoking kernels for wrong next salts.

I'll post a current patch or make a PR after some more experiments. Thanks!

Sep 23 '19 11:09 solardiz

magnum, no luck solving the problem using your suggestions. Here's the current patch:

diff --git a/src/opencl_md5crypt_fmt_plug.c b/src/opencl_md5crypt_fmt_plug.c
index 001a57f..567e56a 100644
--- a/src/opencl_md5crypt_fmt_plug.c
+++ b/src/opencl_md5crypt_fmt_plug.c
@@ -82,7 +82,8 @@ static crypt_md5_salt host_salt;		/** salt **/
 
 //OpenCL variables:
 static cl_mem mem_in, mem_out, pinned_in, pinned_out, mem_salt;
-static int new_keys;
+static cl_mem mem_outs[2], mem_salts[2];
+static int new_keys, kernel_invoked;
 static struct fmt_main *self;
 
 #define insize (sizeof(crypt_md5_password) * global_work_size)
@@ -189,9 +190,6 @@ static void create_clobj(size_t gws, struct fmt_main *self)
 	size_t out_size = (sizeof(crypt_md5_hash) * gws);
 
 	///Allocate memory on the GPU
-	mem_salt = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, saltsize, NULL, &ret_code);
-	HANDLE_CLERROR(ret_code, "Error while allocating memory for salt");
-
 	pinned_in = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, in_size, NULL, &ret_code);
 	HANDLE_CLERROR(ret_code, "Error while allocating pinned memory for passwords");
 	mem_in = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, in_size, NULL, &ret_code);
@@ -201,8 +199,16 @@ static void create_clobj(size_t gws, struct fmt_main *self)
 
 	pinned_out = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, out_size, NULL, &ret_code);
 	HANDLE_CLERROR(ret_code, "Error while allocating pinned memory for hashes");
-	mem_out = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY, out_size, NULL, &ret_code);
-	HANDLE_CLERROR(ret_code, "Error while allocating GPU memory for hashes");
+
+	int i;
+	for (i = 0; i < 2; i++) {
+		mem_salt = mem_salts[i] = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, saltsize, NULL, &ret_code);
+		HANDLE_CLERROR(ret_code, "Error while allocating memory for salt");
+
+		mem_out = mem_outs[i] = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY, out_size, NULL, &ret_code);
+		HANDLE_CLERROR(ret_code, "Error while allocating GPU memory for hashes");
+	}
+
 	outbuffer = clEnqueueMapBuffer(queue[gpu_id], pinned_out, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, out_size, 0, NULL, NULL, &ret_code);
 	HANDLE_CLERROR(ret_code, "Error mapping results buffer");
 
@@ -217,15 +223,19 @@ static void create_clobj(size_t gws, struct fmt_main *self)
 static void release_clobj(void)
 {
 	if (mem_out) {
+		HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
+
 		HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[gpu_id], pinned_in, inbuffer, 0, NULL, NULL), "Error Unmapping inbuffer");
 		HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[gpu_id], pinned_out, outbuffer, 0, NULL, NULL), "Error Unmapping outbuffer");
-		HANDLE_CLERROR(clFinish(queue[gpu_id]), "Error releasing memory mappings");
 
 		HANDLE_CLERROR(clReleaseMemObject(pinned_in), "Release pinned_in");
 		HANDLE_CLERROR(clReleaseMemObject(mem_in), "Release mem_in");
-		HANDLE_CLERROR(clReleaseMemObject(mem_salt), "Release mem_salt");
 		HANDLE_CLERROR(clReleaseMemObject(pinned_out), "Release pinned_out");
-		HANDLE_CLERROR(clReleaseMemObject(mem_out), "Release mem_out");
+		int i;
+		for (i = 0; i < 2; i++) {
+			HANDLE_CLERROR(clReleaseMemObject(mem_outs[i]), "Release mem_out");
+			HANDLE_CLERROR(clReleaseMemObject(mem_salts[i]), "Release mem_salt");
+		}
 
 		mem_out = NULL;
 	}
@@ -276,11 +286,23 @@ static void set_salt(void *salt)
 	uint8_t *s = salt;
 	uint8_t len;
 
-	for (len = 0; len < 8 && s[len]; len++);
+	for (len = 0; len < 8 && s[len]; len++)
+		continue;
+
+	if (kernel_invoked && len == host_salt.saltlen && !memcmp(salt, host_salt.salt, len)) {
+		kernel_invoked = 2; /* yes, for correct salt */
+		return;
+	}
+
 	host_salt.saltlen = len;
 	memcpy(host_salt.salt, s, host_salt.saltlen);
 	host_salt.prefix = s[8];
 
+	if (kernel_invoked) { /* was for wrong salt */
+		HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
+		kernel_invoked = 0;
+	}
+
 	HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_salt, CL_FALSE, 0, saltsize, &host_salt, 0, NULL, NULL), "Salt transfer");
 	HANDLE_CLERROR(clFlush(queue[gpu_id]), "clFlush failed in set_salt()");
 }
@@ -359,6 +381,11 @@ static int crypt_all(int *pcount, struct db_salt *salt)
 
 	global_work_size = GET_NEXT_MULTIPLE(count, local_work_size);
 
+	if (kernel_invoked) /* must finish before we replace the salt */
+		BENCH_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
+	if (new_keys || kernel_invoked != 2) /* was for wrong keys or salt */
+		kernel_invoked = 0;
+
 	///Copy data to GPU memory
 	if (new_keys)
 		BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_in, CL_FALSE,
@@ -366,15 +393,43 @@ static int crypt_all(int *pcount, struct db_salt *salt)
 			"Copy memin");
 
 	///Run kernel
-	BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1,
-		NULL, &global_work_size, lws, 0, NULL, multi_profilingEvent[1]),
-		"Set ND range");
+	if (!kernel_invoked) {
+		BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1,
+			NULL, &global_work_size, lws, 0, NULL, multi_profilingEvent[1]),
+			"Set ND range");
+		if (salt->next && (!benchmark_running || self_test_running)) /* must finish before we replace the salt */
+			BENCH_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
+	}
+
+	cl_event event, *eventp = &event;
+	if (multi_profilingEvent[2])
+		eventp = multi_profilingEvent[2];
+
 	BENCH_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_FALSE,
-		0, outsize, outbuffer, 0, NULL, multi_profilingEvent[2]),
+		0, outsize, outbuffer, 0, NULL, eventp),
 		"Copy data back");
 
-	///Await completion of all the above
-	BENCH_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
+	kernel_invoked = 0;
+	if (salt->next && (!benchmark_running || self_test_running)) { /* not for benchmark, but for cracking or self-test */
+		if (mem_out == mem_outs[0]) {
+			mem_out = mem_outs[1];
+			mem_salt = mem_salts[1];
+		} else {
+			mem_out = mem_outs[0];
+			mem_salt = mem_salts[0];
+		}
+		set_salt(salt->next->salt);
+		BENCH_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof(mem_out), &mem_out), "Error while setting mem_out kernel argument");
+		BENCH_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof(mem_salt), &mem_salt), "Error while setting mem_salt kernel argument");
+		BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1,
+			NULL, &global_work_size, lws, 0, NULL, NULL),
+			"Set ND range");
+		kernel_invoked = 1;
+	}
+
+	BENCH_CLERROR(clWaitForEvents(1, eventp), "clWaitForEvents");
+	if (eventp == &event)
+		BENCH_CLERROR(clReleaseEvent(event), "clReleaseEvent");
 
 	new_keys = 0;
 	return count;

Sep 23 '19 15:09 solardiz

I think we should probably have this async transfers hack for md5crypt-opencl and maybe phpass-opencl. However, given the system crashes with AMDGPU-Pro that it triggered, I think we should enable this either only on NVIDIA or on anything non-AMD.

@claudioandre-br @magnumripper What is our current preferred way to check for NVIDIA or non-AMD from per-format host code?

Aug 10 '20 15:08 solardiz

Host code:

if (gpu_nvidia(device_info[sequential_id]))

In kernel code if amd_gcn(DEVICE_INFO) && DEV_VER_MAJOR < 2500

So:

	if (gpu_amd(device_info[sequential_id]) && major > xxxx)  #major is available somewhere
           avoid async transfers
        else
           go ahead

#if amd_gcn(DEVICE_INFO) && DEV_VER_MAJOR > xxxx
#elif gpu_nvidia(DEVICE_INFO)
#else gpu_amd(DEVICE_INFO)

A list is available at opencl_device_info.h.

Aug 10 '20 16:08 claudioandre-br

Note: besides md5crypt-opencl, another format that should significantly benefit from async transfers as proposed here is phpass-opencl, although mostly at its lower cost setting used for benchmarks and by phpBB3, rather than the higher cost setting used by the far more popular WordPress.

Sep 13 '20 20:09 solardiz

Maybe try this combined with avoiding busy-waits as in #4944?

Dec 27 '21 21:12 magnumripper

Maybe try this combined with avoiding busy-waits as in #4944?

Yes, eventually I intend to, but mostly as a test for whether the macros are sufficiently generic to accommodate uses like this.

Dec 27 '21 22:12 solardiz