Microbenchmarks Add Heaps algorithm which crawls over all permutations

The Heap's algorithm which crawls over all permutations seem a good testcase, cause You can not be "smart" and use language specific optimisations, it's just pure operations on integer arrays (indexing, changing content of);

function allperms(N)
    elts = collect(1:N)
    c = ones(Int, N)
    countfirst = 0
    n = 1
    k = 0
#    doit(elts)
    countfirst += elts[1]
    @inbounds while n <= N
      if c[n] < n
         k = ifelse(isodd(n), 1, c[n])
         elts[k], elts[n] = elts[n], elts[k]
         c[n] += 1
         n = 1
         countfirst += elts[1]
#         doit(elts)
      else
        c[n] = 1
        n += 1
      end
   end
   return countfirst
end

a similar implementation in c:

void swap(int *p, int a, int b){
   int tmp;
   tmp = p[a];
   p[a] = p[b];
   p[b] = tmp;
}

long heaps(int N){
   long count = 0;
   int n;

   int *p = malloc(sizeof(int)*N);
   int *c = calloc(N, sizeof(int));
   for (n=0; n < N; n++){
      p[n] = n+1;
      // c[n] = 0;
   }

   count += p[0];

   for (n=0; n < N;){
      if (c[n] < n){
         swap(p, (n%2 ? c[n] : 0), n);

         count += p[0];

         c[n]++;
         n = 0;
      } else {
         c[n] = 0;
         n++;
      }
   }
   free(p);
   free(c);
   return count;
}

EDIT: @inbounds macro makes a fairer comparison to C (and speeds julias version!)

Jun 26 '18 06:06 kalmarek

That does seem like a good one!

Jun 26 '18 12:06 StefanKarpinski

Here's my Rust attempt. This sure feels hard to optimize.

Jun 26 '18 15:06 Enet4

#include <stdlib.h>
#include <stdio.h>

void swap(size_t *p, size_t a, size_t b) {
  size_t tmp = p[a];
  p[a] = p[b];
  p[b] = tmp;
}

size_t heaps(size_t n) {
  size_t *a = malloc(n * sizeof *a);
  if (!a) {
    exit(EXIT_FAILURE);
  }
  for (size_t i = 0; i < n; i++) {
    a[i] = i; // i + 1 was strange
  }
    
  size_t *c = calloc(n, sizeof *c);
  if (!c) {
    exit(EXIT_FAILURE);
  }

  size_t count = a[0];

  size_t i = 0;
  while (i < n) {
    if (c[i] < i) {
      swap(a, i % 2 ? c[i] : 0, i);

      count += a[0];

      c[i]++;
      i = 0;
    } else {
      c[i] = 0;
      i++;
    }
  }

  free(a);
  free(c);

  return count;
}

Jun 27 '18 01:06 Stargateur

I think there are some issues with these codes

heaps(n), n=1:4	code
1, 3, 12, 60	kalmarek Julia
0, 3, 12, 60	karmarek C
0, 1, 6, 36	Stargateur C
1, 2, 6, 24	n!

Jul 03 '18 16:07 johnfgibson

@johnfgibson Sorry, I should make it more clear that I change the initialisation in favor of the rust:

p[n] = n+1; // n + 1 could overflow n is as good as n + 1 and can't overflow let mut perm: Vec<_> = (0..n).collect(); // rust use n a[i] = i; // so I just used n too

Jul 03 '18 16:07 Stargateur

@johnfgibson, @Stargateur: for comparison with n-factorial: we don't simply count the permutations, but sum the first entry of every permutation (otherwise future compilers may skip the swap part entirely); p[n] = n+1 was because permutations are usually counted from one, not zero. This does matter for the returned value.

Could You also recheck n=1? my C version returns squarely 1 for N=1 ;-)

Jul 03 '18 16:07 kalmarek

@kalmarek aha! thank you. (regarding returning the sum of the first entry).

I ran what's currently posted above and got heaps(1:5) == 1, 3, 12, 60, 360. There were some differences from an earlier version (count += p[0] versus count += p[1]) that probably account for the difference.

@Stargateur I don't follow from your comment what changes should be made.

I can add working codes to the microbenchmark suite but I don't think I can debug and fix codes with errors.

Jul 03 '18 17:07 johnfgibson

@kalmarek @johnfgibson I'm not a mathematician, if you conclude that function should start init with 1, I'm ok with it. But I didn't think that the result was so important, just as you said it's for avoid compiler to optimize it. I'm just a C dev who read this thread.

This is better ?

#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include <assert.h>

static void swap(uintmax_t *p, size_t a, size_t b) {
  uintmax_t tmp = p[a];
  p[a] = p[b];
  p[b] = tmp;
}

// UB for n == 0
static uintmax_t heaps(size_t n) {
  uintmax_t *a = malloc(n * sizeof *a);
  if (!a) {
    exit(EXIT_FAILURE);
  }
  uintmax_t acu = 1;
  for (size_t i = 0; i < n; i++) {
    a[i] = acu++;
  }

  size_t *c = calloc(n, sizeof *c);
  if (!c) {
    exit(EXIT_FAILURE);
  }

  assert(n != 0); // remove when NDEBUG is defined
  uintmax_t count = a[0];

  size_t i = 0;
  while (i < n) {
    if (c[i] < i) {
      swap(a, i % 2 ? c[i] : 0, i);

      count += a[0];

      c[i]++;
      i = 0;
    } else {
      c[i] = 0;
      i++;
    }
  }

  free(a);
  free(c);

  return count;
}

int main(void) {
  for (size_t i = 1; i < 5; i++) {
    printf("heaps(%zu) = %" PRIuMAX "\n", i, heaps(i));
  }
}

Jul 03 '18 17:07 Stargateur

@Stargateur, sure the exact value does not matter, it was just to explain the differences in the results. If I am not mistaken, You could start from a[0] = 0, but then You need to sum a[1]. Anyway, the important thing is that we swap during each iteration.

BTW How do I call Your code from julia? if I use:

C_code = """
    [Your C code here]
"""
const Clib = tempname()

open(`gcc -fPIC -O3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, C_code) 
end

heaps_c(n::Int) = ccall(("heaps", Clib), Int, (Int,), n)

I get ccall: could not find function heaps in library /tmp/juliaACJ6nY, but this works for my, primitive ;-) C code

Jul 04 '18 08:07 kalmarek

Remove static from static uintmax_t heaps(size_t n), it's make the symbol not global.

C_code = """
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include <assert.h>

static void swap(uintmax_t *p, size_t a, size_t b) {
  uintmax_t tmp = p[a];
  p[a] = p[b];
  p[b] = tmp;
}

// UB for n == 0
uintmax_t heaps(size_t n) {
  uintmax_t *a = malloc(n * sizeof *a);
  if (!a) {
    exit(EXIT_FAILURE);
  }
  uintmax_t acu = 1;
  for (size_t i = 0; i < n; i++) {
    a[i] = acu++;
  }

  size_t *c = calloc(n, sizeof *c);
  if (!c) {
    exit(EXIT_FAILURE);
  }

  assert(n != 0); // remove when NDEBUG is defined
  uintmax_t count = a[0];

  size_t i = 0;
  while (i < n) {
    if (c[i] < i) {
      swap(a, i % 2 ? c[i] : 0, i);

      count += a[0];

      c[i]++;
      i = 0;
    } else {
      c[i] = 0;
      i++;
    }
  }

  free(a);
  free(c);

  return count;
}
"""
const Clib = tempname()

open(`gcc -fPIC -O3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, C_code) 
end

ccall(("heaps", Clib), Cuintmax_t , (Csize_t,), n)

Jul 04 '18 09:07 Stargateur

I updated the Rust code above as well, thanks to your feedback. It was initializing p from 0 rather than 1.

Jul 05 '18 17:07 Enet4

a naive python version

def heaps(n):
    elts = [i+1 for i in range(n)]
    c = [0 for i in range(n)]
    n = 0
    countfirst = elts[0]
    while n < len(elts):
        if c[n] < n:
            if n % 2:
                k = c[n]
            else:
                k = 0
            elts[k], elts[n] = elts[n], elts[k]
            
#             print(elts)
            countfirst += elts[0]
            
            c[n] += 1
            n = 0
        else:
            c[n] = 0
            n += 1
    return countfirst

To be honest, I don't know any other language microbenchmark compares julia against, so someone else has to come-up with the code.

Jul 15 '18 15:07 kalmarek

defining heaps_c(n) = Int(ccall(("heaps", Clib), Cuintmax_t , (Csize_t,), n)) I get:

On julia-0.6.4:

julia> @btime allperms(12)
  2.862 s (3 allocations: 384 bytes)
3113510400

julia> @btime heaps_c(12)
  1.470 s (0 allocations: 0 bytes)
3113510400

On julia-0.7.0:

julia> @btime allperms(12)
  1.397 s (2 allocations: 352 bytes)
3113510400

julia> @btime heaps_c(12)
  1.466 s (0 allocations: 0 bytes)
3113510400

I'm impressed!

Aug 08 '18 15:08 kalmarek

Microbenchmarks Microbenchmarks copied to clipboard

Add Heaps algorithm which crawls over all permutations

Microbenchmarks
Microbenchmarks copied to clipboard