jamulus icon indicating copy to clipboard operation
jamulus copied to clipboard

Improve performance/correctness of FlipNBits in sound.cpp

Open ann0see opened this issue 2 years ago • 14 comments

I assume that the code for Flip16,32,64 bits in sound.cpp is not efficient (or doesn't do what it should do): https://github.com/jamulussoftware/jamulus/blob/main/src/sound/asio/sound.cpp#L1168

I made some heuristic tests on what the function Flip16Bits outputs (Disclaimer: I haven't fully understood what the code exactly does) and it seems that all it does is a left shift by one. Thus, at least Flip16Bits can be simplified (and I assume it's the same for the other code). By the function name, it could do a bitwise invert which it doesn't. Also it doesn't seem to flip the whole word around. What confuses me most, is that the if statement doesn't flip anything. So it might well be a bug.

The following code outputs "ok", which I consider as proof that the function Flip16Bits can be implemented as leftshift by 1 only.

#include <cassert>
#include <cstdint>
#include <iostream>
#include <bitset>
using namespace std;

int32_t Flip32Bits(int32_t iIn) {
  
    uint32_t iMask = ( static_cast<uint32_t> ( 1 ) << 31 );

    int32_t  iOut  = 0;


    for ( unsigned int i = 0; i < 32; i++ )

    {

        // copy current bit to correct position

        iOut |= ( iIn & iMask ) ? 1 : 0;


        // shift out value and mask by one bit

        iOut <<= 1;

        iMask >>= 1;

    }


    return iOut;
}

int32_t Flip32BitHist(int32_t iIn) {
  return (iIn<<1);
}


int16_t Flip16Bits ( const int16_t iIn )

{

    uint16_t iMask = ( 1 << 15 );

    int16_t  iOut  = 0;


    for ( unsigned int i = 0; i < 16; i++ )

    {

        // copy current bit to correct position

        iOut |= ( iIn & iMask ) ? 1 : 0;


        // shift out value and mask by one bit

        iOut <<= 1;

        iMask >>= 1;

    }


    return iOut;

}

int16_t Flip16BitsHist(const int16_t iIn) {
  int16_t iOut = (iIn<< 1);
  return iOut;
}

int64_t Flip64Bits ( const int64_t iIn )

{

    uint64_t iMask = ( static_cast<uint64_t> ( 1 ) << 63 );

    int64_t  iOut  = 0;


    for ( unsigned int i = 0; i < 64; i++ )

    {

        // copy current bit to correct position
        iOut |= ( iIn & iMask ) ? 1 : 0;


        // shift out value and mask by one bit

        iOut <<= 1;

        iMask >>= 1;

    }


    return iOut;

}

int64_t Flip64BitHist(int64_t iIn) {
  return (iIn << 1);
}

int main(int argc, char** argv) {
  for (uint16_t check = 0;check<=(uint16_t)-1;check++) {
    cout << bitset<16> (check) << endl;
    //cout << b1 << endl;
    //cout << bitset<16>(Flip16BitsHist(check)) << endl;
    //cout << bitset<16>(Flip16Bits(check)) << endl << endl;
    assert(Flip16BitsHist(check) == Flip16Bits(check));
    //assert(Flip32Bits(check) == Flip32BitHist(check));
    //assert(Flip64Bits(check) == Flip64BitHist(check));
    //cout << check;
    if (check == (uint16_t)-1) {// not so elegant termination!
      break;
    }
  }
  cout << "ok";
}

The other functions can't be tested this way as uint64_t is just too large to get an answer, thus analysis of the code is needed. (I might run the 32 bit over night)

ann0see avatar Jan 15 '23 20:01 ann0see

@corrados as you are probably most familiar with the Windows sound code: what should the FlipNBits functions do?

ann0see avatar Jan 15 '23 21:01 ann0see

The 32 Bit code also seems to be equivalent.

The following program returns ok:

#include <cassert>
#include <cstdint>
#include <iostream>
#include <bitset>
#include <future>

using namespace std;

int32_t Flip32Bits(int32_t iIn) {
  
    uint32_t iMask = ( static_cast<uint32_t> ( 1 ) << 31 );

    int32_t  iOut  = 0;


    for ( unsigned int i = 0; i < 32; i++ )

    {

        // copy current bit to correct position

        iOut |= ( iIn & iMask ) ? 1 : 0;


        // shift out value and mask by one bit

        iOut <<= 1;

        iMask >>= 1;

    }


    return iOut;
}

int32_t Flip32BitHist(int32_t iIn) {
  return (iIn<<1);
}


int16_t Flip16Bits ( const int16_t iIn )

{

    uint16_t iMask = ( 1 << 15 );

    int16_t  iOut  = 0;


    for ( unsigned int i = 0; i < 16; i++ )

    {

        // copy current bit to correct position

        iOut |= ( iIn & iMask ) ? 1 : 0;


        // shift out value and mask by one bit

        iOut <<= 1;

        iMask >>= 1;

    }


    return iOut;

}

int16_t Flip16BitsHist(const int16_t iIn) {
  int16_t iOut = (iIn<< 1);
  return iOut;
}

int64_t Flip64Bits ( const int64_t iIn )

{

    uint64_t iMask = ( static_cast<uint64_t> ( 1 ) << 63 );

    int64_t  iOut  = 0;


    for ( unsigned int i = 0; i < 64; i++ )

    {

        // copy current bit to correct position
        iOut |= ( iIn & iMask ) ? 1 : 0;


        // shift out value and mask by one bit

        iOut <<= 1;

        iMask >>= 1;

    }


    return iOut;

}

int64_t Flip64BitHist(int64_t iIn) {
  return (iIn << 1);
}
bool testValue(uint32_t start, uint32_t end) {
  bool error = false;
  for (uint32_t check = start; check <= end; check++) {
    //cout << bitset<16> (check) << endl;
    error = error || (Flip32Bits(check) != Flip32BitHist(check));
    
  }
  return error;
  //p.set_value(error); // sets the error if present

}
int main(int argc, char** argv) {
  /*for (uint32_t check = 0;check<=(uint32_t)-1;check++) {
    cout << bitset<32> (check) << endl;
    //cout << b1 << endl;
    //cout << bitset<16>(Flip16BitsHist(check)) << endl;
    //cout << bitset<16>(Flip16Bits(check)) << endl << endl;
    //assert(Flip16BitsHist(check) == Flip16Bits(check));
    assert(Flip32Bits(check) == Flip32BitHist(check));
    //assert(Flip64Bits(check) == Flip64BitHist(check));
    //cout << check;
    if (check == (uint32_t)-1) {// not so elegant termination!
      break;
    }
  }*/

  uint32_t stride = ((uint32_t)-1)/4;
  std::future<bool> t1 = std::async(&testValue, 0,stride);
  std::future<bool> t2 = std::async(&testValue, stride+1,stride*2);
  std::future<bool> t3 = std::async(&testValue, stride*2+1,stride*3);
  std::future<bool> t4 = std::async(&testValue, stride*3+1,((uint32_t)-1)-1); // stop one off error
  bool error = t1.get() || t2.get() || t3.get() || t4.get();
  // last iteration.
  error = error || (Flip32Bits((uint32_t)-1) != Flip32BitHist((uint32_t)-1));
  
  if (!error) {
    cout << "ok";
  } else {
    cout << "error";
  }
}

ann0see avatar Jan 16 '23 13:01 ann0see

As you can see in the code: "// NOT YET TESTED" -> There are a lot of switch branches which are not tested. You need to have the correct hardware to test the code. I have written these functions to convert the ASIO formats in the Jamulus sample format but only a subset was actually tested by myself (i.e., the branches which my hardware has used).

By the function name, it could do a bitwise invert which it doesn't.

That's not good :-). These "FlipBits" functions were created to convert MSB <-> LSB format. If that does not work correctly, it should be fixed.

Maybe it makes sense to take a look at the implementation in Portaudio. They must have some similar code and I assume that these conversion functions are all tested.

corrados avatar Jan 16 '23 16:01 corrados

As I probably don't have the hardware, I can't test either.

These "FlipBits" functions were created to convert MSB <-> LSB format.

Ok. Thanks for the information. Does it swap endianness? Or can you give an example?

1100 0100 --> 1000 1000 (currently)

Do you want a reversal like:

1010 0001 -> 1000 0101

?

ann0see avatar Jan 17 '23 11:01 ann0see

When considering changing them for solely performance reasons it might also make sense to validate whether the compiler doesn't already perform such optimizations.

hoffie avatar Jan 17 '23 13:01 hoffie

Do you want a reversal like: 1010 0001 -> 1000 0101

Yes, that was the intention.

corrados avatar Jan 17 '23 15:01 corrados

When considering changing them for solely performance reasons it might also make sense to validate whether the compiler doesn't already perform such optimizations.

Probably it's possible, but I think the code is invalid right now.

Do you want a reversal like: 1010 0001 -> 1000 0101 Yes, that was the intention.

ann0see avatar Jan 17 '23 19:01 ann0see

~~Found https://www.geeksforgeeks.org/c-program-to-rotate-bits-of-a-number/~~

Probably https://www.geeksforgeeks.org/write-an-efficient-c-program-to-reverse-bits-of-a-number/ is what we search for.

ann0see avatar Jan 17 '23 19:01 ann0see

I find it very odd that one must reverse the bits of a number in this case. Parallel to serial shifting for the purpose of serial data transmission ,which this is*, can have the bits sent MSBit first or LSBit first. The serializer and deserializer hardware must be initialized correctly for correct data transmission. * there can be no other reasonable explanation. BTW if I were doing bitswapping I would make inline code without a for-loop. I would also do it in assembler with ROL and ROR and the Carry-Flag (LOL!). This procedure is done on the upper half And lower half of the same register so that it is done in-place.

ghost avatar Jan 17 '23 20:01 ghost

I haven't yet studied the original code in Jamulus for reversing the order of bits, and don't know what impact it has on performance, but if the fastest performance would be beneficial, I would suggest using a lookup table of reversed bytes, with code such as the following:

const uint8_t revbyte[256] = {
  0, 128,  64, 192,  32, 160,  96, 224, 16, 144,  80, 208,  48, 176, 112, 240,
  8, 136,  72, 200,  40, 168, 104, 232, 24, 152,  88, 216,  56, 184, 120, 248,
  4, 132,  68, 196,  36, 164, 100, 228, 20, 148,  84, 212,  52, 180, 116, 244,
 12, 140,  76, 204,  44, 172, 108, 236, 28, 156,  92, 220,  60, 188, 124, 252,
  2, 130,  66, 194,  34, 162,  98, 226, 18, 146,  82, 210,  50, 178, 114, 242,
 10, 138,  74, 202,  42, 170, 106, 234, 26, 154,  90, 218,  58, 186, 122, 250,
  6, 134,  70, 198,  38, 166, 102, 230, 22, 150,  86, 214,  54, 182, 118, 246,
 14, 142,  78, 206,  46, 174, 110, 238, 30, 158,  94, 222,  62, 190, 126, 254,
  1, 129,  65, 193,  33, 161,  97, 225, 17, 145,  81, 209,  49, 177, 113, 241,
  9, 137,  73, 201,  41, 169, 105, 233, 25, 153,  89, 217,  57, 185, 121, 249,
  5, 133,  69, 197,  37, 165, 101, 229, 21, 149,  85, 213,  53, 181, 117, 245,
 13, 141,  77, 205,  45, 173, 109, 237, 29, 157,  93, 221,  61, 189, 125, 253,
  3, 131,  67, 195,  35, 163,  99, 227, 19, 147,  83, 211,  51, 179, 115, 243,
 11, 139,  75, 203,  43, 171, 107, 235, 27, 155,  91, 219,  59, 187, 123, 251,
  7, 135,  71, 199,  39, 167, 103, 231, 23, 151,  87, 215,  55, 183, 119, 247,
 15, 143,  79, 207,  47, 175, 111, 239, 31, 159,  95, 223,  63, 191, 127, 255
};

uint8_t Flip8Bits(uint8_t iIn)
{
        return revbyte[iIn];
}

uint16_t Flip16Bits(uint16_t iIn)
{
        uint16_t iOut;
        uint8_t *pOut = (uint8_t *)&iOut;
        uint8_t *pIn = (uint8_t *)&iIn;

        pOut[0] = revbyte[ pIn[1] ];
        pOut[1] = revbyte[ pIn[0] ];

        return iOut;
}

uint32_t Flip32Bits(uint32_t iIn)
{
        uint32_t iOut;
        uint8_t *pOut = (uint8_t *)&iOut;
        uint8_t *pIn = (uint8_t *)&iIn;

        pOut[0] = revbyte[ pIn[3] ];
        pOut[1] = revbyte[ pIn[2] ];
        pOut[2] = revbyte[ pIn[1] ];
        pOut[3] = revbyte[ pIn[0] ];

        return iOut;
}

uint64_t Flip64Bits(uint64_t iIn)
{
        uint64_t iOut;
        uint8_t *pOut = (uint8_t *)&iOut;
        uint8_t *pIn = (uint8_t *)&iIn;

        pOut[0] = revbyte[ pIn[7] ];
        pOut[1] = revbyte[ pIn[6] ];
        pOut[2] = revbyte[ pIn[5] ];
        pOut[3] = revbyte[ pIn[4] ];
        pOut[4] = revbyte[ pIn[3] ];
        pOut[5] = revbyte[ pIn[2] ];
        pOut[6] = revbyte[ pIn[1] ];
        pOut[7] = revbyte[ pIn[0] ];

        return iOut;
}

Disclaimer: not tested.

softins avatar Jan 17 '23 23:01 softins

Hmm. Agree. The lookup table approach is probably the fastest? We should verify that it actually does what we expect with a device.

I find it very odd that one must reverse the bits of a number in this case.

True. If we find a way around turning the bits around, we could rewrite this part, sure.

ann0see avatar Jan 25 '23 20:01 ann0see

Hmm. Agree. The lookup table approach is probably the fastest? We should verify that it actually does what we expect with a device.

The lookup table is fast and the best choice for multi-platform C C++. X86 CPU is only 1/2 as good as I thought because you must use the lowest 16 bits of the register (8 bits high and 8 bits low), then shift the results (barrel shifter = good) as far as 64 bits. It can be done while preserving the initial carry flag throughout, and uses only 1 CPU register.

I find it very odd that one must reverse the bits of a number in this case.

I said this because (without looking or digging) there must be something wrong. Wrong that the bits are reversed. I also can't believe that it would be for a poorly implemented FFT. Maybe ASIO can be called with different parameters so that the bits come in the right order.

The following link may put some light on this issue: https://www.kvraudio.com/forum/viewtopic.php?t=490654

ghost avatar Jan 25 '23 21:01 ghost

I have just for the first time studied the existing flip bits code, and also run some tests, comparing the results with my lookup table functions above.

I assume that the code for Flip16,32,64 bits in sound.cpp is not efficient (or doesn't do what it should do): https://github.com/jamulussoftware/jamulus/blob/main/src/sound/asio/sound.cpp#L1168

I made some heuristic tests on what the function Flip16Bits outputs (Disclaimer: I haven't fully understood what the code exactly does) and it seems that all it does is a left shift by one.

Yes, that is what it is actually doing, but not what it is intended to do, I'm sure. It appears that the intention is by shifting and masking to reverse the order of the bits. Any ASIO driver that returns MSB data needing to be reversed will not currently work with Jamulus at all. I don't know if there have been any such drivers in practice.

Here is the 16-bit function.

int16_t Flip16Bits ( const int16_t iIn )
{
    uint16_t iMask = ( 1 << 15 );

    int16_t  iOut  = 0;

    for ( unsigned int i = 0; i < 16; i++ )
    {
        // copy current bit to correct position
        iOut |= ( iIn & iMask ) ? 1 : 0;

        // shift out value and mask by one bit
        iOut <<= 1;
        iMask >>= 1;
    }

    return iOut;
}

There are two main problems with this code:

  • The mask starts with the leftmost bit and works down. However, the detected bit is shifted in at the rightmost end and shifted left, so the bits end up moving back towards the leftmost end, and the output bits remain in the same order as the input, instead of being reversed.
  • The shift of iOut is being done after adding in the new bit, but should have been done before adding it. This means that after adding the last bit, the output value is still shifted once more. This is why the output is just the input shifted left by one bit.

It would be straightforward to fix both of those problems to make the function work correctly:

int16_t Flip16Bits ( int16_t iIn )
{
    int16_t  iOut  = 0;

    for ( unsigned int i = 0; i < 16; i++ )
    {
        // shift output value left before adding
        iOut <<= 1;

        // copy rightmost bit of input into output
        iOut |= iIn & 1;

        // shift input value right to obtain the next bit
        iIn >>= 1;
    }

    return iOut;
}

And similarly for the 32-bit and 64-bit functions.

I'm not familiar with ASIO, but the question remains in my mind whether "MSB" data from ASIO actually needs the bits reversed, or just the bytes swapped, like network addresses do.

If indeed it is bit reversal that is needed, the existing functions, when corrected as above, are still very inefficient, with lots of looping and shifting. They should be replaced with the lookup table functions. I could raise a PR to do this soon.

softins avatar Feb 11 '24 23:02 softins

I found a PDF for the ASIO SDK here.

softins avatar Feb 11 '24 23:02 softins