kaitai_struct_csharp_runtime Docstrings + Process XOR/ROL optimisations

Analog implementation to Python PR.

@GreyCat There are 2 methods that remain without docstrings (marked ???). If you would please tell me what to write there, I would include it.

Benchmarks for loops:

using System;
using System.Diagnostics;
using System.Runtime;

namespace demo
{
    class MainClass
    {
        static bool ByForArrayLength (byte[] data)
        {
            for (int i = 0; i < data.Length; i++)
                if (data [i] != 0)
                    return false;
            return true;
        }

        static bool ByForLocalLength (byte[] data)
        {
            int len = data.Length;
            for (int i = 0; i < len; i++)
                if (data [i] != 0)
                    return false;
            return true;
        }

        static bool ByForeach (byte[] data)
        {
            foreach (byte b in data)
                if (b != 0)
                    return false;
            return true;
        }

        static void Measure (Action work, string description)
        {
            var watch = Stopwatch.StartNew ();
            work.Invoke ();
            Console.WriteLine ("{0,-40}: {1} ms", description, watch.Elapsed.TotalMilliseconds);
        }

        public static void Main (string[] args)
        {
            GCSettings.LatencyMode = GCLatencyMode.LowLatency;
            byte[] data = new byte[256 * 1024 * 1024];

            Measure (() => ByForArrayLength (data), "For with byte[].Length property");
            Measure (() => ByForLocalLength (data), "For with Length in local variable");
            Measure (() => ByForeach (data), "Foreach over byte[]");
        }
    }
}

$ mcs Program.cs -optimize
$ ./Program.exe 
For with byte[].Length property         : 427,2671 ms
For with Length in local variable       : 331,5512 ms
Foreach over byte[]                     : 272,0288 ms

Benchmarks for rotations:

using System;
using System.Diagnostics;
using System.Runtime;

namespace demo
{
    class MainClass
    {
        static void ByShifts (byte[] data)
        {
            int len = data.Length;
            int amount = 1;
            int anti = 7;

            for (int i = 0; i < len; i++)
            {
                byte x = data[i];
                data[i] = (byte)((x << amount) | (x >> anti));
            }
        }

        static void ByLookup (byte[] data)
        {
            int len = data.Length;
            byte[] translate = new byte[256];

            for (int i = 0; i < len; i++)
                data[i] = translate[data[i]];
        }

        static void Measure (Action work, string description)
        {
            var watch = Stopwatch.StartNew ();
            work.Invoke ();
            Console.WriteLine ("{0,-40}: {1} ms", description, watch.Elapsed.TotalMilliseconds);
        }

        public static void Main (string[] args)
        {
            GCSettings.LatencyMode = GCLatencyMode.LowLatency;
            byte[] data = new byte[256 * 1024 * 1024];

            Measure (() => ByShifts (data), "Loop with << >> |");
            Measure (() => ByLookup (data), "Loop with translate[...]");
        }
    }
}

$ mcs Program.cs -optimize
$ ./Program.exe 
Loop with << >> |                       : 808,9077 ms
Loop with translate[...]                : 513,9755 ms

Requesting review from @koczkatamas @LogicAndTrick @Arlorean @Kolanich .

Apr 05 '18 23:04 arekbulski

It seems that @GreyCat was once again right and I was half wrong. You were right, that benchmarking C# code is more tricky than I thought, although I would like to point out in my defense that its for reasons that nobody pointed out explicitly and possibly nobody even understood.

It is my current belief that warmup has very little impact as far as (1) CPU cache is concerned, since test data is 2GB in size and CPU L2 cache is ~1MB and (2) JIT cost is also smallish compared to overall run time. However, it seems that there is one other reason to warmup the tested method: When you allocate new byte[...], it seems that it works similar to malloc, it reserves address space but not allocate actual memory pages. C# guaranatees that its elements are defaulted to 0 but that might have been implemented by intercepting page fault on first access instead of memset-ing it in advance. Thats why first method that run in the suite was slower. I fixed it in below code:

using System;
using System.Diagnostics;
using System.Threading;
using System.Runtime;

namespace demo
{
    class MainClass
    {

        static void ByModulo (byte[] data, byte[] key)
        {
            int len = data.Length;
            int keylen = key.Length;

            for (int i = 0; i < len; i++)
                data[i] = (byte) (data[i] ^ key[i % keylen]);
        }

        static void ByModulo2 (byte[] data, byte[] key)
        {
            for (int i = 0; i < data.Length; i++)
                data[i] = (byte) (data[i] ^ key[i % key.Length]);
        }

        static void ByCheckWrap (byte[] data, byte[] key)
        {
            int len = data.Length;
            int keylen = key.Length;

            int k = 0;
            for (int i = 0; i < len; i++)
                data[i] = (byte) (data[i] ^ key[k]);
                k++;
                if (k == keylen)
                    k = 0;
        }

        static void ByCheckWrap2 (byte[] data, byte[] key)
        {
            int k = 0;
            for (int i = 0; i < data.Length; i++)
                data[i] = (byte) (data[i] ^ key[k]);
                k++;
                if (k == key.Length)
                    k = 0;
        }

        static void Measure (Action work, string description)
        {
            work.Invoke ();
            var watch = Stopwatch.StartNew ();
            work.Invoke ();
            var elapsed = watch.Elapsed.TotalMilliseconds;
            Console.WriteLine ("{0,-40}: {1} ms", description, elapsed);
        }

        public static void Main (string[] args)
        {
            byte[] data = new byte[int.MaxValue];
            byte[] key = new byte[10];
            Measure (() => ByModulo (data, key), "Loop with data[i % keylen]");
            Measure (() => ByModulo2 (data, key), "Loop with data[i % key.Length]");
            Measure (() => ByCheckWrap (data, key), "Loop with data[k]; if(k==keylen)k=0");
            Measure (() => ByCheckWrap2 (data, key), "Loop with data[k]; if(k==key.Length)k=0");
        }
    }
}

$ mcs --version
Mono C# compiler version 4.6.2.0
$ mcs Program.cs -optimize
$ ./Program.exe 
Loop with data[i % keylen]              : 20878 ms
Loop with data[i % key.Length]          : 20450 ms
Loop with data[k]; if(k==keylen)k=0     : 4329 ms
Loop with data[k]; if(k==key.Length)k=0 : 4190 ms

Apr 11 '18 22:04 arekbulski

All commits were amended to reflect last benchmarks. Previous changset was 52308c2ca00c50d7dabbca47fac5e94ed4f840b9.

Apr 11 '18 23:04 arekbulski

@arekbulski, I would keep insisting that you're missing the point with benchmarks here. The platform is wrong (given that 95% of .NET installations use Microsoft's .NET, it's only natural to benchmark these), the methodology is all wrong both from point of view of general statistics, and from .NET's environment point of view, etc. This alone is a total disaster, for example, which makes all tests effectively useless:

            int k = 0;
            for (int i = 0; i < data.Length; i++)
                data[i] = (byte) (data[i] ^ key[k]);
                k++;
                if (k == key.Length)
                    k = 0;

If you don't see it, please take a longer look at this particular piece of code.

I'm by no means a .NET expert, but even what I know is enough to see that it is not the way to do it. I can teach you how to do it, I can do these benchmarks myself, but that would take a considerable chunk of my time that I'd rather spend on some real-life tasks. This is, of course, interesting from educational perspective, both for me, you and anyone who might be interested in microbenchmarking, but there are like 400 other pressing issues in the main repo, and I do believe they're more important than this task that nobody even ever requested.

Apr 12 '18 15:04 GreyCat

My experience in real-world .NET applications on Windows is that the amount of memory you turn over, which puts pressure on the Garbage Collector, often outweighs any micro-optimizations at the loop level like this. We use tools like dotTrace and dotMemory to pin down performance problems, but more often than not, it's a change in algorithm, rather than a change in loop structuring that provides the most performance gains.

The Roslyn team have some guidelines which may help. The key one being don't use LINQ as it allocates memory which causes performance issues.

This brings me to an overall concern about the KaitaiStruct C# API. As much as I love it, I don't love the amount of memory that gets allocated when parsing a file. A solution along the lines of FlatBuffers would be preferable since it performs zero memory allocations but still gives easy access to the data structures stored in essentially a byte array.

Apr 16 '18 09:04 Arlorean

@Arlorean Unfortunately, it's not that easy. Projects like Cap'n Proto or FlatBuffers use carefully tuned representation of certain data laid out in memory in a manner that would be fast to access on a reasonably modern x86_64 box. If you'll try to apply the same principle to arbitrary binary data, chances are that you'll end up with very slow code due to every parsed.Bar call taking a long time due to data alignment issues.

Also, that's not a single answer for everything anyway. You can't escape creating a string object in C# if you indeed want a string, not a byte array. Actually, in C# you can't escape allocating new byte array and copying slice of the data there every time you want a very basic byte array in the middle of memory.

That said, I think that it's still a very good idea to try implementing such "zero-object creation", even if it would be of limited use and not a good fit for every one. I don't think that ksy format per se has any blockers for implementing that right now.

Apr 16 '18 20:04 GreyCat

@GreyCat Okay okay, take it easy. I will get you some .NET proper benchmarks. I looked at the code you quoted for few days, what is so wrong with it? The cases do arrays in-place but that doesnt make the results relative wrong.

Please note that only 2 items are (should be) in question: XOR(bytearray) and ROL(groupsize=1). Other changes are either easily justifiable (like returning entire array is cheaper than copying it) or add instead of replacing features (like ROL >1 groups).

@Arlorean Note that in this PR there is no LINQ code, there is no real allocations either (rotation needs some indecies but those are small and short lived). Albeit not related to this PR, it is worthy discussion.

@GreyCat You dont need a subarray copy to decode a unicode string, you can do that on a "slice" using: Encoding.GetString (Byte[], Int32, Int32)

Apr 17 '18 01:04 arekbulski

@arekbulski

I looked at the code you quoted for few days, what is so wrong with it?

Ok, the catch is that you've used Python ~formatting~ syntax, missing {}s, and this is C#. Formatted properly, your code does the following:

int k = 0;
for (int i = 0; i < data.Length; i++)
{
    data[i] = (byte) (data[i] ^ key[k]);
}
k++;
if (k == key.Length)
{
    k = 0;
}

That k wrapped increment fell out of the loop.

Apr 17 '18 01:04 GreyCat

Benchmarks of fixed code and fixed methodology, although still ran on Mono.

using System;
using System.Diagnostics;
using System.Runtime;

namespace demo
{
    class MainClass
    {
        static void ByBitwiseXor (byte[] data)
        {
            int len = data.Length;
            byte x = 1;

            for (int i = 0; i < len; i++) {
                data[i] = (byte) (data[i] ^ x);
            }
        }

        static void ByShifts (byte[] data)
        {
            int len = data.Length;
            int amount = 1;
            int anti = 7;

            for (int i = 0; i < len; i++) {
                byte x = data[i];
                data[i] = (byte)((x << amount) | (x >> anti));
            }
        }

        static void ByLookup (byte[] data)
        {
            int len = data.Length;
            byte[] translate = new byte[256];
            for (int i = 0; i < 256; i++)
                translate[i] = (byte) (i ^ 1);

            for (int i = 0; i < len; i++) {
                data[i] = translate[data[i]];
            }
        }

        static void ByModulo (byte[] data, byte[] key)
        {
            int len = data.Length;
            int keylen = key.Length;

            for (int i = 0; i < len; i++) {
                data[i] = (byte) (data[i] ^ key[i % keylen]);
            }
        }

        static void ByModulo2 (byte[] data, byte[] key)
        {
            for (int i = 0; i < data.Length; i++) {
                data[i] = (byte) (data[i] ^ key[i % key.Length]);
            }
        }

        static void ByCheckWrap (byte[] data, byte[] key)
        {
            int len = data.Length;
            int keylen = key.Length;

            int k = 0;
            for (int i = 0; i < len; i++) {
                data[i] = (byte) (data[i] ^ key[k]);
                k++;
                if (k == keylen)
                    k = 0;
            }
        }

        static void ByCheckWrap2 (byte[] data, byte[] key)
        {
            int k = 0;
            for (int i = 0; i < data.Length; i++) {
                data[i] = (byte) (data[i] ^ key[k]);
                k++;
                if (k == key.Length)
                    k = 0;
            }
        }

        static void Measure (Action work, string description)
        {
            GCSettings.LatencyMode = GCLatencyMode.LowLatency;
            work.Invoke ();
            var watch = Stopwatch.StartNew ();
            work.Invoke ();
            var elapsed = watch.Elapsed.TotalMilliseconds;
            Console.WriteLine ("{0,-40}: {1:F0} ms", description, elapsed);
        }

        public static void Main (string[] args)
        {
            byte[] data = new byte[int.MaxValue];
            byte[] key = new byte[10];
            Measure (() => ByBitwiseXor (data), "Loop with ^ key");
            Measure (() => ByShifts (data), "Loop with << >> |");
            Measure (() => ByLookup (data), "Loop with translate[...]");
            Measure (() => ByModulo (data, key), "Loop with data[i % keylen]");
            Measure (() => ByModulo2 (data, key), "Loop with data[i % key.Length]");
            Measure (() => ByCheckWrap (data, key), "Loop with data[k]; if(k==keylen)k=0");
            Measure (() => ByCheckWrap2 (data, key), "Loop with data[k]; if(k==key.Length)k=0");
        }
    }
}

$ mcs --version
Mono C# compiler version 4.6.2.0
$ mcs Program.cs -optimize
$ ./Program.exe 
Loop with ^ key                         : 2980 ms
Loop with << >> |                       : 5417 ms
Loop with translate[...]                : 3910 ms
Loop with data[i % keylen]              : 20341 ms
Loop with data[i % key.Length]          : 20650 ms
Loop with data[k]; if(k==keylen)k=0     : 4599 ms
Loop with data[k]; if(k==key.Length)k=0 : 5645 ms

The benchmark shows that array lookup is faster than shifts (in ROL) but not faster than simple xoring (in XOR). It also shows that local variable and instance field .Length have identical performance, but only in loop variables, so i is optimized this way but k is not. The PR code is not yet updated according to these results. MERGE ON HOLD.

Apr 17 '18 08:04 arekbulski

The code was updated according to the benchmarks. READY TO MERGE (AGAIN! :).

Apr 18 '18 12:04 arekbulski

kaitai_struct_csharp_runtime kaitai_struct_csharp_runtime copied to clipboard

Docstrings + Process XOR/ROL optimisations

kaitai_struct_csharp_runtime
kaitai_struct_csharp_runtime copied to clipboard