Data.HashFunction
Data.HashFunction copied to clipboard
Result of CityHash64 doesn't match the reference implementation on strings of 64*n (n > 1) lengths
using System.Collections.Generic;
using System.Data.HashFunction.CityHash;
using System;
using System.Text;
namespace HashTest
{
class Program
{
static void Main(string[] args)
{
var symbols = new List<char>{'a', 'b', 'x', 'y'};
var hasher = CityHashFactory.Instance.Create(new CityHashConfig{ HashSizeInBits = 64 });
foreach (var symbol in symbols)
{
for (var length = 2; length < 4; ++length)
{
var s = new string(symbol, length*64);
var bytes = Encoding.ASCII.GetBytes(s);
var hash = BitConverter.ToUInt64(hasher.ComputeHash(bytes).Hash, 0);
Console.WriteLine($"{symbol}, {length}: 0x{hash:x}");
}
}
Console.ReadKey();
}
}
}
vs python (uses reference implementation, via cityhash module)
import cityhash
for symbol in ('a', 'b', 'x', 'y'):
for length in (2, 3):
hash = cityhash.CityHash64(symbol*length*64)
print("{}, {}: {:#x}".format(symbol, length, hash))
vs C++ (using https://github.com/google/cityhash via conan)
#include <string>
#include <cstdio>
#include "city.h"
int main(int, char*[]) {
for (const auto symbol : {'a', 'b', 'x', 'y'}) {
for (const auto length : {2, 3}) {
const auto s = std::string(length*64, symbol);
const auto hash = CityHash64(s.data(), s.size());
printf("%c, %d: 0x%lx\n", symbol, length, hash);
}
}
return 0;
}
C#:
a, 2: 0x17eb9429608efa10
a, 3: 0xd173291f9db2d8d1
b, 2: 0xd7f220816e41070d
b, 3: 0x36074be8fc81c410
x, 2: 0x77f3f0a5f76761d5
x, 3: 0xfe9c5c96274e4df9
y, 2: 0x85b294ba426c41c7
y, 3: 0xd7dcefe6faea4424
Python and C++:
a, 2: 0x8732752111926e2c
a, 3: 0xf7b22b0a38b54ca8
b, 2: 0x9f0c541d796fd1f1
b, 3: 0x453fb3d655153452
x, 2: 0x87e1532e643b0d29
x, 3: 0x128cf8134a32840
y, 2: 0xdf18ce2fbf974758
y, 3: 0x28e79fea5420f5f5
It looks like there is a difference in tail symbol processing