Commit 6454151b authored by Philip Trettner's avatar Philip Trettner
Browse files

Merge branch 'update_xxhash' into 'develop'

Update xxHash source files

See merge request !40
parents 8f61a37a af981cf9
/*
* xxHash - Extremely Fast Hash algorithm
* Development source file for `xxh3`
* Copyright (C) 2019-2020 Yann Collet
* Copyright (C) 2019-2021 Yann Collet
*
* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
*
......@@ -34,2339 +34,22 @@
*/
/*
* Note: This file is separated for development purposes.
* It will be integrated into `xxhash.h` when development stage is completed.
* Note: This file used to host the source code of XXH3_* variants.
* during the development period.
* The source code is now properly integrated within xxhash.h.
*
* Credit: most of the work on vectorial and asm variants comes from @easyaspi314
*/
#ifndef XXH3_H_1397135465
#define XXH3_H_1397135465
/* === Dependencies === */
#ifndef XXHASH_H_5627135585666179
/* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */
#undef XXH_INLINE_ALL /* avoid redefinition */
#define XXH_INLINE_ALL
#endif
#include "xxhash.hh"
/* === Compiler specifics === */
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
#define XXH_RESTRICT restrict
#else
/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
#define XXH_RESTRICT /* disable */
#endif
#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
#define XXH_likely(x) __builtin_expect(x, 1)
#define XXH_unlikely(x) __builtin_expect(x, 0)
#else
#define XXH_likely(x) (x)
#define XXH_unlikely(x) (x)
#endif
#if defined(__GNUC__)
#if defined(__AVX2__)
#include <immintrin.h>
#elif defined(__SSE2__)
#include <emmintrin.h>
#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
#define inline __inline__ /* clang bug */
#include <arm_neon.h>
#undef inline
#endif
#elif defined(_MSC_VER)
#include <intrin.h>
#endif
/*
* One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
* remaining a true 64-bit/128-bit hash function.
*
* This is done by prioritizing a subset of 64-bit operations that can be
* emulated without too many steps on the average 32-bit machine.
*
* For example, these two lines seem similar, and run equally fast on 64-bit:
*
* xxh_u64 x;
* x ^= (x >> 47); // good
* x ^= (x >> 13); // bad
*
* However, to a 32-bit machine, there is a major difference.
*
* x ^= (x >> 47) looks like this:
*
* x.lo ^= (x.hi >> (47 - 32));
*
* while x ^= (x >> 13) looks like this:
*
* // note: funnel shifts are not usually cheap.
* x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
* x.hi ^= (x.hi >> 13);
*
* The first one is significantly faster than the second, simply because the
* shift is larger than 32. This means:
* - All the bits we need are in the upper 32 bits, so we can ignore the lower
* 32 bits in the shift.
* - The shift result will always fit in the lower 32 bits, and therefore,
* we can ignore the upper 32 bits in the xor.
*
* Thanks to this optimization, XXH3 only requires these features to be efficient:
*
* - Usable unaligned access
* - A 32-bit or 64-bit ALU
* - If 32-bit, a decent ADC instruction
* - A 32 or 64-bit multiply with a 64-bit result
* - For the 128-bit variant, a decent byteswap helps short inputs.
*
* The first two are already required by XXH32, and almost all 32-bit and 64-bit
* platforms which can run XXH32 can run XXH3 efficiently.
*
* Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
* notable exception.
*
* First of all, Thumb-1 lacks support for the UMULL instruction which
* performs the important long multiply. This means numerous __aeabi_lmul
* calls.
*
* Second of all, the 8 functional registers are just not enough.
* Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
* Lo registers, and this shuffling results in thousands more MOVs than A32.
*
* A32 and T32 don't have this limitation. They can access all 14 registers,
* do a 32->64 multiply with UMULL, and the flexible operand allowing free
* shifts is helpful, too.
*
* Therefore, we do a quick sanity check.
*
* If compiling Thumb-1 for a target which supports ARM instructions, we will
* emit a warning, as it is not a "sane" platform to compile for.
*
* Usually, if this happens, it is because of an accident and you probably need
* to specify -march, as you likely meant to compile for a newer architecture.
*/
#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
#warning "XXH3 is highly inefficient without ARM or Thumb-2."
#endif
/* ==========================================
* Vectorization detection
* ========================================== */
#define XXH_SCALAR 0 /* Portable scalar version */
#define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */
#define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */
#define XXH_NEON 3 /* NEON for most ARMv7-A and all AArch64 */
#define XXH_VSX 4 /* VSX and ZVector for POWER8/z13 */
#define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */
#ifndef XXH_VECTOR /* can be defined on command line */
#if defined(__AVX512F__)
#define XXH_VECTOR XXH_AVX512
#elif defined(__AVX2__)
#define XXH_VECTOR XXH_AVX2
#elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
#define XXH_VECTOR XXH_SSE2
#elif defined(__GNUC__) /* msvc support maybe later */ \
&& (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
&& (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
#define XXH_VECTOR XXH_NEON
#elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || (defined(__s390x__) && defined(__VEC__)) && defined(__GNUC__) /* TODO: IBM XL */
#define XXH_VECTOR XXH_VSX
#else
#define XXH_VECTOR XXH_SCALAR
#endif
#endif
/*
* Controls the alignment of the accumulator.
* This is for compatibility with aligned vector loads, which are usually faster.
*/
#ifndef XXH_ACC_ALIGN
#if XXH_VECTOR == XXH_SCALAR /* scalar */
#define XXH_ACC_ALIGN 8
#elif XXH_VECTOR == XXH_SSE2 /* sse2 */
#define XXH_ACC_ALIGN 16
#elif XXH_VECTOR == XXH_AVX2 /* avx2 */
#define XXH_ACC_ALIGN 32
#elif XXH_VECTOR == XXH_NEON /* neon */
#define XXH_ACC_ALIGN 16
#elif XXH_VECTOR == XXH_VSX /* vsx */
#define XXH_ACC_ALIGN 16
#elif XXH_VECTOR == XXH_AVX512 /* avx512 */
#define XXH_ACC_ALIGN 64
#endif
#endif
/*
* UGLY HACK:
* GCC usually generates the best code with -O3 for xxHash.
*
* However, when targeting AVX2, it is overzealous in its unrolling resulting
* in code roughly 3/4 the speed of Clang.
*
* There are other issues, such as GCC splitting _mm256_loadu_si256 into
* _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
* only applies to Sandy and Ivy Bridge... which don't even support AVX2.
*
* That is why when compiling the AVX2 version, it is recommended to use either
* -O2 -mavx2 -march=haswell
* or
* -O2 -mavx2 -mno-avx256-split-unaligned-load
* for decent performance, or to use Clang instead.
*
* Fortunately, we can control the first one with a pragma that forces GCC into
* -O2, but the other one we can't control without "failed to inline always
* inline function due to target mismatch" warnings.
*/
#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
#pragma GCC push_options
#pragma GCC optimize("-O2")
#endif
#if XXH_VECTOR == XXH_NEON
/*
* NEON's setup for vmlal_u32 is a little more complicated than it is on
* SSE2, AVX2, and VSX.
*
* While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
*
* To do the same operation, the 128-bit 'Q' register needs to be split into
* two 64-bit 'D' registers, performing this operation::
*
* [ a | b ]
* | '---------. .--------' |
* | x |
* | .---------' '--------. |
* [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]
*
* Due to significant changes in aarch64, the fastest method for aarch64 is
* completely different than the fastest method for ARMv7-A.
*
* ARMv7-A treats D registers as unions overlaying Q registers, so modifying
* D11 will modify the high half of Q5. This is similar to how modifying AH
* will only affect bits 8-15 of AX on x86.
*
* VZIP takes two registers, and puts even lanes in one register and odd lanes
* in the other.
*
* On ARMv7-A, this strangely modifies both parameters in place instead of
* taking the usual 3-operand form.
*
* Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
* lower and upper halves of the Q register to end up with the high and low
* halves where we want - all in one instruction.
*
* vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
*
* Unfortunately we need inline assembly for this: Instructions modifying two
* registers at once is not possible in GCC or Clang's IR, and they have to
* create a copy.
*
* aarch64 requires a different approach.
*
* In order to make it easier to write a decent compiler for aarch64, many
* quirks were removed, such as conditional execution.
*
* NEON was also affected by this.
*
* aarch64 cannot access the high bits of a Q-form register, and writes to a
* D-form register zero the high bits, similar to how writes to W-form scalar
* registers (or DWORD registers on x86_64) work.
*
* The formerly free vget_high intrinsics now require a vext (with a few
* exceptions)
*
* Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
* of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
* operand.
*
* The equivalent of the VZIP.32 on the lower and upper halves would be this
* mess:
*
* ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
* zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }
* zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
*
* Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
*
* shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
* xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
*
* This is available on ARMv7-A, but is less efficient than a single VZIP.32.
*/
/*
* Function-like macro:
* void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
* {
* outLo = (uint32x2_t)(in & 0xFFFFFFFF);
* outHi = (uint32x2_t)(in >> 32);
* in = UNDEFINED;
* }
*/
#if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
&& defined(__GNUC__) && !defined(__aarch64__) && !defined(__arm64__)
#define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
do \
{ \
/* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
/* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \
/* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
__asm__("vzip.32 %e0, %f0" : "+w"(in)); \
(outLo) = vget_low_u32(vreinterpretq_u32_u64(in)); \
(outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
} while (0)
#else
#define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
do \
{ \
(outLo) = vmovn_u64(in); \
(outHi) = vshrn_n_u64((in), 32); \
} while (0)
#endif
#endif /* XXH_VECTOR == XXH_NEON */
/*
* VSX and Z Vector helpers.
*
* This is very messy, and any pull requests to clean this up are welcome.
*
* There are a lot of problems with supporting VSX and s390x, due to
* inconsistent intrinsics, spotty coverage, and multiple endiannesses.
*/
#if XXH_VECTOR == XXH_VSX
#if defined(__s390x__)
#include <s390intrin.h>
#else
#include <altivec.h>
#endif
#undef vector /* Undo the pollution */
typedef __vector unsigned long long xxh_u64x2;
typedef __vector unsigned char xxh_u8x16;
typedef __vector unsigned xxh_u32x4;
#ifndef XXH_VSX_BE
#if defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#define XXH_VSX_BE 1
#elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
#warning "-maltivec=be is not recommended. Please use native endianness."
#define XXH_VSX_BE 1
#else
#define XXH_VSX_BE 0
#endif
#endif /* !defined(XXH_VSX_BE) */
#if XXH_VSX_BE
/* A wrapper for POWER9's vec_revb. */
#if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
#define XXH_vec_revb vec_revb
#else
XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
{
xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08};
return vec_perm(val, val, vByteSwap);
}
#endif
#endif /* XXH_VSX_BE */
/*
* Performs an unaligned load and byte swaps it on big endian.
*/
XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void* ptr)
{
xxh_u64x2 ret;
memcpy(&ret, ptr, sizeof(xxh_u64x2));
#if XXH_VSX_BE
ret = XXH_vec_revb(ret);
#endif
return ret;
}
/*
* vec_mulo and vec_mule are very problematic intrinsics on PowerPC
*
* These intrinsics weren't added until GCC 8, despite existing for a while,
* and they are endian dependent. Also, their meaning swap depending on version.
* */
#if defined(__s390x__)
/* s390x is always big endian, no issue on this platform */
#define XXH_vec_mulo vec_mulo
#define XXH_vec_mule vec_mule
#elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw)
/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
#define XXH_vec_mulo __builtin_altivec_vmulouw
#define XXH_vec_mule __builtin_altivec_vmuleuw
#else
/* gcc needs inline assembly */
/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
{
xxh_u64x2 result;
__asm__("vmulouw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
return result;
}
XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
{
xxh_u64x2 result;
__asm__("vmuleuw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
return result;
}
#endif /* XXH_vec_mulo, XXH_vec_mule */
#endif /* XXH_VECTOR == XXH_VSX */
/* prefetch
* can be disabled, by declaring XXH_NO_PREFETCH build macro */
#if defined(XXH_NO_PREFETCH)
#define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
#else
#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
#include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
#define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
#elif defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
#define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
#else
#define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
#endif
#endif /* XXH_NO_PREFETCH */
/* ==========================================
* XXH3 default settings
* ========================================== */
#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
#error "default keyset is not large enough"
#endif
/* Pseudorandom secret taken directly from FARSH */
XXH_ALIGN(64)
static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90,
0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d,
0xcc, 0xff, 0x72, 0x21, 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, 0x3c, 0x28,
0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e,
0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b,
0x4f, 0x1d, 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0,
0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1,
0x7a, 0xd0, 0x31, 0xce, 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
};
/*
* Calculates a 32-bit to 64-bit long multiply.
*
* Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
* need to (but it shouldn't need to anyways, it is about 7 instructions to do
* a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
* use that instead of the normal method.
*
* If you are compiling for platforms like Thumb-1 and don't have a better option,
* you may also want to write your own long multiply routine here.
*
* XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
* {
* return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
* }
*/
#if defined(_MSC_VER) && defined(_M_IX86)
#include <intrin.h>
#define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
#else
/*
* Downcast + upcast is usually better than masking on older compilers like
* GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
*
* The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
* and perform a full 64x64 multiply -- entirely redundant on 32-bit.
*/
#define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
#endif
/*
* Calculates a 64->128-bit long multiply.
*
* Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
*/
static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
{
/*
* GCC/Clang __uint128_t method.
*
* On most 64-bit targets, GCC and Clang define a __uint128_t type.
* This is usually the best way as it usually uses a native long 64-bit
* multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
*
* Usually.
*
* Despite being a 32-bit platform, Clang (and emscripten) define this type
* despite not having the arithmetic for it. This results in a laggy
* compiler builtin call which calculates a full 128-bit multiply.
* In that case it is best to use the portable one.
* https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
*/
#if defined(__GNUC__) && !defined(__wasm__) && defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
__uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
XXH128_hash_t r128;
r128.low64 = (xxh_u64)(product);
r128.high64 = (xxh_u64)(product >> 64);
return r128;
/*
* MSVC for x64's _umul128 method.
*
* xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
*
* This compiles to single operand MUL on x64.
*/
#elif defined(_M_X64) || defined(_M_IA64)
#ifndef _MSC_VER
#pragma intrinsic(_umul128)
#endif
xxh_u64 product_high;
xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
XXH128_hash_t r128;
r128.low64 = product_low;
r128.high64 = product_high;
return r128;
#else
/*
* Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
*
* This is a fast and simple grade school multiply, which is shown below
* with base 10 arithmetic instead of base 0x100000000.
*
* 9 3 // D2 lhs = 93
* x 7 5 // D2 rhs = 75
* ----------
* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
* 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
* 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
* + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
* ---------
* 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
* + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
* ---------
* 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
*
* The reasons for adding the products like this are:
* 1. It avoids manual carry tracking. Just like how
* (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
* This avoids a lot of complexity.
*
* 2. It hints for, and on Clang, compiles to, the powerful UMAAL
* instruction available in ARM's Digital Signal Processing extension
* in 32-bit ARMv6 and later, which is shown below:
*
* void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
* {
* xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
* *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
* *RdHi = (xxh_u32)(product >> 32);
* }
*
* This instruction was designed for efficient long multiplication, and
* allows this to be calculated in only 4 instructions at speeds
* comparable to some 64-bit ALUs.
*
* 3. It isn't terrible on other platforms. Usually this will be a couple
* of 32-bit ADD/ADCs.
*/
/* First calculate all of the cross products. */
xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
/* Now add the products together. These will never overflow. */
xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
XXH128_hash_t r128;
r128.low64 = lower;
r128.high64 = upper;
return r128;
#endif
}
/*
* Does a 64-bit to 128-bit multiply, then XOR folds it.
* xxh3.h is no longer useful,
* but it is still provided for compatibility with source code
* which used to include it directly.
*
* The reason for the separate function is to prevent passing too many structs
* around by value. This will hopefully inline the multiply, but we don't force it.
*/
static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
{
XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
return product.low64 ^ product.high64;
}
/* Seems to produce slightly better code on GCC for some reason. */
XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
{
XXH_ASSERT(0 <= shift && shift < 64);
return v64 ^ (v64 >> shift);
}
/*
* We don't need to (or want to) mix as much as XXH64.
* Programs are now highly discouraged to include xxh3.h.
* Include `xxhash.h` instead, which is the officially supported interface.
*
* Short hashes are more evenly distributed, so it isn't necessary.
* In the future, xxh3.h will start to generate warnings, then errors,
* then it will be removed from source package and from include directory.
*/
static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
{
h64 = XXH_xorshift64(h64, 37);
h64 *= 0x165667919E3779F9ULL;
h64 = XXH_xorshift64(h64, 32);
return h64;
}