DOSBox-X: src/libs/decoders/xxh3.h Source File

00001 /*
00002  * xxHash - Extremely Fast Hash algorithm
00003  * Development source file for `xxh3`
00004  * Copyright (C) 2019-2020 Yann Collet
00005  *
00006  * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions are
00010  * met:
00011  *
00012  *    * Redistributions of source code must retain the above copyright
00013  *      notice, this list of conditions and the following disclaimer.
00014  *    * Redistributions in binary form must reproduce the above
00015  *      copyright notice, this list of conditions and the following disclaimer
00016  *      in the documentation and/or other materials provided with the
00017  *      distribution.
00018  *
00019  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00020  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00021  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00022  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00023  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00024  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00025  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00026  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00027  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00028  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00029  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030  *
00031  * You can contact the author at:
00032  *   - xxHash homepage: https://www.xxhash.com
00033  *   - xxHash source repository: https://github.com/Cyan4973/xxHash
00034  */
00035 
00036 /*
00037  * Note: This file is separated for development purposes.
00038  * It will be integrated into `xxhash.h` when development stage is completed.
00039  *
00040  * Credit: most of the work on vectorial and asm variants comes from @easyaspi314
00041  */
00042 
00043 #ifndef XXH3_H_1397135465
00044 #define XXH3_H_1397135465
00045 
00046 /* ===   Dependencies   === */
00047 #ifndef XXHASH_H_5627135585666179
00048 /* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */
00049 #  undef XXH_INLINE_ALL   /* avoid redefinition */
00050 #  define XXH_INLINE_ALL
00051 #endif
00052 #include "xxhash.h"
00053 
00054 
00055 /* ===   Compiler specifics   === */
00056 
00057 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
00058 #  define XXH_RESTRICT   restrict
00059 #else
00060 /* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
00061 #  define XXH_RESTRICT   /* disable */
00062 #endif
00063 
00064 #if (defined(__GNUC__) && (__GNUC__ >= 3))  \
00065   || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
00066   || defined(__clang__)
00067 #    define XXH_likely(x) __builtin_expect(x, 1)
00068 #    define XXH_unlikely(x) __builtin_expect(x, 0)
00069 #else
00070 #    define XXH_likely(x) (x)
00071 #    define XXH_unlikely(x) (x)
00072 #endif
00073 
00074 #if defined(__GNUC__)
00075 #  if defined(__AVX2__)
00076 #    include <immintrin.h>
00077 #  elif defined(__SSE2__)
00078 #    include <emmintrin.h>
00079 #  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
00080 #    define inline __inline__  /* clang bug */
00081 #    include <arm_neon.h>
00082 #    undef inline
00083 #  endif
00084 #elif defined(_MSC_VER)
00085 #  include <intrin.h>
00086 #endif
00087 
00088 /*
00089  * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
00090  * remaining a true 64-bit/128-bit hash function.
00091  *
00092  * This is done by prioritizing a subset of 64-bit operations that can be
00093  * emulated without too many steps on the average 32-bit machine.
00094  *
00095  * For example, these two lines seem similar, and run equally fast on 64-bit:
00096  *
00097  *   xxh_u64 x;
00098  *   x ^= (x >> 47); // good
00099  *   x ^= (x >> 13); // bad
00100  *
00101  * However, to a 32-bit machine, there is a major difference.
00102  *
00103  * x ^= (x >> 47) looks like this:
00104  *
00105  *   x.lo ^= (x.hi >> (47 - 32));
00106  *
00107  * while x ^= (x >> 13) looks like this:
00108  *
00109  *   // note: funnel shifts are not usually cheap.
00110  *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
00111  *   x.hi ^= (x.hi >> 13);
00112  *
00113  * The first one is significantly faster than the second, simply because the
00114  * shift is larger than 32. This means:
00115  *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
00116  *    32 bits in the shift.
00117  *  - The shift result will always fit in the lower 32 bits, and therefore,
00118  *    we can ignore the upper 32 bits in the xor.
00119  *
00120  * Thanks to this optimization, XXH3 only requires these features to be efficient:
00121  *
00122  *  - Usable unaligned access
00123  *  - A 32-bit or 64-bit ALU
00124  *      - If 32-bit, a decent ADC instruction
00125  *  - A 32 or 64-bit multiply with a 64-bit result
00126  *  - For the 128-bit variant, a decent byteswap helps short inputs.
00127  *
00128  * The first two are already required by XXH32, and almost all 32-bit and 64-bit
00129  * platforms which can run XXH32 can run XXH3 efficiently.
00130  *
00131  * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
00132  * notable exception.
00133  *
00134  * First of all, Thumb-1 lacks support for the UMULL instruction which
00135  * performs the important long multiply. This means numerous __aeabi_lmul
00136  * calls.
00137  *
00138  * Second of all, the 8 functional registers are just not enough.
00139  * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
00140  * Lo registers, and this shuffling results in thousands more MOVs than A32.
00141  *
00142  * A32 and T32 don't have this limitation. They can access all 14 registers,
00143  * do a 32->64 multiply with UMULL, and the flexible operand allowing free
00144  * shifts is helpful, too.
00145  *
00146  * Therefore, we do a quick sanity check.
00147  *
00148  * If compiling Thumb-1 for a target which supports ARM instructions, we will
00149  * emit a warning, as it is not a "sane" platform to compile for.
00150  *
00151  * Usually, if this happens, it is because of an accident and you probably need
00152  * to specify -march, as you likely meant to compile for a newer architecture.
00153  */
00154 #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
00155 #   warning "XXH3 is highly inefficient without ARM or Thumb-2."
00156 #endif
00157 
00158 /* ==========================================
00159  * Vectorization detection
00160  * ========================================== */
00161 #define XXH_SCALAR 0 /* Portable scalar version */
00162 #define XXH_SSE2   1 /* SSE2 for Pentium 4 and all x86_64 */
00163 #define XXH_AVX2   2 /* AVX2 for Haswell and Bulldozer */
00164 #define XXH_NEON   3 /* NEON for most ARMv7-A and all AArch64 */
00165 #define XXH_VSX    4 /* VSX and ZVector for POWER8/z13 */
00166 #define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */
00167 
00168 #ifndef XXH_VECTOR    /* can be defined on command line */
00169 #  if defined(__AVX512F__)
00170 #    define XXH_VECTOR XXH_AVX512
00171 #  elif defined(__AVX2__)
00172 #    define XXH_VECTOR XXH_AVX2
00173 #  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
00174 #    define XXH_VECTOR XXH_SSE2
00175 #  elif defined(__GNUC__) /* msvc support maybe later */ \
00176   && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
00177   && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
00178     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
00179 #    define XXH_VECTOR XXH_NEON
00180 #  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
00181      || (defined(__s390x__) && defined(__VEC__)) \
00182      && defined(__GNUC__) /* TODO: IBM XL */
00183 #    define XXH_VECTOR XXH_VSX
00184 #  else
00185 #    define XXH_VECTOR XXH_SCALAR
00186 #  endif
00187 #endif
00188 
00189 /*
00190  * Controls the alignment of the accumulator.
00191  * This is for compatibility with aligned vector loads, which are usually faster.
00192  */
00193 #ifndef XXH_ACC_ALIGN
00194 #  if XXH_VECTOR == XXH_SCALAR  /* scalar */
00195 #     define XXH_ACC_ALIGN 8
00196 #  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
00197 #     define XXH_ACC_ALIGN 16
00198 #  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
00199 #     define XXH_ACC_ALIGN 32
00200 #  elif XXH_VECTOR == XXH_NEON  /* neon */
00201 #     define XXH_ACC_ALIGN 16
00202 #  elif XXH_VECTOR == XXH_VSX   /* vsx */
00203 #     define XXH_ACC_ALIGN 16
00204 #  elif XXH_VECTOR == XXH_AVX512 /* avx512 */
00205 #     define XXH_ACC_ALIGN 64
00206 #  endif
00207 #endif
00208 
00209 /*
00210  * UGLY HACK:
00211  * GCC usually generates the best code with -O3 for xxHash.
00212  *
00213  * However, when targeting AVX2, it is overzealous in its unrolling resulting
00214  * in code roughly 3/4 the speed of Clang.
00215  *
00216  * There are other issues, such as GCC splitting _mm256_loadu_si256 into
00217  * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
00218  * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
00219  *
00220  * That is why when compiling the AVX2 version, it is recommended to use either
00221  *   -O2 -mavx2 -march=haswell
00222  * or
00223  *   -O2 -mavx2 -mno-avx256-split-unaligned-load
00224  * for decent performance, or to use Clang instead.
00225  *
00226  * Fortunately, we can control the first one with a pragma that forces GCC into
00227  * -O2, but the other one we can't control without "failed to inline always
00228  * inline function due to target mismatch" warnings.
00229  */
00230 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
00231   && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
00232   && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
00233 #  pragma GCC push_options
00234 #  pragma GCC optimize("-O2")
00235 #endif
00236 
00237 
00238 #if XXH_VECTOR == XXH_NEON
00239 /*
00240  * NEON's setup for vmlal_u32 is a little more complicated than it is on
00241  * SSE2, AVX2, and VSX.
00242  *
00243  * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
00244  *
00245  * To do the same operation, the 128-bit 'Q' register needs to be split into
00246  * two 64-bit 'D' registers, performing this operation::
00247  *
00248  *   [                a                 |                 b                ]
00249  *            |              '---------. .--------'                |
00250  *            |                         x                          |
00251  *            |              .---------' '--------.                |
00252  *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
00253  *
00254  * Due to significant changes in aarch64, the fastest method for aarch64 is
00255  * completely different than the fastest method for ARMv7-A.
00256  *
00257  * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
00258  * D11 will modify the high half of Q5. This is similar to how modifying AH
00259  * will only affect bits 8-15 of AX on x86.
00260  *
00261  * VZIP takes two registers, and puts even lanes in one register and odd lanes
00262  * in the other.
00263  *
00264  * On ARMv7-A, this strangely modifies both parameters in place instead of
00265  * taking the usual 3-operand form.
00266  *
00267  * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
00268  * lower and upper halves of the Q register to end up with the high and low
00269  * halves where we want - all in one instruction.
00270  *
00271  *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
00272  *
00273  * Unfortunately we need inline assembly for this: Instructions modifying two
00274  * registers at once is not possible in GCC or Clang's IR, and they have to
00275  * create a copy.
00276  *
00277  * aarch64 requires a different approach.
00278  *
00279  * In order to make it easier to write a decent compiler for aarch64, many
00280  * quirks were removed, such as conditional execution.
00281  *
00282  * NEON was also affected by this.
00283  *
00284  * aarch64 cannot access the high bits of a Q-form register, and writes to a
00285  * D-form register zero the high bits, similar to how writes to W-form scalar
00286  * registers (or DWORD registers on x86_64) work.
00287  *
00288  * The formerly free vget_high intrinsics now require a vext (with a few
00289  * exceptions)
00290  *
00291  * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
00292  * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
00293  * operand.
00294  *
00295  * The equivalent of the VZIP.32 on the lower and upper halves would be this
00296  * mess:
00297  *
00298  *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
00299  *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
00300  *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
00301  *
00302  * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
00303  *
00304  *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
00305  *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
00306  *
00307  * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
00308  */
00309 
00310 /*
00311  * Function-like macro:
00312  * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
00313  * {
00314  *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
00315  *     outHi = (uint32x2_t)(in >> 32);
00316  *     in = UNDEFINED;
00317  * }
00318  */
00319 # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
00320    && defined(__GNUC__) \
00321    && !defined(__aarch64__) && !defined(__arm64__)
00322 #  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
00323     do {                                                                                    \
00324       /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
00325       /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
00326       /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
00327       __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
00328       (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
00329       (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
00330    } while (0)
00331 # else
00332 #  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
00333     do {                                                                                  \
00334       (outLo) = vmovn_u64    (in);                                                        \
00335       (outHi) = vshrn_n_u64  ((in), 32);                                                  \
00336     } while (0)
00337 # endif
00338 #endif  /* XXH_VECTOR == XXH_NEON */
00339 
00340 /*
00341  * VSX and Z Vector helpers.
00342  *
00343  * This is very messy, and any pull requests to clean this up are welcome.
00344  *
00345  * There are a lot of problems with supporting VSX and s390x, due to
00346  * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
00347  */
00348 #if XXH_VECTOR == XXH_VSX
00349 #  if defined(__s390x__)
00350 #    include <s390intrin.h>
00351 #  else
00352 #    include <altivec.h>
00353 #  endif
00354 
00355 #  undef vector /* Undo the pollution */
00356 
00357 typedef __vector unsigned long long xxh_u64x2;
00358 typedef __vector unsigned char xxh_u8x16;
00359 typedef __vector unsigned xxh_u32x4;
00360 
00361 # ifndef XXH_VSX_BE
00362 #  if defined(__BIG_ENDIAN__) \
00363   || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
00364 #    define XXH_VSX_BE 1
00365 #  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
00366 #    warning "-maltivec=be is not recommended. Please use native endianness."
00367 #    define XXH_VSX_BE 1
00368 #  else
00369 #    define XXH_VSX_BE 0
00370 #  endif
00371 # endif /* !defined(XXH_VSX_BE) */
00372 
00373 # if XXH_VSX_BE
00374 /* A wrapper for POWER9's vec_revb. */
00375 #  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
00376 #    define XXH_vec_revb vec_revb
00377 #  else
00378 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
00379 {
00380     xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
00381                                   0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
00382     return vec_perm(val, val, vByteSwap);
00383 }
00384 #  endif
00385 # endif /* XXH_VSX_BE */
00386 
00387 /*
00388  * Performs an unaligned load and byte swaps it on big endian.
00389  */
00390 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
00391 {
00392     xxh_u64x2 ret;
00393     memcpy(&ret, ptr, sizeof(xxh_u64x2));
00394 # if XXH_VSX_BE
00395     ret = XXH_vec_revb(ret);
00396 # endif
00397     return ret;
00398 }
00399 
00400 /*
00401  * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
00402  *
00403  * These intrinsics weren't added until GCC 8, despite existing for a while,
00404  * and they are endian dependent. Also, their meaning swap depending on version.
00405  * */
00406 # if defined(__s390x__)
00407  /* s390x is always big endian, no issue on this platform */
00408 #  define XXH_vec_mulo vec_mulo
00409 #  define XXH_vec_mule vec_mule
00410 # elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
00411 /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
00412 #  define XXH_vec_mulo __builtin_altivec_vmulouw
00413 #  define XXH_vec_mule __builtin_altivec_vmuleuw
00414 # else
00415 /* gcc needs inline assembly */
00416 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
00417 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
00418 {
00419     xxh_u64x2 result;
00420     __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
00421     return result;
00422 }
00423 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
00424 {
00425     xxh_u64x2 result;
00426     __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
00427     return result;
00428 }
00429 # endif /* XXH_vec_mulo, XXH_vec_mule */
00430 #endif /* XXH_VECTOR == XXH_VSX */
00431 
00432 
00433 /* prefetch
00434  * can be disabled, by declaring XXH_NO_PREFETCH build macro */
00435 #if defined(XXH_NO_PREFETCH)
00436 #  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
00437 #else
00438 #  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
00439 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
00440 #    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
00441 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
00442 #    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
00443 #  else
00444 #    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
00445 #  endif
00446 #endif  /* XXH_NO_PREFETCH */
00447 
00448 
00449 /* ==========================================
00450  * XXH3 default settings
00451  * ========================================== */
00452 
00453 #define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
00454 
00455 #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
00456 #  error "default keyset is not large enough"
00457 #endif
00458 
00459 /* Pseudorandom secret taken directly from FARSH */
00460 XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
00461     0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
00462     0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
00463     0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
00464     0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
00465     0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
00466     0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
00467     0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
00468     0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
00469 
00470     0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
00471     0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
00472     0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
00473     0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
00474 };
00475 
00476 #ifdef XXH_OLD_NAMES
00477 #  define kSecret XXH3_kSecret
00478 #endif
00479 
00480 /*
00481  * Calculates a 32-bit to 64-bit long multiply.
00482  *
00483  * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
00484  * need to (but it shouldn't need to anyways, it is about 7 instructions to do
00485  * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
00486  * use that instead of the normal method.
00487  *
00488  * If you are compiling for platforms like Thumb-1 and don't have a better option,
00489  * you may also want to write your own long multiply routine here.
00490  *
00491  * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
00492  * {
00493  *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
00494  * }
00495  */
00496 #if defined(_MSC_VER) && defined(_M_IX86)
00497 #    include <intrin.h>
00498 #    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
00499 #else
00500 /*
00501  * Downcast + upcast is usually better than masking on older compilers like
00502  * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
00503  *
00504  * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
00505  * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
00506  */
00507 #    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
00508 #endif
00509 
00510 /*
00511  * Calculates a 64->128-bit long multiply.
00512  *
00513  * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
00514  */
00515 static XXH128_hash_t
00516 XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
00517 {
00518     /*
00519      * GCC/Clang __uint128_t method.
00520      *
00521      * On most 64-bit targets, GCC and Clang define a __uint128_t type.
00522      * This is usually the best way as it usually uses a native long 64-bit
00523      * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
00524      *
00525      * Usually.
00526      *
00527      * Despite being a 32-bit platform, Clang (and emscripten) define this type
00528      * despite not having the arithmetic for it. This results in a laggy
00529      * compiler builtin call which calculates a full 128-bit multiply.
00530      * In that case it is best to use the portable one.
00531      * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
00532      */
00533 #if defined(__GNUC__) && !defined(__wasm__) \
00534     && defined(__SIZEOF_INT128__) \
00535     || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
00536 
00537     __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
00538     XXH128_hash_t r128;
00539     r128.low64  = (xxh_u64)(product);
00540     r128.high64 = (xxh_u64)(product >> 64);
00541     return r128;
00542 
00543     /*
00544      * MSVC for x64's _umul128 method.
00545      *
00546      * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
00547      *
00548      * This compiles to single operand MUL on x64.
00549      */
00550 #elif defined(_M_X64) || defined(_M_IA64)
00551 
00552 #ifndef _MSC_VER
00553 #   pragma intrinsic(_umul128)
00554 #endif
00555     xxh_u64 product_high;
00556     xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
00557     XXH128_hash_t r128;
00558     r128.low64  = product_low;
00559     r128.high64 = product_high;
00560     return r128;
00561 
00562 #else
00563     /*
00564      * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
00565      *
00566      * This is a fast and simple grade school multiply, which is shown below
00567      * with base 10 arithmetic instead of base 0x100000000.
00568      *
00569      *           9 3 // D2 lhs = 93
00570      *         x 7 5 // D2 rhs = 75
00571      *     ----------
00572      *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
00573      *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
00574      *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
00575      *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
00576      *     ---------
00577      *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
00578      *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
00579      *     ---------
00580      *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
00581      *
00582      * The reasons for adding the products like this are:
00583      *  1. It avoids manual carry tracking. Just like how
00584      *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
00585      *     This avoids a lot of complexity.
00586      *
00587      *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
00588      *     instruction available in ARM's Digital Signal Processing extension
00589      *     in 32-bit ARMv6 and later, which is shown below:
00590      *
00591      *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
00592      *         {
00593      *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
00594      *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
00595      *             *RdHi = (xxh_u32)(product >> 32);
00596      *         }
00597      *
00598      *     This instruction was designed for efficient long multiplication, and
00599      *     allows this to be calculated in only 4 instructions at speeds
00600      *     comparable to some 64-bit ALUs.
00601      *
00602      *  3. It isn't terrible on other platforms. Usually this will be a couple
00603      *     of 32-bit ADD/ADCs.
00604      */
00605 
00606     /* First calculate all of the cross products. */
00607     xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
00608     xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
00609     xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
00610     xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
00611 
00612     /* Now add the products together. These will never overflow. */
00613     xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
00614     xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
00615     xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
00616 
00617     XXH128_hash_t r128;
00618     r128.low64  = lower;
00619     r128.high64 = upper;
00620     return r128;
00621 #endif
00622 }
00623 
00624 /*
00625  * Does a 64-bit to 128-bit multiply, then XOR folds it.
00626  *
00627  * The reason for the separate function is to prevent passing too many structs
00628  * around by value. This will hopefully inline the multiply, but we don't force it.
00629  */
00630 static xxh_u64
00631 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
00632 {
00633     XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
00634     return product.low64 ^ product.high64;
00635 }
00636 
00637 /* Seems to produce slightly better code on GCC for some reason. */
00638 XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
00639 {
00640     XXH_ASSERT(0 <= shift && shift < 64);
00641     return v64 ^ (v64 >> shift);
00642 }
00643 
00644 /*
00645  * We don't need to (or want to) mix as much as XXH64.
00646  *
00647  * Short hashes are more evenly distributed, so it isn't necessary.
00648  */
00649 static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
00650 {
00651     h64 = XXH_xorshift64(h64, 37);
00652     h64 *= 0x165667919E3779F9ULL;
00653     h64 = XXH_xorshift64(h64, 32);
00654     return h64;
00655 }
00656 
00657 
00658 /* ==========================================
00659  * Short keys
00660  * ==========================================
00661  * One of the shortcomings of XXH32 and XXH64 was that their performance was
00662  * sub-optimal on short lengths. It used an iterative algorithm which strongly
00663  * favored lengths that were a multiple of 4 or 8.
00664  *
00665  * Instead of iterating over individual inputs, we use a set of single shot
00666  * functions which piece together a range of lengths and operate in constant time.
00667  *
00668  * Additionally, the number of multiplies has been significantly reduced. This
00669  * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
00670  *
00671  * Depending on the platform, this may or may not be faster than XXH32, but it
00672  * is almost guaranteed to be faster than XXH64.
00673  */
00674 
00675 /*
00676  * At very short lengths, there isn't enough input to fully hide secrets, or use
00677  * the entire secret.
00678  *
00679  * There is also only a limited amount of mixing we can do before significantly
00680  * impacting performance.
00681  *
00682  * Therefore, we use different sections of the secret and always mix two secret
00683  * samples with an XOR. This should have no effect on performance on the
00684  * seedless or withSeed variants because everything _should_ be constant folded
00685  * by modern compilers.
00686  *
00687  * The XOR mixing hides individual parts of the secret and increases entropy.
00688  *
00689  * This adds an extra layer of strength for custom secrets.
00690  */
00691 XXH_FORCE_INLINE XXH64_hash_t
00692 XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
00693 {
00694     XXH_ASSERT(input != NULL);
00695     XXH_ASSERT(1 <= len && len <= 3);
00696     XXH_ASSERT(secret != NULL);
00697     /*
00698      * len = 1: combined = { input[0], 0x01, input[0], input[0] }
00699      * len = 2: combined = { input[1], 0x02, input[0], input[1] }
00700      * len = 3: combined = { input[2], 0x03, input[0], input[1] }
00701      */
00702     {   xxh_u8 const c1 = input[0];
00703         xxh_u8 const c2 = input[len >> 1];
00704         xxh_u8 const c3 = input[len - 1];
00705         xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
00706                                | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
00707         xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
00708         xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
00709         xxh_u64 const mixed = keyed * XXH_PRIME64_1;
00710         return XXH3_avalanche(mixed);
00711     }
00712 }
00713 
00714 XXH_FORCE_INLINE XXH64_hash_t
00715 XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
00716 {
00717     XXH_ASSERT(input != NULL);
00718     XXH_ASSERT(secret != NULL);
00719     XXH_ASSERT(4 <= len && len < 8);
00720     seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
00721     {   xxh_u32 const input1 = XXH_readLE32(input);
00722         xxh_u32 const input2 = XXH_readLE32(input + len - 4);
00723         xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
00724         xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
00725         xxh_u64 x = input64 ^ bitflip;
00726         /* this mix is inspired by Pelle Evensen's rrmxmx */
00727         x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24);
00728         x *= 0x9FB21C651E98DF25ULL;
00729         x ^= (x >> 35) + len ;
00730         x *= 0x9FB21C651E98DF25ULL;
00731         return XXH_xorshift64(x, 28);
00732     }
00733 }
00734 
00735 XXH_FORCE_INLINE XXH64_hash_t
00736 XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
00737 {
00738     XXH_ASSERT(input != NULL);
00739     XXH_ASSERT(secret != NULL);
00740     XXH_ASSERT(8 <= len && len <= 16);
00741     {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
00742         xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
00743         xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
00744         xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
00745         xxh_u64 const acc = len
00746                           + XXH_swap64(input_lo) + input_hi
00747                           + XXH3_mul128_fold64(input_lo, input_hi);
00748         return XXH3_avalanche(acc);
00749     }
00750 }
00751 
00752 XXH_FORCE_INLINE XXH64_hash_t
00753 XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
00754 {
00755     XXH_ASSERT(len <= 16);
00756     {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
00757         if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
00758         if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
00759         return XXH3_avalanche((XXH_PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
00760     }
00761 }
00762 
00763 /*
00764  * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
00765  * multiplication by zero, affecting hashes of lengths 17 to 240.
00766  *
00767  * However, they are very unlikely.
00768  *
00769  * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
00770  * unseeded non-cryptographic hashes, it does not attempt to defend itself
00771  * against specially crafted inputs, only random inputs.
00772  *
00773  * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
00774  * cancelling out the secret is taken an arbitrary number of times (addressed
00775  * in XXH3_accumulate_512), this collision is very unlikely with random inputs
00776  * and/or proper seeding:
00777  *
00778  * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
00779  * function that is only called up to 16 times per hash with up to 240 bytes of
00780  * input.
00781  *
00782  * This is not too bad for a non-cryptographic hash function, especially with
00783  * only 64 bit outputs.
00784  *
00785  * The 128-bit variant (which trades some speed for strength) is NOT affected
00786  * by this, although it is always a good idea to use a proper seed if you care
00787  * about strength.
00788  */
00789 XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
00790                                      const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
00791 {
00792 #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
00793   && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
00794   && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
00795     /*
00796      * UGLY HACK:
00797      * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
00798      * slower code.
00799      *
00800      * By forcing seed64 into a register, we disrupt the cost model and
00801      * cause it to scalarize. See `XXH32_round()`
00802      *
00803      * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
00804      * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
00805      * GCC 9.2, despite both emitting scalar code.
00806      *
00807      * GCC generates much better scalar code than Clang for the rest of XXH3,
00808      * which is why finding a more optimal codepath is an interest.
00809      */
00810     __asm__ ("" : "+r" (seed64));
00811 #endif
00812     {   xxh_u64 const input_lo = XXH_readLE64(input);
00813         xxh_u64 const input_hi = XXH_readLE64(input+8);
00814         return XXH3_mul128_fold64(
00815             input_lo ^ (XXH_readLE64(secret)   + seed64),
00816             input_hi ^ (XXH_readLE64(secret+8) - seed64)
00817         );
00818     }
00819 }
00820 
00821 /* For mid range keys, XXH3 uses a Mum-hash variant. */
00822 XXH_FORCE_INLINE XXH64_hash_t
00823 XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
00824                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
00825                      XXH64_hash_t seed)
00826 {
00827     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
00828     XXH_ASSERT(16 < len && len <= 128);
00829 
00830     {   xxh_u64 acc = len * XXH_PRIME64_1;
00831         if (len > 32) {
00832             if (len > 64) {
00833                 if (len > 96) {
00834                     acc += XXH3_mix16B(input+48, secret+96, seed);
00835                     acc += XXH3_mix16B(input+len-64, secret+112, seed);
00836                 }
00837                 acc += XXH3_mix16B(input+32, secret+64, seed);
00838                 acc += XXH3_mix16B(input+len-48, secret+80, seed);
00839             }
00840             acc += XXH3_mix16B(input+16, secret+32, seed);
00841             acc += XXH3_mix16B(input+len-32, secret+48, seed);
00842         }
00843         acc += XXH3_mix16B(input+0, secret+0, seed);
00844         acc += XXH3_mix16B(input+len-16, secret+16, seed);
00845 
00846         return XXH3_avalanche(acc);
00847     }
00848 }
00849 
00850 #define XXH3_MIDSIZE_MAX 240
00851 
00852 XXH_NO_INLINE XXH64_hash_t
00853 XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
00854                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
00855                       XXH64_hash_t seed)
00856 {
00857     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
00858     XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
00859 
00860     #define XXH3_MIDSIZE_STARTOFFSET 3
00861     #define XXH3_MIDSIZE_LASTOFFSET  17
00862 
00863     {   xxh_u64 acc = len * XXH_PRIME64_1;
00864         int const nbRounds = (int)len / 16;
00865         int i;
00866         for (i=0; i<8; i++) {
00867             acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
00868         }
00869         acc = XXH3_avalanche(acc);
00870         XXH_ASSERT(nbRounds >= 8);
00871 #if defined(__clang__)                                /* Clang */ \
00872     && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
00873     && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
00874         /*
00875          * UGLY HACK:
00876          * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
00877          * In everywhere else, it uses scalar code.
00878          *
00879          * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
00880          * would still be slower than UMAAL (see XXH_mult64to128).
00881          *
00882          * Unfortunately, Clang doesn't handle the long multiplies properly and
00883          * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
00884          * scalarized into an ugly mess of VMOV.32 instructions.
00885          *
00886          * This mess is difficult to avoid without turning autovectorization
00887          * off completely, but they are usually relatively minor and/or not
00888          * worth it to fix.
00889          *
00890          * This loop is the easiest to fix, as unlike XXH32, this pragma
00891          * _actually works_ because it is a loop vectorization instead of an
00892          * SLP vectorization.
00893          */
00894         #pragma clang loop vectorize(disable)
00895 #endif
00896         for (i=8 ; i < nbRounds; i++) {
00897             acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
00898         }
00899         /* last bytes */
00900         acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
00901         return XXH3_avalanche(acc);
00902     }
00903 }
00904 
00905 
00906 /* ===    Long Keys    === */
00907 
00908 #define XXH_STRIPE_LEN 64
00909 #define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
00910 #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
00911 
00912 #ifdef XXH_OLD_NAMES
00913 #  define STRIPE_LEN XXH_STRIPE_LEN
00914 #  define ACC_NB XXH_ACC_NB
00915 #endif
00916 
00917 typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
00918 
00919 /*
00920  * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
00921  *
00922  * It is a hardened version of UMAC, based off of FARSH's implementation.
00923  *
00924  * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
00925  * implementations, and it is ridiculously fast.
00926  *
00927  * We harden it by mixing the original input to the accumulators as well as the product.
00928  *
00929  * This means that in the (relatively likely) case of a multiply by zero, the
00930  * original input is preserved.
00931  *
00932  * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
00933  * cross-pollination, as otherwise the upper and lower halves would be
00934  * essentially independent.
00935  *
00936  * This doesn't matter on 64-bit hashes since they all get merged together in
00937  * the end, so we skip the extra step.
00938  *
00939  * Both XXH3_64bits and XXH3_128bits use this subroutine.
00940  */
00941 XXH_FORCE_INLINE void
00942 XXH3_accumulate_512(      void* XXH_RESTRICT acc,
00943                     const void* XXH_RESTRICT input,
00944                     const void* XXH_RESTRICT secret,
00945                     XXH3_accWidth_e accWidth)
00946 {
00947 #if (XXH_VECTOR == XXH_AVX512)
00948 
00949     XXH_ASSERT((((size_t)acc) & 63) == 0);
00950     XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
00951     {   XXH_ALIGN(64) __m512i* const xacc    =       (__m512i *) acc;
00952 
00953         /* data_vec    = input[0]; */
00954         __m512i const data_vec    = _mm512_loadu_si512   (input);
00955         /* key_vec     = secret[0]; */
00956         __m512i const key_vec     = _mm512_loadu_si512   (secret);
00957         /* data_key    = data_vec ^ key_vec; */
00958         __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
00959         /* data_key_lo = data_key >> 32; */
00960         __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
00961         /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
00962         __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
00963         if (accWidth == XXH3_acc_128bits) {
00964             /* xacc[0] += swap(data_vec); */
00965             __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
00966             __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
00967             /* xacc[0] += product; */
00968             *xacc = _mm512_add_epi64(product, sum);
00969         } else {  /* XXH3_acc_64bits */
00970             /* xacc[0] += data_vec; */
00971             __m512i const sum = _mm512_add_epi64(*xacc, data_vec);
00972             /* xacc[0] += product; */
00973             *xacc = _mm512_add_epi64(product, sum);
00974         }
00975     }
00976 
00977 #elif (XXH_VECTOR == XXH_AVX2)
00978 
00979     XXH_ASSERT((((size_t)acc) & 31) == 0);
00980     {   XXH_ALIGN(32) __m256i* const xacc    =       (__m256i *) acc;
00981         /* Unaligned. This is mainly for pointer arithmetic, and because
00982          * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
00983         const         __m256i* const xinput  = (const __m256i *) input;
00984         /* Unaligned. This is mainly for pointer arithmetic, and because
00985          * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
00986         const         __m256i* const xsecret = (const __m256i *) secret;
00987 
00988         size_t i;
00989         for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
00990             /* data_vec    = xinput[i]; */
00991             __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
00992             /* key_vec     = xsecret[i]; */
00993             __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
00994             /* data_key    = data_vec ^ key_vec; */
00995             __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
00996             /* data_key_lo = data_key >> 32; */
00997             __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
00998             /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
00999             __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
01000             if (accWidth == XXH3_acc_128bits) {
01001                 /* xacc[i] += swap(data_vec); */
01002                 __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
01003                 __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
01004                 /* xacc[i] += product; */
01005                 xacc[i] = _mm256_add_epi64(product, sum);
01006             } else {  /* XXH3_acc_64bits */
01007                 /* xacc[i] += data_vec; */
01008                 __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
01009                 /* xacc[i] += product; */
01010                 xacc[i] = _mm256_add_epi64(product, sum);
01011             }
01012     }   }
01013 
01014 #elif (XXH_VECTOR == XXH_SSE2)
01015 
01016     /* SSE2 is just a half-scale version of the AVX2 version. */
01017     XXH_ASSERT((((size_t)acc) & 15) == 0);
01018     {   XXH_ALIGN(16) __m128i* const xacc    =       (__m128i *) acc;
01019         /* Unaligned. This is mainly for pointer arithmetic, and because
01020          * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
01021         const         __m128i* const xinput  = (const __m128i *) input;
01022         /* Unaligned. This is mainly for pointer arithmetic, and because
01023          * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
01024         const         __m128i* const xsecret = (const __m128i *) secret;
01025 
01026         size_t i;
01027         for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
01028             /* data_vec    = xinput[i]; */
01029             __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
01030             /* key_vec     = xsecret[i]; */
01031             __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
01032             /* data_key    = data_vec ^ key_vec; */
01033             __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
01034             /* data_key_lo = data_key >> 32; */
01035             __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
01036             /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
01037             __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
01038             if (accWidth == XXH3_acc_128bits) {
01039                 /* xacc[i] += swap(data_vec); */
01040                 __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
01041                 __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
01042                 /* xacc[i] += product; */
01043                 xacc[i] = _mm_add_epi64(product, sum);
01044             } else {  /* XXH3_acc_64bits */
01045                 /* xacc[i] += data_vec; */
01046                 __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
01047                 /* xacc[i] += product; */
01048                 xacc[i] = _mm_add_epi64(product, sum);
01049             }
01050     }   }
01051 
01052 #elif (XXH_VECTOR == XXH_NEON)
01053 
01054     XXH_ASSERT((((size_t)acc) & 15) == 0);
01055     {
01056         XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
01057         /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
01058         uint8_t const* const xinput = (const uint8_t *) input;
01059         uint8_t const* const xsecret  = (const uint8_t *) secret;
01060 
01061         size_t i;
01062         for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
01063             /* data_vec = xinput[i]; */
01064             uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
01065             /* key_vec  = xsecret[i];  */
01066             uint8x16_t key_vec     = vld1q_u8(xsecret + (i * 16));
01067             uint64x2_t data_key;
01068             uint32x2_t data_key_lo, data_key_hi;
01069             if (accWidth == XXH3_acc_64bits) {
01070                 /* xacc[i] += data_vec; */
01071                 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
01072             } else {  /* XXH3_acc_128bits */
01073                 /* xacc[i] += swap(data_vec); */
01074                 uint64x2_t const data64  = vreinterpretq_u64_u8(data_vec);
01075                 uint64x2_t const swapped = vextq_u64(data64, data64, 1);
01076                 xacc[i] = vaddq_u64 (xacc[i], swapped);
01077             }
01078             /* data_key = data_vec ^ key_vec; */
01079             data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
01080             /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
01081              * data_key_hi = (uint32x2_t) (data_key >> 32);
01082              * data_key = UNDEFINED; */
01083             XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
01084             /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
01085             xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
01086 
01087         }
01088     }
01089 
01090 #elif (XXH_VECTOR == XXH_VSX)
01091           xxh_u64x2* const xacc     =       (xxh_u64x2*) acc;    /* presumed aligned */
01092     xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
01093     xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
01094     xxh_u64x2 const v32 = { 32, 32 };
01095     size_t i;
01096     for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
01097         /* data_vec = xinput[i]; */
01098         xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
01099         /* key_vec = xsecret[i]; */
01100         xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
01101         xxh_u64x2 const data_key = data_vec ^ key_vec;
01102         /* shuffled = (data_key << 32) | (data_key >> 32); */
01103         xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
01104         /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
01105         xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
01106         xacc[i] += product;
01107 
01108         if (accWidth == XXH3_acc_64bits) {
01109             xacc[i] += data_vec;
01110         } else {  /* XXH3_acc_128bits */
01111             /* swap high and low halves */
01112 #ifdef __s390x__
01113             xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2);
01114 #else
01115             xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
01116 #endif
01117             xacc[i] += data_swapped;
01118         }
01119     }
01120 
01121 #else   /* scalar variant of Accumulator - universal */
01122 
01123     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
01124     const xxh_u8* const xinput  = (const xxh_u8*) input;  /* no alignment restriction */
01125     const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
01126     size_t i;
01127     XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
01128     for (i=0; i < XXH_ACC_NB; i++) {
01129         xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
01130         xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
01131 
01132         if (accWidth == XXH3_acc_64bits) {
01133             xacc[i] += data_val;
01134         } else {
01135             xacc[i ^ 1] += data_val; /* swap adjacent lanes */
01136         }
01137         xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
01138     }
01139 #endif
01140 }
01141 
01142 /*
01143  * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
01144  *
01145  * Multiplication isn't perfect, as explained by Google in HighwayHash:
01146  *
01147  *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
01148  *  // varying degrees. In descending order of goodness, bytes
01149  *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
01150  *  // As expected, the upper and lower bytes are much worse.
01151  *
01152  * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
01153  *
01154  * Since our algorithm uses a pseudorandom secret to add some variance into the
01155  * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
01156  *
01157  * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
01158  * extraction.
01159  *
01160  * Both XXH3_64bits and XXH3_128bits use this subroutine.
01161  */
01162 XXH_FORCE_INLINE void
01163 XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
01164 {
01165 #if (XXH_VECTOR == XXH_AVX512)
01166 
01167     XXH_ASSERT((((size_t)acc) & 63) == 0);
01168     XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
01169     {   XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
01170         const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
01171 
01172         /* xacc[0] ^= (xacc[0] >> 47) */
01173         __m512i const acc_vec     = *xacc;
01174         __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
01175         __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
01176         /* xacc[0] ^= secret; */
01177         __m512i const key_vec     = _mm512_loadu_si512   (secret);
01178         __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
01179 
01180         /* xacc[0] *= XXH_PRIME32_1; */
01181         __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
01182         __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
01183         __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
01184         *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
01185     }
01186 
01187 #elif (XXH_VECTOR == XXH_AVX2)
01188 
01189     XXH_ASSERT((((size_t)acc) & 31) == 0);
01190     {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
01191         /* Unaligned. This is mainly for pointer arithmetic, and because
01192          * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
01193         const         __m256i* const xsecret = (const __m256i *) secret;
01194         const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
01195 
01196         size_t i;
01197         for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
01198             /* xacc[i] ^= (xacc[i] >> 47) */
01199             __m256i const acc_vec     = xacc[i];
01200             __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
01201             __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
01202             /* xacc[i] ^= xsecret; */
01203             __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
01204             __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
01205 
01206             /* xacc[i] *= XXH_PRIME32_1; */
01207             __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
01208             __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
01209             __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
01210             xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
01211         }
01212     }
01213 
01214 #elif (XXH_VECTOR == XXH_SSE2)
01215 
01216     XXH_ASSERT((((size_t)acc) & 15) == 0);
01217     {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
01218         /* Unaligned. This is mainly for pointer arithmetic, and because
01219          * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
01220         const         __m128i* const xsecret = (const __m128i *) secret;
01221         const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
01222 
01223         size_t i;
01224         for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
01225             /* xacc[i] ^= (xacc[i] >> 47) */
01226             __m128i const acc_vec     = xacc[i];
01227             __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
01228             __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
01229             /* xacc[i] ^= xsecret[i]; */
01230             __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
01231             __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
01232 
01233             /* xacc[i] *= XXH_PRIME32_1; */
01234             __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
01235             __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
01236             __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
01237             xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
01238         }
01239     }
01240 
01241 #elif (XXH_VECTOR == XXH_NEON)
01242 
01243     XXH_ASSERT((((size_t)acc) & 15) == 0);
01244 
01245     {   uint64x2_t* xacc       = (uint64x2_t*) acc;
01246         uint8_t const* xsecret = (uint8_t const*) secret;
01247         uint32x2_t prime       = vdup_n_u32 (XXH_PRIME32_1);
01248 
01249         size_t i;
01250         for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {
01251             /* xacc[i] ^= (xacc[i] >> 47); */
01252             uint64x2_t acc_vec  = xacc[i];
01253             uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
01254             uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
01255 
01256             /* xacc[i] ^= xsecret[i]; */
01257             uint8x16_t key_vec  = vld1q_u8(xsecret + (i * 16));
01258             uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
01259 
01260             /* xacc[i] *= XXH_PRIME32_1 */
01261             uint32x2_t data_key_lo, data_key_hi;
01262             /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
01263              * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
01264              * xacc[i] = UNDEFINED; */
01265             XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
01266             {   /*
01267                  * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
01268                  *
01269                  * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
01270                  * incorrectly "optimize" this:
01271                  *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
01272                  *   shifted = vshll_n_u32(tmp, 32);
01273                  * to this:
01274                  *   tmp     = "vmulq_u64"(a, b); // no such thing!
01275                  *   shifted = vshlq_n_u64(tmp, 32);
01276                  *
01277                  * However, unlike SSE, Clang lacks a 64-bit multiply routine
01278                  * for NEON, and it scalarizes two 64-bit multiplies instead.
01279                  *
01280                  * vmull_u32 has the same timing as vmul_u32, and it avoids
01281                  * this bug completely.
01282                  * See https://bugs.llvm.org/show_bug.cgi?id=39967
01283                  */
01284                 uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
01285                 /* xacc[i] = prod_hi << 32; */
01286                 xacc[i] = vshlq_n_u64(prod_hi, 32);
01287                 /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
01288                 xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
01289             }
01290     }   }
01291 
01292 #elif (XXH_VECTOR == XXH_VSX)
01293 
01294     XXH_ASSERT((((size_t)acc) & 15) == 0);
01295 
01296     {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
01297         const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
01298         /* constants */
01299         xxh_u64x2 const v32  = { 32, 32 };
01300         xxh_u64x2 const v47 = { 47, 47 };
01301         xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
01302         size_t i;
01303         for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
01304             /* xacc[i] ^= (xacc[i] >> 47); */
01305             xxh_u64x2 const acc_vec  = xacc[i];
01306             xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
01307 
01308             /* xacc[i] ^= xsecret[i]; */
01309             xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
01310             xxh_u64x2 const data_key = data_vec ^ key_vec;
01311 
01312             /* xacc[i] *= XXH_PRIME32_1 */
01313             /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
01314             xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
01315             /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
01316             xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
01317             xacc[i] = prod_odd + (prod_even << v32);
01318     }   }
01319 
01320 #else   /* scalar variant of Scrambler - universal */
01321 
01322     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
01323     const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
01324     size_t i;
01325     XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
01326     for (i=0; i < XXH_ACC_NB; i++) {
01327         xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
01328         xxh_u64 acc64 = xacc[i];
01329         acc64 = XXH_xorshift64(acc64, 47);
01330         acc64 ^= key64;
01331         acc64 *= XXH_PRIME32_1;
01332         xacc[i] = acc64;
01333     }
01334 
01335 #endif
01336 }
01337 
01338 #define XXH_PREFETCH_DIST 384
01339 
01340 #ifdef __clang__ // for clang
01341 #  define XXH_PREFETCH_DIST_AVX512_64  320
01342 #  define XXH_PREFETCH_DIST_AVX512_128 320
01343 #else // for gcc
01344 #  define XXH_PREFETCH_DIST_AVX512_64  640
01345 #  define XXH_PREFETCH_DIST_AVX512_128 512
01346 #endif
01347 
01348 /*
01349  * XXH3_accumulate()
01350  * Loops over XXH3_accumulate_512().
01351  * Assumption: nbStripes will not overflow the secret size
01352  */
01353 XXH_FORCE_INLINE void
01354 XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
01355                 const xxh_u8* XXH_RESTRICT input,
01356                 const xxh_u8* XXH_RESTRICT secret,
01357                       size_t nbStripes,
01358                       XXH3_accWidth_e accWidth)
01359 {
01360     size_t n;
01361     for (n = 0; n < nbStripes; n++ ) {
01362         const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
01363 #if (XXH_VECTOR == XXH_AVX512)
01364         if (accWidth == XXH3_acc_64bits) XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_64);
01365         else                             XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_128);
01366 #else
01367         XXH_PREFETCH(in + XXH_PREFETCH_DIST);
01368 #endif
01369         XXH3_accumulate_512(acc,
01370                             in,
01371                             secret + n*XXH_SECRET_CONSUME_RATE,
01372                             accWidth);
01373     }
01374 }
01375 
01376 XXH_FORCE_INLINE void
01377 XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
01378                       const xxh_u8* XXH_RESTRICT input, size_t len,
01379                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
01380                             XXH3_accWidth_e accWidth)
01381 {
01382     size_t const nb_rounds = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
01383     size_t const block_len = XXH_STRIPE_LEN * nb_rounds;
01384     size_t const nb_blocks = len / block_len;
01385 
01386     size_t n;
01387 
01388     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
01389 
01390     for (n = 0; n < nb_blocks; n++) {
01391         XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
01392         XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
01393     }
01394 
01395     /* last partial block */
01396     XXH_ASSERT(len > XXH_STRIPE_LEN);
01397     {   size_t const nbStripes = (len - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
01398         XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
01399         XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
01400 
01401         /* last stripe */
01402         if (len & (XXH_STRIPE_LEN - 1)) {
01403             const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
01404             /* Do not align on 8, so that the secret is different from the scrambler */
01405 #define XXH_SECRET_LASTACC_START 7
01406             XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
01407     }   }
01408 }
01409 
01410 XXH_FORCE_INLINE xxh_u64
01411 XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
01412 {
01413     return XXH3_mul128_fold64(
01414                acc[0] ^ XXH_readLE64(secret),
01415                acc[1] ^ XXH_readLE64(secret+8) );
01416 }
01417 
01418 static XXH64_hash_t
01419 XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
01420 {
01421     xxh_u64 result64 = start;
01422     size_t i = 0;
01423 
01424     for (i = 0; i < 4; i++) {
01425         result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
01426 #if defined(__clang__)                                /* Clang */ \
01427     && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
01428     && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
01429     && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
01430         /*
01431          * UGLY HACK:
01432          * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
01433          * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
01434          * XXH3_64bits, len == 256, Snapdragon 835:
01435          *   without hack: 2063.7 MB/s
01436          *   with hack:    2560.7 MB/s
01437          */
01438         __asm__("" : "+r" (result64));
01439 #endif
01440     }
01441 
01442     return XXH3_avalanche(result64);
01443 }
01444 
01445 #define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
01446                         XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
01447 
01448 XXH_FORCE_INLINE XXH64_hash_t
01449 XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
01450                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
01451 {
01452     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
01453 
01454     XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits);
01455 
01456     /* converge into final hash */
01457     XXH_STATIC_ASSERT(sizeof(acc) == 64);
01458     /* do not align on 8, so that the secret is different from the accumulator */
01459 #define XXH_SECRET_MERGEACCS_START 11
01460     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
01461     return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
01462 }
01463 
01464 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
01465 {
01466     if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
01467     memcpy(dst, &v64, sizeof(v64));
01468 }
01469 
01470 /* XXH3_initCustomSecret() :
01471  * destination `customSecret` is presumed allocated and same size as `XXH3_kSecret`.
01472  */
01473 XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64)
01474 {
01475     int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
01476     int i;
01477     /*
01478      * We need a separate pointer for the hack below.
01479      * Any decent compiler will optimize this out otherwise.
01480      */
01481     const xxh_u8 *kSecretPtr = XXH3_kSecret;
01482 
01483     XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
01484 
01485 #if defined(__clang__) && defined(__aarch64__)
01486     /*
01487      * UGLY HACK:
01488      * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
01489      * placed sequentially, in order, at the top of the unrolled loop.
01490      *
01491      * While MOVK is great for generating constants (2 cycles for a 64-bit
01492      * constant compared to 4 cycles for LDR), long MOVK chains stall the
01493      * integer pipelines:
01494      *   I   L   S
01495      * MOVK
01496      * MOVK
01497      * MOVK
01498      * MOVK
01499      * ADD
01500      * SUB      STR
01501      *          STR
01502      * By forcing loads from memory (as the asm line causes Clang to assume
01503      * that XXH3_kSecretPtr has been changed), the pipelines are used more
01504      * efficiently:
01505      *   I   L   S
01506      *      LDR
01507      *  ADD LDR
01508      *  SUB     STR
01509      *          STR
01510      * XXH3_64bits_withSeed, len == 256, Snapdragon 835
01511      *   without hack: 2654.4 MB/s
01512      *   with hack:    3202.9 MB/s
01513      */
01514     __asm__("" : "+r" (kSecretPtr));
01515 #endif
01516     /*
01517      * Note: in debug mode, this overrides the asm optimization
01518      * and Clang will emit MOVK chains again.
01519      */
01520     XXH_ASSERT(kSecretPtr == XXH3_kSecret);
01521 
01522     for (i=0; i < nbRounds; i++) {
01523         /*
01524          * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
01525          * customSecret, and on aarch64, this prevented LDP from merging two
01526          * loads together for free. Putting the loads together before the stores
01527          * properly generates LDP.
01528          */
01529         xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
01530         xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
01531         XXH_writeLE64(customSecret + 16*i,     lo);
01532         XXH_writeLE64(customSecret + 16*i + 8, hi);
01533     }
01534 }
01535 
01536 
01537 /*
01538  * It's important for performance that XXH3_hashLong is not inlined. Not sure
01539  * why (uop cache maybe?), but the difference is large and easily measurable.
01540  */
01541 XXH_NO_INLINE XXH64_hash_t
01542 XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
01543 {
01544     return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret));
01545 }
01546 
01547 /*
01548  * It's important for performance that XXH3_hashLong is not inlined. Not sure
01549  * why (uop cache maybe?), but the difference is large and easily measurable.
01550  */
01551 XXH_NO_INLINE XXH64_hash_t
01552 XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
01553                              const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
01554 {
01555     return XXH3_hashLong_64b_internal(input, len, secret, secretSize);
01556 }
01557 
01558 /*
01559  * XXH3_hashLong_64b_withSeed():
01560  * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
01561  * and then use this key for long mode hashing.
01562  *
01563  * This operation is decently fast but nonetheless costs a little bit of time.
01564  * Try to avoid it whenever possible (typically when seed==0).
01565  *
01566  * It's important for performance that XXH3_hashLong is not inlined. Not sure
01567  * why (uop cache maybe?), but the difference is large and easily measurable.
01568  */
01569 XXH_NO_INLINE XXH64_hash_t
01570 XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
01571 {
01572     XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
01573     if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len);
01574     XXH3_initCustomSecret(secret, seed);
01575     return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret));
01576 }
01577 
01578 /* ===   Public entry point   === */
01579 
01580 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
01581 {
01582     if (len <= 16)
01583         return XXH3_len_0to16_64b((const xxh_u8*)input, len, XXH3_kSecret, 0);
01584     if (len <= 128)
01585         return XXH3_len_17to128_64b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), 0);
01586     if (len <= XXH3_MIDSIZE_MAX)
01587          return XXH3_len_129to240_64b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), 0);
01588     return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
01589 }
01590 
01591 XXH_PUBLIC_API XXH64_hash_t
01592 XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
01593 {
01594     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
01595     /*
01596      * If an action is to be taken if `secret` conditions are not respected,
01597      * it should be done here.
01598      * For now, it's a contract pre-condition.
01599      * Adding a check and a branch here would cost performance at every hash.
01600      */
01601     if (len <= 16)
01602         return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
01603     if (len <= 128)
01604         return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
01605     if (len <= XXH3_MIDSIZE_MAX)
01606         return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
01607     return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
01608 }
01609 
01610 XXH_PUBLIC_API XXH64_hash_t
01611 XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
01612 {
01613     if (len <= 16)
01614         return XXH3_len_0to16_64b((const xxh_u8*)input, len, XXH3_kSecret, seed);
01615     if (len <= 128)
01616         return XXH3_len_17to128_64b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), seed);
01617     if (len <= XXH3_MIDSIZE_MAX)
01618         return XXH3_len_129to240_64b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), seed);
01619     return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
01620 }
01621 
01622 /* ===   XXH3 streaming   === */
01623 
01624 
01625 /*
01626  * Malloc's a pointer that is always aligned to align.
01627  *
01628  * This must be freed with `XXH_alignedFree()`.
01629  *
01630  * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
01631  * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
01632  * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
01633  *
01634  * This underalignment previously caused a rather obvious crash which went
01635  * completely unnoticed due to XXH3_createState() not actually being tested.
01636  * Credit to RedSpah for noticing this bug.
01637  *
01638  * The alignment is done manually: Functions like posix_memalign or _mm_malloc
01639  * are avoided: To maintain portability, we would have to write a fallback
01640  * like this anyways, and besides, testing for the existence of library
01641  * functions without relying on external build tools is impossible.
01642  *
01643  * The method is simple: Overallocate, manually align, and store the offset
01644  * to the original behind the returned pointer.
01645  *
01646  * Align must be a power of 2 and 8 <= align <= 128.
01647  */
01648 static void* XXH_alignedMalloc(size_t s, size_t align)
01649 {
01650     XXH_ASSERT(align <= 128 && align >= 8); /* range check */
01651     XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
01652     XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
01653     {   /* Overallocate to make room for manual realignment and an offset byte */
01654         xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
01655         if (base != NULL) {
01656             /*
01657              * Get the offset needed to align this pointer.
01658              *
01659              * Even if the returned pointer is aligned, there will always be
01660              * at least one byte to store the offset to the original pointer.
01661              */
01662             size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
01663             /* Add the offset for the now-aligned pointer */
01664             xxh_u8* ptr = base + offset;
01665 
01666             XXH_ASSERT((size_t)ptr % align == 0);
01667 
01668             /* Store the offset immediately before the returned pointer. */
01669             ptr[-1] = (xxh_u8)offset;
01670             return ptr;
01671         }
01672         return NULL;
01673     }
01674 }
01675 /*
01676  * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
01677  * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
01678  */
01679 static void XXH_alignedFree(void* p)
01680 {
01681     if (p != NULL) {
01682         xxh_u8* ptr = (xxh_u8*)p;
01683         /* Get the offset byte we added in XXH_malloc. */
01684         xxh_u8 offset = ptr[-1];
01685         /* Free the original malloc'd pointer */
01686         xxh_u8* base = ptr - offset;
01687         XXH_free(base);
01688     }
01689 }
01690 XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
01691 {
01692     return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
01693 }
01694 
01695 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
01696 {
01697     XXH_alignedFree(statePtr);
01698     return XXH_OK;
01699 }
01700 
01701 XXH_PUBLIC_API void
01702 XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
01703 {
01704     memcpy(dst_state, src_state, sizeof(*dst_state));
01705 }
01706 
01707 static void
01708 XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
01709                            XXH64_hash_t seed,
01710                            const xxh_u8* secret, size_t secretSize)
01711 {
01712     XXH_ASSERT(statePtr != NULL);
01713     memset(statePtr, 0, sizeof(*statePtr));
01714     statePtr->acc[0] = XXH_PRIME32_3;
01715     statePtr->acc[1] = XXH_PRIME64_1;
01716     statePtr->acc[2] = XXH_PRIME64_2;
01717     statePtr->acc[3] = XXH_PRIME64_3;
01718     statePtr->acc[4] = XXH_PRIME64_4;
01719     statePtr->acc[5] = XXH_PRIME32_2;
01720     statePtr->acc[6] = XXH_PRIME64_5;
01721     statePtr->acc[7] = XXH_PRIME32_1;
01722     statePtr->seed = seed;
01723     XXH_ASSERT(secret != NULL);
01724     statePtr->extSecret = secret;
01725     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
01726     statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
01727     statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
01728 }
01729 
01730 XXH_PUBLIC_API XXH_errorcode
01731 XXH3_64bits_reset(XXH3_state_t* statePtr)
01732 {
01733     if (statePtr == NULL) return XXH_ERROR;
01734     XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
01735     return XXH_OK;
01736 }
01737 
01738 XXH_PUBLIC_API XXH_errorcode
01739 XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
01740 {
01741     if (statePtr == NULL) return XXH_ERROR;
01742     XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
01743     if (secret == NULL) return XXH_ERROR;
01744     if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
01745     return XXH_OK;
01746 }
01747 
01748 XXH_PUBLIC_API XXH_errorcode
01749 XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
01750 {
01751     if (statePtr == NULL) return XXH_ERROR;
01752     XXH3_64bits_reset_internal(statePtr, seed, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
01753     XXH3_initCustomSecret(statePtr->customSecret, seed);
01754     statePtr->extSecret = NULL;
01755     return XXH_OK;
01756 }
01757 
01758 XXH_FORCE_INLINE void
01759 XXH3_consumeStripes( xxh_u64* acc,
01760                     size_t* nbStripesSoFarPtr, size_t nbStripesPerBlock,
01761                     const xxh_u8* input, size_t totalStripes,
01762                     const xxh_u8* secret, size_t secretLimit,
01763                     XXH3_accWidth_e accWidth)
01764 {
01765     XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
01766     if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
01767         /* need a scrambling operation */
01768         size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
01769         XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
01770         XXH3_scrambleAcc(acc, secret + secretLimit);
01771         XXH3_accumulate(acc, input + nbStripes * XXH_STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
01772         *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
01773     } else {
01774         XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
01775         *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
01776     }
01777 }
01778 
01779 /*
01780  * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
01781  */
01782 XXH_FORCE_INLINE XXH_errorcode
01783 XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth)
01784 {
01785     if (input==NULL)
01786 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
01787         return XXH_OK;
01788 #else
01789         return XXH_ERROR;
01790 #endif
01791 
01792     {   const xxh_u8* const bEnd = input + len;
01793         const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
01794 
01795         state->totalLen += len;
01796 
01797         if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
01798             XXH_memcpy(state->buffer + state->bufferedSize, input, len);
01799             state->bufferedSize += (XXH32_hash_t)len;
01800             return XXH_OK;
01801         }
01802         /* input is now > XXH3_INTERNALBUFFER_SIZE */
01803 
01804         #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
01805         XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
01806 
01807         /*
01808          * There is some input left inside the internal buffer.
01809          * Fill it, then consume it.
01810          */
01811         if (state->bufferedSize) {
01812             size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
01813             XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
01814             input += loadSize;
01815             XXH3_consumeStripes(state->acc,
01816                                &state->nbStripesSoFar, state->nbStripesPerBlock,
01817                                 state->buffer, XXH3_INTERNALBUFFER_STRIPES,
01818                                 secret, state->secretLimit,
01819                                 accWidth);
01820             state->bufferedSize = 0;
01821         }
01822 
01823         /* Consume input by full buffer quantities */
01824         if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
01825             const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
01826             do {
01827                 XXH3_consumeStripes(state->acc,
01828                                    &state->nbStripesSoFar, state->nbStripesPerBlock,
01829                                     input, XXH3_INTERNALBUFFER_STRIPES,
01830                                     secret, state->secretLimit,
01831                                     accWidth);
01832                 input += XXH3_INTERNALBUFFER_SIZE;
01833             } while (input<=limit);
01834         }
01835 
01836         if (input < bEnd) { /* Some remaining input: buffer it */
01837             XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
01838             state->bufferedSize = (XXH32_hash_t)(bEnd-input);
01839         }
01840     }
01841 
01842     return XXH_OK;
01843 }
01844 
01845 XXH_PUBLIC_API XXH_errorcode
01846 XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
01847 {
01848     return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits);
01849 }
01850 
01851 
01852 XXH_FORCE_INLINE void
01853 XXH3_digest_long (XXH64_hash_t* acc,
01854                   const XXH3_state_t* state,
01855                   const unsigned char* secret,
01856                   XXH3_accWidth_e accWidth)
01857 {
01858     /*
01859      * Digest on a local copy. This way, the state remains unaltered, and it can
01860      * continue ingesting more input afterwards.
01861      */
01862     memcpy(acc, state->acc, sizeof(state->acc));
01863     if (state->bufferedSize >= XXH_STRIPE_LEN) {
01864         size_t const totalNbStripes = state->bufferedSize / XXH_STRIPE_LEN;
01865         size_t nbStripesSoFar = state->nbStripesSoFar;
01866         XXH3_consumeStripes(acc,
01867                            &nbStripesSoFar, state->nbStripesPerBlock,
01868                             state->buffer, totalNbStripes,
01869                             secret, state->secretLimit,
01870                             accWidth);
01871         if (state->bufferedSize % XXH_STRIPE_LEN) {  /* one last partial stripe */
01872             XXH3_accumulate_512(acc,
01873                                 state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
01874                                 secret + state->secretLimit - XXH_SECRET_LASTACC_START,
01875                                 accWidth);
01876         }
01877     } else {  /* bufferedSize < XXH_STRIPE_LEN */
01878         if (state->bufferedSize) { /* one last stripe */
01879             xxh_u8 lastStripe[XXH_STRIPE_LEN];
01880             size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
01881             memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
01882             memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
01883             XXH3_accumulate_512(acc,
01884                                 lastStripe,
01885                                 secret + state->secretLimit - XXH_SECRET_LASTACC_START,
01886                                 accWidth);
01887     }   }
01888 }
01889 
01890 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
01891 {
01892     const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
01893     if (state->totalLen > XXH3_MIDSIZE_MAX) {
01894         XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
01895         XXH3_digest_long(acc, state, secret, XXH3_acc_64bits);
01896         return XXH3_mergeAccs(acc,
01897                               secret + XXH_SECRET_MERGEACCS_START,
01898                               (xxh_u64)state->totalLen * XXH_PRIME64_1);
01899     }
01900     /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
01901     if (state->seed)
01902         return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
01903     return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
01904                                   secret, state->secretLimit + XXH_STRIPE_LEN);
01905 }
01906 
01907 /* ==========================================
01908  * XXH3 128 bits (a.k.a XXH128)
01909  * ==========================================
01910  * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
01911  * even without counting the significantly larger output size.
01912  *
01913  * For example, extra steps are taken to avoid the seed-dependent collisions
01914  * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
01915  *
01916  * This strength naturally comes at the cost of some speed, especially on short
01917  * lengths. Note that longer hashes are about as fast as the 64-bit version
01918  * due to it using only a slight modification of the 64-bit loop.
01919  *
01920  * XXH128 is also more oriented towards 64-bit machines. It is still extremely
01921  * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
01922  */
01923 
01924 XXH_FORCE_INLINE XXH128_hash_t
01925 XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
01926 {
01927     /* A doubled version of 1to3_64b with different constants. */
01928     XXH_ASSERT(input != NULL);
01929     XXH_ASSERT(1 <= len && len <= 3);
01930     XXH_ASSERT(secret != NULL);
01931     /*
01932      * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
01933      * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
01934      * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
01935      */
01936     {   xxh_u8 const c1 = input[0];
01937         xxh_u8 const c2 = input[len >> 1];
01938         xxh_u8 const c3 = input[len - 1];
01939         xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
01940                                 | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
01941         xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
01942         xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
01943         xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
01944         xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
01945         xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
01946         xxh_u64 const mixedl = keyed_lo * XXH_PRIME64_1;
01947         xxh_u64 const mixedh = keyed_hi * XXH_PRIME64_5;
01948         XXH128_hash_t h128;
01949         h128.low64  = XXH3_avalanche(mixedl);
01950         h128.high64 = XXH3_avalanche(mixedh);
01951         return h128;
01952     }
01953 }
01954 
01955 XXH_FORCE_INLINE XXH128_hash_t
01956 XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
01957 {
01958     XXH_ASSERT(input != NULL);
01959     XXH_ASSERT(secret != NULL);
01960     XXH_ASSERT(4 <= len && len <= 8);
01961     seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
01962     {   xxh_u32 const input_lo = XXH_readLE32(input);
01963         xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
01964         xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
01965         xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
01966         xxh_u64 const keyed = input_64 ^ bitflip;
01967 
01968         /* Shift len to the left to ensure it is even, this avoids even multiplies. */
01969         XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
01970 
01971         m128.high64 += (m128.low64 << 1);
01972         m128.low64  ^= (m128.high64 >> 3);
01973 
01974         m128.low64   = XXH_xorshift64(m128.low64, 35);
01975         m128.low64  *= 0x9FB21C651E98DF25ULL;
01976         m128.low64   = XXH_xorshift64(m128.low64, 28);
01977         m128.high64  = XXH3_avalanche(m128.high64);
01978         return m128;
01979     }
01980 }
01981 
01982 XXH_FORCE_INLINE XXH128_hash_t
01983 XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
01984 {
01985     XXH_ASSERT(input != NULL);
01986     XXH_ASSERT(secret != NULL);
01987     XXH_ASSERT(9 <= len && len <= 16);
01988     {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
01989         xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
01990         xxh_u64 const input_lo = XXH_readLE64(input);
01991         xxh_u64       input_hi = XXH_readLE64(input + len - 8);
01992         XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
01993         /*
01994          * Put len in the middle of m128 to ensure that the length gets mixed to
01995          * both the low and high bits in the 128x64 multiply below.
01996          */
01997         m128.low64 += (xxh_u64)(len - 1) << 54;
01998         input_hi   ^= bitfliph;
01999         /*
02000          * Add the high 32 bits of input_hi to the high 32 bits of m128, then
02001          * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
02002          * the high 64 bits of m128.
02003          *
02004          * The best approach to this operation is different on 32-bit and 64-bit.
02005          */
02006         if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
02007             /*
02008              * 32-bit optimized version, which is more readable.
02009              *
02010              * On 32-bit, it removes an ADC and delays a dependency between the two
02011              * halves of m128.high64, but it generates an extra mask on 64-bit.
02012              */
02013             m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
02014         } else {
02015             /*
02016              * 64-bit optimized (albeit more confusing) version.
02017              *
02018              * Uses some properties of addition and multiplication to remove the mask:
02019              *
02020              * Let:
02021              *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
02022              *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
02023              *    c = XXH_PRIME32_2
02024              *
02025              *    a + (b * c)
02026              * Inverse Property: x + y - x == y
02027              *    a + (b * (1 + c - 1))
02028              * Distributive Property: x * (y + z) == (x * y) + (x * z)
02029              *    a + (b * 1) + (b * (c - 1))
02030              * Identity Property: x * 1 == x
02031              *    a + b + (b * (c - 1))
02032              *
02033              * Substitute a, b, and c:
02034              *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
02035              *
02036              * Since input_hi.hi + input_hi.lo == input_hi, we get this:
02037              *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
02038              */
02039             m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
02040         }
02041         /* m128 ^= XXH_swap64(m128 >> 64); */
02042         m128.low64  ^= XXH_swap64(m128.high64);
02043 
02044         {   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
02045             XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
02046             h128.high64 += m128.high64 * XXH_PRIME64_2;
02047 
02048             h128.low64   = XXH3_avalanche(h128.low64);
02049             h128.high64  = XXH3_avalanche(h128.high64);
02050             return h128;
02051     }   }
02052 }
02053 
02054 /*
02055  * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
02056  */
02057 XXH_FORCE_INLINE XXH128_hash_t
02058 XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
02059 {
02060     XXH_ASSERT(len <= 16);
02061     {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
02062         if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
02063         if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
02064         {   XXH128_hash_t h128;
02065             xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
02066             xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
02067             h128.low64 = XXH3_avalanche((XXH_PRIME64_1 + seed) ^ bitflipl);
02068             h128.high64 = XXH3_avalanche((XXH_PRIME64_2 - seed) ^ bitfliph);
02069             return h128;
02070     }   }
02071 }
02072 
02073 /*
02074  * A bit slower than XXH3_mix16B, but handles multiply by zero better.
02075  */
02076 XXH_FORCE_INLINE XXH128_hash_t
02077 XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
02078               const xxh_u8* secret, XXH64_hash_t seed)
02079 {
02080     acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
02081     acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
02082     acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
02083     acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
02084     return acc;
02085 }
02086 
02087 
02088 XXH_FORCE_INLINE XXH128_hash_t
02089 XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
02090                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
02091                       XXH64_hash_t seed)
02092 {
02093     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
02094     XXH_ASSERT(16 < len && len <= 128);
02095 
02096     {   XXH128_hash_t acc;
02097         acc.low64 = len * XXH_PRIME64_1;
02098         acc.high64 = 0;
02099         if (len > 32) {
02100             if (len > 64) {
02101                 if (len > 96) {
02102                     acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
02103                 }
02104                 acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
02105             }
02106             acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
02107         }
02108         acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
02109         {   XXH128_hash_t h128;
02110             h128.low64  = acc.low64 + acc.high64;
02111             h128.high64 = (acc.low64    * XXH_PRIME64_1)
02112                         + (acc.high64   * XXH_PRIME64_4)
02113                         + ((len - seed) * XXH_PRIME64_2);
02114             h128.low64  = XXH3_avalanche(h128.low64);
02115             h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
02116             return h128;
02117         }
02118     }
02119 }
02120 
02121 XXH_NO_INLINE XXH128_hash_t
02122 XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
02123                        const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
02124                        XXH64_hash_t seed)
02125 {
02126     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
02127     XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
02128 
02129     {   XXH128_hash_t acc;
02130         int const nbRounds = (int)len / 32;
02131         int i;
02132         acc.low64 = len * XXH_PRIME64_1;
02133         acc.high64 = 0;
02134         for (i=0; i<4; i++) {
02135             acc = XXH128_mix32B(acc,
02136                                 input  + (32 * i),
02137                                 input  + (32 * i) + 16,
02138                                 secret + (32 * i),
02139                                 seed);
02140         }
02141         acc.low64 = XXH3_avalanche(acc.low64);
02142         acc.high64 = XXH3_avalanche(acc.high64);
02143         XXH_ASSERT(nbRounds >= 4);
02144         for (i=4 ; i < nbRounds; i++) {
02145             acc = XXH128_mix32B(acc,
02146                                 input + (32 * i),
02147                                 input + (32 * i) + 16,
02148                                 secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
02149                                 seed);
02150         }
02151         /* last bytes */
02152         acc = XXH128_mix32B(acc,
02153                             input + len - 16,
02154                             input + len - 32,
02155                             secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
02156                             0ULL - seed);
02157 
02158         {   XXH128_hash_t h128;
02159             h128.low64  = acc.low64 + acc.high64;
02160             h128.high64 = (acc.low64    * XXH_PRIME64_1)
02161                         + (acc.high64   * XXH_PRIME64_4)
02162                         + ((len - seed) * XXH_PRIME64_2);
02163             h128.low64  = XXH3_avalanche(h128.low64);
02164             h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
02165             return h128;
02166         }
02167     }
02168 }
02169 
02170 XXH_FORCE_INLINE XXH128_hash_t
02171 XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
02172                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
02173 {
02174     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
02175 
02176     XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits);
02177 
02178     /* converge into final hash */
02179     XXH_STATIC_ASSERT(sizeof(acc) == 64);
02180     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
02181     {   XXH128_hash_t h128;
02182         h128.low64  = XXH3_mergeAccs(acc,
02183                                      secret + XXH_SECRET_MERGEACCS_START,
02184                                      (xxh_u64)len * XXH_PRIME64_1);
02185         h128.high64 = XXH3_mergeAccs(acc,
02186                                      secret + secretSize
02187                                             - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
02188                                      ~((xxh_u64)len * XXH_PRIME64_2));
02189         return h128;
02190     }
02191 }
02192 
02193 /*
02194  * It's important for performance that XXH3_hashLong is not inlined. Not sure
02195  * why (uop cache maybe?), but the difference is large and easily measurable.
02196  */
02197 XXH_NO_INLINE XXH128_hash_t
02198 XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
02199 {
02200     return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret));
02201 }
02202 
02203 /*
02204  * It's important for performance that XXH3_hashLong is not inlined. Not sure
02205  * why (uop cache maybe?), but the difference is large and easily measurable.
02206  */
02207 XXH_NO_INLINE XXH128_hash_t
02208 XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
02209                               const xxh_u8* secret, size_t secretSize)
02210 {
02211     return XXH3_hashLong_128b_internal(input, len, secret, secretSize);
02212 }
02213 
02214 /*
02215  * It's important for performance that XXH3_hashLong is not inlined. Not sure
02216  * why (uop cache maybe?), but the difference is large and easily measurable.
02217  */
02218 XXH_NO_INLINE XXH128_hash_t
02219 XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
02220 {
02221     XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
02222     if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len);
02223     XXH3_initCustomSecret(secret, seed);
02224     return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret));
02225 }
02226 
02227 
02228 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
02229 {
02230     if (len <= 16)
02231         return XXH3_len_0to16_128b((const xxh_u8*)input, len, XXH3_kSecret, 0);
02232     if (len <= 128)
02233         return XXH3_len_17to128_128b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), 0);
02234     if (len <= XXH3_MIDSIZE_MAX)
02235         return XXH3_len_129to240_128b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), 0);
02236     return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
02237 }
02238 
02239 XXH_PUBLIC_API XXH128_hash_t
02240 XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
02241 {
02242     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
02243     /*
02244      * If an action is to be taken if `secret` conditions are not respected,
02245      * it should be done here.
02246      * For now, it's a contract pre-condition.
02247      * Adding a check and a branch here would cost performance at every hash.
02248      */
02249     if (len <= 16)
02250         return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
02251     if (len <= 128)
02252         return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
02253     if (len <= XXH3_MIDSIZE_MAX)
02254         return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
02255     return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
02256 }
02257 
02258 XXH_PUBLIC_API XXH128_hash_t
02259 XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
02260 {
02261     if (len <= 16)
02262         return XXH3_len_0to16_128b((const xxh_u8*)input, len, XXH3_kSecret, seed);
02263     if (len <= 128)
02264          return XXH3_len_17to128_128b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), seed);
02265     if (len <= XXH3_MIDSIZE_MAX)
02266          return XXH3_len_129to240_128b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), seed);
02267     return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
02268 }
02269 
02270 XXH_PUBLIC_API XXH128_hash_t
02271 XXH128(const void* input, size_t len, XXH64_hash_t seed)
02272 {
02273     return XXH3_128bits_withSeed(input, len, seed);
02274 }
02275 
02276 
02277 /* ===   XXH3 128-bit streaming   === */
02278 
02279 /*
02280  * All the functions are actually the same as for 64-bit streaming variant.
02281  * The only difference is the finalizatiom routine.
02282  */
02283 
02284 static void
02285 XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
02286                             XXH64_hash_t seed,
02287                             const xxh_u8* secret, size_t secretSize)
02288 {
02289     XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
02290 }
02291 
02292 XXH_PUBLIC_API XXH_errorcode
02293 XXH3_128bits_reset(XXH3_state_t* statePtr)
02294 {
02295     if (statePtr == NULL) return XXH_ERROR;
02296     XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
02297     return XXH_OK;
02298 }
02299 
02300 XXH_PUBLIC_API XXH_errorcode
02301 XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
02302 {
02303     if (statePtr == NULL) return XXH_ERROR;
02304     XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
02305     if (secret == NULL) return XXH_ERROR;
02306     if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
02307     return XXH_OK;
02308 }
02309 
02310 XXH_PUBLIC_API XXH_errorcode
02311 XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
02312 {
02313     if (statePtr == NULL) return XXH_ERROR;
02314     XXH3_128bits_reset_internal(statePtr, seed, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
02315     XXH3_initCustomSecret(statePtr->customSecret, seed);
02316     statePtr->extSecret = NULL;
02317     return XXH_OK;
02318 }
02319 
02320 XXH_PUBLIC_API XXH_errorcode
02321 XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
02322 {
02323     return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits);
02324 }
02325 
02326 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
02327 {
02328     const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
02329     if (state->totalLen > XXH3_MIDSIZE_MAX) {
02330         XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
02331         XXH3_digest_long(acc, state, secret, XXH3_acc_128bits);
02332         XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
02333         {   XXH128_hash_t h128;
02334             h128.low64  = XXH3_mergeAccs(acc,
02335                                          secret + XXH_SECRET_MERGEACCS_START,
02336                                          (xxh_u64)state->totalLen * XXH_PRIME64_1);
02337             h128.high64 = XXH3_mergeAccs(acc,
02338                                          secret + state->secretLimit + XXH_STRIPE_LEN
02339                                                 - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
02340                                          ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
02341             return h128;
02342         }
02343     }
02344     /* len <= XXH3_MIDSIZE_MAX : short code */
02345     if (state->seed)
02346         return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
02347     return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
02348                                    secret, state->secretLimit + XXH_STRIPE_LEN);
02349 }
02350 
02351 /* 128-bit utility functions */
02352 
02353 #include <string.h>   /* memcmp, memcpy */
02354 
02355 /* return : 1 is equal, 0 if different */
02356 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
02357 {
02358     /* note : XXH128_hash_t is compact, it has no padding byte */
02359     return !(memcmp(&h1, &h2, sizeof(h1)));
02360 }
02361 
02362 /* This prototype is compatible with stdlib's qsort().
02363  * return : >0 if *h128_1  > *h128_2
02364  *          <0 if *h128_1  < *h128_2
02365  *          =0 if *h128_1 == *h128_2  */
02366 XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
02367 {
02368     XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
02369     XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
02370     int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
02371     /* note : bets that, in most cases, hash values are different */
02372     if (hcmp) return hcmp;
02373     return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
02374 }
02375 
02376 
02377 /*======   Canonical representation   ======*/
02378 XXH_PUBLIC_API void
02379 XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
02380 {
02381     XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
02382     if (XXH_CPU_LITTLE_ENDIAN) {
02383         hash.high64 = XXH_swap64(hash.high64);
02384         hash.low64  = XXH_swap64(hash.low64);
02385     }
02386     memcpy(dst, &hash.high64, sizeof(hash.high64));
02387     memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
02388 }
02389 
02390 XXH_PUBLIC_API XXH128_hash_t
02391 XXH128_hashFromCanonical(const XXH128_canonical_t* src)
02392 {
02393     XXH128_hash_t h;
02394     h.high64 = XXH_readBE64(src);
02395     h.low64  = XXH_readBE64(src->digest + 8);
02396     return h;
02397 }
02398 
02399 /* Pop our optimization override from above */
02400 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
02401   && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
02402   && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
02403 #  pragma GCC pop_options
02404 #endif
02405 
02406 #endif  /* XXH3_H_1397135465 */