DOSBox-X
|
00001 /* 00002 * xxHash - Extremely Fast Hash algorithm 00003 * Development source file for `xxh3` 00004 * Copyright (C) 2019-2020 Yann Collet 00005 * 00006 * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions are 00010 * met: 00011 * 00012 * * Redistributions of source code must retain the above copyright 00013 * notice, this list of conditions and the following disclaimer. 00014 * * Redistributions in binary form must reproduce the above 00015 * copyright notice, this list of conditions and the following disclaimer 00016 * in the documentation and/or other materials provided with the 00017 * distribution. 00018 * 00019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00020 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00021 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 00022 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 00023 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00024 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00025 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00026 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00027 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00028 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00029 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00030 * 00031 * You can contact the author at: 00032 * - xxHash homepage: https://www.xxhash.com 00033 * - xxHash source repository: https://github.com/Cyan4973/xxHash 00034 */ 00035 00036 /* 00037 * Note: This file is separated for development purposes. 00038 * It will be integrated into `xxhash.h` when development stage is completed. 00039 * 00040 * Credit: most of the work on vectorial and asm variants comes from @easyaspi314 00041 */ 00042 00043 #ifndef XXH3_H_1397135465 00044 #define XXH3_H_1397135465 00045 00046 /* === Dependencies === */ 00047 #ifndef XXHASH_H_5627135585666179 00048 /* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */ 00049 # undef XXH_INLINE_ALL /* avoid redefinition */ 00050 # define XXH_INLINE_ALL 00051 #endif 00052 #include "xxhash.h" 00053 00054 00055 /* === Compiler specifics === */ 00056 00057 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ 00058 # define XXH_RESTRICT restrict 00059 #else 00060 /* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ 00061 # define XXH_RESTRICT /* disable */ 00062 #endif 00063 00064 #if (defined(__GNUC__) && (__GNUC__ >= 3)) \ 00065 || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ 00066 || defined(__clang__) 00067 # define XXH_likely(x) __builtin_expect(x, 1) 00068 # define XXH_unlikely(x) __builtin_expect(x, 0) 00069 #else 00070 # define XXH_likely(x) (x) 00071 # define XXH_unlikely(x) (x) 00072 #endif 00073 00074 #if defined(__GNUC__) 00075 # if defined(__AVX2__) 00076 # include <immintrin.h> 00077 # elif defined(__SSE2__) 00078 # include <emmintrin.h> 00079 # elif defined(__ARM_NEON__) || defined(__ARM_NEON) 00080 # define inline __inline__ /* clang bug */ 00081 # include <arm_neon.h> 00082 # undef inline 00083 # endif 00084 #elif defined(_MSC_VER) 00085 # include <intrin.h> 00086 #endif 00087 00088 /* 00089 * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while 00090 * remaining a true 64-bit/128-bit hash function. 00091 * 00092 * This is done by prioritizing a subset of 64-bit operations that can be 00093 * emulated without too many steps on the average 32-bit machine. 00094 * 00095 * For example, these two lines seem similar, and run equally fast on 64-bit: 00096 * 00097 * xxh_u64 x; 00098 * x ^= (x >> 47); // good 00099 * x ^= (x >> 13); // bad 00100 * 00101 * However, to a 32-bit machine, there is a major difference. 00102 * 00103 * x ^= (x >> 47) looks like this: 00104 * 00105 * x.lo ^= (x.hi >> (47 - 32)); 00106 * 00107 * while x ^= (x >> 13) looks like this: 00108 * 00109 * // note: funnel shifts are not usually cheap. 00110 * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); 00111 * x.hi ^= (x.hi >> 13); 00112 * 00113 * The first one is significantly faster than the second, simply because the 00114 * shift is larger than 32. This means: 00115 * - All the bits we need are in the upper 32 bits, so we can ignore the lower 00116 * 32 bits in the shift. 00117 * - The shift result will always fit in the lower 32 bits, and therefore, 00118 * we can ignore the upper 32 bits in the xor. 00119 * 00120 * Thanks to this optimization, XXH3 only requires these features to be efficient: 00121 * 00122 * - Usable unaligned access 00123 * - A 32-bit or 64-bit ALU 00124 * - If 32-bit, a decent ADC instruction 00125 * - A 32 or 64-bit multiply with a 64-bit result 00126 * - For the 128-bit variant, a decent byteswap helps short inputs. 00127 * 00128 * The first two are already required by XXH32, and almost all 32-bit and 64-bit 00129 * platforms which can run XXH32 can run XXH3 efficiently. 00130 * 00131 * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one 00132 * notable exception. 00133 * 00134 * First of all, Thumb-1 lacks support for the UMULL instruction which 00135 * performs the important long multiply. This means numerous __aeabi_lmul 00136 * calls. 00137 * 00138 * Second of all, the 8 functional registers are just not enough. 00139 * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need 00140 * Lo registers, and this shuffling results in thousands more MOVs than A32. 00141 * 00142 * A32 and T32 don't have this limitation. They can access all 14 registers, 00143 * do a 32->64 multiply with UMULL, and the flexible operand allowing free 00144 * shifts is helpful, too. 00145 * 00146 * Therefore, we do a quick sanity check. 00147 * 00148 * If compiling Thumb-1 for a target which supports ARM instructions, we will 00149 * emit a warning, as it is not a "sane" platform to compile for. 00150 * 00151 * Usually, if this happens, it is because of an accident and you probably need 00152 * to specify -march, as you likely meant to compile for a newer architecture. 00153 */ 00154 #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) 00155 # warning "XXH3 is highly inefficient without ARM or Thumb-2." 00156 #endif 00157 00158 /* ========================================== 00159 * Vectorization detection 00160 * ========================================== */ 00161 #define XXH_SCALAR 0 /* Portable scalar version */ 00162 #define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ 00163 #define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ 00164 #define XXH_NEON 3 /* NEON for most ARMv7-A and all AArch64 */ 00165 #define XXH_VSX 4 /* VSX and ZVector for POWER8/z13 */ 00166 #define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */ 00167 00168 #ifndef XXH_VECTOR /* can be defined on command line */ 00169 # if defined(__AVX512F__) 00170 # define XXH_VECTOR XXH_AVX512 00171 # elif defined(__AVX2__) 00172 # define XXH_VECTOR XXH_AVX2 00173 # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) 00174 # define XXH_VECTOR XXH_SSE2 00175 # elif defined(__GNUC__) /* msvc support maybe later */ \ 00176 && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ 00177 && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ 00178 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) 00179 # define XXH_VECTOR XXH_NEON 00180 # elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ 00181 || (defined(__s390x__) && defined(__VEC__)) \ 00182 && defined(__GNUC__) /* TODO: IBM XL */ 00183 # define XXH_VECTOR XXH_VSX 00184 # else 00185 # define XXH_VECTOR XXH_SCALAR 00186 # endif 00187 #endif 00188 00189 /* 00190 * Controls the alignment of the accumulator. 00191 * This is for compatibility with aligned vector loads, which are usually faster. 00192 */ 00193 #ifndef XXH_ACC_ALIGN 00194 # if XXH_VECTOR == XXH_SCALAR /* scalar */ 00195 # define XXH_ACC_ALIGN 8 00196 # elif XXH_VECTOR == XXH_SSE2 /* sse2 */ 00197 # define XXH_ACC_ALIGN 16 00198 # elif XXH_VECTOR == XXH_AVX2 /* avx2 */ 00199 # define XXH_ACC_ALIGN 32 00200 # elif XXH_VECTOR == XXH_NEON /* neon */ 00201 # define XXH_ACC_ALIGN 16 00202 # elif XXH_VECTOR == XXH_VSX /* vsx */ 00203 # define XXH_ACC_ALIGN 16 00204 # elif XXH_VECTOR == XXH_AVX512 /* avx512 */ 00205 # define XXH_ACC_ALIGN 64 00206 # endif 00207 #endif 00208 00209 /* 00210 * UGLY HACK: 00211 * GCC usually generates the best code with -O3 for xxHash. 00212 * 00213 * However, when targeting AVX2, it is overzealous in its unrolling resulting 00214 * in code roughly 3/4 the speed of Clang. 00215 * 00216 * There are other issues, such as GCC splitting _mm256_loadu_si256 into 00217 * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which 00218 * only applies to Sandy and Ivy Bridge... which don't even support AVX2. 00219 * 00220 * That is why when compiling the AVX2 version, it is recommended to use either 00221 * -O2 -mavx2 -march=haswell 00222 * or 00223 * -O2 -mavx2 -mno-avx256-split-unaligned-load 00224 * for decent performance, or to use Clang instead. 00225 * 00226 * Fortunately, we can control the first one with a pragma that forces GCC into 00227 * -O2, but the other one we can't control without "failed to inline always 00228 * inline function due to target mismatch" warnings. 00229 */ 00230 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ 00231 && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ 00232 && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ 00233 # pragma GCC push_options 00234 # pragma GCC optimize("-O2") 00235 #endif 00236 00237 00238 #if XXH_VECTOR == XXH_NEON 00239 /* 00240 * NEON's setup for vmlal_u32 is a little more complicated than it is on 00241 * SSE2, AVX2, and VSX. 00242 * 00243 * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. 00244 * 00245 * To do the same operation, the 128-bit 'Q' register needs to be split into 00246 * two 64-bit 'D' registers, performing this operation:: 00247 * 00248 * [ a | b ] 00249 * | '---------. .--------' | 00250 * | x | 00251 * | .---------' '--------. | 00252 * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] 00253 * 00254 * Due to significant changes in aarch64, the fastest method for aarch64 is 00255 * completely different than the fastest method for ARMv7-A. 00256 * 00257 * ARMv7-A treats D registers as unions overlaying Q registers, so modifying 00258 * D11 will modify the high half of Q5. This is similar to how modifying AH 00259 * will only affect bits 8-15 of AX on x86. 00260 * 00261 * VZIP takes two registers, and puts even lanes in one register and odd lanes 00262 * in the other. 00263 * 00264 * On ARMv7-A, this strangely modifies both parameters in place instead of 00265 * taking the usual 3-operand form. 00266 * 00267 * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the 00268 * lower and upper halves of the Q register to end up with the high and low 00269 * halves where we want - all in one instruction. 00270 * 00271 * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } 00272 * 00273 * Unfortunately we need inline assembly for this: Instructions modifying two 00274 * registers at once is not possible in GCC or Clang's IR, and they have to 00275 * create a copy. 00276 * 00277 * aarch64 requires a different approach. 00278 * 00279 * In order to make it easier to write a decent compiler for aarch64, many 00280 * quirks were removed, such as conditional execution. 00281 * 00282 * NEON was also affected by this. 00283 * 00284 * aarch64 cannot access the high bits of a Q-form register, and writes to a 00285 * D-form register zero the high bits, similar to how writes to W-form scalar 00286 * registers (or DWORD registers on x86_64) work. 00287 * 00288 * The formerly free vget_high intrinsics now require a vext (with a few 00289 * exceptions) 00290 * 00291 * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent 00292 * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one 00293 * operand. 00294 * 00295 * The equivalent of the VZIP.32 on the lower and upper halves would be this 00296 * mess: 00297 * 00298 * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } 00299 * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } 00300 * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } 00301 * 00302 * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): 00303 * 00304 * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); 00305 * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); 00306 * 00307 * This is available on ARMv7-A, but is less efficient than a single VZIP.32. 00308 */ 00309 00310 /* 00311 * Function-like macro: 00312 * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) 00313 * { 00314 * outLo = (uint32x2_t)(in & 0xFFFFFFFF); 00315 * outHi = (uint32x2_t)(in >> 32); 00316 * in = UNDEFINED; 00317 * } 00318 */ 00319 # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ 00320 && defined(__GNUC__) \ 00321 && !defined(__aarch64__) && !defined(__arm64__) 00322 # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ 00323 do { \ 00324 /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ 00325 /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ 00326 /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ 00327 __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ 00328 (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ 00329 (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ 00330 } while (0) 00331 # else 00332 # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ 00333 do { \ 00334 (outLo) = vmovn_u64 (in); \ 00335 (outHi) = vshrn_n_u64 ((in), 32); \ 00336 } while (0) 00337 # endif 00338 #endif /* XXH_VECTOR == XXH_NEON */ 00339 00340 /* 00341 * VSX and Z Vector helpers. 00342 * 00343 * This is very messy, and any pull requests to clean this up are welcome. 00344 * 00345 * There are a lot of problems with supporting VSX and s390x, due to 00346 * inconsistent intrinsics, spotty coverage, and multiple endiannesses. 00347 */ 00348 #if XXH_VECTOR == XXH_VSX 00349 # if defined(__s390x__) 00350 # include <s390intrin.h> 00351 # else 00352 # include <altivec.h> 00353 # endif 00354 00355 # undef vector /* Undo the pollution */ 00356 00357 typedef __vector unsigned long long xxh_u64x2; 00358 typedef __vector unsigned char xxh_u8x16; 00359 typedef __vector unsigned xxh_u32x4; 00360 00361 # ifndef XXH_VSX_BE 00362 # if defined(__BIG_ENDIAN__) \ 00363 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 00364 # define XXH_VSX_BE 1 00365 # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ 00366 # warning "-maltivec=be is not recommended. Please use native endianness." 00367 # define XXH_VSX_BE 1 00368 # else 00369 # define XXH_VSX_BE 0 00370 # endif 00371 # endif /* !defined(XXH_VSX_BE) */ 00372 00373 # if XXH_VSX_BE 00374 /* A wrapper for POWER9's vec_revb. */ 00375 # if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) 00376 # define XXH_vec_revb vec_revb 00377 # else 00378 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) 00379 { 00380 xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 00381 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; 00382 return vec_perm(val, val, vByteSwap); 00383 } 00384 # endif 00385 # endif /* XXH_VSX_BE */ 00386 00387 /* 00388 * Performs an unaligned load and byte swaps it on big endian. 00389 */ 00390 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) 00391 { 00392 xxh_u64x2 ret; 00393 memcpy(&ret, ptr, sizeof(xxh_u64x2)); 00394 # if XXH_VSX_BE 00395 ret = XXH_vec_revb(ret); 00396 # endif 00397 return ret; 00398 } 00399 00400 /* 00401 * vec_mulo and vec_mule are very problematic intrinsics on PowerPC 00402 * 00403 * These intrinsics weren't added until GCC 8, despite existing for a while, 00404 * and they are endian dependent. Also, their meaning swap depending on version. 00405 * */ 00406 # if defined(__s390x__) 00407 /* s390x is always big endian, no issue on this platform */ 00408 # define XXH_vec_mulo vec_mulo 00409 # define XXH_vec_mule vec_mule 00410 # elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) 00411 /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ 00412 # define XXH_vec_mulo __builtin_altivec_vmulouw 00413 # define XXH_vec_mule __builtin_altivec_vmuleuw 00414 # else 00415 /* gcc needs inline assembly */ 00416 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ 00417 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) 00418 { 00419 xxh_u64x2 result; 00420 __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); 00421 return result; 00422 } 00423 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) 00424 { 00425 xxh_u64x2 result; 00426 __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); 00427 return result; 00428 } 00429 # endif /* XXH_vec_mulo, XXH_vec_mule */ 00430 #endif /* XXH_VECTOR == XXH_VSX */ 00431 00432 00433 /* prefetch 00434 * can be disabled, by declaring XXH_NO_PREFETCH build macro */ 00435 #if defined(XXH_NO_PREFETCH) 00436 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ 00437 #else 00438 # if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ 00439 # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ 00440 # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) 00441 # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) 00442 # define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) 00443 # else 00444 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ 00445 # endif 00446 #endif /* XXH_NO_PREFETCH */ 00447 00448 00449 /* ========================================== 00450 * XXH3 default settings 00451 * ========================================== */ 00452 00453 #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ 00454 00455 #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) 00456 # error "default keyset is not large enough" 00457 #endif 00458 00459 /* Pseudorandom secret taken directly from FARSH */ 00460 XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { 00461 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, 00462 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 00463 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, 00464 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, 00465 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 00466 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, 00467 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, 00468 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, 00469 00470 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, 00471 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, 00472 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, 00473 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, 00474 }; 00475 00476 #ifdef XXH_OLD_NAMES 00477 # define kSecret XXH3_kSecret 00478 #endif 00479 00480 /* 00481 * Calculates a 32-bit to 64-bit long multiply. 00482 * 00483 * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't 00484 * need to (but it shouldn't need to anyways, it is about 7 instructions to do 00485 * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we 00486 * use that instead of the normal method. 00487 * 00488 * If you are compiling for platforms like Thumb-1 and don't have a better option, 00489 * you may also want to write your own long multiply routine here. 00490 * 00491 * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) 00492 * { 00493 * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); 00494 * } 00495 */ 00496 #if defined(_MSC_VER) && defined(_M_IX86) 00497 # include <intrin.h> 00498 # define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) 00499 #else 00500 /* 00501 * Downcast + upcast is usually better than masking on older compilers like 00502 * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. 00503 * 00504 * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands 00505 * and perform a full 64x64 multiply -- entirely redundant on 32-bit. 00506 */ 00507 # define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) 00508 #endif 00509 00510 /* 00511 * Calculates a 64->128-bit long multiply. 00512 * 00513 * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version. 00514 */ 00515 static XXH128_hash_t 00516 XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) 00517 { 00518 /* 00519 * GCC/Clang __uint128_t method. 00520 * 00521 * On most 64-bit targets, GCC and Clang define a __uint128_t type. 00522 * This is usually the best way as it usually uses a native long 64-bit 00523 * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. 00524 * 00525 * Usually. 00526 * 00527 * Despite being a 32-bit platform, Clang (and emscripten) define this type 00528 * despite not having the arithmetic for it. This results in a laggy 00529 * compiler builtin call which calculates a full 128-bit multiply. 00530 * In that case it is best to use the portable one. 00531 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 00532 */ 00533 #if defined(__GNUC__) && !defined(__wasm__) \ 00534 && defined(__SIZEOF_INT128__) \ 00535 || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) 00536 00537 __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; 00538 XXH128_hash_t r128; 00539 r128.low64 = (xxh_u64)(product); 00540 r128.high64 = (xxh_u64)(product >> 64); 00541 return r128; 00542 00543 /* 00544 * MSVC for x64's _umul128 method. 00545 * 00546 * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); 00547 * 00548 * This compiles to single operand MUL on x64. 00549 */ 00550 #elif defined(_M_X64) || defined(_M_IA64) 00551 00552 #ifndef _MSC_VER 00553 # pragma intrinsic(_umul128) 00554 #endif 00555 xxh_u64 product_high; 00556 xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); 00557 XXH128_hash_t r128; 00558 r128.low64 = product_low; 00559 r128.high64 = product_high; 00560 return r128; 00561 00562 #else 00563 /* 00564 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. 00565 * 00566 * This is a fast and simple grade school multiply, which is shown below 00567 * with base 10 arithmetic instead of base 0x100000000. 00568 * 00569 * 9 3 // D2 lhs = 93 00570 * x 7 5 // D2 rhs = 75 00571 * ---------- 00572 * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 00573 * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 00574 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 00575 * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 00576 * --------- 00577 * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 00578 * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 00579 * --------- 00580 * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 00581 * 00582 * The reasons for adding the products like this are: 00583 * 1. It avoids manual carry tracking. Just like how 00584 * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. 00585 * This avoids a lot of complexity. 00586 * 00587 * 2. It hints for, and on Clang, compiles to, the powerful UMAAL 00588 * instruction available in ARM's Digital Signal Processing extension 00589 * in 32-bit ARMv6 and later, which is shown below: 00590 * 00591 * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) 00592 * { 00593 * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; 00594 * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); 00595 * *RdHi = (xxh_u32)(product >> 32); 00596 * } 00597 * 00598 * This instruction was designed for efficient long multiplication, and 00599 * allows this to be calculated in only 4 instructions at speeds 00600 * comparable to some 64-bit ALUs. 00601 * 00602 * 3. It isn't terrible on other platforms. Usually this will be a couple 00603 * of 32-bit ADD/ADCs. 00604 */ 00605 00606 /* First calculate all of the cross products. */ 00607 xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); 00608 xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); 00609 xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); 00610 xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); 00611 00612 /* Now add the products together. These will never overflow. */ 00613 xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; 00614 xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; 00615 xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); 00616 00617 XXH128_hash_t r128; 00618 r128.low64 = lower; 00619 r128.high64 = upper; 00620 return r128; 00621 #endif 00622 } 00623 00624 /* 00625 * Does a 64-bit to 128-bit multiply, then XOR folds it. 00626 * 00627 * The reason for the separate function is to prevent passing too many structs 00628 * around by value. This will hopefully inline the multiply, but we don't force it. 00629 */ 00630 static xxh_u64 00631 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) 00632 { 00633 XXH128_hash_t product = XXH_mult64to128(lhs, rhs); 00634 return product.low64 ^ product.high64; 00635 } 00636 00637 /* Seems to produce slightly better code on GCC for some reason. */ 00638 XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) 00639 { 00640 XXH_ASSERT(0 <= shift && shift < 64); 00641 return v64 ^ (v64 >> shift); 00642 } 00643 00644 /* 00645 * We don't need to (or want to) mix as much as XXH64. 00646 * 00647 * Short hashes are more evenly distributed, so it isn't necessary. 00648 */ 00649 static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) 00650 { 00651 h64 = XXH_xorshift64(h64, 37); 00652 h64 *= 0x165667919E3779F9ULL; 00653 h64 = XXH_xorshift64(h64, 32); 00654 return h64; 00655 } 00656 00657 00658 /* ========================================== 00659 * Short keys 00660 * ========================================== 00661 * One of the shortcomings of XXH32 and XXH64 was that their performance was 00662 * sub-optimal on short lengths. It used an iterative algorithm which strongly 00663 * favored lengths that were a multiple of 4 or 8. 00664 * 00665 * Instead of iterating over individual inputs, we use a set of single shot 00666 * functions which piece together a range of lengths and operate in constant time. 00667 * 00668 * Additionally, the number of multiplies has been significantly reduced. This 00669 * reduces latency, especially when emulating 64-bit multiplies on 32-bit. 00670 * 00671 * Depending on the platform, this may or may not be faster than XXH32, but it 00672 * is almost guaranteed to be faster than XXH64. 00673 */ 00674 00675 /* 00676 * At very short lengths, there isn't enough input to fully hide secrets, or use 00677 * the entire secret. 00678 * 00679 * There is also only a limited amount of mixing we can do before significantly 00680 * impacting performance. 00681 * 00682 * Therefore, we use different sections of the secret and always mix two secret 00683 * samples with an XOR. This should have no effect on performance on the 00684 * seedless or withSeed variants because everything _should_ be constant folded 00685 * by modern compilers. 00686 * 00687 * The XOR mixing hides individual parts of the secret and increases entropy. 00688 * 00689 * This adds an extra layer of strength for custom secrets. 00690 */ 00691 XXH_FORCE_INLINE XXH64_hash_t 00692 XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 00693 { 00694 XXH_ASSERT(input != NULL); 00695 XXH_ASSERT(1 <= len && len <= 3); 00696 XXH_ASSERT(secret != NULL); 00697 /* 00698 * len = 1: combined = { input[0], 0x01, input[0], input[0] } 00699 * len = 2: combined = { input[1], 0x02, input[0], input[1] } 00700 * len = 3: combined = { input[2], 0x03, input[0], input[1] } 00701 */ 00702 { xxh_u8 const c1 = input[0]; 00703 xxh_u8 const c2 = input[len >> 1]; 00704 xxh_u8 const c3 = input[len - 1]; 00705 xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) 00706 | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); 00707 xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; 00708 xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; 00709 xxh_u64 const mixed = keyed * XXH_PRIME64_1; 00710 return XXH3_avalanche(mixed); 00711 } 00712 } 00713 00714 XXH_FORCE_INLINE XXH64_hash_t 00715 XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 00716 { 00717 XXH_ASSERT(input != NULL); 00718 XXH_ASSERT(secret != NULL); 00719 XXH_ASSERT(4 <= len && len < 8); 00720 seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; 00721 { xxh_u32 const input1 = XXH_readLE32(input); 00722 xxh_u32 const input2 = XXH_readLE32(input + len - 4); 00723 xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; 00724 xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); 00725 xxh_u64 x = input64 ^ bitflip; 00726 /* this mix is inspired by Pelle Evensen's rrmxmx */ 00727 x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24); 00728 x *= 0x9FB21C651E98DF25ULL; 00729 x ^= (x >> 35) + len ; 00730 x *= 0x9FB21C651E98DF25ULL; 00731 return XXH_xorshift64(x, 28); 00732 } 00733 } 00734 00735 XXH_FORCE_INLINE XXH64_hash_t 00736 XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 00737 { 00738 XXH_ASSERT(input != NULL); 00739 XXH_ASSERT(secret != NULL); 00740 XXH_ASSERT(8 <= len && len <= 16); 00741 { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; 00742 xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; 00743 xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; 00744 xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; 00745 xxh_u64 const acc = len 00746 + XXH_swap64(input_lo) + input_hi 00747 + XXH3_mul128_fold64(input_lo, input_hi); 00748 return XXH3_avalanche(acc); 00749 } 00750 } 00751 00752 XXH_FORCE_INLINE XXH64_hash_t 00753 XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 00754 { 00755 XXH_ASSERT(len <= 16); 00756 { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); 00757 if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); 00758 if (len) return XXH3_len_1to3_64b(input, len, secret, seed); 00759 return XXH3_avalanche((XXH_PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); 00760 } 00761 } 00762 00763 /* 00764 * DISCLAIMER: There are known *seed-dependent* multicollisions here due to 00765 * multiplication by zero, affecting hashes of lengths 17 to 240. 00766 * 00767 * However, they are very unlikely. 00768 * 00769 * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all 00770 * unseeded non-cryptographic hashes, it does not attempt to defend itself 00771 * against specially crafted inputs, only random inputs. 00772 * 00773 * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes 00774 * cancelling out the secret is taken an arbitrary number of times (addressed 00775 * in XXH3_accumulate_512), this collision is very unlikely with random inputs 00776 * and/or proper seeding: 00777 * 00778 * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a 00779 * function that is only called up to 16 times per hash with up to 240 bytes of 00780 * input. 00781 * 00782 * This is not too bad for a non-cryptographic hash function, especially with 00783 * only 64 bit outputs. 00784 * 00785 * The 128-bit variant (which trades some speed for strength) is NOT affected 00786 * by this, although it is always a good idea to use a proper seed if you care 00787 * about strength. 00788 */ 00789 XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, 00790 const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) 00791 { 00792 #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ 00793 && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ 00794 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ 00795 /* 00796 * UGLY HACK: 00797 * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in 00798 * slower code. 00799 * 00800 * By forcing seed64 into a register, we disrupt the cost model and 00801 * cause it to scalarize. See `XXH32_round()` 00802 * 00803 * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, 00804 * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on 00805 * GCC 9.2, despite both emitting scalar code. 00806 * 00807 * GCC generates much better scalar code than Clang for the rest of XXH3, 00808 * which is why finding a more optimal codepath is an interest. 00809 */ 00810 __asm__ ("" : "+r" (seed64)); 00811 #endif 00812 { xxh_u64 const input_lo = XXH_readLE64(input); 00813 xxh_u64 const input_hi = XXH_readLE64(input+8); 00814 return XXH3_mul128_fold64( 00815 input_lo ^ (XXH_readLE64(secret) + seed64), 00816 input_hi ^ (XXH_readLE64(secret+8) - seed64) 00817 ); 00818 } 00819 } 00820 00821 /* For mid range keys, XXH3 uses a Mum-hash variant. */ 00822 XXH_FORCE_INLINE XXH64_hash_t 00823 XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, 00824 const xxh_u8* XXH_RESTRICT secret, size_t secretSize, 00825 XXH64_hash_t seed) 00826 { 00827 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; 00828 XXH_ASSERT(16 < len && len <= 128); 00829 00830 { xxh_u64 acc = len * XXH_PRIME64_1; 00831 if (len > 32) { 00832 if (len > 64) { 00833 if (len > 96) { 00834 acc += XXH3_mix16B(input+48, secret+96, seed); 00835 acc += XXH3_mix16B(input+len-64, secret+112, seed); 00836 } 00837 acc += XXH3_mix16B(input+32, secret+64, seed); 00838 acc += XXH3_mix16B(input+len-48, secret+80, seed); 00839 } 00840 acc += XXH3_mix16B(input+16, secret+32, seed); 00841 acc += XXH3_mix16B(input+len-32, secret+48, seed); 00842 } 00843 acc += XXH3_mix16B(input+0, secret+0, seed); 00844 acc += XXH3_mix16B(input+len-16, secret+16, seed); 00845 00846 return XXH3_avalanche(acc); 00847 } 00848 } 00849 00850 #define XXH3_MIDSIZE_MAX 240 00851 00852 XXH_NO_INLINE XXH64_hash_t 00853 XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, 00854 const xxh_u8* XXH_RESTRICT secret, size_t secretSize, 00855 XXH64_hash_t seed) 00856 { 00857 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; 00858 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); 00859 00860 #define XXH3_MIDSIZE_STARTOFFSET 3 00861 #define XXH3_MIDSIZE_LASTOFFSET 17 00862 00863 { xxh_u64 acc = len * XXH_PRIME64_1; 00864 int const nbRounds = (int)len / 16; 00865 int i; 00866 for (i=0; i<8; i++) { 00867 acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); 00868 } 00869 acc = XXH3_avalanche(acc); 00870 XXH_ASSERT(nbRounds >= 8); 00871 #if defined(__clang__) /* Clang */ \ 00872 && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ 00873 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ 00874 /* 00875 * UGLY HACK: 00876 * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. 00877 * In everywhere else, it uses scalar code. 00878 * 00879 * For 64->128-bit multiplies, even if the NEON was 100% optimal, it 00880 * would still be slower than UMAAL (see XXH_mult64to128). 00881 * 00882 * Unfortunately, Clang doesn't handle the long multiplies properly and 00883 * converts them to the nonexistent "vmulq_u64" intrinsic, which is then 00884 * scalarized into an ugly mess of VMOV.32 instructions. 00885 * 00886 * This mess is difficult to avoid without turning autovectorization 00887 * off completely, but they are usually relatively minor and/or not 00888 * worth it to fix. 00889 * 00890 * This loop is the easiest to fix, as unlike XXH32, this pragma 00891 * _actually works_ because it is a loop vectorization instead of an 00892 * SLP vectorization. 00893 */ 00894 #pragma clang loop vectorize(disable) 00895 #endif 00896 for (i=8 ; i < nbRounds; i++) { 00897 acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); 00898 } 00899 /* last bytes */ 00900 acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); 00901 return XXH3_avalanche(acc); 00902 } 00903 } 00904 00905 00906 /* === Long Keys === */ 00907 00908 #define XXH_STRIPE_LEN 64 00909 #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ 00910 #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) 00911 00912 #ifdef XXH_OLD_NAMES 00913 # define STRIPE_LEN XXH_STRIPE_LEN 00914 # define ACC_NB XXH_ACC_NB 00915 #endif 00916 00917 typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e; 00918 00919 /* 00920 * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. 00921 * 00922 * It is a hardened version of UMAC, based off of FARSH's implementation. 00923 * 00924 * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD 00925 * implementations, and it is ridiculously fast. 00926 * 00927 * We harden it by mixing the original input to the accumulators as well as the product. 00928 * 00929 * This means that in the (relatively likely) case of a multiply by zero, the 00930 * original input is preserved. 00931 * 00932 * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve 00933 * cross-pollination, as otherwise the upper and lower halves would be 00934 * essentially independent. 00935 * 00936 * This doesn't matter on 64-bit hashes since they all get merged together in 00937 * the end, so we skip the extra step. 00938 * 00939 * Both XXH3_64bits and XXH3_128bits use this subroutine. 00940 */ 00941 XXH_FORCE_INLINE void 00942 XXH3_accumulate_512( void* XXH_RESTRICT acc, 00943 const void* XXH_RESTRICT input, 00944 const void* XXH_RESTRICT secret, 00945 XXH3_accWidth_e accWidth) 00946 { 00947 #if (XXH_VECTOR == XXH_AVX512) 00948 00949 XXH_ASSERT((((size_t)acc) & 63) == 0); 00950 XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); 00951 { XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc; 00952 00953 /* data_vec = input[0]; */ 00954 __m512i const data_vec = _mm512_loadu_si512 (input); 00955 /* key_vec = secret[0]; */ 00956 __m512i const key_vec = _mm512_loadu_si512 (secret); 00957 /* data_key = data_vec ^ key_vec; */ 00958 __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); 00959 /* data_key_lo = data_key >> 32; */ 00960 __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); 00961 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ 00962 __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); 00963 if (accWidth == XXH3_acc_128bits) { 00964 /* xacc[0] += swap(data_vec); */ 00965 __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); 00966 __m512i const sum = _mm512_add_epi64(*xacc, data_swap); 00967 /* xacc[0] += product; */ 00968 *xacc = _mm512_add_epi64(product, sum); 00969 } else { /* XXH3_acc_64bits */ 00970 /* xacc[0] += data_vec; */ 00971 __m512i const sum = _mm512_add_epi64(*xacc, data_vec); 00972 /* xacc[0] += product; */ 00973 *xacc = _mm512_add_epi64(product, sum); 00974 } 00975 } 00976 00977 #elif (XXH_VECTOR == XXH_AVX2) 00978 00979 XXH_ASSERT((((size_t)acc) & 31) == 0); 00980 { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; 00981 /* Unaligned. This is mainly for pointer arithmetic, and because 00982 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ 00983 const __m256i* const xinput = (const __m256i *) input; 00984 /* Unaligned. This is mainly for pointer arithmetic, and because 00985 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ 00986 const __m256i* const xsecret = (const __m256i *) secret; 00987 00988 size_t i; 00989 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { 00990 /* data_vec = xinput[i]; */ 00991 __m256i const data_vec = _mm256_loadu_si256 (xinput+i); 00992 /* key_vec = xsecret[i]; */ 00993 __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); 00994 /* data_key = data_vec ^ key_vec; */ 00995 __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); 00996 /* data_key_lo = data_key >> 32; */ 00997 __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); 00998 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ 00999 __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); 01000 if (accWidth == XXH3_acc_128bits) { 01001 /* xacc[i] += swap(data_vec); */ 01002 __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); 01003 __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); 01004 /* xacc[i] += product; */ 01005 xacc[i] = _mm256_add_epi64(product, sum); 01006 } else { /* XXH3_acc_64bits */ 01007 /* xacc[i] += data_vec; */ 01008 __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); 01009 /* xacc[i] += product; */ 01010 xacc[i] = _mm256_add_epi64(product, sum); 01011 } 01012 } } 01013 01014 #elif (XXH_VECTOR == XXH_SSE2) 01015 01016 /* SSE2 is just a half-scale version of the AVX2 version. */ 01017 XXH_ASSERT((((size_t)acc) & 15) == 0); 01018 { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; 01019 /* Unaligned. This is mainly for pointer arithmetic, and because 01020 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ 01021 const __m128i* const xinput = (const __m128i *) input; 01022 /* Unaligned. This is mainly for pointer arithmetic, and because 01023 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ 01024 const __m128i* const xsecret = (const __m128i *) secret; 01025 01026 size_t i; 01027 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { 01028 /* data_vec = xinput[i]; */ 01029 __m128i const data_vec = _mm_loadu_si128 (xinput+i); 01030 /* key_vec = xsecret[i]; */ 01031 __m128i const key_vec = _mm_loadu_si128 (xsecret+i); 01032 /* data_key = data_vec ^ key_vec; */ 01033 __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); 01034 /* data_key_lo = data_key >> 32; */ 01035 __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); 01036 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ 01037 __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); 01038 if (accWidth == XXH3_acc_128bits) { 01039 /* xacc[i] += swap(data_vec); */ 01040 __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); 01041 __m128i const sum = _mm_add_epi64(xacc[i], data_swap); 01042 /* xacc[i] += product; */ 01043 xacc[i] = _mm_add_epi64(product, sum); 01044 } else { /* XXH3_acc_64bits */ 01045 /* xacc[i] += data_vec; */ 01046 __m128i const sum = _mm_add_epi64(xacc[i], data_vec); 01047 /* xacc[i] += product; */ 01048 xacc[i] = _mm_add_epi64(product, sum); 01049 } 01050 } } 01051 01052 #elif (XXH_VECTOR == XXH_NEON) 01053 01054 XXH_ASSERT((((size_t)acc) & 15) == 0); 01055 { 01056 XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; 01057 /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ 01058 uint8_t const* const xinput = (const uint8_t *) input; 01059 uint8_t const* const xsecret = (const uint8_t *) secret; 01060 01061 size_t i; 01062 for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) { 01063 /* data_vec = xinput[i]; */ 01064 uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); 01065 /* key_vec = xsecret[i]; */ 01066 uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); 01067 uint64x2_t data_key; 01068 uint32x2_t data_key_lo, data_key_hi; 01069 if (accWidth == XXH3_acc_64bits) { 01070 /* xacc[i] += data_vec; */ 01071 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); 01072 } else { /* XXH3_acc_128bits */ 01073 /* xacc[i] += swap(data_vec); */ 01074 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); 01075 uint64x2_t const swapped = vextq_u64(data64, data64, 1); 01076 xacc[i] = vaddq_u64 (xacc[i], swapped); 01077 } 01078 /* data_key = data_vec ^ key_vec; */ 01079 data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); 01080 /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); 01081 * data_key_hi = (uint32x2_t) (data_key >> 32); 01082 * data_key = UNDEFINED; */ 01083 XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); 01084 /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ 01085 xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); 01086 01087 } 01088 } 01089 01090 #elif (XXH_VECTOR == XXH_VSX) 01091 xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */ 01092 xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ 01093 xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ 01094 xxh_u64x2 const v32 = { 32, 32 }; 01095 size_t i; 01096 for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { 01097 /* data_vec = xinput[i]; */ 01098 xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); 01099 /* key_vec = xsecret[i]; */ 01100 xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); 01101 xxh_u64x2 const data_key = data_vec ^ key_vec; 01102 /* shuffled = (data_key << 32) | (data_key >> 32); */ 01103 xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); 01104 /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ 01105 xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); 01106 xacc[i] += product; 01107 01108 if (accWidth == XXH3_acc_64bits) { 01109 xacc[i] += data_vec; 01110 } else { /* XXH3_acc_128bits */ 01111 /* swap high and low halves */ 01112 #ifdef __s390x__ 01113 xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2); 01114 #else 01115 xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); 01116 #endif 01117 xacc[i] += data_swapped; 01118 } 01119 } 01120 01121 #else /* scalar variant of Accumulator - universal */ 01122 01123 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ 01124 const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ 01125 const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ 01126 size_t i; 01127 XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); 01128 for (i=0; i < XXH_ACC_NB; i++) { 01129 xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); 01130 xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); 01131 01132 if (accWidth == XXH3_acc_64bits) { 01133 xacc[i] += data_val; 01134 } else { 01135 xacc[i ^ 1] += data_val; /* swap adjacent lanes */ 01136 } 01137 xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); 01138 } 01139 #endif 01140 } 01141 01142 /* 01143 * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. 01144 * 01145 * Multiplication isn't perfect, as explained by Google in HighwayHash: 01146 * 01147 * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to 01148 * // varying degrees. In descending order of goodness, bytes 01149 * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. 01150 * // As expected, the upper and lower bytes are much worse. 01151 * 01152 * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 01153 * 01154 * Since our algorithm uses a pseudorandom secret to add some variance into the 01155 * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. 01156 * 01157 * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid 01158 * extraction. 01159 * 01160 * Both XXH3_64bits and XXH3_128bits use this subroutine. 01161 */ 01162 XXH_FORCE_INLINE void 01163 XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) 01164 { 01165 #if (XXH_VECTOR == XXH_AVX512) 01166 01167 XXH_ASSERT((((size_t)acc) & 63) == 0); 01168 XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); 01169 { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc; 01170 const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); 01171 01172 /* xacc[0] ^= (xacc[0] >> 47) */ 01173 __m512i const acc_vec = *xacc; 01174 __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); 01175 __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); 01176 /* xacc[0] ^= secret; */ 01177 __m512i const key_vec = _mm512_loadu_si512 (secret); 01178 __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); 01179 01180 /* xacc[0] *= XXH_PRIME32_1; */ 01181 __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); 01182 __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); 01183 __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); 01184 *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); 01185 } 01186 01187 #elif (XXH_VECTOR == XXH_AVX2) 01188 01189 XXH_ASSERT((((size_t)acc) & 31) == 0); 01190 { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; 01191 /* Unaligned. This is mainly for pointer arithmetic, and because 01192 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ 01193 const __m256i* const xsecret = (const __m256i *) secret; 01194 const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); 01195 01196 size_t i; 01197 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { 01198 /* xacc[i] ^= (xacc[i] >> 47) */ 01199 __m256i const acc_vec = xacc[i]; 01200 __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); 01201 __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); 01202 /* xacc[i] ^= xsecret; */ 01203 __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); 01204 __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); 01205 01206 /* xacc[i] *= XXH_PRIME32_1; */ 01207 __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); 01208 __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); 01209 __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); 01210 xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); 01211 } 01212 } 01213 01214 #elif (XXH_VECTOR == XXH_SSE2) 01215 01216 XXH_ASSERT((((size_t)acc) & 15) == 0); 01217 { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; 01218 /* Unaligned. This is mainly for pointer arithmetic, and because 01219 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ 01220 const __m128i* const xsecret = (const __m128i *) secret; 01221 const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); 01222 01223 size_t i; 01224 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { 01225 /* xacc[i] ^= (xacc[i] >> 47) */ 01226 __m128i const acc_vec = xacc[i]; 01227 __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); 01228 __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); 01229 /* xacc[i] ^= xsecret[i]; */ 01230 __m128i const key_vec = _mm_loadu_si128 (xsecret+i); 01231 __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); 01232 01233 /* xacc[i] *= XXH_PRIME32_1; */ 01234 __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); 01235 __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); 01236 __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); 01237 xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); 01238 } 01239 } 01240 01241 #elif (XXH_VECTOR == XXH_NEON) 01242 01243 XXH_ASSERT((((size_t)acc) & 15) == 0); 01244 01245 { uint64x2_t* xacc = (uint64x2_t*) acc; 01246 uint8_t const* xsecret = (uint8_t const*) secret; 01247 uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1); 01248 01249 size_t i; 01250 for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) { 01251 /* xacc[i] ^= (xacc[i] >> 47); */ 01252 uint64x2_t acc_vec = xacc[i]; 01253 uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); 01254 uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); 01255 01256 /* xacc[i] ^= xsecret[i]; */ 01257 uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); 01258 uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec)); 01259 01260 /* xacc[i] *= XXH_PRIME32_1 */ 01261 uint32x2_t data_key_lo, data_key_hi; 01262 /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); 01263 * data_key_hi = (uint32x2_t) (xacc[i] >> 32); 01264 * xacc[i] = UNDEFINED; */ 01265 XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); 01266 { /* 01267 * prod_hi = (data_key >> 32) * XXH_PRIME32_1; 01268 * 01269 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will 01270 * incorrectly "optimize" this: 01271 * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); 01272 * shifted = vshll_n_u32(tmp, 32); 01273 * to this: 01274 * tmp = "vmulq_u64"(a, b); // no such thing! 01275 * shifted = vshlq_n_u64(tmp, 32); 01276 * 01277 * However, unlike SSE, Clang lacks a 64-bit multiply routine 01278 * for NEON, and it scalarizes two 64-bit multiplies instead. 01279 * 01280 * vmull_u32 has the same timing as vmul_u32, and it avoids 01281 * this bug completely. 01282 * See https://bugs.llvm.org/show_bug.cgi?id=39967 01283 */ 01284 uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); 01285 /* xacc[i] = prod_hi << 32; */ 01286 xacc[i] = vshlq_n_u64(prod_hi, 32); 01287 /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ 01288 xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); 01289 } 01290 } } 01291 01292 #elif (XXH_VECTOR == XXH_VSX) 01293 01294 XXH_ASSERT((((size_t)acc) & 15) == 0); 01295 01296 { xxh_u64x2* const xacc = (xxh_u64x2*) acc; 01297 const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; 01298 /* constants */ 01299 xxh_u64x2 const v32 = { 32, 32 }; 01300 xxh_u64x2 const v47 = { 47, 47 }; 01301 xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; 01302 size_t i; 01303 for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { 01304 /* xacc[i] ^= (xacc[i] >> 47); */ 01305 xxh_u64x2 const acc_vec = xacc[i]; 01306 xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); 01307 01308 /* xacc[i] ^= xsecret[i]; */ 01309 xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); 01310 xxh_u64x2 const data_key = data_vec ^ key_vec; 01311 01312 /* xacc[i] *= XXH_PRIME32_1 */ 01313 /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ 01314 xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); 01315 /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ 01316 xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); 01317 xacc[i] = prod_odd + (prod_even << v32); 01318 } } 01319 01320 #else /* scalar variant of Scrambler - universal */ 01321 01322 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ 01323 const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ 01324 size_t i; 01325 XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); 01326 for (i=0; i < XXH_ACC_NB; i++) { 01327 xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); 01328 xxh_u64 acc64 = xacc[i]; 01329 acc64 = XXH_xorshift64(acc64, 47); 01330 acc64 ^= key64; 01331 acc64 *= XXH_PRIME32_1; 01332 xacc[i] = acc64; 01333 } 01334 01335 #endif 01336 } 01337 01338 #define XXH_PREFETCH_DIST 384 01339 01340 #ifdef __clang__ // for clang 01341 # define XXH_PREFETCH_DIST_AVX512_64 320 01342 # define XXH_PREFETCH_DIST_AVX512_128 320 01343 #else // for gcc 01344 # define XXH_PREFETCH_DIST_AVX512_64 640 01345 # define XXH_PREFETCH_DIST_AVX512_128 512 01346 #endif 01347 01348 /* 01349 * XXH3_accumulate() 01350 * Loops over XXH3_accumulate_512(). 01351 * Assumption: nbStripes will not overflow the secret size 01352 */ 01353 XXH_FORCE_INLINE void 01354 XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, 01355 const xxh_u8* XXH_RESTRICT input, 01356 const xxh_u8* XXH_RESTRICT secret, 01357 size_t nbStripes, 01358 XXH3_accWidth_e accWidth) 01359 { 01360 size_t n; 01361 for (n = 0; n < nbStripes; n++ ) { 01362 const xxh_u8* const in = input + n*XXH_STRIPE_LEN; 01363 #if (XXH_VECTOR == XXH_AVX512) 01364 if (accWidth == XXH3_acc_64bits) XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_64); 01365 else XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_128); 01366 #else 01367 XXH_PREFETCH(in + XXH_PREFETCH_DIST); 01368 #endif 01369 XXH3_accumulate_512(acc, 01370 in, 01371 secret + n*XXH_SECRET_CONSUME_RATE, 01372 accWidth); 01373 } 01374 } 01375 01376 XXH_FORCE_INLINE void 01377 XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc, 01378 const xxh_u8* XXH_RESTRICT input, size_t len, 01379 const xxh_u8* XXH_RESTRICT secret, size_t secretSize, 01380 XXH3_accWidth_e accWidth) 01381 { 01382 size_t const nb_rounds = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; 01383 size_t const block_len = XXH_STRIPE_LEN * nb_rounds; 01384 size_t const nb_blocks = len / block_len; 01385 01386 size_t n; 01387 01388 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 01389 01390 for (n = 0; n < nb_blocks; n++) { 01391 XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); 01392 XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN); 01393 } 01394 01395 /* last partial block */ 01396 XXH_ASSERT(len > XXH_STRIPE_LEN); 01397 { size_t const nbStripes = (len - (block_len * nb_blocks)) / XXH_STRIPE_LEN; 01398 XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); 01399 XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); 01400 01401 /* last stripe */ 01402 if (len & (XXH_STRIPE_LEN - 1)) { 01403 const xxh_u8* const p = input + len - XXH_STRIPE_LEN; 01404 /* Do not align on 8, so that the secret is different from the scrambler */ 01405 #define XXH_SECRET_LASTACC_START 7 01406 XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth); 01407 } } 01408 } 01409 01410 XXH_FORCE_INLINE xxh_u64 01411 XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) 01412 { 01413 return XXH3_mul128_fold64( 01414 acc[0] ^ XXH_readLE64(secret), 01415 acc[1] ^ XXH_readLE64(secret+8) ); 01416 } 01417 01418 static XXH64_hash_t 01419 XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) 01420 { 01421 xxh_u64 result64 = start; 01422 size_t i = 0; 01423 01424 for (i = 0; i < 4; i++) { 01425 result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); 01426 #if defined(__clang__) /* Clang */ \ 01427 && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ 01428 && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ 01429 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ 01430 /* 01431 * UGLY HACK: 01432 * Prevent autovectorization on Clang ARMv7-a. Exact same problem as 01433 * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. 01434 * XXH3_64bits, len == 256, Snapdragon 835: 01435 * without hack: 2063.7 MB/s 01436 * with hack: 2560.7 MB/s 01437 */ 01438 __asm__("" : "+r" (result64)); 01439 #endif 01440 } 01441 01442 return XXH3_avalanche(result64); 01443 } 01444 01445 #define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ 01446 XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } 01447 01448 XXH_FORCE_INLINE XXH64_hash_t 01449 XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, 01450 const xxh_u8* XXH_RESTRICT secret, size_t secretSize) 01451 { 01452 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; 01453 01454 XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits); 01455 01456 /* converge into final hash */ 01457 XXH_STATIC_ASSERT(sizeof(acc) == 64); 01458 /* do not align on 8, so that the secret is different from the accumulator */ 01459 #define XXH_SECRET_MERGEACCS_START 11 01460 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); 01461 return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); 01462 } 01463 01464 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) 01465 { 01466 if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); 01467 memcpy(dst, &v64, sizeof(v64)); 01468 } 01469 01470 /* XXH3_initCustomSecret() : 01471 * destination `customSecret` is presumed allocated and same size as `XXH3_kSecret`. 01472 */ 01473 XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64) 01474 { 01475 int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; 01476 int i; 01477 /* 01478 * We need a separate pointer for the hack below. 01479 * Any decent compiler will optimize this out otherwise. 01480 */ 01481 const xxh_u8 *kSecretPtr = XXH3_kSecret; 01482 01483 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); 01484 01485 #if defined(__clang__) && defined(__aarch64__) 01486 /* 01487 * UGLY HACK: 01488 * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are 01489 * placed sequentially, in order, at the top of the unrolled loop. 01490 * 01491 * While MOVK is great for generating constants (2 cycles for a 64-bit 01492 * constant compared to 4 cycles for LDR), long MOVK chains stall the 01493 * integer pipelines: 01494 * I L S 01495 * MOVK 01496 * MOVK 01497 * MOVK 01498 * MOVK 01499 * ADD 01500 * SUB STR 01501 * STR 01502 * By forcing loads from memory (as the asm line causes Clang to assume 01503 * that XXH3_kSecretPtr has been changed), the pipelines are used more 01504 * efficiently: 01505 * I L S 01506 * LDR 01507 * ADD LDR 01508 * SUB STR 01509 * STR 01510 * XXH3_64bits_withSeed, len == 256, Snapdragon 835 01511 * without hack: 2654.4 MB/s 01512 * with hack: 3202.9 MB/s 01513 */ 01514 __asm__("" : "+r" (kSecretPtr)); 01515 #endif 01516 /* 01517 * Note: in debug mode, this overrides the asm optimization 01518 * and Clang will emit MOVK chains again. 01519 */ 01520 XXH_ASSERT(kSecretPtr == XXH3_kSecret); 01521 01522 for (i=0; i < nbRounds; i++) { 01523 /* 01524 * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with 01525 * customSecret, and on aarch64, this prevented LDP from merging two 01526 * loads together for free. Putting the loads together before the stores 01527 * properly generates LDP. 01528 */ 01529 xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; 01530 xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; 01531 XXH_writeLE64(customSecret + 16*i, lo); 01532 XXH_writeLE64(customSecret + 16*i + 8, hi); 01533 } 01534 } 01535 01536 01537 /* 01538 * It's important for performance that XXH3_hashLong is not inlined. Not sure 01539 * why (uop cache maybe?), but the difference is large and easily measurable. 01540 */ 01541 XXH_NO_INLINE XXH64_hash_t 01542 XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len) 01543 { 01544 return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret)); 01545 } 01546 01547 /* 01548 * It's important for performance that XXH3_hashLong is not inlined. Not sure 01549 * why (uop cache maybe?), but the difference is large and easily measurable. 01550 */ 01551 XXH_NO_INLINE XXH64_hash_t 01552 XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len, 01553 const xxh_u8* XXH_RESTRICT secret, size_t secretSize) 01554 { 01555 return XXH3_hashLong_64b_internal(input, len, secret, secretSize); 01556 } 01557 01558 /* 01559 * XXH3_hashLong_64b_withSeed(): 01560 * Generate a custom key based on alteration of default XXH3_kSecret with the seed, 01561 * and then use this key for long mode hashing. 01562 * 01563 * This operation is decently fast but nonetheless costs a little bit of time. 01564 * Try to avoid it whenever possible (typically when seed==0). 01565 * 01566 * It's important for performance that XXH3_hashLong is not inlined. Not sure 01567 * why (uop cache maybe?), but the difference is large and easily measurable. 01568 */ 01569 XXH_NO_INLINE XXH64_hash_t 01570 XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) 01571 { 01572 XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; 01573 if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len); 01574 XXH3_initCustomSecret(secret, seed); 01575 return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret)); 01576 } 01577 01578 /* === Public entry point === */ 01579 01580 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) 01581 { 01582 if (len <= 16) 01583 return XXH3_len_0to16_64b((const xxh_u8*)input, len, XXH3_kSecret, 0); 01584 if (len <= 128) 01585 return XXH3_len_17to128_64b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), 0); 01586 if (len <= XXH3_MIDSIZE_MAX) 01587 return XXH3_len_129to240_64b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), 0); 01588 return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len); 01589 } 01590 01591 XXH_PUBLIC_API XXH64_hash_t 01592 XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) 01593 { 01594 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 01595 /* 01596 * If an action is to be taken if `secret` conditions are not respected, 01597 * it should be done here. 01598 * For now, it's a contract pre-condition. 01599 * Adding a check and a branch here would cost performance at every hash. 01600 */ 01601 if (len <= 16) 01602 return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); 01603 if (len <= 128) 01604 return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); 01605 if (len <= XXH3_MIDSIZE_MAX) 01606 return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); 01607 return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); 01608 } 01609 01610 XXH_PUBLIC_API XXH64_hash_t 01611 XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) 01612 { 01613 if (len <= 16) 01614 return XXH3_len_0to16_64b((const xxh_u8*)input, len, XXH3_kSecret, seed); 01615 if (len <= 128) 01616 return XXH3_len_17to128_64b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), seed); 01617 if (len <= XXH3_MIDSIZE_MAX) 01618 return XXH3_len_129to240_64b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), seed); 01619 return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); 01620 } 01621 01622 /* === XXH3 streaming === */ 01623 01624 01625 /* 01626 * Malloc's a pointer that is always aligned to align. 01627 * 01628 * This must be freed with `XXH_alignedFree()`. 01629 * 01630 * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte 01631 * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 01632 * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. 01633 * 01634 * This underalignment previously caused a rather obvious crash which went 01635 * completely unnoticed due to XXH3_createState() not actually being tested. 01636 * Credit to RedSpah for noticing this bug. 01637 * 01638 * The alignment is done manually: Functions like posix_memalign or _mm_malloc 01639 * are avoided: To maintain portability, we would have to write a fallback 01640 * like this anyways, and besides, testing for the existence of library 01641 * functions without relying on external build tools is impossible. 01642 * 01643 * The method is simple: Overallocate, manually align, and store the offset 01644 * to the original behind the returned pointer. 01645 * 01646 * Align must be a power of 2 and 8 <= align <= 128. 01647 */ 01648 static void* XXH_alignedMalloc(size_t s, size_t align) 01649 { 01650 XXH_ASSERT(align <= 128 && align >= 8); /* range check */ 01651 XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ 01652 XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ 01653 { /* Overallocate to make room for manual realignment and an offset byte */ 01654 xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); 01655 if (base != NULL) { 01656 /* 01657 * Get the offset needed to align this pointer. 01658 * 01659 * Even if the returned pointer is aligned, there will always be 01660 * at least one byte to store the offset to the original pointer. 01661 */ 01662 size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ 01663 /* Add the offset for the now-aligned pointer */ 01664 xxh_u8* ptr = base + offset; 01665 01666 XXH_ASSERT((size_t)ptr % align == 0); 01667 01668 /* Store the offset immediately before the returned pointer. */ 01669 ptr[-1] = (xxh_u8)offset; 01670 return ptr; 01671 } 01672 return NULL; 01673 } 01674 } 01675 /* 01676 * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass 01677 * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. 01678 */ 01679 static void XXH_alignedFree(void* p) 01680 { 01681 if (p != NULL) { 01682 xxh_u8* ptr = (xxh_u8*)p; 01683 /* Get the offset byte we added in XXH_malloc. */ 01684 xxh_u8 offset = ptr[-1]; 01685 /* Free the original malloc'd pointer */ 01686 xxh_u8* base = ptr - offset; 01687 XXH_free(base); 01688 } 01689 } 01690 XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) 01691 { 01692 return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); 01693 } 01694 01695 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) 01696 { 01697 XXH_alignedFree(statePtr); 01698 return XXH_OK; 01699 } 01700 01701 XXH_PUBLIC_API void 01702 XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) 01703 { 01704 memcpy(dst_state, src_state, sizeof(*dst_state)); 01705 } 01706 01707 static void 01708 XXH3_64bits_reset_internal(XXH3_state_t* statePtr, 01709 XXH64_hash_t seed, 01710 const xxh_u8* secret, size_t secretSize) 01711 { 01712 XXH_ASSERT(statePtr != NULL); 01713 memset(statePtr, 0, sizeof(*statePtr)); 01714 statePtr->acc[0] = XXH_PRIME32_3; 01715 statePtr->acc[1] = XXH_PRIME64_1; 01716 statePtr->acc[2] = XXH_PRIME64_2; 01717 statePtr->acc[3] = XXH_PRIME64_3; 01718 statePtr->acc[4] = XXH_PRIME64_4; 01719 statePtr->acc[5] = XXH_PRIME32_2; 01720 statePtr->acc[6] = XXH_PRIME64_5; 01721 statePtr->acc[7] = XXH_PRIME32_1; 01722 statePtr->seed = seed; 01723 XXH_ASSERT(secret != NULL); 01724 statePtr->extSecret = secret; 01725 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 01726 statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; 01727 statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; 01728 } 01729 01730 XXH_PUBLIC_API XXH_errorcode 01731 XXH3_64bits_reset(XXH3_state_t* statePtr) 01732 { 01733 if (statePtr == NULL) return XXH_ERROR; 01734 XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); 01735 return XXH_OK; 01736 } 01737 01738 XXH_PUBLIC_API XXH_errorcode 01739 XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) 01740 { 01741 if (statePtr == NULL) return XXH_ERROR; 01742 XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); 01743 if (secret == NULL) return XXH_ERROR; 01744 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; 01745 return XXH_OK; 01746 } 01747 01748 XXH_PUBLIC_API XXH_errorcode 01749 XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) 01750 { 01751 if (statePtr == NULL) return XXH_ERROR; 01752 XXH3_64bits_reset_internal(statePtr, seed, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); 01753 XXH3_initCustomSecret(statePtr->customSecret, seed); 01754 statePtr->extSecret = NULL; 01755 return XXH_OK; 01756 } 01757 01758 XXH_FORCE_INLINE void 01759 XXH3_consumeStripes( xxh_u64* acc, 01760 size_t* nbStripesSoFarPtr, size_t nbStripesPerBlock, 01761 const xxh_u8* input, size_t totalStripes, 01762 const xxh_u8* secret, size_t secretLimit, 01763 XXH3_accWidth_e accWidth) 01764 { 01765 XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); 01766 if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) { 01767 /* need a scrambling operation */ 01768 size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr; 01769 XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth); 01770 XXH3_scrambleAcc(acc, secret + secretLimit); 01771 XXH3_accumulate(acc, input + nbStripes * XXH_STRIPE_LEN, secret, totalStripes - nbStripes, accWidth); 01772 *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes); 01773 } else { 01774 XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth); 01775 *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes; 01776 } 01777 } 01778 01779 /* 01780 * Both XXH3_64bits_update and XXH3_128bits_update use this routine. 01781 */ 01782 XXH_FORCE_INLINE XXH_errorcode 01783 XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth) 01784 { 01785 if (input==NULL) 01786 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) 01787 return XXH_OK; 01788 #else 01789 return XXH_ERROR; 01790 #endif 01791 01792 { const xxh_u8* const bEnd = input + len; 01793 const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; 01794 01795 state->totalLen += len; 01796 01797 if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ 01798 XXH_memcpy(state->buffer + state->bufferedSize, input, len); 01799 state->bufferedSize += (XXH32_hash_t)len; 01800 return XXH_OK; 01801 } 01802 /* input is now > XXH3_INTERNALBUFFER_SIZE */ 01803 01804 #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) 01805 XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ 01806 01807 /* 01808 * There is some input left inside the internal buffer. 01809 * Fill it, then consume it. 01810 */ 01811 if (state->bufferedSize) { 01812 size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; 01813 XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); 01814 input += loadSize; 01815 XXH3_consumeStripes(state->acc, 01816 &state->nbStripesSoFar, state->nbStripesPerBlock, 01817 state->buffer, XXH3_INTERNALBUFFER_STRIPES, 01818 secret, state->secretLimit, 01819 accWidth); 01820 state->bufferedSize = 0; 01821 } 01822 01823 /* Consume input by full buffer quantities */ 01824 if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) { 01825 const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; 01826 do { 01827 XXH3_consumeStripes(state->acc, 01828 &state->nbStripesSoFar, state->nbStripesPerBlock, 01829 input, XXH3_INTERNALBUFFER_STRIPES, 01830 secret, state->secretLimit, 01831 accWidth); 01832 input += XXH3_INTERNALBUFFER_SIZE; 01833 } while (input<=limit); 01834 } 01835 01836 if (input < bEnd) { /* Some remaining input: buffer it */ 01837 XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); 01838 state->bufferedSize = (XXH32_hash_t)(bEnd-input); 01839 } 01840 } 01841 01842 return XXH_OK; 01843 } 01844 01845 XXH_PUBLIC_API XXH_errorcode 01846 XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) 01847 { 01848 return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits); 01849 } 01850 01851 01852 XXH_FORCE_INLINE void 01853 XXH3_digest_long (XXH64_hash_t* acc, 01854 const XXH3_state_t* state, 01855 const unsigned char* secret, 01856 XXH3_accWidth_e accWidth) 01857 { 01858 /* 01859 * Digest on a local copy. This way, the state remains unaltered, and it can 01860 * continue ingesting more input afterwards. 01861 */ 01862 memcpy(acc, state->acc, sizeof(state->acc)); 01863 if (state->bufferedSize >= XXH_STRIPE_LEN) { 01864 size_t const totalNbStripes = state->bufferedSize / XXH_STRIPE_LEN; 01865 size_t nbStripesSoFar = state->nbStripesSoFar; 01866 XXH3_consumeStripes(acc, 01867 &nbStripesSoFar, state->nbStripesPerBlock, 01868 state->buffer, totalNbStripes, 01869 secret, state->secretLimit, 01870 accWidth); 01871 if (state->bufferedSize % XXH_STRIPE_LEN) { /* one last partial stripe */ 01872 XXH3_accumulate_512(acc, 01873 state->buffer + state->bufferedSize - XXH_STRIPE_LEN, 01874 secret + state->secretLimit - XXH_SECRET_LASTACC_START, 01875 accWidth); 01876 } 01877 } else { /* bufferedSize < XXH_STRIPE_LEN */ 01878 if (state->bufferedSize) { /* one last stripe */ 01879 xxh_u8 lastStripe[XXH_STRIPE_LEN]; 01880 size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; 01881 memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); 01882 memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); 01883 XXH3_accumulate_512(acc, 01884 lastStripe, 01885 secret + state->secretLimit - XXH_SECRET_LASTACC_START, 01886 accWidth); 01887 } } 01888 } 01889 01890 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) 01891 { 01892 const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; 01893 if (state->totalLen > XXH3_MIDSIZE_MAX) { 01894 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; 01895 XXH3_digest_long(acc, state, secret, XXH3_acc_64bits); 01896 return XXH3_mergeAccs(acc, 01897 secret + XXH_SECRET_MERGEACCS_START, 01898 (xxh_u64)state->totalLen * XXH_PRIME64_1); 01899 } 01900 /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ 01901 if (state->seed) 01902 return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); 01903 return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), 01904 secret, state->secretLimit + XXH_STRIPE_LEN); 01905 } 01906 01907 /* ========================================== 01908 * XXH3 128 bits (a.k.a XXH128) 01909 * ========================================== 01910 * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, 01911 * even without counting the significantly larger output size. 01912 * 01913 * For example, extra steps are taken to avoid the seed-dependent collisions 01914 * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). 01915 * 01916 * This strength naturally comes at the cost of some speed, especially on short 01917 * lengths. Note that longer hashes are about as fast as the 64-bit version 01918 * due to it using only a slight modification of the 64-bit loop. 01919 * 01920 * XXH128 is also more oriented towards 64-bit machines. It is still extremely 01921 * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). 01922 */ 01923 01924 XXH_FORCE_INLINE XXH128_hash_t 01925 XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 01926 { 01927 /* A doubled version of 1to3_64b with different constants. */ 01928 XXH_ASSERT(input != NULL); 01929 XXH_ASSERT(1 <= len && len <= 3); 01930 XXH_ASSERT(secret != NULL); 01931 /* 01932 * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } 01933 * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } 01934 * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } 01935 */ 01936 { xxh_u8 const c1 = input[0]; 01937 xxh_u8 const c2 = input[len >> 1]; 01938 xxh_u8 const c3 = input[len - 1]; 01939 xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) 01940 | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); 01941 xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); 01942 xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; 01943 xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; 01944 xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; 01945 xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; 01946 xxh_u64 const mixedl = keyed_lo * XXH_PRIME64_1; 01947 xxh_u64 const mixedh = keyed_hi * XXH_PRIME64_5; 01948 XXH128_hash_t h128; 01949 h128.low64 = XXH3_avalanche(mixedl); 01950 h128.high64 = XXH3_avalanche(mixedh); 01951 return h128; 01952 } 01953 } 01954 01955 XXH_FORCE_INLINE XXH128_hash_t 01956 XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 01957 { 01958 XXH_ASSERT(input != NULL); 01959 XXH_ASSERT(secret != NULL); 01960 XXH_ASSERT(4 <= len && len <= 8); 01961 seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; 01962 { xxh_u32 const input_lo = XXH_readLE32(input); 01963 xxh_u32 const input_hi = XXH_readLE32(input + len - 4); 01964 xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); 01965 xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; 01966 xxh_u64 const keyed = input_64 ^ bitflip; 01967 01968 /* Shift len to the left to ensure it is even, this avoids even multiplies. */ 01969 XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); 01970 01971 m128.high64 += (m128.low64 << 1); 01972 m128.low64 ^= (m128.high64 >> 3); 01973 01974 m128.low64 = XXH_xorshift64(m128.low64, 35); 01975 m128.low64 *= 0x9FB21C651E98DF25ULL; 01976 m128.low64 = XXH_xorshift64(m128.low64, 28); 01977 m128.high64 = XXH3_avalanche(m128.high64); 01978 return m128; 01979 } 01980 } 01981 01982 XXH_FORCE_INLINE XXH128_hash_t 01983 XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 01984 { 01985 XXH_ASSERT(input != NULL); 01986 XXH_ASSERT(secret != NULL); 01987 XXH_ASSERT(9 <= len && len <= 16); 01988 { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; 01989 xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; 01990 xxh_u64 const input_lo = XXH_readLE64(input); 01991 xxh_u64 input_hi = XXH_readLE64(input + len - 8); 01992 XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); 01993 /* 01994 * Put len in the middle of m128 to ensure that the length gets mixed to 01995 * both the low and high bits in the 128x64 multiply below. 01996 */ 01997 m128.low64 += (xxh_u64)(len - 1) << 54; 01998 input_hi ^= bitfliph; 01999 /* 02000 * Add the high 32 bits of input_hi to the high 32 bits of m128, then 02001 * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to 02002 * the high 64 bits of m128. 02003 * 02004 * The best approach to this operation is different on 32-bit and 64-bit. 02005 */ 02006 if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ 02007 /* 02008 * 32-bit optimized version, which is more readable. 02009 * 02010 * On 32-bit, it removes an ADC and delays a dependency between the two 02011 * halves of m128.high64, but it generates an extra mask on 64-bit. 02012 */ 02013 m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); 02014 } else { 02015 /* 02016 * 64-bit optimized (albeit more confusing) version. 02017 * 02018 * Uses some properties of addition and multiplication to remove the mask: 02019 * 02020 * Let: 02021 * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) 02022 * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) 02023 * c = XXH_PRIME32_2 02024 * 02025 * a + (b * c) 02026 * Inverse Property: x + y - x == y 02027 * a + (b * (1 + c - 1)) 02028 * Distributive Property: x * (y + z) == (x * y) + (x * z) 02029 * a + (b * 1) + (b * (c - 1)) 02030 * Identity Property: x * 1 == x 02031 * a + b + (b * (c - 1)) 02032 * 02033 * Substitute a, b, and c: 02034 * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) 02035 * 02036 * Since input_hi.hi + input_hi.lo == input_hi, we get this: 02037 * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) 02038 */ 02039 m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); 02040 } 02041 /* m128 ^= XXH_swap64(m128 >> 64); */ 02042 m128.low64 ^= XXH_swap64(m128.high64); 02043 02044 { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ 02045 XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); 02046 h128.high64 += m128.high64 * XXH_PRIME64_2; 02047 02048 h128.low64 = XXH3_avalanche(h128.low64); 02049 h128.high64 = XXH3_avalanche(h128.high64); 02050 return h128; 02051 } } 02052 } 02053 02054 /* 02055 * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN 02056 */ 02057 XXH_FORCE_INLINE XXH128_hash_t 02058 XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 02059 { 02060 XXH_ASSERT(len <= 16); 02061 { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); 02062 if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); 02063 if (len) return XXH3_len_1to3_128b(input, len, secret, seed); 02064 { XXH128_hash_t h128; 02065 xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); 02066 xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); 02067 h128.low64 = XXH3_avalanche((XXH_PRIME64_1 + seed) ^ bitflipl); 02068 h128.high64 = XXH3_avalanche((XXH_PRIME64_2 - seed) ^ bitfliph); 02069 return h128; 02070 } } 02071 } 02072 02073 /* 02074 * A bit slower than XXH3_mix16B, but handles multiply by zero better. 02075 */ 02076 XXH_FORCE_INLINE XXH128_hash_t 02077 XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, 02078 const xxh_u8* secret, XXH64_hash_t seed) 02079 { 02080 acc.low64 += XXH3_mix16B (input_1, secret+0, seed); 02081 acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); 02082 acc.high64 += XXH3_mix16B (input_2, secret+16, seed); 02083 acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); 02084 return acc; 02085 } 02086 02087 02088 XXH_FORCE_INLINE XXH128_hash_t 02089 XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, 02090 const xxh_u8* XXH_RESTRICT secret, size_t secretSize, 02091 XXH64_hash_t seed) 02092 { 02093 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; 02094 XXH_ASSERT(16 < len && len <= 128); 02095 02096 { XXH128_hash_t acc; 02097 acc.low64 = len * XXH_PRIME64_1; 02098 acc.high64 = 0; 02099 if (len > 32) { 02100 if (len > 64) { 02101 if (len > 96) { 02102 acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); 02103 } 02104 acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); 02105 } 02106 acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); 02107 } 02108 acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); 02109 { XXH128_hash_t h128; 02110 h128.low64 = acc.low64 + acc.high64; 02111 h128.high64 = (acc.low64 * XXH_PRIME64_1) 02112 + (acc.high64 * XXH_PRIME64_4) 02113 + ((len - seed) * XXH_PRIME64_2); 02114 h128.low64 = XXH3_avalanche(h128.low64); 02115 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); 02116 return h128; 02117 } 02118 } 02119 } 02120 02121 XXH_NO_INLINE XXH128_hash_t 02122 XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, 02123 const xxh_u8* XXH_RESTRICT secret, size_t secretSize, 02124 XXH64_hash_t seed) 02125 { 02126 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; 02127 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); 02128 02129 { XXH128_hash_t acc; 02130 int const nbRounds = (int)len / 32; 02131 int i; 02132 acc.low64 = len * XXH_PRIME64_1; 02133 acc.high64 = 0; 02134 for (i=0; i<4; i++) { 02135 acc = XXH128_mix32B(acc, 02136 input + (32 * i), 02137 input + (32 * i) + 16, 02138 secret + (32 * i), 02139 seed); 02140 } 02141 acc.low64 = XXH3_avalanche(acc.low64); 02142 acc.high64 = XXH3_avalanche(acc.high64); 02143 XXH_ASSERT(nbRounds >= 4); 02144 for (i=4 ; i < nbRounds; i++) { 02145 acc = XXH128_mix32B(acc, 02146 input + (32 * i), 02147 input + (32 * i) + 16, 02148 secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), 02149 seed); 02150 } 02151 /* last bytes */ 02152 acc = XXH128_mix32B(acc, 02153 input + len - 16, 02154 input + len - 32, 02155 secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, 02156 0ULL - seed); 02157 02158 { XXH128_hash_t h128; 02159 h128.low64 = acc.low64 + acc.high64; 02160 h128.high64 = (acc.low64 * XXH_PRIME64_1) 02161 + (acc.high64 * XXH_PRIME64_4) 02162 + ((len - seed) * XXH_PRIME64_2); 02163 h128.low64 = XXH3_avalanche(h128.low64); 02164 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); 02165 return h128; 02166 } 02167 } 02168 } 02169 02170 XXH_FORCE_INLINE XXH128_hash_t 02171 XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, 02172 const xxh_u8* XXH_RESTRICT secret, size_t secretSize) 02173 { 02174 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; 02175 02176 XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits); 02177 02178 /* converge into final hash */ 02179 XXH_STATIC_ASSERT(sizeof(acc) == 64); 02180 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); 02181 { XXH128_hash_t h128; 02182 h128.low64 = XXH3_mergeAccs(acc, 02183 secret + XXH_SECRET_MERGEACCS_START, 02184 (xxh_u64)len * XXH_PRIME64_1); 02185 h128.high64 = XXH3_mergeAccs(acc, 02186 secret + secretSize 02187 - sizeof(acc) - XXH_SECRET_MERGEACCS_START, 02188 ~((xxh_u64)len * XXH_PRIME64_2)); 02189 return h128; 02190 } 02191 } 02192 02193 /* 02194 * It's important for performance that XXH3_hashLong is not inlined. Not sure 02195 * why (uop cache maybe?), but the difference is large and easily measurable. 02196 */ 02197 XXH_NO_INLINE XXH128_hash_t 02198 XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len) 02199 { 02200 return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret)); 02201 } 02202 02203 /* 02204 * It's important for performance that XXH3_hashLong is not inlined. Not sure 02205 * why (uop cache maybe?), but the difference is large and easily measurable. 02206 */ 02207 XXH_NO_INLINE XXH128_hash_t 02208 XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len, 02209 const xxh_u8* secret, size_t secretSize) 02210 { 02211 return XXH3_hashLong_128b_internal(input, len, secret, secretSize); 02212 } 02213 02214 /* 02215 * It's important for performance that XXH3_hashLong is not inlined. Not sure 02216 * why (uop cache maybe?), but the difference is large and easily measurable. 02217 */ 02218 XXH_NO_INLINE XXH128_hash_t 02219 XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) 02220 { 02221 XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; 02222 if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len); 02223 XXH3_initCustomSecret(secret, seed); 02224 return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret)); 02225 } 02226 02227 02228 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) 02229 { 02230 if (len <= 16) 02231 return XXH3_len_0to16_128b((const xxh_u8*)input, len, XXH3_kSecret, 0); 02232 if (len <= 128) 02233 return XXH3_len_17to128_128b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), 0); 02234 if (len <= XXH3_MIDSIZE_MAX) 02235 return XXH3_len_129to240_128b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), 0); 02236 return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len); 02237 } 02238 02239 XXH_PUBLIC_API XXH128_hash_t 02240 XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) 02241 { 02242 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 02243 /* 02244 * If an action is to be taken if `secret` conditions are not respected, 02245 * it should be done here. 02246 * For now, it's a contract pre-condition. 02247 * Adding a check and a branch here would cost performance at every hash. 02248 */ 02249 if (len <= 16) 02250 return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); 02251 if (len <= 128) 02252 return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); 02253 if (len <= XXH3_MIDSIZE_MAX) 02254 return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); 02255 return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); 02256 } 02257 02258 XXH_PUBLIC_API XXH128_hash_t 02259 XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) 02260 { 02261 if (len <= 16) 02262 return XXH3_len_0to16_128b((const xxh_u8*)input, len, XXH3_kSecret, seed); 02263 if (len <= 128) 02264 return XXH3_len_17to128_128b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), seed); 02265 if (len <= XXH3_MIDSIZE_MAX) 02266 return XXH3_len_129to240_128b((const xxh_u8*)input, len, XXH3_kSecret, sizeof(XXH3_kSecret), seed); 02267 return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed); 02268 } 02269 02270 XXH_PUBLIC_API XXH128_hash_t 02271 XXH128(const void* input, size_t len, XXH64_hash_t seed) 02272 { 02273 return XXH3_128bits_withSeed(input, len, seed); 02274 } 02275 02276 02277 /* === XXH3 128-bit streaming === */ 02278 02279 /* 02280 * All the functions are actually the same as for 64-bit streaming variant. 02281 * The only difference is the finalizatiom routine. 02282 */ 02283 02284 static void 02285 XXH3_128bits_reset_internal(XXH3_state_t* statePtr, 02286 XXH64_hash_t seed, 02287 const xxh_u8* secret, size_t secretSize) 02288 { 02289 XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize); 02290 } 02291 02292 XXH_PUBLIC_API XXH_errorcode 02293 XXH3_128bits_reset(XXH3_state_t* statePtr) 02294 { 02295 if (statePtr == NULL) return XXH_ERROR; 02296 XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); 02297 return XXH_OK; 02298 } 02299 02300 XXH_PUBLIC_API XXH_errorcode 02301 XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) 02302 { 02303 if (statePtr == NULL) return XXH_ERROR; 02304 XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); 02305 if (secret == NULL) return XXH_ERROR; 02306 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; 02307 return XXH_OK; 02308 } 02309 02310 XXH_PUBLIC_API XXH_errorcode 02311 XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) 02312 { 02313 if (statePtr == NULL) return XXH_ERROR; 02314 XXH3_128bits_reset_internal(statePtr, seed, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); 02315 XXH3_initCustomSecret(statePtr->customSecret, seed); 02316 statePtr->extSecret = NULL; 02317 return XXH_OK; 02318 } 02319 02320 XXH_PUBLIC_API XXH_errorcode 02321 XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) 02322 { 02323 return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits); 02324 } 02325 02326 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) 02327 { 02328 const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; 02329 if (state->totalLen > XXH3_MIDSIZE_MAX) { 02330 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; 02331 XXH3_digest_long(acc, state, secret, XXH3_acc_128bits); 02332 XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); 02333 { XXH128_hash_t h128; 02334 h128.low64 = XXH3_mergeAccs(acc, 02335 secret + XXH_SECRET_MERGEACCS_START, 02336 (xxh_u64)state->totalLen * XXH_PRIME64_1); 02337 h128.high64 = XXH3_mergeAccs(acc, 02338 secret + state->secretLimit + XXH_STRIPE_LEN 02339 - sizeof(acc) - XXH_SECRET_MERGEACCS_START, 02340 ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); 02341 return h128; 02342 } 02343 } 02344 /* len <= XXH3_MIDSIZE_MAX : short code */ 02345 if (state->seed) 02346 return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); 02347 return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), 02348 secret, state->secretLimit + XXH_STRIPE_LEN); 02349 } 02350 02351 /* 128-bit utility functions */ 02352 02353 #include <string.h> /* memcmp, memcpy */ 02354 02355 /* return : 1 is equal, 0 if different */ 02356 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) 02357 { 02358 /* note : XXH128_hash_t is compact, it has no padding byte */ 02359 return !(memcmp(&h1, &h2, sizeof(h1))); 02360 } 02361 02362 /* This prototype is compatible with stdlib's qsort(). 02363 * return : >0 if *h128_1 > *h128_2 02364 * <0 if *h128_1 < *h128_2 02365 * =0 if *h128_1 == *h128_2 */ 02366 XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) 02367 { 02368 XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; 02369 XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; 02370 int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); 02371 /* note : bets that, in most cases, hash values are different */ 02372 if (hcmp) return hcmp; 02373 return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); 02374 } 02375 02376 02377 /*====== Canonical representation ======*/ 02378 XXH_PUBLIC_API void 02379 XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) 02380 { 02381 XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); 02382 if (XXH_CPU_LITTLE_ENDIAN) { 02383 hash.high64 = XXH_swap64(hash.high64); 02384 hash.low64 = XXH_swap64(hash.low64); 02385 } 02386 memcpy(dst, &hash.high64, sizeof(hash.high64)); 02387 memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); 02388 } 02389 02390 XXH_PUBLIC_API XXH128_hash_t 02391 XXH128_hashFromCanonical(const XXH128_canonical_t* src) 02392 { 02393 XXH128_hash_t h; 02394 h.high64 = XXH_readBE64(src); 02395 h.low64 = XXH_readBE64(src->digest + 8); 02396 return h; 02397 } 02398 02399 /* Pop our optimization override from above */ 02400 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ 02401 && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ 02402 && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ 02403 # pragma GCC pop_options 02404 #endif 02405 02406 #endif /* XXH3_H_1397135465 */