| #ifndef BLAKE3_IMPL_H |
| #define BLAKE3_IMPL_H |
| |
| #include <assert.h> |
| #include <stdbool.h> |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <string.h> |
| |
| #include "llvm-c/blake3.h" |
| // For \p LLVM_LIBRARY_VISIBILITY |
| #include "llvm/Support/Compiler.h" |
| |
| #include "llvm_blake3_prefix.h" |
| |
| // internal flags |
| enum blake3_flags { |
| CHUNK_START = 1 << 0, |
| CHUNK_END = 1 << 1, |
| PARENT = 1 << 2, |
| ROOT = 1 << 3, |
| KEYED_HASH = 1 << 4, |
| DERIVE_KEY_CONTEXT = 1 << 5, |
| DERIVE_KEY_MATERIAL = 1 << 6, |
| }; |
| |
| // This C implementation tries to support recent versions of GCC, Clang, and |
| // MSVC. |
| #if defined(_MSC_VER) |
| #define INLINE static __forceinline |
| #else |
| #define INLINE static inline __attribute__((always_inline)) |
| #endif |
| |
| #if defined(__x86_64__) || defined(_M_X64) |
| #define IS_X86 |
| #define IS_X86_64 |
| #endif |
| |
| #if defined(__i386__) || defined(_M_IX86) |
| #define IS_X86 |
| #define IS_X86_32 |
| #endif |
| |
| #if defined(__aarch64__) || defined(_M_ARM64) |
| #define IS_AARCH64 |
| #endif |
| |
| #if defined(IS_X86) |
| #if defined(_MSC_VER) |
| #include <intrin.h> |
| #endif |
| #include <immintrin.h> |
| #endif |
| |
| #if !defined(BLAKE3_USE_NEON) |
| // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness |
| #if defined(IS_AARCH64) |
| #define BLAKE3_USE_NEON 1 |
| #else |
| #define BLAKE3_USE_NEON 0 |
| #endif |
| #endif |
| |
| #if defined(IS_X86) |
| #define MAX_SIMD_DEGREE 16 |
| #elif BLAKE3_USE_NEON == 1 |
| #define MAX_SIMD_DEGREE 4 |
| #else |
| #define MAX_SIMD_DEGREE 1 |
| #endif |
| |
| // There are some places where we want a static size that's equal to the |
| // MAX_SIMD_DEGREE, but also at least 2. |
| #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) |
| |
| static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, |
| 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, |
| 0x1F83D9ABUL, 0x5BE0CD19UL}; |
| |
| static const uint8_t MSG_SCHEDULE[7][16] = { |
| {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
| {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, |
| {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, |
| {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, |
| {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, |
| {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, |
| {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, |
| }; |
| |
| /* Find index of the highest set bit */ |
| /* x is assumed to be nonzero. */ |
| static unsigned int highest_one(uint64_t x) { |
| #if defined(__GNUC__) || defined(__clang__) |
| return 63 ^ __builtin_clzll(x); |
| #elif defined(_MSC_VER) && defined(IS_X86_64) |
| unsigned long index; |
| _BitScanReverse64(&index, x); |
| return index; |
| #elif defined(_MSC_VER) && defined(IS_X86_32) |
| if(x >> 32) { |
| unsigned long index; |
| _BitScanReverse(&index, (unsigned long)(x >> 32)); |
| return 32 + index; |
| } else { |
| unsigned long index; |
| _BitScanReverse(&index, (unsigned long)x); |
| return index; |
| } |
| #else |
| unsigned int c = 0; |
| if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } |
| if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } |
| if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } |
| if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } |
| if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } |
| if(x & 0x0000000000000002ULL) { c += 1; } |
| return c; |
| #endif |
| } |
| |
| // Count the number of 1 bits. |
| INLINE unsigned int popcnt(uint64_t x) { |
| #if defined(__GNUC__) || defined(__clang__) |
| return __builtin_popcountll(x); |
| #else |
| unsigned int count = 0; |
| while (x != 0) { |
| count += 1; |
| x &= x - 1; |
| } |
| return count; |
| #endif |
| } |
| |
| // Largest power of two less than or equal to x. As a special case, returns 1 |
| // when x is 0. |
| INLINE uint64_t round_down_to_power_of_2(uint64_t x) { |
| return 1ULL << highest_one(x | 1); |
| } |
| |
| INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } |
| |
| INLINE uint32_t counter_high(uint64_t counter) { |
| return (uint32_t)(counter >> 32); |
| } |
| |
| INLINE uint32_t load32(const void *src) { |
| const uint8_t *p = (const uint8_t *)src; |
| return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | |
| ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); |
| } |
| |
| INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], |
| uint32_t key_words[8]) { |
| key_words[0] = load32(&key[0 * 4]); |
| key_words[1] = load32(&key[1 * 4]); |
| key_words[2] = load32(&key[2 * 4]); |
| key_words[3] = load32(&key[3 * 4]); |
| key_words[4] = load32(&key[4 * 4]); |
| key_words[5] = load32(&key[5 * 4]); |
| key_words[6] = load32(&key[6 * 4]); |
| key_words[7] = load32(&key[7 * 4]); |
| } |
| |
| INLINE void store32(void *dst, uint32_t w) { |
| uint8_t *p = (uint8_t *)dst; |
| p[0] = (uint8_t)(w >> 0); |
| p[1] = (uint8_t)(w >> 8); |
| p[2] = (uint8_t)(w >> 16); |
| p[3] = (uint8_t)(w >> 24); |
| } |
| |
| INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { |
| store32(&bytes_out[0 * 4], cv_words[0]); |
| store32(&bytes_out[1 * 4], cv_words[1]); |
| store32(&bytes_out[2 * 4], cv_words[2]); |
| store32(&bytes_out[3 * 4], cv_words[3]); |
| store32(&bytes_out[4 * 4], cv_words[4]); |
| store32(&bytes_out[5 * 4], cv_words[5]); |
| store32(&bytes_out[6 * 4], cv_words[6]); |
| store32(&bytes_out[7 * 4], cv_words[7]); |
| } |
| |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_compress_in_place(uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, |
| uint8_t flags); |
| |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_compress_xof(const uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, uint8_t flags, |
| uint8_t out[64]); |
| |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, |
| size_t blocks, const uint32_t key[8], uint64_t counter, |
| bool increment_counter, uint8_t flags, |
| uint8_t flags_start, uint8_t flags_end, uint8_t *out); |
| |
| LLVM_LIBRARY_VISIBILITY |
| size_t blake3_simd_degree(void); |
| |
| |
| // Declarations for implementation-specific functions. |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_compress_in_place_portable(uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, |
| uint8_t flags); |
| |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_compress_xof_portable(const uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, |
| uint8_t flags, uint8_t out[64]); |
| |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, |
| size_t blocks, const uint32_t key[8], |
| uint64_t counter, bool increment_counter, |
| uint8_t flags, uint8_t flags_start, |
| uint8_t flags_end, uint8_t *out); |
| |
| #if defined(IS_X86) |
| #if !defined(BLAKE3_NO_SSE2) |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_compress_in_place_sse2(uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, |
| uint8_t flags); |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_compress_xof_sse2(const uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, |
| uint8_t flags, uint8_t out[64]); |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, |
| size_t blocks, const uint32_t key[8], |
| uint64_t counter, bool increment_counter, |
| uint8_t flags, uint8_t flags_start, |
| uint8_t flags_end, uint8_t *out); |
| #endif |
| #if !defined(BLAKE3_NO_SSE41) |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_compress_in_place_sse41(uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, |
| uint8_t flags); |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_compress_xof_sse41(const uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, |
| uint8_t flags, uint8_t out[64]); |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, |
| size_t blocks, const uint32_t key[8], |
| uint64_t counter, bool increment_counter, |
| uint8_t flags, uint8_t flags_start, |
| uint8_t flags_end, uint8_t *out); |
| #endif |
| #if !defined(BLAKE3_NO_AVX2) |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, |
| size_t blocks, const uint32_t key[8], |
| uint64_t counter, bool increment_counter, |
| uint8_t flags, uint8_t flags_start, |
| uint8_t flags_end, uint8_t *out); |
| #endif |
| #if !defined(BLAKE3_NO_AVX512) |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_compress_in_place_avx512(uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, |
| uint8_t flags); |
| |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_compress_xof_avx512(const uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, |
| uint8_t flags, uint8_t out[64]); |
| |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, |
| size_t blocks, const uint32_t key[8], |
| uint64_t counter, bool increment_counter, |
| uint8_t flags, uint8_t flags_start, |
| uint8_t flags_end, uint8_t *out); |
| #endif |
| #endif |
| |
| #if BLAKE3_USE_NEON == 1 |
| LLVM_LIBRARY_VISIBILITY |
| void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, |
| size_t blocks, const uint32_t key[8], |
| uint64_t counter, bool increment_counter, |
| uint8_t flags, uint8_t flags_start, |
| uint8_t flags_end, uint8_t *out); |
| #endif |
| |
| |
| #endif /* BLAKE3_IMPL_H */ |