| #include <stdbool.h> |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include "blake3_impl.h" |
| |
| #if defined(IS_X86) |
| #if defined(_MSC_VER) |
| #include <intrin.h> |
| #elif defined(__GNUC__) |
| #include <immintrin.h> |
| #else |
| #error "Unimplemented!" |
| #endif |
| #endif |
| |
| #define MAYBE_UNUSED(x) (void)((x)) |
| |
| #if defined(IS_X86) |
| static uint64_t xgetbv(void) { |
| #if defined(_MSC_VER) |
| return _xgetbv(0); |
| #else |
| uint32_t eax = 0, edx = 0; |
| __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); |
| return ((uint64_t)edx << 32) | eax; |
| #endif |
| } |
| |
| static void cpuid(uint32_t out[4], uint32_t id) { |
| #if defined(_MSC_VER) |
| __cpuid((int *)out, id); |
| #elif defined(__i386__) || defined(_M_IX86) |
| __asm__ __volatile__("movl %%ebx, %1\n" |
| "cpuid\n" |
| "xchgl %1, %%ebx\n" |
| : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) |
| : "a"(id)); |
| #else |
| __asm__ __volatile__("cpuid\n" |
| : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) |
| : "a"(id)); |
| #endif |
| } |
| |
| static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { |
| #if defined(_MSC_VER) |
| __cpuidex((int *)out, id, sid); |
| #elif defined(__i386__) || defined(_M_IX86) |
| __asm__ __volatile__("movl %%ebx, %1\n" |
| "cpuid\n" |
| "xchgl %1, %%ebx\n" |
| : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) |
| : "a"(id), "c"(sid)); |
| #else |
| __asm__ __volatile__("cpuid\n" |
| : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) |
| : "a"(id), "c"(sid)); |
| #endif |
| } |
| |
| #endif |
| |
| enum cpu_feature { |
| SSE2 = 1 << 0, |
| SSSE3 = 1 << 1, |
| SSE41 = 1 << 2, |
| AVX = 1 << 3, |
| AVX2 = 1 << 4, |
| AVX512F = 1 << 5, |
| AVX512VL = 1 << 6, |
| /* ... */ |
| UNDEFINED = 1 << 30 |
| }; |
| |
| #if !defined(BLAKE3_TESTING) |
| static /* Allow the variable to be controlled manually for testing */ |
| #endif |
| enum cpu_feature g_cpu_features = UNDEFINED; |
| |
| LLVM_ATTRIBUTE_USED |
| #if !defined(BLAKE3_TESTING) |
| static |
| #endif |
| enum cpu_feature |
| get_cpu_features(void) { |
| |
| if (g_cpu_features != UNDEFINED) { |
| return g_cpu_features; |
| } else { |
| #if defined(IS_X86) |
| uint32_t regs[4] = {0}; |
| uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; |
| (void)edx; |
| enum cpu_feature features = 0; |
| cpuid(regs, 0); |
| const int max_id = *eax; |
| cpuid(regs, 1); |
| #if defined(__amd64__) || defined(_M_X64) |
| features |= SSE2; |
| #else |
| if (*edx & (1UL << 26)) |
| features |= SSE2; |
| #endif |
| if (*ecx & (1UL << 0)) |
| features |= SSSE3; |
| if (*ecx & (1UL << 19)) |
| features |= SSE41; |
| |
| if (*ecx & (1UL << 27)) { // OSXSAVE |
| const uint64_t mask = xgetbv(); |
| if ((mask & 6) == 6) { // SSE and AVX states |
| if (*ecx & (1UL << 28)) |
| features |= AVX; |
| if (max_id >= 7) { |
| cpuidex(regs, 7, 0); |
| if (*ebx & (1UL << 5)) |
| features |= AVX2; |
| if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm |
| if (*ebx & (1UL << 31)) |
| features |= AVX512VL; |
| if (*ebx & (1UL << 16)) |
| features |= AVX512F; |
| } |
| } |
| } |
| } |
| g_cpu_features = features; |
| return features; |
| #else |
| /* How to detect NEON? */ |
| return 0; |
| #endif |
| } |
| } |
| |
| void blake3_compress_in_place(uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, |
| uint8_t flags) { |
| #if defined(IS_X86) |
| const enum cpu_feature features = get_cpu_features(); |
| MAYBE_UNUSED(features); |
| #if !defined(BLAKE3_NO_AVX512) |
| if (features & AVX512VL) { |
| blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); |
| return; |
| } |
| #endif |
| #if !defined(BLAKE3_NO_SSE41) |
| if (features & SSE41) { |
| blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); |
| return; |
| } |
| #endif |
| #if !defined(BLAKE3_NO_SSE2) |
| if (features & SSE2) { |
| blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); |
| return; |
| } |
| #endif |
| #endif |
| blake3_compress_in_place_portable(cv, block, block_len, counter, flags); |
| } |
| |
| void blake3_compress_xof(const uint32_t cv[8], |
| const uint8_t block[BLAKE3_BLOCK_LEN], |
| uint8_t block_len, uint64_t counter, uint8_t flags, |
| uint8_t out[64]) { |
| #if defined(IS_X86) |
| const enum cpu_feature features = get_cpu_features(); |
| MAYBE_UNUSED(features); |
| #if !defined(BLAKE3_NO_AVX512) |
| if (features & AVX512VL) { |
| blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); |
| return; |
| } |
| #endif |
| #if !defined(BLAKE3_NO_SSE41) |
| if (features & SSE41) { |
| blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); |
| return; |
| } |
| #endif |
| #if !defined(BLAKE3_NO_SSE2) |
| if (features & SSE2) { |
| blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); |
| return; |
| } |
| #endif |
| #endif |
| blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); |
| } |
| |
| void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, |
| size_t blocks, const uint32_t key[8], uint64_t counter, |
| bool increment_counter, uint8_t flags, |
| uint8_t flags_start, uint8_t flags_end, uint8_t *out) { |
| #if defined(IS_X86) |
| const enum cpu_feature features = get_cpu_features(); |
| MAYBE_UNUSED(features); |
| #if !defined(BLAKE3_NO_AVX512) |
| if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { |
| blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, |
| increment_counter, flags, flags_start, flags_end, |
| out); |
| return; |
| } |
| #endif |
| #if !defined(BLAKE3_NO_AVX2) |
| if (features & AVX2) { |
| blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, |
| increment_counter, flags, flags_start, flags_end, |
| out); |
| return; |
| } |
| #endif |
| #if !defined(BLAKE3_NO_SSE41) |
| if (features & SSE41) { |
| blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, |
| increment_counter, flags, flags_start, flags_end, |
| out); |
| return; |
| } |
| #endif |
| #if !defined(BLAKE3_NO_SSE2) |
| if (features & SSE2) { |
| blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, |
| increment_counter, flags, flags_start, flags_end, |
| out); |
| return; |
| } |
| #endif |
| #endif |
| |
| #if BLAKE3_USE_NEON == 1 |
| blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, |
| increment_counter, flags, flags_start, flags_end, out); |
| return; |
| #endif |
| |
| blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, |
| increment_counter, flags, flags_start, flags_end, |
| out); |
| } |
| |
| // The dynamically detected SIMD degree of the current platform. |
| size_t blake3_simd_degree(void) { |
| #if defined(IS_X86) |
| const enum cpu_feature features = get_cpu_features(); |
| MAYBE_UNUSED(features); |
| #if !defined(BLAKE3_NO_AVX512) |
| if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { |
| return 16; |
| } |
| #endif |
| #if !defined(BLAKE3_NO_AVX2) |
| if (features & AVX2) { |
| return 8; |
| } |
| #endif |
| #if !defined(BLAKE3_NO_SSE41) |
| if (features & SSE41) { |
| return 4; |
| } |
| #endif |
| #if !defined(BLAKE3_NO_SSE2) |
| if (features & SSE2) { |
| return 4; |
| } |
| #endif |
| #endif |
| #if BLAKE3_USE_NEON == 1 |
| return 4; |
| #endif |
| return 1; |
| } |