LLVM 22.0.0git
blake3_dispatch.c
Go to the documentation of this file.
1#include <stdbool.h>
2#include <stddef.h>
3#include <stdint.h>
4
5#include "blake3_impl.h"
6
7#if defined(_MSC_VER)
8#include <Windows.h>
9#endif
10
11#if defined(IS_X86)
12#if defined(_MSC_VER)
13#include <intrin.h>
14#elif defined(__GNUC__)
15#include <immintrin.h>
16#else
17#undef IS_X86 /* Unimplemented! */
18#endif
19#endif
20
21#if !defined(BLAKE3_ATOMICS)
22#if defined(__has_include)
23#if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
24#define BLAKE3_ATOMICS 1
25#else
26#define BLAKE3_ATOMICS 0
27#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
28#else
29#define BLAKE3_ATOMICS 0
30#endif /* defined(__has_include) */
31#endif /* BLAKE3_ATOMICS */
32
33#if BLAKE3_ATOMICS
34#define ATOMIC_INT _Atomic int
35#define ATOMIC_LOAD(x) x
36#define ATOMIC_STORE(x, y) x = y
37#elif defined(_MSC_VER)
38#define ATOMIC_INT LONG
39#define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
40#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
41#else
42#define ATOMIC_INT int
43#define ATOMIC_LOAD(x) x
44#define ATOMIC_STORE(x, y) x = y
45#endif
46
47#define MAYBE_UNUSED(x) (void)((x))
48
49#if defined(IS_X86)
50static uint64_t xgetbv(void) {
51#if defined(_MSC_VER)
52 return _xgetbv(0);
53#else
54 uint32_t eax = 0, edx = 0;
55 __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
56 return ((uint64_t)edx << 32) | eax;
57#endif
58}
59
60static void cpuid(uint32_t out[4], uint32_t id) {
61#if defined(_MSC_VER)
62 __cpuid((int *)out, id);
63#elif defined(__i386__) || defined(_M_IX86)
64 __asm__ __volatile__("movl %%ebx, %1\n"
65 "cpuid\n"
66 "xchgl %1, %%ebx\n"
67 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
68 : "a"(id));
69#else
70 __asm__ __volatile__("cpuid\n"
71 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
72 : "a"(id));
73#endif
74}
75
76static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
77#if defined(_MSC_VER)
78 __cpuidex((int *)out, id, sid);
79#elif defined(__i386__) || defined(_M_IX86)
80 __asm__ __volatile__("movl %%ebx, %1\n"
81 "cpuid\n"
82 "xchgl %1, %%ebx\n"
83 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
84 : "a"(id), "c"(sid));
85#else
86 __asm__ __volatile__("cpuid\n"
87 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
88 : "a"(id), "c"(sid));
89#endif
90}
91
92#endif
93
95 SSE2 = 1 << 0,
96 SSSE3 = 1 << 1,
97 SSE41 = 1 << 2,
98 AVX = 1 << 3,
99 AVX2 = 1 << 4,
100 AVX512F = 1 << 5,
101 AVX512VL = 1 << 6,
102 /* ... */
103 UNDEFINED = 1 << 30
105
106#if !defined(BLAKE3_TESTING)
107static /* Allow the variable to be controlled manually for testing */
108#endif
110
112#if !defined(BLAKE3_TESTING)
113static
114#endif
115 enum cpu_feature
117
118 /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
119 enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
120 if (features != UNDEFINED) {
121 return features;
122 } else {
123#if defined(IS_X86)
124 uint32_t regs[4] = {0};
125 uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
126 (void)edx;
127 features = 0;
128 cpuid(regs, 0);
129 const int max_id = *eax;
130 cpuid(regs, 1);
131#if defined(__amd64__) || defined(_M_X64)
132 features |= SSE2;
133#else
134 if (*edx & (1UL << 26))
135 features |= SSE2;
136#endif
137 if (*ecx & (1UL << 9))
138 features |= SSSE3;
139 if (*ecx & (1UL << 19))
140 features |= SSE41;
141
142 if (*ecx & (1UL << 27)) { // OSXSAVE
143 const uint64_t mask = xgetbv();
144 if ((mask & 6) == 6) { // SSE and AVX states
145 if (*ecx & (1UL << 28))
146 features |= AVX;
147 if (max_id >= 7) {
148 cpuidex(regs, 7, 0);
149 if (*ebx & (1UL << 5))
150 features |= AVX2;
151 if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
152 if (*ebx & (1UL << 31))
153 features |= AVX512VL;
154 if (*ebx & (1UL << 16))
155 features |= AVX512F;
156 }
157 }
158 }
159 }
160 ATOMIC_STORE(g_cpu_features, features);
161 return features;
162#else
163 /* How to detect NEON? */
164 return 0;
165#endif
166 }
167}
168
171 uint8_t block_len, uint64_t counter,
172 uint8_t flags) {
173#if defined(IS_X86)
174 const enum cpu_feature features = get_cpu_features();
175 MAYBE_UNUSED(features);
176#if !defined(BLAKE3_NO_AVX512)
177 if (features & AVX512VL) {
178 blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
179 return;
180 }
181#endif
182#if !defined(BLAKE3_NO_SSE41)
183 if (features & SSE41) {
184 blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
185 return;
186 }
187#endif
188#if !defined(BLAKE3_NO_SSE2)
189 if (features & SSE2) {
190 blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
191 return;
192 }
193#endif
194#endif
195 blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
196}
197
200 uint8_t block_len, uint64_t counter, uint8_t flags,
201 uint8_t out[64]) {
202#if defined(IS_X86)
203 const enum cpu_feature features = get_cpu_features();
204 MAYBE_UNUSED(features);
205#if !defined(BLAKE3_NO_AVX512)
206 if (features & AVX512VL) {
207 blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
208 return;
209 }
210#endif
211#if !defined(BLAKE3_NO_SSE41)
212 if (features & SSE41) {
213 blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
214 return;
215 }
216#endif
217#if !defined(BLAKE3_NO_SSE2)
218 if (features & SSE2) {
219 blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
220 return;
221 }
222#endif
223#endif
224 blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
225}
226
227
228void blake3_xof_many(const uint32_t cv[8],
230 uint8_t block_len, uint64_t counter, uint8_t flags,
231 uint8_t out[64], size_t outblocks) {
232 if (outblocks == 0) {
233 // The current assembly implementation always outputs at least 1 block.
234 return;
235 }
236#if defined(IS_X86)
237 const enum cpu_feature features = get_cpu_features();
238 MAYBE_UNUSED(features);
239#if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(BLAKE3_NO_AVX512)
240 if (features & AVX512VL) {
241 blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
242 return;
243 }
244#endif
245#endif
246 for(size_t i = 0; i < outblocks; ++i) {
247 blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i);
248 }
249}
250
251void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
252 size_t blocks, const uint32_t key[8], uint64_t counter,
253 bool increment_counter, uint8_t flags,
254 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
255#if defined(IS_X86)
256 const enum cpu_feature features = get_cpu_features();
257 MAYBE_UNUSED(features);
258#if !defined(BLAKE3_NO_AVX512)
259 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
260 blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
261 increment_counter, flags, flags_start, flags_end,
262 out);
263 return;
264 }
265#endif
266#if !defined(BLAKE3_NO_AVX2)
267 if (features & AVX2) {
268 blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
269 increment_counter, flags, flags_start, flags_end,
270 out);
271 return;
272 }
273#endif
274#if !defined(BLAKE3_NO_SSE41)
275 if (features & SSE41) {
276 blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
277 increment_counter, flags, flags_start, flags_end,
278 out);
279 return;
280 }
281#endif
282#if !defined(BLAKE3_NO_SSE2)
283 if (features & SSE2) {
284 blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
285 increment_counter, flags, flags_start, flags_end,
286 out);
287 return;
288 }
289#endif
290#endif
291
292#if BLAKE3_USE_NEON == 1
293 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
294 increment_counter, flags, flags_start, flags_end, out);
295 return;
296#endif
297
298 blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
299 increment_counter, flags, flags_start, flags_end,
300 out);
301}
302
303// The dynamically detected SIMD degree of the current platform.
304size_t blake3_simd_degree(void) {
305#if defined(IS_X86)
306 const enum cpu_feature features = get_cpu_features();
307 MAYBE_UNUSED(features);
308#if !defined(BLAKE3_NO_AVX512)
309 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
310 return 16;
311 }
312#endif
313#if !defined(BLAKE3_NO_AVX2)
314 if (features & AVX2) {
315 return 8;
316 }
317#endif
318#if !defined(BLAKE3_NO_SSE41)
319 if (features & SSE41) {
320 return 4;
321 }
322#endif
323#if !defined(BLAKE3_NO_SSE2)
324 if (features & SSE2) {
325 return 4;
326 }
327#endif
328#endif
329#if BLAKE3_USE_NEON == 1
330 return 4;
331#endif
332 return 1;
333}
bbsections Prepares for basic block by splitting functions into clusters of basic blocks
#define LLVM_ATTRIBUTE_USED
Definition: Compiler.h:236
unify loop Fixup each natural loop to have a single exit block
#define MAYBE_UNUSED(x)
size_t blake3_simd_degree(void)
static LLVM_ATTRIBUTE_USED enum cpu_feature get_cpu_features(void)
#define ATOMIC_STORE(x, y)
static ATOMIC_INT g_cpu_features
cpu_feature
@ AVX
@ SSE41
@ AVX512VL
@ SSSE3
@ UNDEFINED
@ AVX512F
@ AVX2
@ SSE2
#define ATOMIC_LOAD(x)
#define ATOMIC_INT
#define blake3_compress_in_place_sse41
#define blake3_hash_many_neon
#define blake3_hash_many_avx512
#define blake3_hash_many_avx2
#define blake3_compress_xof_sse2
#define BLAKE3_BLOCK_LEN
#define blake3_hash_many_sse41
#define blake3_compress_xof
#define blake3_compress_xof_sse41
#define blake3_hash_many
#define blake3_compress_in_place_sse2
#define blake3_compress_in_place
#define blake3_compress_xof_avx512
#define blake3_xof_many
#define blake3_xof_many_avx512
#define blake3_compress_xof_portable
#define blake3_hash_many_portable
#define blake3_hash_many_sse2
#define blake3_compress_in_place_portable
#define blake3_compress_in_place_avx512