38 #define __has_builtin(x) 0 41 #ifndef __has_attribute 42 #define __has_attribute(x) 0 46 #define GNUC_PREREQ(x, y) \ 47 (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y)) 49 #define GNUC_PREREQ(x, y) 0 53 #define CLANG_PREREQ(x, y) \ 54 (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y)) 56 #define CLANG_PREREQ(x, y) 0 59 #if (defined(__i386__) || \ 60 defined(__x86_64__) || \ 66 #if defined(X86_OR_X64) && \ 67 (defined(__cplusplus) || \ 68 (GNUC_PREREQ(4, 2) || \ 69 __has_builtin(__sync_val_compare_and_swap))) 73 #if GNUC_PREREQ(4, 2) || \ 74 __has_builtin(__builtin_popcount) 75 #define HAVE_BUILTIN_POPCOUNT 78 #if GNUC_PREREQ(4, 2) || \ 80 #define HAVE_ASM_POPCNT 83 #if defined(HAVE_CPUID) && \ 84 (defined(HAVE_ASM_POPCNT) || \ 89 #if defined(HAVE_CPUID) && \ 94 #if defined(HAVE_CPUID) && \ 95 CLANG_PREREQ(3, 8) && \ 96 __has_attribute(target) && \ 97 (!defined(_MSC_VER) || defined(__AVX2__)) && \ 98 (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) 108 static inline uint64_t popcount64(uint64_t x)
110 uint64_t m1 = 0x5555555555555555ll;
111 uint64_t m2 = 0x3333333333333333ll;
112 uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll;
113 uint64_t h01 = 0x0101010101010101ll;
116 x = (x & m2) + ((x >> 2) & m2);
117 x = (x + (x >> 4)) & m4;
119 return (x * h01) >> 56;
122 #if defined(HAVE_ASM_POPCNT) && \ 125 static inline uint64_t popcnt64(uint64_t x)
127 __asm__ (
"popcnt %1, %0" :
"=r" (x) :
"0" (x));
131 #elif defined(HAVE_ASM_POPCNT) && \ 134 static inline uint32_t popcnt32(uint32_t x)
136 __asm__ (
"popcnt %1, %0" :
"=r" (x) :
"0" (x));
140 static inline uint64_t popcnt64(uint64_t x)
142 return popcnt32((uint32_t) x) +
143 popcnt32((uint32_t)(x >> 32));
146 #elif defined(_MSC_VER) && \ 149 #include <nmmintrin.h> 151 static inline uint64_t popcnt64(uint64_t x)
153 return _mm_popcnt_u64(x);
156 #elif defined(_MSC_VER) && \ 159 #include <nmmintrin.h> 161 static inline uint64_t popcnt64(uint64_t x)
163 return _mm_popcnt_u32((uint32_t) x) +
164 _mm_popcnt_u32((uint32_t)(x >> 32));
168 #elif defined(HAVE_BUILTIN_POPCOUNT) 170 static inline uint64_t popcnt64(uint64_t x)
172 return __builtin_popcountll(x);
179 static inline uint64_t popcnt64(uint64_t x)
181 return popcount64(x);
186 static inline uint64_t popcnt64_unrolled(
const uint64_t* data, uint64_t size)
189 uint64_t limit = size - size % 4;
192 for (; i < limit; i += 4)
194 cnt += popcnt64(data[i+0]);
195 cnt += popcnt64(data[i+1]);
196 cnt += popcnt64(data[i+2]);
197 cnt += popcnt64(data[i+3]);
200 for (; i < size; i++)
201 cnt += popcnt64(data[i]);
206 #if defined(HAVE_CPUID) 208 #if defined(_MSC_VER) 210 #include <immintrin.h> 214 #define bit_POPCNT (1 << 23) 217 #define bit_AVX2 (1 << 5) 220 #define XSTATE_SSE (1 << 1) 221 #define XSTATE_YMM (1 << 2) 223 static inline void run_cpuid(
int eax,
int ecx,
int* abcd)
228 #if defined(_MSC_VER) 229 __cpuidex(abcd, eax, ecx);
230 #elif defined(__i386__) && \ 233 __asm__ (
"movl %%ebx, %%edi;" 235 "xchgl %%ebx, %%edi;" 253 static inline int has_POPCNT()
257 run_cpuid(1, 0, abcd);
258 if ((abcd[2] & bit_POPCNT) != bit_POPCNT)
264 #if defined(HAVE_AVX2) 266 static inline int check_xcr0_ymm()
269 int mask = XSTATE_SSE | XSTATE_YMM;
270 #if defined(_MSC_VER) 271 xcr0 = (int) _xgetbv(0);
273 __asm__ (
"xgetbv" :
"=a" (xcr0) :
"c" (0) :
"%edx" );
275 return (xcr0 & mask) == mask;
278 static inline int has_AVX2()
281 int osxsave_mask = (1 << 27);
284 run_cpuid(1, 0, abcd);
285 if ((abcd[2] & osxsave_mask) != osxsave_mask)
289 if (!check_xcr0_ymm())
292 run_cpuid(7, 0, abcd);
293 if ((abcd[1] & bit_AVX2) != bit_AVX2)
301 static inline int get_cpuid()
303 #if defined(HAVE_AVX2) 304 return has_POPCNT() | has_AVX2();
312 #if defined(HAVE_AVX2) 314 #include <immintrin.h> 316 __attribute__ ((target (
"avx2")))
317 static inline
void CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c)
320 *h = (a & b) | (u & c);
324 __attribute__ ((target (
"avx2")))
325 static inline __m256i popcnt256(__m256i v)
327 __m256i lookup1 = _mm256_setr_epi8(
328 4, 5, 5, 6, 5, 6, 6, 7,
329 5, 6, 6, 7, 6, 7, 7, 8,
330 4, 5, 5, 6, 5, 6, 6, 7,
331 5, 6, 6, 7, 6, 7, 7, 8
334 __m256i lookup2 = _mm256_setr_epi8(
335 4, 3, 3, 2, 3, 2, 2, 1,
336 3, 2, 2, 1, 2, 1, 1, 0,
337 4, 3, 3, 2, 3, 2, 2, 1,
338 3, 2, 2, 1, 2, 1, 1, 0
341 __m256i low_mask = _mm256_set1_epi8(0x0f);
342 __m256i lo = v & low_mask;
343 __m256i hi = _mm256_srli_epi16(v, 4) & low_mask;
344 __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo);
345 __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi);
347 return _mm256_sad_epu8(popcnt1, popcnt2);
357 __attribute__ ((target (
"avx2")))
358 static inline uint64_t popcnt_avx2(const __m256i* data, uint64_t size)
360 __m256i cnt = _mm256_setzero_si256();
361 __m256i ones = _mm256_setzero_si256();
362 __m256i twos = _mm256_setzero_si256();
363 __m256i fours = _mm256_setzero_si256();
364 __m256i eights = _mm256_setzero_si256();
365 __m256i sixteens = _mm256_setzero_si256();
366 __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
369 uint64_t limit = size - size % 16;
372 for(; i < limit; i += 16)
374 CSA256(&twosA, &ones, ones, data[i+0], data[i+1]);
375 CSA256(&twosB, &ones, ones, data[i+2], data[i+3]);
376 CSA256(&foursA, &twos, twos, twosA, twosB);
377 CSA256(&twosA, &ones, ones, data[i+4], data[i+5]);
378 CSA256(&twosB, &ones, ones, data[i+6], data[i+7]);
379 CSA256(&foursB, &twos, twos, twosA, twosB);
380 CSA256(&eightsA, &fours, fours, foursA, foursB);
381 CSA256(&twosA, &ones, ones, data[i+8], data[i+9]);
382 CSA256(&twosB, &ones, ones, data[i+10], data[i+11]);
383 CSA256(&foursA, &twos, twos, twosA, twosB);
384 CSA256(&twosA, &ones, ones, data[i+12], data[i+13]);
385 CSA256(&twosB, &ones, ones, data[i+14], data[i+15]);
386 CSA256(&foursB, &twos, twos, twosA, twosB);
387 CSA256(&eightsB, &fours, fours, foursA, foursB);
388 CSA256(&sixteens, &eights, eights, eightsA, eightsB);
390 cnt = _mm256_add_epi64(cnt, popcnt256(sixteens));
393 cnt = _mm256_slli_epi64(cnt, 4);
394 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3));
395 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2));
396 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1));
397 cnt = _mm256_add_epi64(cnt, popcnt256(ones));
400 cnt = _mm256_add_epi64(cnt, popcnt256(data[i]));
402 cnt64 = (uint64_t*) &cnt;
411 static inline void align_avx2(
const uint8_t** p, uint64_t* size, uint64_t* cnt)
413 for (; (uintptr_t) *p % 8; (*p)++)
415 *cnt += popcnt64(**p);
418 for (; (uintptr_t) *p % 32; (*p) += 8)
421 *(
const uint64_t*) *p);
429 #if defined(X86_OR_X64) 436 static inline uint64_t popcnt(
const void* data, uint64_t size)
438 const uint8_t* ptr = (
const uint8_t*) data;
442 #if defined(HAVE_CPUID) 443 #if defined(__cplusplus) 445 static const int cpuid = get_cpuid();
447 static int cpuid_ = -1;
452 __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
457 #if defined(HAVE_AVX2) 460 if ((cpuid & bit_AVX2) &&
463 align_avx2(&ptr, &size, &cnt);
464 cnt += popcnt_avx2((
const __m256i*) ptr, size / 32);
465 ptr += size - size % 32;
471 #if defined(HAVE_POPCNT) 473 if (cpuid & bit_POPCNT)
475 cnt += popcnt64_unrolled((
const uint64_t*) ptr, size / 8);
476 ptr += size - size % 8;
478 for (i = 0; i < size; i++)
479 cnt += popcnt64(ptr[i]);
487 for (i = 0; i < size; i++)
488 cnt += popcount64(ptr[i]);
493 #elif defined(__ARM_NEON) || \ 496 #include <arm_neon.h> 503 static inline uint64_t popcnt(
const void* data, uint64_t size)
505 const uint8_t* ptr = (
const uint8_t*) data;
508 uint64_t chunk_size = 128;
509 uint64_t n = size / chunk_size;
517 uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0));
519 for (i = 0; i < n; i++, ptr += chunk_size)
521 input0 = vld4q_u8(ptr);
522 input1 = vld4q_u8(ptr + 64);
524 t0 = vcntq_u8(input0.val[0]);
525 t0 = vaddq_u8(t0, vcntq_u8(input0.val[1]));
526 t0 = vaddq_u8(t0, vcntq_u8(input0.val[2]));
527 t0 = vaddq_u8(t0, vcntq_u8(input0.val[3]));
528 t0 = vaddq_u8(t0, vcntq_u8(input1.val[0]));
529 t0 = vaddq_u8(t0, vcntq_u8(input1.val[1]));
530 t0 = vaddq_u8(t0, vcntq_u8(input1.val[2]));
531 t0 = vaddq_u8(t0, vcntq_u8(input1.val[3]));
532 t1 = vpaddlq_u16(vpaddlq_u8(t0));
534 sum = vpadalq_u32(sum, t1);
538 for (i = 0; i < 2; i++)
542 cnt += popcnt64_unrolled((
const uint64_t*) ptr, size / 8);
543 ptr += size - size % 8;
545 for (i = 0; i < size; i++)
546 cnt += popcnt64(ptr[i]);
555 static inline void align(
const uint8_t** p, uint64_t* size, uint64_t* cnt)
557 for (; *size > 0 && (uintptr_t) *p % 8; (*p)++)
559 *cnt += popcnt64(**p);
569 static inline uint64_t popcnt(
const void* data, uint64_t size)
571 const uint8_t* ptr = (
const uint8_t*) data;
575 align(&ptr, &size, &cnt);
576 cnt += popcnt64_unrolled((
const uint64_t*) ptr, size / 8);
577 ptr += size - size % 8;
579 for (i = 0; i < size; i++)
580 cnt += popcnt64(ptr[i]);