38 #define __has_builtin(x) 0
41#ifndef __has_attribute
42 #define __has_attribute(x) 0
46 #define __has_include(x) 0
50 #define LIBPOPCNT_GNUC_PREREQ(x, y) \
51 (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y))
53 #define LIBPOPCNT_GNUC_PREREQ(x, y) 0
57 #define LIBPOPCNT_CLANG_PREREQ(x, y) \
58 (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y))
60 #define LIBPOPCNT_CLANG_PREREQ(x, y) 0
63#if (_MSC_VER < 1900) && \
65 #define inline __inline
68#if (defined(__i386__) || \
69 defined(__x86_64__) || \
72 #define LIBPOPCNT_X86_OR_X64
75#if LIBPOPCNT_GNUC_PREREQ(4, 2) || \
76 __has_builtin(__builtin_popcount)
77 #define LIBPOPCNT_HAVE_BUILTIN_POPCOUNT
80#if LIBPOPCNT_GNUC_PREREQ(4, 2) || \
81 LIBPOPCNT_CLANG_PREREQ(3, 0)
82 #define LIBPOPCNT_HAVE_ASM_POPCNT
85#if defined(LIBPOPCNT_X86_OR_X64) && \
86 (defined(LIBPOPCNT_HAVE_ASM_POPCNT) || \
88 #define LIBPOPCNT_HAVE_POPCNT
92#if defined(LIBPOPCNT_X86_OR_X64) && \
93 LIBPOPCNT_GNUC_PREREQ(5, 0)
94 #define LIBPOPCNT_HAVE_AVX2
98#if defined(LIBPOPCNT_X86_OR_X64) && \
99 LIBPOPCNT_GNUC_PREREQ(11, 0)
100 #define LIBPOPCNT_HAVE_AVX512
104#if defined(LIBPOPCNT_X86_OR_X64) && !defined(_MSC_VER)
105 #if LIBPOPCNT_CLANG_PREREQ(3, 8) && \
106 __has_attribute(target) && \
107 (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
108 #define LIBPOPCNT_HAVE_AVX2
110 #if LIBPOPCNT_CLANG_PREREQ(9, 0) && \
111 __has_attribute(target) && \
112 (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
113 #define LIBPOPCNT_HAVE_AVX512
118#if defined(LIBPOPCNT_X86_OR_X64) && \
128 #if defined(__clang__)
129 #if defined(__AVX2__)
130 #define LIBPOPCNT_HAVE_AVX2
132 #if defined(__AVX512__)
133 #define LIBPOPCNT_HAVE_AVX2
134 #define LIBPOPCNT_HAVE_AVX512
138 #elif _MSC_VER >= 1910
139 #define LIBPOPCNT_HAVE_AVX2
140 #define LIBPOPCNT_HAVE_AVX512
149#if defined(LIBPOPCNT_X86_OR_X64) && \
150 (defined(__cplusplus) || \
151 defined(_MSC_VER) || \
152 (LIBPOPCNT_GNUC_PREREQ(4, 2) || \
153 __has_builtin(__sync_val_compare_and_swap))) && \
154 ((defined(LIBPOPCNT_HAVE_AVX512) && !(defined(__AVX512__) || \
155 (defined(__AVX512F__) && \
156 defined(__AVX512BW__) && \
157 defined(__AVX512VPOPCNTDQ__)))) || \
158 (defined(LIBPOPCNT_HAVE_AVX2) && !defined(__AVX2__)) || \
159 (defined(LIBPOPCNT_HAVE_POPCNT) && !defined(__POPCNT__)))
160 #define LIBPOPCNT_HAVE_CPUID
175 uint64_t m1 = 0x5555555555555555ull;
176 uint64_t m2 = 0x3333333333333333ull;
177 uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
178 uint64_t h01 = 0x0101010101010101ull;
181 x = (x & m2) + ((x >> 2) & m2);
182 x = (x + (x >> 4)) & m4;
184 return (x * h01) >> 56;
187#if defined(LIBPOPCNT_HAVE_ASM_POPCNT) && \
190static inline uint64_t
popcnt64(uint64_t x)
192 __asm__ (
"popcnt %1, %0" :
"=r" (x) :
"0" (x));
196#elif defined(LIBPOPCNT_HAVE_ASM_POPCNT) && \
199static inline uint32_t popcnt32(uint32_t x)
201 __asm__ (
"popcnt %1, %0" :
"=r" (x) :
"0" (x));
205static inline uint64_t
popcnt64(uint64_t x)
207 return popcnt32((uint32_t) x) +
208 popcnt32((uint32_t)(x >> 32));
211#elif defined(_MSC_VER) && \
216static inline uint64_t
popcnt64(uint64_t x)
218 return __popcnt64(x);
221#elif defined(_MSC_VER) && \
226static inline uint64_t
popcnt64(uint64_t x)
228 return __popcnt((uint32_t) x) +
229 __popcnt((uint32_t)(x >> 32));
233#elif defined(LIBPOPCNT_HAVE_BUILTIN_POPCOUNT)
235static inline uint64_t
popcnt64(uint64_t x)
237 return __builtin_popcountll(x);
251#if defined(LIBPOPCNT_HAVE_CPUID)
255 #include <immintrin.h>
262#define LIBPOPCNT_BIT_AVX2 (1 << 5)
263#define LIBPOPCNT_BIT_AVX512F (1 << 16)
264#define LIBPOPCNT_BIT_AVX512BW (1 << 30)
267#define LIBPOPCNT_BIT_AVX512_VPOPCNTDQ (1 << 14)
268#define LIBPOPCNT_BIT_POPCNT (1 << 23)
271#define LIBPOPCNT_XSTATE_SSE (1 << 1)
272#define LIBPOPCNT_XSTATE_YMM (1 << 2)
273#define LIBPOPCNT_XSTATE_ZMM (7 << 5)
275static inline void run_cpuid(
int eax,
int ecx,
int* abcd)
278 __cpuidex(abcd, eax, ecx);
283 #if defined(__i386__) && \
286 __asm__ __volatile__(
"movl %%ebx, %%edi;"
288 "xchgl %%ebx, %%edi;"
294 __asm__ __volatile__(
"cpuid"
308#if defined(LIBPOPCNT_HAVE_AVX2) || \
309 defined(LIBPOPCNT_HAVE_AVX512)
311static inline uint64_t get_xcr0(
void)
319 __asm__ __volatile__(
"xgetbv" :
"=a"(eax),
"=d"(edx) :
"c"(0));
320 return eax | (((uint64_t) edx) << 32);
326static inline int get_cpuid(
void)
331 run_cpuid(1, 0, abcd);
333 if ((abcd[2] & LIBPOPCNT_BIT_POPCNT) == LIBPOPCNT_BIT_POPCNT)
334 flags |= LIBPOPCNT_BIT_POPCNT;
336#if defined(LIBPOPCNT_HAVE_AVX2) || \
337 defined(LIBPOPCNT_HAVE_AVX512)
339 int osxsave_mask = (1 << 27);
342 if ((abcd[2] & osxsave_mask) != osxsave_mask)
345 uint64_t ymm_mask = LIBPOPCNT_XSTATE_SSE | LIBPOPCNT_XSTATE_YMM;
346 uint64_t zmm_mask = LIBPOPCNT_XSTATE_SSE | LIBPOPCNT_XSTATE_YMM | LIBPOPCNT_XSTATE_ZMM;
347 uint64_t xcr0 = get_xcr0();
349 if ((xcr0 & ymm_mask) == ymm_mask)
351 run_cpuid(7, 0, abcd);
353 if ((abcd[1] & LIBPOPCNT_BIT_AVX2) == LIBPOPCNT_BIT_AVX2)
354 flags |= LIBPOPCNT_BIT_AVX2;
356 if ((xcr0 & zmm_mask) == zmm_mask)
360 if ((abcd[1] & LIBPOPCNT_BIT_AVX512F) == LIBPOPCNT_BIT_AVX512F &&
361 (abcd[1] & LIBPOPCNT_BIT_AVX512BW) == LIBPOPCNT_BIT_AVX512BW &&
362 (abcd[2] & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) == LIBPOPCNT_BIT_AVX512_VPOPCNTDQ)
363 flags |= LIBPOPCNT_BIT_AVX512_VPOPCNTDQ;
374#if defined(LIBPOPCNT_HAVE_AVX2) && \
375 __has_include(<immintrin.h>)
377#include <immintrin.h>
379#if __has_attribute(target)
380 __attribute__ ((target (
"avx2")))
382static inline void CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c)
384 __m256i u = _mm256_xor_si256(a, b);
385 *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
386 *l = _mm256_xor_si256(u, c);
389#if __has_attribute(target)
390 __attribute__ ((target (
"avx2")))
392static inline __m256i popcnt256(__m256i v)
394 __m256i lookup1 = _mm256_setr_epi8(
395 4, 5, 5, 6, 5, 6, 6, 7,
396 5, 6, 6, 7, 6, 7, 7, 8,
397 4, 5, 5, 6, 5, 6, 6, 7,
398 5, 6, 6, 7, 6, 7, 7, 8
401 __m256i lookup2 = _mm256_setr_epi8(
402 4, 3, 3, 2, 3, 2, 2, 1,
403 3, 2, 2, 1, 2, 1, 1, 0,
404 4, 3, 3, 2, 3, 2, 2, 1,
405 3, 2, 2, 1, 2, 1, 1, 0
408 __m256i low_mask = _mm256_set1_epi8(0x0f);
409 __m256i lo = _mm256_and_si256(v, low_mask);
410 __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
411 __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo);
412 __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi);
414 return _mm256_sad_epu8(popcnt1, popcnt2);
424#if __has_attribute(target)
425 __attribute__ ((target (
"avx2")))
427static inline uint64_t popcnt_avx2(
const __m256i* ptr, uint64_t size)
429 __m256i cnt = _mm256_setzero_si256();
430 __m256i ones = _mm256_setzero_si256();
431 __m256i twos = _mm256_setzero_si256();
432 __m256i fours = _mm256_setzero_si256();
433 __m256i eights = _mm256_setzero_si256();
434 __m256i sixteens = _mm256_setzero_si256();
435 __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
441 for(; i < limit; i += 16)
443 CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 0), _mm256_loadu_si256(ptr + i + 1));
444 CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 2), _mm256_loadu_si256(ptr + i + 3));
445 CSA256(&foursA, &twos, twos, twosA, twosB);
446 CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 4), _mm256_loadu_si256(ptr + i + 5));
447 CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 6), _mm256_loadu_si256(ptr + i + 7));
448 CSA256(&foursB, &twos, twos, twosA, twosB);
449 CSA256(&eightsA, &fours, fours, foursA, foursB);
450 CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 8), _mm256_loadu_si256(ptr + i + 9));
451 CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 10), _mm256_loadu_si256(ptr + i + 11));
452 CSA256(&foursA, &twos, twos, twosA, twosB);
453 CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 12), _mm256_loadu_si256(ptr + i + 13));
454 CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 14), _mm256_loadu_si256(ptr + i + 15));
455 CSA256(&foursB, &twos, twos, twosA, twosB);
456 CSA256(&eightsB, &fours, fours, foursA, foursB);
457 CSA256(&sixteens, &eights, eights, eightsA, eightsB);
459 cnt = _mm256_add_epi64(cnt, popcnt256(sixteens));
462 cnt = _mm256_slli_epi64(cnt, 4);
463 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3));
464 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2));
465 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1));
466 cnt = _mm256_add_epi64(cnt, popcnt256(ones));
469 cnt = _mm256_add_epi64(cnt, popcnt256(_mm256_loadu_si256(ptr + i)));
471 cnt64 = (uint64_t*) &cnt;
481#if defined(LIBPOPCNT_HAVE_AVX512) && \
482 __has_include(<immintrin.h>)
484#include <immintrin.h>
486#if __has_attribute(target)
487 __attribute__ ((target (
"avx512f,avx512bw,avx512vpopcntdq")))
489static inline uint64_t popcnt_avx512(
const uint8_t* ptr8, uint64_t size)
491 __m512i cnt = _mm512_setzero_si512();
492 const uint64_t* ptr64 = (
const uint64_t*) ptr8;
493 uint64_t size64 =
size /
sizeof(uint64_t);
496 for (; i + 32 <= size64; i += 32)
498 __m512i vec0 = _mm512_loadu_epi64(&ptr64[i + 0]);
499 __m512i vec1 = _mm512_loadu_epi64(&ptr64[i + 8]);
500 __m512i vec2 = _mm512_loadu_epi64(&ptr64[i + 16]);
501 __m512i vec3 = _mm512_loadu_epi64(&ptr64[i + 24]);
503 vec0 = _mm512_popcnt_epi64(vec0);
504 vec1 = _mm512_popcnt_epi64(vec1);
505 vec2 = _mm512_popcnt_epi64(vec2);
506 vec3 = _mm512_popcnt_epi64(vec3);
508 cnt = _mm512_add_epi64(cnt, vec0);
509 cnt = _mm512_add_epi64(cnt, vec1);
510 cnt = _mm512_add_epi64(cnt, vec2);
511 cnt = _mm512_add_epi64(cnt, vec3);
514 for (; i + 8 <= size64; i += 8)
516 __m512i vec = _mm512_loadu_epi64(&ptr64[i]);
517 vec = _mm512_popcnt_epi64(vec);
518 cnt = _mm512_add_epi64(cnt, vec);
521 i *=
sizeof(uint64_t);
526 __mmask64 mask = (__mmask64) (0xffffffffffffffffull >> (i + 64 - size));
527 __m512i vec = _mm512_maskz_loadu_epi8(mask, &ptr8[i]);
528 vec = _mm512_popcnt_epi64(vec);
529 cnt = _mm512_add_epi64(cnt, vec);
532 return _mm512_reduce_add_epi64(cnt);
538#if defined(LIBPOPCNT_X86_OR_X64)
545static uint64_t
popcnt(
const void* data, uint64_t size)
552#if defined(LIBPOPCNT_HAVE_CPUID)
553 #if defined(__cplusplus)
555 static const int cpuid = get_cpuid();
557 static int cpuid_ = -1;
563 #if defined(_MSC_VER)
564 _InterlockedCompareExchange(&cpuid_, cpuid, -1);
566 __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
572 const uint8_t* ptr = (
const uint8_t*) data;
576#if defined(LIBPOPCNT_HAVE_AVX512)
577 #if defined(__AVX512__) || \
578 (defined(__AVX512F__) && \
579 defined(__AVX512BW__) && \
580 defined(__AVX512VPOPCNTDQ__))
584 if ((cpuid & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) &&
587 return popcnt_avx512(ptr, size);
590#if defined(LIBPOPCNT_HAVE_AVX2)
591 #if defined(__AVX2__)
595 if ((cpuid & LIBPOPCNT_BIT_AVX2) &&
599 const __m256i* ptr256 = (
const __m256i*)(ptr + i);
600 cnt += popcnt_avx2(ptr256, (size - i) / 32);
605#if defined(LIBPOPCNT_HAVE_POPCNT)
612 #if !defined(__POPCNT__)
613 if (cpuid & LIBPOPCNT_BIT_POPCNT)
618 uintptr_t rem = ((uintptr_t) &ptr[i]) % 8;
624 uint64_t bytes = (uint64_t) (8 - rem % 8);
625 bytes = (bytes <= 7) ? bytes : 7;
626 for (uint64_t j = 0; j < bytes; j++)
627 val |= ((uint64_t) ptr[i + j]) << (j * 8);
633 for (; i + 8 <=
size; i += 8)
634 cnt +=
popcnt64(*(
const uint64_t*)(ptr + i));
639 uint64_t bytes = (uint64_t) (size - i);
640 bytes = (bytes <= 7) ? bytes : 7;
641 for (uint64_t j = 0; j < bytes; j++)
642 val |= ((uint64_t) ptr[i + j]) << (j * 8);
655#if !defined(LIBPOPCNT_HAVE_POPCNT) || \
660 uintptr_t rem = ((uintptr_t) &ptr[i]) % 8;
666 uint64_t bytes = (uint64_t) (8 - rem % 8);
667 bytes = (bytes <= 7) ? bytes : 7;
668 for (uint64_t j = 0; j < bytes; j++)
669 val |= ((uint64_t) ptr[i + j]) << (j * 8);
675 for (; i + 8 <=
size; i += 8)
681 uint64_t bytes = (uint64_t) (size - i);
682 bytes = (bytes <= 7) ? bytes : 7;
683 for (uint64_t j = 0; j < bytes; j++)
684 val |= ((uint64_t) ptr[i + j]) << (j * 8);
693#elif defined(__ARM_FEATURE_SVE) && \
694 __has_include(<arm_sve.h>)
703static inline uint64_t
popcnt(
const void* data, uint64_t size)
706 const uint64_t* ptr64 = (
const uint64_t*) data;
707 uint64_t size64 =
size /
sizeof(uint64_t);
708 svuint64_t vcnt = svdup_u64(0);
710 for (; i + svcntd() * 4 <= size64; i += svcntd() * 4)
712 svuint64_t vec0 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 0]);
713 svuint64_t vec1 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 1]);
714 svuint64_t vec2 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 2]);
715 svuint64_t vec3 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 3]);
717 vec0 = svcnt_u64_x(svptrue_b64(), vec0);
718 vec1 = svcnt_u64_x(svptrue_b64(), vec1);
719 vec2 = svcnt_u64_x(svptrue_b64(), vec2);
720 vec3 = svcnt_u64_x(svptrue_b64(), vec3);
722 vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec0);
723 vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec1);
724 vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec2);
725 vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec3);
728 svbool_t pg = svwhilelt_b64(i, size64);
730 while (svptest_any(svptrue_b64(), pg))
732 svuint64_t vec = svld1_u64(pg, &ptr64[i]);
733 vec = svcnt_u64_z(pg, vec);
734 vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec);
736 pg = svwhilelt_b64(i, size64);
739 uint64_t cnt = svaddv_u64(svptrue_b64(), vcnt);
740 uint64_t bytes =
size %
sizeof(uint64_t);
745 const uint8_t* ptr8 = (
const uint8_t*) data;
746 svbool_t pg8 = svwhilelt_b8(i, size);
747 svuint8_t vec = svld1_u8(pg8, &ptr8[i]);
748 svuint8_t vcnt8 = svcnt_u8_z(pg8, vec);
749 cnt += svaddv_u8(pg8, vcnt8);
755#elif (defined(__ARM_NEON) || \
756 defined(__aarch64__) || \
757 defined(_M_ARM64)) && \
758 __has_include(<arm_neon.h>)
762static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t)
764 return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t)));
772static inline uint64_t
popcnt(
const void* data, uint64_t size)
776 uint64_t chunk_size = 64;
777 const uint8_t* ptr = (
const uint8_t*) data;
779 if (size >= chunk_size)
781 uint64_t iters =
size / chunk_size;
782 uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0));
783 uint8x16_t zero = vcombine_u8(vcreate_u8(0), vcreate_u8(0));
787 uint8x16_t t0 = zero;
788 uint8x16_t t1 = zero;
789 uint8x16_t t2 = zero;
790 uint8x16_t t3 = zero;
798 uint64_t limit = (i + 31 < iters) ? i + 31 : iters;
801 for (; i < limit; i++)
803 uint8x16x4_t input = vld4q_u8(ptr);
806 t0 = vaddq_u8(t0, vcntq_u8(input.val[0]));
807 t1 = vaddq_u8(t1, vcntq_u8(input.val[1]));
808 t2 = vaddq_u8(t2, vcntq_u8(input.val[2]));
809 t3 = vaddq_u8(t3, vcntq_u8(input.val[3]));
812 sum = vpadalq(sum, t0);
813 sum = vpadalq(sum, t1);
814 sum = vpadalq(sum, t2);
815 sum = vpadalq(sum, t3);
830 uintptr_t rem = ((uintptr_t) &ptr[i]) % 8;
836 uint64_t bytes = (uint64_t) (8 - rem % 8);
837 bytes = (bytes <= 7) ? bytes : 7;
838 for (uint64_t j = 0; j < bytes; j++)
839 val |= ((uint64_t) ptr[i + j]) << (j * 8);
845 for (; i + 8 <=
size; i += 8)
846 cnt +=
popcnt64(*(
const uint64_t*)(ptr + i));
851 uint64_t bytes = (uint64_t) (size - i);
852 bytes = (bytes <= 7) ? bytes : 7;
853 for (uint64_t j = 0; j < bytes; j++)
854 val |= ((uint64_t) ptr[i + j]) << (j * 8);
869static inline uint64_t
popcnt(
const void* data, uint64_t size)
873 const uint8_t* ptr = (
const uint8_t*) data;
877 uintptr_t rem = ((uintptr_t) &ptr[i]) % 8;
883 uint64_t bytes = (uint64_t) (8 - rem % 8);
884 bytes = (bytes <= 7) ? bytes : 7;
885 for (uint64_t j = 0; j < bytes; j++)
886 val |= ((uint64_t) ptr[i + j]) << (j * 8);
892 for (; i + 8 <= size; i += 8)
893 cnt +=
popcnt64(*(
const uint64_t*)(ptr + i));
898 uint64_t bytes = (uint64_t) (size - i);
899 bytes = (bytes <= 7) ? bytes : 7;
900 for (uint64_t j = 0; j < bytes; j++)
901 val |= ((uint64_t) ptr[i + j]) << (j * 8);
static uint64_t popcnt64_bitwise(uint64_t x)
static uint64_t popcnt(const void *data, uint64_t size)
static uint64_t popcnt64(uint64_t x)
constexpr std::size_t size(typelist< T... >={})
Gets the count of types contained in a typelist.