#include <stdio.h>
#include <time.h>

#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>

typedef unsigned __int64 uint64;
typedef __int64 int64;

int popcnt_a(unsigned int u)
{
	int cnt=0;
	for (; u; u &= u-1) cnt++;
	return cnt;
}

int popcnt_b(unsigned int u)
{
	u = (u & 0x55555555) + ((u >>  1) & 0x55555555);
	u = (u & 0x33333333) + ((u >>  2) & 0x33333333);
	u = (u & 0x0F0F0F0F) + ((u >>  4) & 0x0F0F0F0F);
	u = (u & 0x00FF00FF) + ((u >>  8) & 0x00FF00FF);
	u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF);
	return u;
}

int popcnt64_a(uint64 u)
{
	int cnt=0;
	for (; u; u &= u-1) cnt++;
	return cnt;
}

int popcnt64_b(uint64 u)
{
	u = (u & 0x5555555555555555UI64) + ((u >>  1) & 0x5555555555555555UI64);
	u = (u & 0x3333333333333333UI64) + ((u >>  2) & 0x3333333333333333UI64);
	u = (u & 0x0F0F0F0F0F0F0F0FUI64) + ((u >>  4) & 0x0F0F0F0F0F0F0F0FUI64);
	u = (u & 0x00FF00FF00FF00FFUI64) + ((u >>  8) & 0x00FF00FF00FF00FFUI64);
	u = (u & 0x0000FFFF0000FFFFUI64) + ((u >> 16) & 0x0000FFFF0000FFFFUI64);
	u = (u & 0x00000000FFFFFFFFUI64) + ((u >> 32) & 0x00000000FFFFFFFFUI64);
	return (int)u;
}

// USE MMX
#if !defined(_M_X64)
int popcnt64_c(uint64 u)
{
	__m64 mm0 = {u};
	static const __m64 mask01 = {0x5555555555555555UI64};
	static const __m64 mask02 = {0x3333333333333333UI64};
	static const __m64 mask04 = {0x0F0F0F0F0F0F0F0FUI64};
	static const __m64 mask08 = {0x00FF00FF00FF00FFUI64};
	static const __m64 mask16 = {0x0000FFFF0000FFFFUI64};
	static const __m64 mask32 = {0x00000000FFFFFFFFUI64};

	// VS2005 paddq `??  DW܂J肪Ȃ̂ŁAPADDDg
	mm0 = _mm_add_pi32(_mm_and_si64(mm0, mask01), _mm_and_si64(_mm_srli_si64(mm0,  1), mask01));
	mm0 = _mm_add_pi32(_mm_and_si64(mm0, mask02), _mm_and_si64(_mm_srli_si64(mm0,  2), mask02));
	mm0 = _mm_add_pi32(_mm_and_si64(mm0, mask04), _mm_and_si64(_mm_srli_si64(mm0,  4), mask04));
	mm0 = _mm_add_pi32(_mm_and_si64(mm0, mask08), _mm_and_si64(_mm_srli_si64(mm0,  8), mask08));
	mm0 = _mm_add_pi32(_mm_and_si64(mm0, mask16), _mm_and_si64(_mm_srli_si64(mm0, 16), mask16));
	mm0 = _mm_add_pi32(_mm_and_si64(mm0, mask32), _mm_and_si64(_mm_srli_si64(mm0, 32), mask32));

	return mm0.m64_i32[0];
}

int popcnt64_d(uint64 u)
{
	__m64 mm0 = {u};
	static const __m64 mask01 = {0x5555555555555555UI64};
	static const __m64 mask02 = {0x3333333333333333UI64};
	static const __m64 mask04 = {0x0F0F0F0F0F0F0F0FUI64};
	static const __m64 zero   = {0x0000000000000000UI64};

	// VS2005 paddq `??  DW܂J肪Ȃ̂ŁAPADDDg
	mm0 = _mm_add_pi32(_mm_and_si64(mm0, mask01), _mm_and_si64(_mm_srli_si64(mm0,  1), mask01));
	mm0 = _mm_add_pi32(_mm_and_si64(mm0, mask02), _mm_and_si64(_mm_srli_si64(mm0,  2), mask02));
	mm0 = _mm_add_pi32(_mm_and_si64(mm0, mask04), _mm_and_si64(_mm_srli_si64(mm0,  4), mask04));
	mm0 = _mm_sad_pu8(mm0, zero);

	return mm0.m64_i32[0];
}
#endif

#if 0	// VS2005 SSE ̖߂sHH
int popcnt64_d(uint64 u)
{
	__m128i xmm0;
	static const __m128i zero   = {0x00,};
	static const __m128i mask01 = {0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,};
	static const __m128i mask02 = {0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33,};
	static const __m128i mask04 = {0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,};
	static const __m128i mask08 = {0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00,};
	static const __m128i mask16 = {0xFF,0xFF,0x00,0x00,0xFF,0xFF,0x00,0x00,};
	static const __m128i mask32 = {0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,};
	xmm0 = _mm_set_epi64((__m64){0}, (__m64){u});

	// VS2005 SSE ̖߂sHH
	xmm0 = _mm_add_pi32(_mm_and_si128(mm0, mask01), _mm_and_si128(_mm_srli_epi64(mm0,  1), mask01));
	xmm0 = _mm_add_pi32(_mm_and_si128(mm0, mask02), _mm_and_si128(_mm_srli_epi64(mm0,  2), mask02));
	xmm0 = _mm_add_pi32(_mm_and_si128(mm0, mask04), _mm_and_si128(_mm_srli_epi64(mm0,  4), mask04));
	PSABDW

	return _mm_cvtsi128_si32(xmm0);
}
#endif

void binprint(unsigned int u)
{
	int i;
	for (i = 31; i >= 0; i--) {
		printf("%d", (u & (1 << i)) ? 1 : 0);
	}
}

void binprint64(uint64 u64)
{
	int i;
	for (i = 63; i >= 0; i--) {
		printf("%d", (u64 & (1UI64 << i)) ? 1 : 0);
	}
}

#define LOOPS (1000*1000)
void chk_func(int (*func)(unsigned int), const char *name, unsigned int u)
{
	clock_t start = clock();
	unsigned int i, n;
	for (n = 0; n < LOOPS; n++) {
		for (i = 0; i < 0x100; i++) {
			func(u);
		}
	}
	clock_t end = clock();
	printf("%s:", name);
	binprint(u);
	printf(":%2d:%7.4f sec\n", func(u), (end - start) / (double)CLOCKS_PER_SEC);
}

void chk_func64(int (*func64)(uint64), const char *name, uint64 u64)
{
	clock_t start = clock();
	unsigned int i, n;
	for (n = 0; n < LOOPS; n++) {
		for (i = 0; i < 0x100; i++) {
			func64(u64);
		}
	}
	clock_t end = clock();
	printf("%s:", name);
	binprint64(u64);
	printf(":%2d", func64(u64));
#if !defined(_M_X64)
	// MMXgƂ̂ݕKv
	__asm emms
#endif
	printf(":%7.4f sec\n", (end - start) / (double)CLOCKS_PER_SEC);
}

#define CHK_FUNC(f,u)	chk_func(f, #f, u)
#define CHK_FUNC64(f,u64)	chk_func64(f, #f, u64)

int main()
{
	unsigned int u;

	printf("%u bit\n", sizeof(void*)*8);

	printf("\n32bitZ\n");
	u = 0x00000000;
	CHK_FUNC(popcnt_a, u);
	CHK_FUNC(popcnt_b, u);
	u = 0x00000001;
	CHK_FUNC(popcnt_a, u);
	CHK_FUNC(popcnt_b, u);
	u = 0x80000000;
	CHK_FUNC(popcnt_a, u);
	CHK_FUNC(popcnt_b, u);
	u = 0x00000011;
	CHK_FUNC(popcnt_a, u);
	CHK_FUNC(popcnt_b, u);
	u = 0x00001111;
	CHK_FUNC(popcnt_a, u);
	CHK_FUNC(popcnt_b, u);
	u = 0x11111111;
	CHK_FUNC(popcnt_a, u);
	CHK_FUNC(popcnt_b, u);
	u = 0x55555555;
	CHK_FUNC(popcnt_a, u);
	CHK_FUNC(popcnt_b, u);
	u = 0xFFFFFFFF;
	CHK_FUNC(popcnt_a, u);
	CHK_FUNC(popcnt_b, u);

	printf("\n64bit\n");
	uint64 u64;
	u64 = 0x0000000000000000UI64;
	CHK_FUNC64(popcnt64_a, u64);
	CHK_FUNC64(popcnt64_b, u64);
#if !defined(_M_X64)
	CHK_FUNC64(popcnt64_c, u64);
	CHK_FUNC64(popcnt64_d, u64);
#endif

	u64 = 0x0000000000000001UI64;
	CHK_FUNC64(popcnt64_a, u64);
	CHK_FUNC64(popcnt64_b, u64);
#if !defined(_M_X64)
	CHK_FUNC64(popcnt64_c, u64);
	CHK_FUNC64(popcnt64_d, u64);
#endif

	u64 = 0x0000000100000001UI64;
	CHK_FUNC64(popcnt64_a, u64);
	CHK_FUNC64(popcnt64_b, u64);
#if !defined(_M_X64)
	CHK_FUNC64(popcnt64_c, u64);
	CHK_FUNC64(popcnt64_d, u64);
#endif

	u64 = 0x0001000100010001UI64;
	CHK_FUNC64(popcnt64_a, u64);
	CHK_FUNC64(popcnt64_b, u64);
#if !defined(_M_X64)
	CHK_FUNC64(popcnt64_c, u64);
	CHK_FUNC64(popcnt64_d, u64);
#endif

	u64 = 0x0101010101010101UI64;
	CHK_FUNC64(popcnt64_a, u64);
	CHK_FUNC64(popcnt64_b, u64);
#if !defined(_M_X64)
	CHK_FUNC64(popcnt64_c, u64);
	CHK_FUNC64(popcnt64_d, u64);
#endif

	u64 = 0x1111111111111111UI64;
	CHK_FUNC64(popcnt64_a, u64);
	CHK_FUNC64(popcnt64_b, u64);
#if !defined(_M_X64)
	CHK_FUNC64(popcnt64_c, u64);
	CHK_FUNC64(popcnt64_d, u64);
#endif

	u64 = 0x5555555555555555UI64;
	CHK_FUNC64(popcnt64_a, u64);
	CHK_FUNC64(popcnt64_b, u64);
#if !defined(_M_X64)
	CHK_FUNC64(popcnt64_c, u64);
	CHK_FUNC64(popcnt64_d, u64);
#endif

	u64 = 0xFFFFFFFFFFFFFFFFUI64;
	CHK_FUNC64(popcnt64_a, u64);
	CHK_FUNC64(popcnt64_b, u64);
#if !defined(_M_X64)
	CHK_FUNC64(popcnt64_c, u64);
	CHK_FUNC64(popcnt64_d, u64);
#endif

	return 0;
}
