1 #ifndef BMSSE2__H__INCLUDED__
2 #define BMSSE2__H__INCLUDED__
34 #pragma GCC diagnostic push
35 #pragma GCC diagnostic ignored "-Wconversion"
64 const unsigned mu1 = 0x55555555;
65 const unsigned mu2 = 0x33333333;
66 const unsigned mu3 = 0x0F0F0F0F;
67 const unsigned mu4 = 0x0000003F;
70 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
71 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
72 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
73 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
75 mcnt = _mm_xor_si128(m1, m1);
80 __m128i b = _mm_load_si128(block);
84 tmp1 = _mm_srli_epi32(b, 1);
85 tmp1 = _mm_and_si128(tmp1, m1);
86 tmp2 = _mm_and_si128(b, m1);
87 b = _mm_add_epi32(tmp1, tmp2);
90 tmp1 = _mm_srli_epi32(b, 2);
91 tmp1 = _mm_and_si128(tmp1, m2);
92 tmp2 = _mm_and_si128(b, m2);
93 b = _mm_add_epi32(tmp1, tmp2);
96 tmp1 = _mm_srli_epi32(b, 4);
97 b = _mm_add_epi32(b, tmp1);
98 b = _mm_and_si128(b, m3);
101 tmp1 = _mm_srli_epi32 (b, 8);
102 b = _mm_add_epi32(b, tmp1);
105 tmp1 = _mm_srli_epi32 (b, 16);
106 b = _mm_add_epi32(b, tmp1);
107 b = _mm_and_si128(b, m4);
109 mcnt = _mm_add_epi32(mcnt, b);
111 }
while (block < block_end);
115 _mm_store_si128((__m128i*)tcnt, mcnt);
117 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
128 const unsigned mu1 = 0x55555555;
129 const unsigned mu2 = 0x33333333;
130 const unsigned mu3 = 0x0F0F0F0F;
131 const unsigned mu4 = 0x0000003F;
134 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
135 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
136 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
137 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
139 mcnt = _mm_xor_si128(m1, m1);
143 __m128i b = _mm_load_si128(block++);
145 tmp1 = _mm_load_si128(mask_block++);
147 b = sse2_func(b, tmp1);
150 tmp1 = _mm_srli_epi32(b, 1);
151 tmp1 = _mm_and_si128(tmp1, m1);
152 tmp2 = _mm_and_si128(b, m1);
153 b = _mm_add_epi32(tmp1, tmp2);
156 tmp1 = _mm_srli_epi32(b, 2);
157 tmp1 = _mm_and_si128(tmp1, m2);
158 tmp2 = _mm_and_si128(b, m2);
159 b = _mm_add_epi32(tmp1, tmp2);
162 tmp1 = _mm_srli_epi32(b, 4);
163 b = _mm_add_epi32(b, tmp1);
164 b = _mm_and_si128(b, m3);
167 tmp1 = _mm_srli_epi32 (b, 8);
168 b = _mm_add_epi32(b, tmp1);
171 tmp1 = _mm_srli_epi32 (b, 16);
172 b = _mm_add_epi32(b, tmp1);
173 b = _mm_and_si128(b, m4);
175 mcnt = _mm_add_epi32(mcnt, b);
177 }
while (block < block_end);
180 _mm_store_si128((__m128i*)tcnt, mcnt);
182 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
191 const unsigned mu1 = 0x55555555;
192 const unsigned mu2 = 0x33333333;
193 const unsigned mu3 = 0x0F0F0F0F;
194 const unsigned mu4 = 0x0000003F;
197 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
198 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
199 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
200 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
202 mcnt = _mm_xor_si128(m1, m1);
207 int count = (int)(block_end - block)*4;
210 const int w_shift =
sizeof(w) * 8 - 1;
211 bool first_word =
true;
219 count -= (w_prev = (w0 >> w_shift));
229 __m128i b = _mm_load_si128(block);
232 tmp1 = _mm_srli_epi32(b, 1);
233 tmp2 = _mm_xor_si128(b, tmp1);
234 _mm_store_si128((__m128i*)tcnt, tmp2);
242 tmp1 = _mm_and_si128(tmp1, m1);
243 tmp2 = _mm_and_si128(b, m1);
244 b = _mm_add_epi32(tmp1, tmp2);
247 tmp1 = _mm_srli_epi32(b, 2);
248 tmp1 = _mm_and_si128(tmp1, m2);
249 tmp2 = _mm_and_si128(b, m2);
250 b = _mm_add_epi32(tmp1, tmp2);
253 tmp1 = _mm_srli_epi32(b, 4);
254 b = _mm_add_epi32(b, tmp1);
255 b = _mm_and_si128(b, m3);
258 tmp1 = _mm_srli_epi32 (b, 8);
259 b = _mm_add_epi32(b, tmp1);
262 tmp1 = _mm_srli_epi32 (b, 16);
263 b = _mm_add_epi32(b, tmp1);
264 b = _mm_and_si128(b, m4);
266 mcnt = _mm_add_epi32(mcnt, b);
287 count -= !(w_prev ^ (w0 & 1));
288 count -= w_prev = (w0 >> w_shift);
292 count -= !w_prev; w_prev ^= w_prev;
298 count -= !(w_prev ^ (w0 & 1));
299 count -= w_prev = (w0 >> w_shift);
303 count -= !w_prev; w_prev ^= w_prev;
308 count -= !(w_prev ^ (w0 & 1));
309 count -= w_prev = (w0 >> w_shift);
313 count -= !w_prev; w_prev ^= w_prev;
318 count -= !(w_prev ^ (w0 & 1));
319 count -= w_prev = (w0 >> w_shift);
323 count -= !w_prev; w_prev ^= w_prev;
326 }
while (++block < block_end);
328 _mm_store_si128((__m128i*)tcnt, mcnt);
329 *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
331 return unsigned(count);
336 #pragma GCC diagnostic push
337 #pragma GCC diagnostic ignored "-Warray-bounds"
350 const unsigned unroll_factor = 8;
354 for (j = 0; j < size; ++j)
362 __m128i m1, mz, maskF, maskFL;
364 mz = _mm_setzero_si128();
365 m1 = _mm_loadu_si128((__m128i*)(pbuf));
367 maskF = _mm_cmpeq_epi32(mz, mz);
368 maskFL = _mm_slli_si128(maskF, 4 * 2);
369 int shiftL = (64 - (unroll_factor - size) * 16);
370 maskFL = _mm_slli_epi64(maskFL, shiftL);
372 m1 = _mm_andnot_si128(maskFL, m1);
373 m1 = _mm_or_si128(m1, maskFL);
375 __m128i mp = _mm_set1_epi16(pos);
376 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
377 int mi = _mm_movemask_epi8(mge_mask);
391 m1 = _mm_loadu_si128((__m128i*)(pbuf2));
392 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
393 mi = _mm_movemask_epi8(mge_mask);
397 return size - (unroll_factor - bsr_i);
417 unsigned end = 1 + ((*buf) >> 3);
418 unsigned dsize = end - start;
423 *is_set = ((*buf) & 1) ^ (start & 1);
425 BM_ASSERT(buf[start] < pos || (start==0));
429 unsigned arr_end = end;
432 unsigned curr = (start + end) >> 1;
438 unsigned size = end - start;
441 size += (end != arr_end);
447 BM_ASSERT(buf[start - 1] < pos || (start == 1));
452 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
470 #pragma GCC diagnostic pop
474 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
475 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
477 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
478 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
480 #define VECT_BITCOUNT(first, last) \
481 sse2_bit_count((__m128i*) (first), (__m128i*) (last))
483 #define VECT_BITCOUNT_AND(first, last, mask) \
484 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
486 #define VECT_BITCOUNT_OR(first, last, mask) \
487 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
489 #define VECT_BITCOUNT_XOR(first, last, mask) \
490 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
492 #define VECT_BITCOUNT_SUB(first, last, mask) \
493 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
495 #define VECT_INVERT_BLOCK(first) \
496 sse2_invert_block((__m128i*)first);
498 #define VECT_AND_BLOCK(dst, src) \
499 sse2_and_block((__m128i*) dst, (__m128i*) (src))
501 #define VECT_OR_BLOCK(dst, src) \
502 sse2_or_block((__m128i*) dst, (__m128i*) (src))
504 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
505 sse2_or_block_2way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
507 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
508 sse2_or_block_3way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
510 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
511 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
513 #define VECT_SUB_BLOCK(dst, src) \
514 sse2_sub_block((__m128i*) dst, (__m128i*) (src))
516 #define VECT_XOR_BLOCK(dst, src) \
517 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
519 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
520 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
522 #define VECT_COPY_BLOCK(dst, src) \
523 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
525 #define VECT_STREAM_BLOCK(dst, src) \
526 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
528 #define VECT_SET_BLOCK(dst, value) \
529 sse2_set_block((__m128i*) dst, value)
531 #define VECT_GAP_BFIND(buf, pos, is_set) \
532 sse2_gap_bfind(buf, pos, is_set)
539 #pragma GCC diagnostic pop