1 #ifndef BMSSE4__H__INCLUDED__
2 #define BMSSE4__H__INCLUDED__
45 #pragma GCC diagnostic push
46 #pragma GCC diagnostic ignored "-Wconversion"
50 #pragma warning( push )
51 #pragma warning( disable : 4146)
86 count += unsigned( _mm_popcnt_u64(b[0]) +
87 _mm_popcnt_u64(b[1]));
93 const unsigned* b = (
unsigned*) block;
94 count += _mm_popcnt_u32(b[0]) +
95 _mm_popcnt_u32(b[1]) +
96 _mm_popcnt_u32(b[2]) +
98 }
while (++block < block_end);
109 unsigned ret = (a ^ b);
117 unsigned op_or(
unsigned a,
unsigned b)
142 __m128i tmp0 = _mm_load_si128(block);
143 __m128i tmp1 = _mm_load_si128(mask_block);
144 __m128i b = sse2_func(tmp0, tmp1);
146 count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 0));
147 count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 1));
149 ++block; ++mask_block;
150 }
while (block < block_end);
154 __m128i tmp0 = _mm_load_si128(block);
155 __m128i tmp1 = _mm_load_si128(mask_block);
156 __m128i b = sse2_func(tmp0, tmp1);
158 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
159 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
160 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
161 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
163 ++block; ++mask_block;
164 }
while (block < block_end);
178 __m128i maskz = _mm_setzero_si128();
184 w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
185 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz)))
187 w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
188 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz)))
191 }
while (block < block_end);
202 __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
203 __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
204 wA = _mm_or_si128(wA, wB);
205 bool z1 = _mm_test_all_zeros(wA, wA);
207 wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
208 wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
209 wA = _mm_or_si128(wA, wB);
210 bool z2 = _mm_test_all_zeros(wA, wA);
221 __m128i mV = _mm_set1_epi32(
int(value));
222 _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
223 _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
224 _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
225 _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
240 __m128i m1A, m1B, m1C, m1D;
241 __m128i accA, accB, accC, accD;
246 accA = accB = accC = accD = _mm_setzero_si128();
250 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
251 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
252 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
253 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
255 _mm_store_si128(dst+0, m1A);
256 _mm_store_si128(dst+1, m1B);
257 _mm_store_si128(dst+2, m1C);
258 _mm_store_si128(dst+3, m1D);
260 accA = _mm_or_si128(accA, m1A);
261 accB = _mm_or_si128(accB, m1B);
262 accC = _mm_or_si128(accC, m1C);
263 accD = _mm_or_si128(accD, m1D);
266 }
while (src < src_end);
268 accA = _mm_or_si128(accA, accB);
269 accC = _mm_or_si128(accC, accD);
270 accA = _mm_or_si128(accA, accC);
272 return !_mm_testz_si128(accA, accA);
287 __m128i m1A, m1B, m1C, m1D;
289 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
290 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
291 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
292 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
294 _mm_store_si128(dst+0, m1A);
295 _mm_store_si128(dst+1, m1B);
296 _mm_store_si128(dst+2, m1C);
297 _mm_store_si128(dst+3, m1D);
299 m1A = _mm_or_si128(m1A, m1B);
300 m1C = _mm_or_si128(m1C, m1D);
301 m1A = _mm_or_si128(m1A, m1C);
303 bool z1 = _mm_testz_si128(m1A, m1A);
305 m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
306 m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
307 m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
308 m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
310 _mm_store_si128(dst+4, m1A);
311 _mm_store_si128(dst+5, m1B);
312 _mm_store_si128(dst+6, m1C);
313 _mm_store_si128(dst+7, m1D);
315 m1A = _mm_or_si128(m1A, m1B);
316 m1C = _mm_or_si128(m1C, m1D);
317 m1A = _mm_or_si128(m1A, m1C);
319 bool z2 = _mm_testz_si128(m1A, m1A);
336 __m128i m1A, m1B, m1C, m1D;
338 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
339 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
340 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
341 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
343 _mm_store_si128(dst+0, m1A);
344 _mm_store_si128(dst+1, m1B);
345 _mm_store_si128(dst+2, m1C);
346 _mm_store_si128(dst+3, m1D);
348 m1A = _mm_or_si128(m1A, m1B);
349 m1C = _mm_or_si128(m1C, m1D);
350 m1A = _mm_or_si128(m1A, m1C);
352 bool z1 = _mm_testz_si128(m1A, m1A);
354 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
355 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
356 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
357 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
359 _mm_store_si128(dst+4, m1A);
360 _mm_store_si128(dst+5, m1B);
361 _mm_store_si128(dst+6, m1C);
362 _mm_store_si128(dst+7, m1D);
364 m1A = _mm_or_si128(m1A, m1B);
365 m1C = _mm_or_si128(m1C, m1D);
366 m1A = _mm_or_si128(m1A, m1C);
368 bool z2 = _mm_testz_si128(m1A, m1A);
385 __m128i m1A, m1B, m1C, m1D;
386 __m128i m1E, m1F, m1G, m1H;
388 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
389 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
390 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
391 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
393 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
394 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
395 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
396 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
398 m1A = _mm_and_si128(m1A, m1E);
399 m1B = _mm_and_si128(m1B, m1F);
400 m1C = _mm_and_si128(m1C, m1G);
401 m1D = _mm_and_si128(m1D, m1H);
403 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
404 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
405 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
406 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
408 _mm_store_si128(dst+0, m1A);
409 _mm_store_si128(dst+1, m1B);
410 _mm_store_si128(dst+2, m1C);
411 _mm_store_si128(dst+3, m1D);
413 m1A = _mm_or_si128(m1A, m1B);
414 m1C = _mm_or_si128(m1C, m1D);
415 m1A = _mm_or_si128(m1A, m1C);
417 bool z1 = _mm_testz_si128(m1A, m1A);
419 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
420 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
421 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
422 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
424 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
425 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
426 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
427 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
429 m1A = _mm_and_si128(m1A, m1E);
430 m1B = _mm_and_si128(m1B, m1F);
431 m1C = _mm_and_si128(m1C, m1G);
432 m1D = _mm_and_si128(m1D, m1H);
434 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
435 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
436 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
437 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
439 _mm_store_si128(dst+4, m1A);
440 _mm_store_si128(dst+5, m1B);
441 _mm_store_si128(dst+6, m1C);
442 _mm_store_si128(dst+7, m1D);
444 m1A = _mm_or_si128(m1A, m1B);
445 m1C = _mm_or_si128(m1C, m1D);
446 m1A = _mm_or_si128(m1A, m1C);
448 bool z2 = _mm_testz_si128(m1A, m1A);
465 __m128i m1A, m1B, m1C, m1D;
467 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
468 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
469 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
470 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
472 _mm_store_si128(dst+0, m1A);
473 _mm_store_si128(dst+1, m1B);
474 _mm_store_si128(dst+2, m1C);
475 _mm_store_si128(dst+3, m1D);
477 m1A = _mm_or_si128(m1A, m1B);
478 m1C = _mm_or_si128(m1C, m1D);
479 m1A = _mm_or_si128(m1A, m1C);
481 bool z1 = _mm_testz_si128(m1A, m1A);
483 m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
484 m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
485 m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
486 m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
488 _mm_store_si128(dst+4, m1A);
489 _mm_store_si128(dst+5, m1B);
490 _mm_store_si128(dst+6, m1C);
491 _mm_store_si128(dst+7, m1D);
493 m1A = _mm_or_si128(m1A, m1B);
494 m1C = _mm_or_si128(m1C, m1D);
495 m1A = _mm_or_si128(m1A, m1C);
497 bool z2 = _mm_testz_si128(m1A, m1A);
515 __m128i m1A, m1B, m1C, m1D;
517 m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
518 m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
519 m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
520 m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
522 _mm_store_si128(dst+0, m1A);
523 _mm_store_si128(dst+1, m1B);
524 _mm_store_si128(dst+2, m1C);
525 _mm_store_si128(dst+3, m1D);
527 m1A = _mm_or_si128(m1A, m1B);
528 m1C = _mm_or_si128(m1C, m1D);
529 m1A = _mm_or_si128(m1A, m1C);
531 bool z1 = _mm_testz_si128(m1A, m1A);
533 m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
534 m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
535 m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
536 m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
538 _mm_store_si128(dst+4, m1A);
539 _mm_store_si128(dst+5, m1B);
540 _mm_store_si128(dst+6, m1C);
541 _mm_store_si128(dst+7, m1D);
543 m1A = _mm_or_si128(m1A, m1B);
544 m1C = _mm_or_si128(m1C, m1D);
545 m1A = _mm_or_si128(m1A, m1C);
547 bool z2 = _mm_testz_si128(m1A, m1A);
567 w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
568 if (!_mm_test_all_ones(w))
570 w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
571 if (!_mm_test_all_ones(w))
575 }
while (block < block_end);
586 return _mm_test_all_ones(_mm_loadu_si128((__m128i*)ptr));
597 __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
598 return _mm_testz_si128(w0, w0);
608 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
609 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
610 w0 = _mm_or_si128(w0, w1);
611 return _mm_testz_si128(w0, w0);
621 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
622 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
623 w0 = _mm_xor_si128(w0, w1);
624 return _mm_testz_si128(w0, w0);
636 const __m128i* block_end =
638 __m128i m1COshft, m2COshft;
643 unsigned co2, co1 = 0;
644 for (;block < block_end; block += 2)
646 __m128i m1A = _mm_load_si128(block);
647 __m128i m2A = _mm_load_si128(block+1);
649 __m128i m1CO = _mm_srli_epi32(m1A, 31);
650 __m128i m2CO = _mm_srli_epi32(m2A, 31);
652 co2 = _mm_extract_epi32(m1CO, 3);
654 __m128i m1As = _mm_slli_epi32(m1A, 1);
655 __m128i m2As = _mm_slli_epi32(m2A, 1);
657 m1COshft = _mm_slli_si128 (m1CO, 4);
658 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
662 co2 = _mm_extract_epi32(m2CO, 3);
664 m2COshft = _mm_slli_si128 (m2CO, 4);
665 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
667 m1As = _mm_or_si128(m1As, m1COshft);
668 m2As = _mm_or_si128(m2As, m2COshft);
673 m1A = _mm_xor_si128(m1A, m1As);
674 m2A = _mm_xor_si128(m2A, m2As);
679 count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
681 m0 = _mm_extract_epi64(m2A, 0);
682 m1 = _mm_extract_epi64(m2A, 1);
683 count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
685 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
686 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
687 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
688 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
689 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
690 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
692 m0 = _mm_extract_epi32(m2A, 0);
693 m1 = _mm_extract_epi32(m2A, 1);
694 m2 = _mm_extract_epi32(m2A, 2);
695 m3 = _mm_extract_epi32(m2A, 3);
696 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
697 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
715 const __m128i* block_end =
717 __m128i m1COshft, m2COshft;
722 unsigned co2, co1 = 0;
723 for (;block < block_end; block += 2, xor_block += 2)
725 __m128i m1A = _mm_load_si128(block);
726 __m128i m2A = _mm_load_si128(block+1);
727 __m128i m1B = _mm_load_si128(xor_block);
728 __m128i m2B = _mm_load_si128(xor_block+1);
730 m1A = _mm_xor_si128(m1A, m1B);
731 m2A = _mm_xor_si128(m2A, m2B);
733 __m128i m1CO = _mm_srli_epi32(m1A, 31);
734 __m128i m2CO = _mm_srli_epi32(m2A, 31);
736 co2 = _mm_extract_epi32(m1CO, 3);
738 __m128i m1As = _mm_slli_epi32(m1A, 1);
739 __m128i m2As = _mm_slli_epi32(m2A, 1);
741 m1COshft = _mm_slli_si128 (m1CO, 4);
742 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
746 co2 = _mm_extract_epi32(m2CO, 3);
748 m2COshft = _mm_slli_si128 (m2CO, 4);
749 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
751 m1As = _mm_or_si128(m1As, m1COshft);
752 m2As = _mm_or_si128(m2As, m2COshft);
757 m1A = _mm_xor_si128(m1A, m1As);
758 m2A = _mm_xor_si128(m2A, m2As);
763 count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
765 m0 = _mm_extract_epi64(m2A, 0);
766 m1 = _mm_extract_epi64(m2A, 1);
767 count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
769 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
770 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
771 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
772 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
773 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
774 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
776 m0 = _mm_extract_epi32(m2A, 0);
777 m1 = _mm_extract_epi32(m2A, 1);
778 m2 = _mm_extract_epi32(m2A, 2);
779 m3 = _mm_extract_epi32(m2A, 3);
780 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
781 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
799 unsigned* gc,
unsigned* bc)
801 const __m128i* block_end =
803 __m128i m1COshft, m2COshft;
806 unsigned bit_count = 0;
807 unsigned gap_count = 1;
809 unsigned co2, co1 = 0;
810 for (;block < block_end; block += 2)
812 __m128i m1A = _mm_load_si128(block);
813 __m128i m2A = _mm_load_si128(block+1);
817 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
818 m0 = _mm_extract_epi64(m2A, 0);
819 m1 = _mm_extract_epi64(m2A, 1);
820 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
823 __m128i m1CO = _mm_srli_epi32(m1A, 31);
824 __m128i m2CO = _mm_srli_epi32(m2A, 31);
826 co2 = _mm_extract_epi32(m1CO, 3);
828 __m128i m1As = _mm_slli_epi32(m1A, 1);
829 __m128i m2As = _mm_slli_epi32(m2A, 1);
831 m1COshft = _mm_slli_si128 (m1CO, 4);
832 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
836 co2 = _mm_extract_epi32(m2CO, 3);
838 m2COshft = _mm_slli_si128 (m2CO, 4);
839 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
841 m1As = _mm_or_si128(m1As, m1COshft);
842 m2As = _mm_or_si128(m2As, m2COshft);
847 m1A = _mm_xor_si128(m1A, m1As);
848 m2A = _mm_xor_si128(m2A, m2As);
852 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
857 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
860 gap_count -= (w0 & 1u);
879 const __m128i* block1_end =
881 __m128i maskZ = _mm_setzero_si128();
883 unsigned simd_lane = 0;
886 mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
887 mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
888 __m128i mOR = _mm_or_si128(mA, mB);
889 if (!_mm_test_all_zeros(mOR, mOR))
891 if (!_mm_test_all_zeros(mA, mA))
893 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
896 int bsf = bm::bsf_asm32(mask);
897 _mm_store_si128 ((__m128i*)simd_buf, mA);
898 unsigned widx = bsf >> 2;
899 unsigned w = simd_buf[widx];
900 bsf = bm::bsf_asm32(w);
901 *pos = (simd_lane * 128) + (widx * 32) + bsf;
904 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
907 int bsf = bm::bsf_asm32(mask);
908 _mm_store_si128 ((__m128i*)simd_buf, mB);
909 unsigned widx = bsf >> 2;
910 unsigned w = simd_buf[widx];
911 bsf = bm::bsf_asm32(w);
912 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
917 block1+=2; block2+=2;
919 }
while (block1 < block1_end);
934 const __m128i* block_end =
936 __m128i maskZ = _mm_setzero_si128();
938 unsigned simd_lane = 0;
941 mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
942 __m128i mOR = _mm_or_si128(mA, mB);
943 if (!_mm_test_all_zeros(mOR, mOR))
945 if (!_mm_test_all_zeros(mA, mA))
947 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
950 int bsf = bm::bsf_asm32(mask);
951 _mm_store_si128 ((__m128i*)simd_buf, mA);
952 unsigned widx = bsf >> 2;
953 unsigned w = simd_buf[widx];
954 bsf = bm::bsf_asm32(w);
955 *pos = (simd_lane * 128) + (widx * 32) + bsf;
958 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
961 int bsf = bm::bsf_asm32(mask);
962 _mm_store_si128 ((__m128i*)simd_buf, mB);
963 unsigned widx = bsf >> 2;
964 unsigned w = simd_buf[widx];
965 bsf = bm::bsf_asm32(w);
966 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
973 }
while (block < block_end);
982 #pragma GCC diagnostic push
983 #pragma GCC diagnostic ignored "-Warray-bounds"
999 const unsigned unroll_factor = 8;
1003 for (j = 0; j < size; ++j)
1011 __m128i m1, mz, maskF, maskFL;
1013 mz = _mm_setzero_si128();
1014 m1 = _mm_loadu_si128((__m128i*)(pbuf));
1016 maskF = _mm_cmpeq_epi64(mz, mz);
1017 maskFL = _mm_slli_si128(maskF, 4 * 2);
1018 int shiftL= (64 - (unroll_factor - size) * 16);
1019 maskFL = _mm_slli_epi64(maskFL, shiftL);
1021 m1 = _mm_andnot_si128(maskFL, m1);
1022 m1 = _mm_or_si128(m1, maskFL);
1024 __m128i mp = _mm_set1_epi16(pos);
1025 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
1026 __m128i c_mask = _mm_slli_epi16(mge_mask, 15);
1027 int mi = _mm_movemask_epi8(c_mask);
1031 unsigned bc = _mm_popcnt_u32(mi);
1032 return unroll_factor - bc;
1039 m1 = _mm_loadu_si128((__m128i*)(pbuf2));
1040 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
1041 mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
1042 unsigned bc = _mm_popcnt_u32(mi);
1062 unsigned end = 1 + ((*buf) >> 3);
1063 unsigned dsize = end - start;
1068 *is_set = ((*buf) & 1) ^ (start & 1);
1070 BM_ASSERT(buf[start] < pos || (start==0));
1074 unsigned arr_end = end;
1075 while (start != end)
1077 unsigned curr = (start + end) >> 1;
1078 if (buf[curr] < pos)
1083 unsigned size = end - start;
1086 size += (end != arr_end);
1092 BM_ASSERT(buf[start - 1] < pos || (start == 1));
1097 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1128 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1129 __m128i mm_val = _mm_set1_epi32(value);
1131 __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8);
1132 __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8);
1134 __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
1135 __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
1137 __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
1138 int mask = _mm_movemask_epi8(cmp_mask_ge);
1141 int bsf = bm::bsf_asm32(mask);
1163 const unsigned*
BMRESTRICT arr_base = &arr[from];
1165 unsigned unroll_factor = 8;
1166 unsigned len = to - from + 1;
1167 unsigned len_unr = len - (len % unroll_factor);
1169 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1170 __m128i vect_target = _mm_set1_epi32(target);
1171 __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8);
1174 __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
1177 for (; k < len_unr; k+=unroll_factor)
1179 vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k]));
1180 norm_vect40 = _mm_sub_epi32(vect40, mask0x8);
1182 cmp_mask_ge = _mm_or_si128(
1183 _mm_cmpgt_epi32 (norm_vect40, norm_target),
1184 _mm_cmpeq_epi32 (vect40, vect_target)
1186 mask = _mm_movemask_epi8(cmp_mask_ge);
1189 int bsf = bm::bsf_asm32(mask);
1190 return from + k + (bsf / 4);
1192 vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
1193 norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
1195 cmp_mask_ge = _mm_or_si128(
1196 _mm_cmpgt_epi32 (norm_vect41, norm_target),
1197 _mm_cmpeq_epi32 (vect41, vect_target)
1199 mask = _mm_movemask_epi8(cmp_mask_ge);
1202 int bsf = bm::bsf_asm32(mask);
1203 return 4 + from + k + (bsf / 4);
1207 for (; k < len; ++k)
1209 if (arr_base[k] >= target)
1223 unsigned nb,
unsigned start)
1225 const unsigned unroll_factor = 8;
1226 const unsigned len = (size - start);
1227 const unsigned len_unr = len - (len % unroll_factor);
1232 __m128i nbM = _mm_set1_epi32(nb);
1234 for (k = 0; k < len_unr; k+=unroll_factor)
1236 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1237 __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
1241 if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
1242 !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
1246 for (; k < len; ++k)
1261 unsigned start,
unsigned stop )
1263 const unsigned unroll_factor = 4;
1264 const unsigned len = (stop - start);
1265 const unsigned len_unr = len - (len % unroll_factor);
1276 for (; k < len_unr; k+=unroll_factor)
1278 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1279 __m128i nbitA = _mm_and_si128 (idxA, sb_mask);
1283 nbitA = _mm_and_si128 (nbitA, sw_mask);
1284 _mm_store_si128 ((__m128i*)mshift_v, nbitA);
1288 __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0);
1289 __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA);
1290 if (_mm_test_all_ones(cmpA))
1292 unsigned nword = _mm_extract_epi32(nwordA, 0);
1293 block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
1294 |(1u << mshift_v[2]) | (1u << mshift_v[3]);
1298 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1300 block[mword_v[0]] |= (1u << mshift_v[0]);
1301 block[mword_v[1]] |= (1u << mshift_v[1]);
1302 block[mword_v[2]] |= (1u << mshift_v[2]);
1303 block[mword_v[3]] |= (1u << mshift_v[3]);
1308 for (; k < len; ++k)
1310 unsigned n = idx[k];
1314 block[nword] |= (1u << nbit);
1350 const unsigned unroll_factor = 4;
1351 const unsigned len = (size - start);
1352 const unsigned len_unr = len - (len % unroll_factor);
1356 __m128i maskFF = _mm_set1_epi32(~0u);
1357 __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
1359 __m128i mask_tmp, mask_0;
1365 unsigned base = start + k;
1366 __m128i* idx_ptr = (__m128i*)(idx + base);
1367 __m128i* target_ptr = (__m128i*)(arr + base);
1368 for (; k < len_unr; k+=unroll_factor)
1370 __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask);
1373 _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
1374 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1382 __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
1383 __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
1384 mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
1385 mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
1386 mask_0 = _mm_or_si128 (mask_0, mask_tmp);
1388 __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
1389 _mm_slli_si128 (am_0, 8));
1390 mask_tmp = _mm_and_si128 (
1391 _mm_slli_epi32(mask1, mshift_v[3]),
1392 _mm_slli_si128 (am_0, 12)
1395 mask_0 = _mm_or_si128 (mask_0,
1396 _mm_or_si128 (mask_2, mask_tmp));
1399 mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1404 mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
1405 blk[mword_v[1]], blk[mword_v[0]]),
1410 mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ);
1411 mask_tmp = _mm_xor_si128 (mask_tmp, maskFF);
1412 mask_tmp = _mm_srli_epi32 (mask_tmp, 31);
1414 mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx);
1416 _mm_storeu_si128 (target_ptr,
1417 _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
1419 ++idx_ptr; ++target_ptr;
1420 _mm_prefetch((
const char*)target_ptr, _MM_HINT_T0);
1423 for (; k < len; ++k)
1439 __m128i* block_end =
1441 __m128i mAcc = _mm_set1_epi32(0);
1442 __m128i mMask1 = _mm_set1_epi32(1);
1445 for (--block_end; block_end >= block; block_end -= 2)
1447 __m128i m1A = _mm_load_si128(block_end);
1448 __m128i m2A = _mm_load_si128(block_end-1);
1450 __m128i m1CO = _mm_and_si128(m1A, mMask1);
1451 __m128i m2CO = _mm_and_si128(m2A, mMask1);
1453 co2 = _mm_extract_epi32(m1CO, 0);
1455 m1A = _mm_srli_epi32(m1A, 1);
1456 m2A = _mm_srli_epi32(m2A, 1);
1458 __m128i m1COshft = _mm_srli_si128 (m1CO, 4);
1459 __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1460 m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
1461 m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
1462 m1COshft = _mm_slli_epi32(m1COshft, 31);
1463 m2COshft = _mm_slli_epi32(m2COshft, 31);
1465 m1A = _mm_or_si128(m1A, m1COshft);
1466 m2A = _mm_or_si128(m2A, m2COshft);
1468 co1 = _mm_extract_epi32(m2CO, 0);
1470 _mm_store_si128(block_end, m1A);
1471 _mm_store_si128(block_end-1, m2A);
1473 mAcc = _mm_or_si128(mAcc, m1A);
1474 mAcc = _mm_or_si128(mAcc, m2A);
1477 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1489 __m128i* block_end =
1491 __m128i m1COshft, m2COshft;
1492 __m128i mAcc = _mm_set1_epi32(0);
1495 for (;block < block_end; block += 2)
1497 __m128i m1A = _mm_load_si128(block);
1498 __m128i m2A = _mm_load_si128(block+1);
1500 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1501 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1503 co2 = _mm_extract_epi32(m1CO, 3);
1505 m1A = _mm_slli_epi32(m1A, 1);
1506 m2A = _mm_slli_epi32(m2A, 1);
1508 m1COshft = _mm_slli_si128 (m1CO, 4);
1509 m2COshft = _mm_slli_si128 (m2CO, 4);
1510 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1511 m2COshft = _mm_insert_epi32 (m2COshft, co2, 0);
1513 m1A = _mm_or_si128(m1A, m1COshft);
1514 m2A = _mm_or_si128(m2A, m2COshft);
1516 co1 = _mm_extract_epi32(m2CO, 3);
1518 _mm_store_si128(block, m1A);
1519 _mm_store_si128(block+1, m2A);
1521 mAcc = _mm_or_si128(mAcc, m1A);
1522 mAcc = _mm_or_si128(mAcc, m2A);
1524 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1545 __m128i m1COshft, m2COshft;
1546 __m128i mAcc = _mm_set1_epi32(0);
1557 di = unsigned(_mm_popcnt_u64(t - 1));
1564 di += unsigned(_mm_popcnt_u32(t32 - 1));
1568 for (; di < 64 ; ++di)
1574 block = (__m128i*) &wblock[d_base];
1575 mask_block = (__m128i*) &mblock[d_base];
1576 mAcc = _mm_xor_si128(mAcc, mAcc);
1577 for (
unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1579 __m128i m1A = _mm_load_si128(block);
1580 __m128i m2A = _mm_load_si128(block+1);
1582 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1583 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1585 co2 = _mm_extract_epi32(m1CO, 3);
1587 m1A = _mm_slli_epi32(m1A, 1);
1588 m2A = _mm_slli_epi32(m2A, 1);
1590 m1COshft = _mm_slli_si128 (m1CO, 4);
1591 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1595 co2 = _mm_extract_epi32(m2CO, 3);
1597 m2COshft = _mm_slli_si128 (m2CO, 4);
1598 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1600 m1A = _mm_or_si128(m1A, m1COshft);
1601 m2A = _mm_or_si128(m2A, m2COshft);
1603 m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block));
1604 m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1));
1606 mAcc = _mm_or_si128(mAcc, m1A);
1607 mAcc = _mm_or_si128(mAcc, m2A);
1609 _mm_store_si128(block, m1A);
1610 _mm_store_si128(block+1, m2A);
1616 if (_mm_testz_si128(mAcc, mAcc))
1627 bm::id64_t w0 = wblock[d_base] = co1 & mblock[d_base];
1628 d |= (dmask & (w0 << di));
1660 const __m128i* sub_block = (__m128i*) (block + off);
1661 __m128i* t_sub_block = (__m128i*)(target_block + off);
1665 const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
1666 __m128i mA, mB, mC, mD;
1667 mA = _mm_xor_si128(_mm_load_si128(sub_block),
1668 _mm_load_si128(xor_sub_block));
1669 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
1670 _mm_load_si128(xor_sub_block+1));
1671 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
1672 _mm_load_si128(xor_sub_block+2));
1673 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
1674 _mm_load_si128(xor_sub_block+3));
1676 _mm_store_si128(t_sub_block, mA);
1677 _mm_store_si128(t_sub_block+1, mB);
1678 _mm_store_si128(t_sub_block+2, mC);
1679 _mm_store_si128(t_sub_block+3, mD);
1681 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
1682 _mm_load_si128(xor_sub_block+4));
1683 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
1684 _mm_load_si128(xor_sub_block+5));
1685 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
1686 _mm_load_si128(xor_sub_block+6));
1687 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
1688 _mm_load_si128(xor_sub_block+7));
1690 _mm_store_si128(t_sub_block+4, mA);
1691 _mm_store_si128(t_sub_block+5, mB);
1692 _mm_store_si128(t_sub_block+6, mC);
1693 _mm_store_si128(t_sub_block+7, mD);
1698 _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
1699 _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
1700 _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
1701 _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
1703 _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
1704 _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
1705 _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
1706 _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
1713 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1714 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1716 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1717 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1719 #define VECT_BITCOUNT(first, last) \
1720 sse4_bit_count((__m128i*) (first), (__m128i*) (last))
1722 #define VECT_BITCOUNT_AND(first, last, mask) \
1723 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1725 #define VECT_BITCOUNT_OR(first, last, mask) \
1726 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1728 #define VECT_BITCOUNT_XOR(first, last, mask) \
1729 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1731 #define VECT_BITCOUNT_SUB(first, last, mask) \
1732 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1734 #define VECT_INVERT_BLOCK(first) \
1735 sse2_invert_block((__m128i*)first);
1737 #define VECT_AND_BLOCK(dst, src) \
1738 sse4_and_block((__m128i*) dst, (__m128i*) (src))
1740 #define VECT_AND_DIGEST(dst, src) \
1741 sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
1743 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1744 sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1746 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1747 sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1749 #define VECT_OR_BLOCK(dst, src) \
1750 sse2_or_block((__m128i*) dst, (__m128i*) (src))
1752 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1753 sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1755 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1756 sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1758 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1759 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1761 #define VECT_SUB_BLOCK(dst, src) \
1762 sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
1764 #define VECT_SUB_DIGEST(dst, src) \
1765 sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
1767 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1768 sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1770 #define VECT_XOR_BLOCK(dst, src) \
1771 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1773 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1774 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1776 #define VECT_COPY_BLOCK(dst, src) \
1777 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1779 #define VECT_STREAM_BLOCK(dst, src) \
1780 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1782 #define VECT_SET_BLOCK(dst, value) \
1783 sse2_set_block((__m128i*) dst, value)
1785 #define VECT_IS_ZERO_BLOCK(dst) \
1786 sse4_is_all_zero((__m128i*) dst)
1788 #define VECT_IS_ONE_BLOCK(dst) \
1789 sse4_is_all_one((__m128i*) dst)
1791 #define VECT_IS_DIGEST_ZERO(start) \
1792 sse4_is_digest_zero((__m128i*)start)
1794 #define VECT_BLOCK_SET_DIGEST(dst, val) \
1795 sse4_block_set_digest((__m128i*)dst, val)
1797 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1798 sse4_lower_bound_scan_u32(arr, target, from, to)
1800 #define VECT_SHIFT_L1(b, acc, co) \
1801 sse42_shift_l1((__m128i*)b, acc, co)
1803 #define VECT_SHIFT_R1(b, acc, co) \
1804 sse42_shift_r1((__m128i*)b, acc, co)
1806 #define VECT_SHIFT_R1_AND(b, co, m, digest) \
1807 sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
1809 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
1810 sse42_idx_arr_block_lookup(idx, size, nb, start)
1812 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
1813 sse42_set_block_bits(block, idx, start, stop)
1815 #define VECT_BLOCK_CHANGE(block, size) \
1816 sse42_bit_block_calc_change((__m128i*)block, size)
1818 #define VECT_BLOCK_XOR_CHANGE(block, xor_block, size) \
1819 sse42_bit_block_calc_xor_change((__m128i*)block, (__m128i*)xor_block, size)
1822 #define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
1823 sse42_bit_block_calc_change_bc((__m128i*)block, gc, bc)
1826 #define VECT_BIT_FIND_FIRST(src, pos) \
1827 sse42_bit_find_first((__m128i*) src, pos)
1829 #define VECT_BIT_FIND_DIFF(src1, src2, pos) \
1830 sse42_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
1832 #define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
1833 sse42_bit_block_xor(t, src, src_xor, d)
1835 #define VECT_GAP_BFIND(buf, pos, is_set) \
1836 sse42_gap_bfind(buf, pos, is_set)
1839 #pragma GCC diagnostic pop
1843 #pragma warning( pop )