BitMagic-C++
bmsse2.h
Go to the documentation of this file.
1 #ifndef BMSSE2__H__INCLUDED__
2 #define BMSSE2__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmsse2.h
22  \brief Compute functions for SSE2 SIMD instruction set (internal)
23 */
24 
25 #include<mmintrin.h>
26 #include<emmintrin.h>
27 
28 #include "bmdef.h"
29 #include "bmsse_util.h"
30 #include "bmutil.h"
31 
32 
33 #ifdef __GNUG__
34 #pragma GCC diagnostic push
35 #pragma GCC diagnostic ignored "-Wconversion"
36 #endif
37 
38 namespace bm
39 {
40 
41 
42 /*!
43  SSE2 optimized bitcounting function implements parallel bitcounting
44  algorithm for SSE2 instruction set.
45 
46 <pre>
47 unsigned CalcBitCount32(unsigned b)
48 {
49  b = (b & 0x55555555) + (b >> 1 & 0x55555555);
50  b = (b & 0x33333333) + (b >> 2 & 0x33333333);
51  b = (b + (b >> 4)) & 0x0F0F0F0F;
52  b = b + (b >> 8);
53  b = (b + (b >> 16)) & 0x0000003F;
54  return b;
55 }
56 </pre>
57 
58  @ingroup SSE2
59 
60 */
61 inline
62 bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
63 {
64  const unsigned mu1 = 0x55555555;
65  const unsigned mu2 = 0x33333333;
66  const unsigned mu3 = 0x0F0F0F0F;
67  const unsigned mu4 = 0x0000003F;
68 
69  // Loading masks
70  __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
71  __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
72  __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
73  __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
74  __m128i mcnt;
75  mcnt = _mm_xor_si128(m1, m1); // cnt = 0
76 
77  __m128i tmp1, tmp2;
78  do
79  {
80  __m128i b = _mm_load_si128(block);
81  ++block;
82 
83  // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
84  tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
85  tmp1 = _mm_and_si128(tmp1, m1);
86  tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
87  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
88 
89  // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
90  tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
91  tmp1 = _mm_and_si128(tmp1, m2);
92  tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
93  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
94 
95  // b = (b + (b >> 4)) & 0x0F0F0F0F;
96  tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
97  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
98  b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
99 
100  // b = b + (b >> 8);
101  tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
102  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
103 
104  // b = (b + (b >> 16)) & 0x0000003F;
105  tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
106  b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
107  b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
108 
109  mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
110 
111  } while (block < block_end);
112 
113 
115  _mm_store_si128((__m128i*)tcnt, mcnt);
116 
117  return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
118 }
119 
120 
121 
122 template<class Func>
123 bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block,
124  const __m128i* BMRESTRICT block_end,
125  const __m128i* BMRESTRICT mask_block,
126  Func sse2_func)
127 {
128  const unsigned mu1 = 0x55555555;
129  const unsigned mu2 = 0x33333333;
130  const unsigned mu3 = 0x0F0F0F0F;
131  const unsigned mu4 = 0x0000003F;
132 
133  // Loading masks
134  __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
135  __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
136  __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
137  __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
138  __m128i mcnt;
139  mcnt = _mm_xor_si128(m1, m1); // cnt = 0
140  do
141  {
142  __m128i tmp1, tmp2;
143  __m128i b = _mm_load_si128(block++);
144 
145  tmp1 = _mm_load_si128(mask_block++);
146 
147  b = sse2_func(b, tmp1);
148 
149  // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
150  tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
151  tmp1 = _mm_and_si128(tmp1, m1);
152  tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
153  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
154 
155  // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
156  tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
157  tmp1 = _mm_and_si128(tmp1, m2);
158  tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
159  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
160 
161  // b = (b + (b >> 4)) & 0x0F0F0F0F;
162  tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
163  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
164  b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
165 
166  // b = b + (b >> 8);
167  tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
168  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
169 
170  // b = (b + (b >> 16)) & 0x0000003F;
171  tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
172  b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
173  b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
174 
175  mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
176 
177  } while (block < block_end);
178 
180  _mm_store_si128((__m128i*)tcnt, mcnt);
181 
182  return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
183 }
184 
185 
186 inline
188  const __m128i* BMRESTRICT block_end,
189  unsigned* BMRESTRICT bit_count)
190 {
191  const unsigned mu1 = 0x55555555;
192  const unsigned mu2 = 0x33333333;
193  const unsigned mu3 = 0x0F0F0F0F;
194  const unsigned mu4 = 0x0000003F;
195 
196  // Loading masks
197  __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
198  __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
199  __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
200  __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
201  __m128i mcnt;//, ccnt;
202  mcnt = _mm_xor_si128(m1, m1); // bit_cnt = 0
203  //ccnt = _mm_xor_si128(m1, m1); // change_cnt = 0
204 
205  __m128i tmp1, tmp2;
206 
207  int count = (int)(block_end - block)*4; //0;//1;
208 
209  bm::word_t w, w0, w_prev;//, w_l;
210  const int w_shift = sizeof(w) * 8 - 1;
211  bool first_word = true;
212 
213  // first word
214  {
215  const bm::word_t* blk = (const bm::word_t*) block;
216  w = w0 = blk[0];
217  w ^= (w >> 1);
218  BM_INCWORD_BITCOUNT(count, w);
219  count -= (w_prev = (w0 >> w_shift)); // negative value correction
220  }
221 
223 
224  do
225  {
226  // compute bit-count
227  // ---------------------------------------------------------------------
228  {
229  __m128i b = _mm_load_si128(block);
230 
231  // w ^(w >> 1)
232  tmp1 = _mm_srli_epi32(b, 1); // tmp1 = b >> 1
233  tmp2 = _mm_xor_si128(b, tmp1); // tmp2 = tmp1 ^ b;
234  _mm_store_si128((__m128i*)tcnt, tmp2);
235 
236 
237  // compare with zero
238  // SSE4: _mm_test_all_zero()
239  {
240  // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
241  //tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
242  tmp1 = _mm_and_si128(tmp1, m1);
243  tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
244  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
245 
246  // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
247  tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
248  tmp1 = _mm_and_si128(tmp1, m2);
249  tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
250  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
251 
252  // b = (b + (b >> 4)) & 0x0F0F0F0F;
253  tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
254  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
255  b = _mm_and_si128(b, m3); //& 0x0F0F0F0F
256 
257  // b = b + (b >> 8);
258  tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
259  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
260 
261  // b = (b + (b >> 16)) & 0x0000003F;
262  tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
263  b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
264  b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
265 
266  mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
267  }
268 
269  }
270  // ---------------------------------------------------------------------
271  {
272  //__m128i b = _mm_load_si128(block);
273  // TODO: SSE4...
274  //w = _mm_extract_epi32(b, i);
275 
276  const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block;
277 
278  if (first_word)
279  {
280  first_word = false;
281  }
282  else
283  {
284  if (0!=(w0=blk[0]))
285  {
286  BM_INCWORD_BITCOUNT(count, tcnt[0]);
287  count -= !(w_prev ^ (w0 & 1));
288  count -= w_prev = (w0 >> w_shift);
289  }
290  else
291  {
292  count -= !w_prev; w_prev ^= w_prev;
293  }
294  }
295  if (0!=(w0=blk[1]))
296  {
297  BM_INCWORD_BITCOUNT(count, tcnt[1]);
298  count -= !(w_prev ^ (w0 & 1));
299  count -= w_prev = (w0 >> w_shift);
300  }
301  else
302  {
303  count -= !w_prev; w_prev ^= w_prev;
304  }
305  if (0!=(w0=blk[2]))
306  {
307  BM_INCWORD_BITCOUNT(count, tcnt[2]);
308  count -= !(w_prev ^ (w0 & 1));
309  count -= w_prev = (w0 >> w_shift);
310  }
311  else
312  {
313  count -= !w_prev; w_prev ^= w_prev;
314  }
315  if (0!=(w0=blk[3]))
316  {
317  BM_INCWORD_BITCOUNT(count, tcnt[3]);
318  count -= !(w_prev ^ (w0 & 1));
319  count -= w_prev = (w0 >> w_shift);
320  }
321  else
322  {
323  count -= !w_prev; w_prev ^= w_prev;
324  }
325  }
326  } while (++block < block_end);
327 
328  _mm_store_si128((__m128i*)tcnt, mcnt);
329  *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
330 
331  return unsigned(count);
332 }
333 
334 #ifdef __GNUG__
335 // necessary measure to silence false warning from GCC about negative pointer arithmetics
336 #pragma GCC diagnostic push
337 #pragma GCC diagnostic ignored "-Warray-bounds"
338 #endif
339 
340 /*!
341 SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search results (8 elements)
342 \internal
343 */
344 inline
345 unsigned sse2_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
346 {
347  BM_ASSERT(size <= 16);
348  BM_ASSERT(size);
349 
350  const unsigned unroll_factor = 8;
351  if (size < 4) // for very short vector use conventional scan
352  {
353  unsigned j;
354  for (j = 0; j < size; ++j)
355  {
356  if (pbuf[j] >= pos)
357  break;
358  }
359  return j;
360  }
361 
362  __m128i m1, mz, maskF, maskFL;
363 
364  mz = _mm_setzero_si128();
365  m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
366 
367  maskF = _mm_cmpeq_epi32(mz, mz); // set all FF
368  maskFL = _mm_slli_si128(maskF, 4 * 2); // byle shift to make [0000 FFFF]
369  int shiftL = (64 - (unroll_factor - size) * 16);
370  maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
371 
372  m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
373  m1 = _mm_or_si128(m1, maskFL);
374 
375  __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
376  __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
377  int mi = _mm_movemask_epi8(mge_mask); // collect flag bits
378  if (mi)
379  {
380  int bsr_i= bm::bit_scan_fwd(mi) >> 1;
381  return bsr_i; // address of first one element (target)
382  }
383  if (size == 8)
384  return size;
385 
386  // inspect the next lane with possible step back (to avoid over-read the block boundaries)
387  // GCC gives a false warning for "- unroll_factor" here
388  const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
389  BM_ASSERT(pbuf2 > pbuf); // assert in place to make sure GCC warning is indeed false
390 
391  m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
392  mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
393  mi = _mm_movemask_epi8(mge_mask);
394  if (mi)
395  {
396  int bsr_i = bm::bit_scan_fwd(mi) >> 1;
397  return size - (unroll_factor - bsr_i);
398  }
399  return size;
400 }
401 
402 /**
403  Hybrid binary search, starts as binary, then switches to linear scan
404 
405  \param buf - GAP buffer pointer.
406  \param pos - index of the element.
407  \param is_set - output. GAP value (0 or 1).
408  \return GAP index.
409 
410  @ingroup SSE2
411 */
412 inline
413 unsigned sse2_gap_bfind(const unsigned short* BMRESTRICT buf,
414  unsigned pos, unsigned* BMRESTRICT is_set)
415 {
416  unsigned start = 1;
417  unsigned end = 1 + ((*buf) >> 3);
418  unsigned dsize = end - start;
419 
420  if (dsize < 17)
421  {
422  start = bm::sse2_gap_find(buf+1, (bm::gap_word_t)pos, dsize);
423  *is_set = ((*buf) & 1) ^ (start & 1);
424  BM_ASSERT(buf[start+1] >= pos);
425  BM_ASSERT(buf[start] < pos || (start==0));
426 
427  return start+1;
428  }
429  unsigned arr_end = end;
430  while (start != end)
431  {
432  unsigned curr = (start + end) >> 1;
433  if (buf[curr] < pos)
434  start = curr + 1;
435  else
436  end = curr;
437 
438  unsigned size = end - start;
439  if (size < 16)
440  {
441  size += (end != arr_end);
442  unsigned idx =
443  bm::sse2_gap_find(buf + start, (bm::gap_word_t)pos, size);
444  start += idx;
445 
446  BM_ASSERT(buf[start] >= pos);
447  BM_ASSERT(buf[start - 1] < pos || (start == 1));
448  break;
449  }
450  }
451 
452  *is_set = ((*buf) & 1) ^ ((start-1) & 1);
453  return start;
454 }
455 
456 /**
457  Hybrid binary search, starts as binary, then switches to scan
458  @ingroup SSE2
459 */
460 inline
461 unsigned sse2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos)
462 {
463  unsigned is_set;
464  bm::sse2_gap_bfind(buf, pos, &is_set);
465  return is_set;
466 }
467 
468 
469 #ifdef __GNUG__
470 #pragma GCC diagnostic pop
471 #endif
472 
473 
474 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
475  sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
476 
477 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
478  sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
479 
480 #define VECT_BITCOUNT(first, last) \
481  sse2_bit_count((__m128i*) (first), (__m128i*) (last))
482 
483 #define VECT_BITCOUNT_AND(first, last, mask) \
484  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
485 
486 #define VECT_BITCOUNT_OR(first, last, mask) \
487  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
488 
489 #define VECT_BITCOUNT_XOR(first, last, mask) \
490  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
491 
492 #define VECT_BITCOUNT_SUB(first, last, mask) \
493  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
494 
495 #define VECT_INVERT_BLOCK(first) \
496  sse2_invert_block((__m128i*)first);
497 
498 #define VECT_AND_BLOCK(dst, src) \
499  sse2_and_block((__m128i*) dst, (__m128i*) (src))
500 
501 #define VECT_OR_BLOCK(dst, src) \
502  sse2_or_block((__m128i*) dst, (__m128i*) (src))
503 
504 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
505  sse2_or_block_2way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
506 
507 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
508  sse2_or_block_3way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
509 
510 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
511  sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
512 
513 #define VECT_SUB_BLOCK(dst, src) \
514  sse2_sub_block((__m128i*) dst, (__m128i*) (src))
515 
516 #define VECT_XOR_BLOCK(dst, src) \
517  sse2_xor_block((__m128i*) dst, (__m128i*) (src))
518 
519 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
520  sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
521 
522 #define VECT_COPY_BLOCK(dst, src) \
523  sse2_copy_block((__m128i*) dst, (__m128i*) (src))
524 
525 #define VECT_STREAM_BLOCK(dst, src) \
526  sse2_stream_block((__m128i*) dst, (__m128i*) (src))
527 
528 #define VECT_SET_BLOCK(dst, value) \
529  sse2_set_block((__m128i*) dst, value)
530 
531 #define VECT_GAP_BFIND(buf, pos, is_set) \
532  sse2_gap_bfind(buf, pos, is_set)
533 
534 
535 } // namespace
536 
537 
538 #ifdef __GNUG__
539 #pragma GCC diagnostic pop
540 #endif
541 
542 
543 #endif
BM_ALIGN16ATTR
#define BM_ALIGN16ATTR
Definition: bmdef.h:277
BM_INCWORD_BITCOUNT
#define BM_INCWORD_BITCOUNT(cnt, w)
Definition: bmdef.h:386
bm::sse2_gap_find
unsigned sse2_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
Definition: bmsse2.h:345
bm::sse2_bit_count
bm::id_t sse2_bit_count(const __m128i *block, const __m128i *block_end)
Definition: bmsse2.h:62
bm::sse2_gap_bfind
unsigned sse2_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set)
Hybrid binary search, starts as binary, then switches to linear scan.
Definition: bmsse2.h:413
bmsse_util.h
Compute functions for SSE SIMD instruction set (internal)
bm::gap_word_t
unsigned short gap_word_t
Definition: bmconst.h:77
BM_ASSERT
#define BM_ASSERT
Definition: bmdef.h:130
bm::id_t
unsigned int id_t
Definition: bmconst.h:37
bmdef.h
Definitions(internal)
bm::sse2_bit_block_calc_count_change
bm::id_t sse2_bit_block_calc_count_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, unsigned *BMRESTRICT bit_count)
Definition: bmsse2.h:187
bmutil.h
Bit manipulation primitives (internal)
bm::sse2_gap_test
unsigned sse2_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
Definition: bmsse2.h:461
bm
Definition: bm.h:76
bm::word_t
unsigned int word_t
Definition: bmconst.h:38
BMRESTRICT
#define BMRESTRICT
Definition: bmdef.h:193
bm::bit_scan_fwd
T bit_scan_fwd(T v) BMNOEXCEPT
Definition: bmutil.h:294
bm::sse2_bit_count_op
bm::id_t sse2_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition: bmsse2.h:123
BM_ALIGN16
#define BM_ALIGN16
Definition: bmdef.h:276