BitMagic-C++
bmsse_util.h
Go to the documentation of this file.
1 #ifndef BMSSE_UTIL__H__INCLUDED__
2 #define BMSSE_UTIL__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmsse_util.h
22  \brief Compute functions for SSE SIMD instruction set (internal)
23 */
24 
25 namespace bm
26 {
27 
28 /** @defgroup SSE2 SSE2 functions
29  Processor specific optimizations for SSE2 instructions (internals)
30  @internal
31  @ingroup bvector
32  */
33 
34 #ifdef __GNUG__
35 #pragma GCC diagnostic push
36 #pragma GCC diagnostic ignored "-Wconversion"
37 #endif
38 
39 
40 /*!
41  @brief SSE2 reinitialization guard class
42 
43  SSE2 requires to call _mm_empty() if we are intermixing
44  MMX integer commands with floating point arithmetics.
45  This class guards critical code fragments where SSE2 integer
46  is used.
47 
48  As of 2015 _mm_empty() is considered deprecated, and not even recognised
49  by some compilers (like MSVC) in 64-bit mode.
50  As MMX instructions gets old, we here deprecate and comment out
51  use of _mm_empty()
52 
53  @ingroup SSE2
54 */
56 {
57 public:
59  {
60  //_mm_empty();
61  }
62 
64  {
65  //_mm_empty();
66  }
67 };
68 
69 
70 
71 /*!
72  @brief XOR array elements to specified mask
73  *dst = *src ^ mask
74 
75  @ingroup SSE2
76 */
77 inline
78 void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst,
79  const __m128i* BMRESTRICT src,
80  const __m128i* BMRESTRICT src_end,
81  bm::word_t mask)
82 {
83  __m128i xM = _mm_set1_epi32((int)mask);
84  do
85  {
86  _mm_store_si128(dst+0, _mm_xor_si128(_mm_load_si128(src+0), xM));
87  _mm_store_si128(dst+1, _mm_xor_si128(_mm_load_si128(src+1), xM));
88  _mm_store_si128(dst+2, _mm_xor_si128(_mm_load_si128(src+2), xM));
89  _mm_store_si128(dst+3, _mm_xor_si128(_mm_load_si128(src+3), xM));
90  dst += 4; src += 4;
91  } while (src < src_end);
92 }
93 
94 
95 /*!
96  @brief Inverts array elements and NOT them to specified mask
97  *dst = ~*src & mask
98 
99  @ingroup SSE2
100 */
101 inline
103  const __m128i* BMRESTRICT src,
104  const __m128i* BMRESTRICT src_end,
105  bm::word_t mask)
106 {
107  __m128i xM = _mm_set1_epi32((int)mask);
108  do
109  {
110  _mm_store_si128(dst+0, _mm_andnot_si128(_mm_load_si128(src+0), xM)); // xmm1 = (~xmm1) & xM
111  _mm_store_si128(dst+1, _mm_andnot_si128(_mm_load_si128(src+1), xM));
112  _mm_store_si128(dst+2, _mm_andnot_si128(_mm_load_si128(src+2), xM));
113  _mm_store_si128(dst+3, _mm_andnot_si128(_mm_load_si128(src+3), xM));
114  dst += 4; src += 4;
115  } while (src < src_end);
116 }
117 
118 /*!
119  @brief AND blocks2
120  *dst &= *src
121  @return 0 if no bits were set
122  @ingroup SSE2
123 */
124 inline
125 unsigned sse2_and_block(__m128i* BMRESTRICT dst,
126  const __m128i* BMRESTRICT src)
127 {
128  __m128i m1A, m1B, m1C, m1D;
129  __m128i accA, accB, accC, accD;
130 
131  const __m128i* BMRESTRICT src_end =
132  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
133 
134  accA = accB = accC = accD = _mm_setzero_si128();
135 
136  do
137  {
138  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
139  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
140  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
141  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
142 
143  _mm_store_si128(dst+0, m1A);
144  _mm_store_si128(dst+1, m1B);
145  _mm_store_si128(dst+2, m1C);
146  _mm_store_si128(dst+3, m1D);
147 
148  accA = _mm_or_si128(accA, m1A);
149  accB = _mm_or_si128(accB, m1B);
150  accC = _mm_or_si128(accC, m1C);
151  accD = _mm_or_si128(accD, m1D);
152 
153  src += 4; dst += 4;
154  } while (src < src_end);
155 
156  accA = _mm_or_si128(accA, accB); // A = A | B
157  accC = _mm_or_si128(accC, accD); // C = C | D
158  accA = _mm_or_si128(accA, accC); // A = A | C
159 
160 
162  _mm_store_si128((__m128i*)macc, accA);
163  return macc[0] | macc[1] | macc[2] | macc[3];
164 }
165 
166 /*!
167  @brief AND array elements against another array (unaligned)
168  *dst &= *src
169 
170  @return 0 if no bits were set
171 
172  @ingroup SSE2
173 */
174 inline
175 unsigned sse2_and_arr_unal(__m128i* BMRESTRICT dst,
176  const __m128i* BMRESTRICT src,
177  const __m128i* BMRESTRICT src_end)
178 {
179  __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
180  __m128i accA, accB, accC, accD;
181 
182  accA = _mm_setzero_si128();
183  accB = _mm_setzero_si128();
184  accC = _mm_setzero_si128();
185  accD = _mm_setzero_si128();
186 
187  do
188  {
189  m1A = _mm_loadu_si128(src+0);
190  m2A = _mm_load_si128(dst+0);
191  m1A = _mm_and_si128(m1A, m2A);
192  _mm_store_si128(dst+0, m1A);
193  accA = _mm_or_si128(accA, m1A);
194 
195  m1B = _mm_loadu_si128(src+1);
196  m2B = _mm_load_si128(dst+1);
197  m1B = _mm_and_si128(m1B, m2B);
198  _mm_store_si128(dst+1, m1B);
199  accB = _mm_or_si128(accB, m1B);
200 
201  m1C = _mm_loadu_si128(src+2);
202  m2C = _mm_load_si128(dst+2);
203  m1C = _mm_and_si128(m1C, m2C);
204  _mm_store_si128(dst+2, m1C);
205  accC = _mm_or_si128(accC, m1C);
206 
207  m1D = _mm_loadu_si128(src+3);
208  m2D = _mm_load_si128(dst+3);
209  m1D = _mm_and_si128(m1D, m2D);
210  _mm_store_si128(dst+3, m1D);
211  accD = _mm_or_si128(accD, m1D);
212 
213  src += 4; dst += 4;
214  } while (src < src_end);
215 
216  accA = _mm_or_si128(accA, accB); // A = A | B
217  accC = _mm_or_si128(accC, accD); // C = C | D
218  accA = _mm_or_si128(accA, accC); // A = A | C
219 
220 
222  _mm_store_si128((__m128i*)macc, accA);
223  return macc[0] | macc[1] | macc[2] | macc[3];
224 }
225 
226 
227 inline
228 unsigned sse2_and_block(__m128i* BMRESTRICT dst,
229  const __m128i* BMRESTRICT src,
230  const __m128i* BMRESTRICT src_end)
231 {
232  __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
233  __m128i accA, accB, accC, accD;
234 
235  accA = _mm_setzero_si128();
236  accB = _mm_setzero_si128();
237  accC = _mm_setzero_si128();
238  accD = _mm_setzero_si128();
239 
240  do
241  {
242  m1A = _mm_load_si128(src + 0);
243  m2A = _mm_load_si128(dst + 0);
244  m1A = _mm_and_si128(m1A, m2A);
245  _mm_store_si128(dst + 0, m1A);
246  accA = _mm_or_si128(accA, m1A);
247 
248  m1B = _mm_load_si128(src + 1);
249  m2B = _mm_load_si128(dst + 1);
250  m1B = _mm_and_si128(m1B, m2B);
251  _mm_store_si128(dst + 1, m1B);
252  accB = _mm_or_si128(accB, m1B);
253 
254  m1C = _mm_load_si128(src + 2);
255  m2C = _mm_load_si128(dst + 2);
256  m1C = _mm_and_si128(m1C, m2C);
257  _mm_store_si128(dst + 2, m1C);
258  accC = _mm_or_si128(accC, m1C);
259 
260  m1D = _mm_load_si128(src + 3);
261  m2D = _mm_load_si128(dst + 3);
262  m1D = _mm_and_si128(m1D, m2D);
263  _mm_store_si128(dst + 3, m1D);
264  accD = _mm_or_si128(accD, m1D);
265 
266  src += 4; dst += 4;
267  } while (src < src_end);
268 
269  accA = _mm_or_si128(accA, accB); // A = A | B
270  accC = _mm_or_si128(accC, accD); // C = C | D
271  accA = _mm_or_si128(accA, accC); // A = A | C
272 
273 
275  _mm_store_si128((__m128i*)macc, accA);
276  return macc[0] | macc[1] | macc[2] | macc[3];
277 }
278 
279 
280 
281 /*!
282  @brief OR array elements against another array
283  *dst |= *src
284  @return true if all bits are 1
285  @ingroup SSE2
286 */
287 inline
288 bool sse2_or_block(__m128i* BMRESTRICT dst,
289  const __m128i* BMRESTRICT src)
290 {
291  __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
292  __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
293  __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
294  const __m128i* BMRESTRICT src_end =
295  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
296 
297  do
298  {
299  m1A = _mm_load_si128(src + 0);
300  m2A = _mm_load_si128(dst + 0);
301  m1A = _mm_or_si128(m1A, m2A);
302  _mm_store_si128(dst + 0, m1A);
303 
304  m1B = _mm_load_si128(src + 1);
305  m2B = _mm_load_si128(dst + 1);
306  m1B = _mm_or_si128(m1B, m2B);
307  _mm_store_si128(dst + 1, m1B);
308 
309  m1C = _mm_load_si128(src + 2);
310  m2C = _mm_load_si128(dst + 2);
311  m1C = _mm_or_si128(m1C, m2C);
312  _mm_store_si128(dst + 2, m1C);
313 
314  m1D = _mm_load_si128(src + 3);
315  m2D = _mm_load_si128(dst + 3);
316  m1D = _mm_or_si128(m1D, m2D);
317  _mm_store_si128(dst + 3, m1D);
318 
319  mAccF1 = _mm_and_si128(mAccF1, m1C);
320  mAccF1 = _mm_and_si128(mAccF1, m1D);
321  mAccF0 = _mm_and_si128(mAccF0, m1A);
322  mAccF0 = _mm_and_si128(mAccF0, m1B);
323 
324  src += 4; dst += 4;
325  } while (src < src_end);
326 
327  __m128i maskF = _mm_set1_epi32(~0u);
328  mAccF0 = _mm_and_si128(mAccF0, mAccF1);
329  __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
330  unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
331 
332  return (maskA == 0xFFFFu);
333 }
334 
335 /*!
336  @brief OR array elements against another array (unaligned)
337  *dst |= *src
338  @return true if all bits are 1
339  @ingroup SSE2
340 */
341 inline
342 bool sse2_or_arr_unal(__m128i* BMRESTRICT dst,
343  const __m128i* BMRESTRICT src,
344  const __m128i* BMRESTRICT src_end)
345 {
346  __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
347  __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
348  __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
349  do
350  {
351  m1A = _mm_loadu_si128(src + 0);
352  m2A = _mm_load_si128(dst + 0);
353  m1A = _mm_or_si128(m1A, m2A);
354  _mm_store_si128(dst + 0, m1A);
355 
356  m1B = _mm_loadu_si128(src + 1);
357  m2B = _mm_load_si128(dst + 1);
358  m1B = _mm_or_si128(m1B, m2B);
359  _mm_store_si128(dst + 1, m1B);
360 
361  m1C = _mm_loadu_si128(src + 2);
362  m2C = _mm_load_si128(dst + 2);
363  m1C = _mm_or_si128(m1C, m2C);
364  _mm_store_si128(dst + 2, m1C);
365 
366  m1D = _mm_loadu_si128(src + 3);
367  m2D = _mm_load_si128(dst + 3);
368  m1D = _mm_or_si128(m1D, m2D);
369  _mm_store_si128(dst + 3, m1D);
370 
371  mAccF1 = _mm_and_si128(mAccF1, m1C);
372  mAccF1 = _mm_and_si128(mAccF1, m1D);
373  mAccF0 = _mm_and_si128(mAccF0, m1A);
374  mAccF0 = _mm_and_si128(mAccF0, m1B);
375 
376  src += 4; dst += 4;
377  } while (src < src_end);
378 
379  __m128i maskF = _mm_set1_epi32(~0u);
380  mAccF0 = _mm_and_si128(mAccF0, mAccF1);
381  __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
382  unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
383  return (maskA == 0xFFFFu);
384 }
385 
386 /*!
387  @brief OR 2 blocks anc copy result to the destination
388  *dst = *src1 | src2
389  @return true if all bits are 1
390 
391  @ingroup SSE2
392 */
393 inline
394 bool sse2_or_block_2way(__m128i* BMRESTRICT dst,
395  const __m128i* BMRESTRICT src1,
396  const __m128i* BMRESTRICT src2)
397 {
398  __m128i m1A, m1B, m1C, m1D;
399  __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
400  __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
401  const __m128i* BMRESTRICT src_end1 =
402  (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
403 
404  do
405  {
406  m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
407  m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
408  m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
409  m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
410 
411  _mm_store_si128(dst + 0, m1A);
412  _mm_store_si128(dst + 1, m1B);
413  _mm_store_si128(dst + 2, m1C);
414  _mm_store_si128(dst + 3, m1D);
415 
416  mAccF1 = _mm_and_si128(mAccF1, m1C);
417  mAccF1 = _mm_and_si128(mAccF1, m1D);
418  mAccF0 = _mm_and_si128(mAccF0, m1A);
419  mAccF0 = _mm_and_si128(mAccF0, m1B);
420 
421  src1 += 4; src2 += 4; dst += 4;
422 
423  } while (src1 < src_end1);
424 
425  __m128i maskF = _mm_set1_epi32(~0u);
426  mAccF0 = _mm_and_si128(mAccF0, mAccF1);
427  __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
428  unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
429  return (maskA == 0xFFFFu);
430 }
431 
432 /*!
433  @brief OR array elements against another 2 arrays
434  *dst |= *src1 | src2
435  @return true if all bits are 1
436 
437  @ingroup SSE2
438 */
439 inline
440 bool sse2_or_block_3way(__m128i* BMRESTRICT dst,
441  const __m128i* BMRESTRICT src1,
442  const __m128i* BMRESTRICT src2)
443 {
444  __m128i m1A, m1B, m1C, m1D;
445  __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
446  __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
447  const __m128i* BMRESTRICT src_end1 =
448  (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
449 
450  do
451  {
452  m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
453  m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
454  m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
455  m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
456 
457  m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
458  m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
459  m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
460  m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
461 
462  _mm_store_si128(dst + 0, m1A);
463  _mm_store_si128(dst + 1, m1B);
464  _mm_store_si128(dst + 2, m1C);
465  _mm_store_si128(dst + 3, m1D);
466 
467  mAccF1 = _mm_and_si128(mAccF1, m1C);
468  mAccF1 = _mm_and_si128(mAccF1, m1D);
469  mAccF0 = _mm_and_si128(mAccF0, m1A);
470  mAccF0 = _mm_and_si128(mAccF0, m1B);
471 
472  src1 += 4; src2 += 4; dst += 4;
473 
474  } while (src1 < src_end1);
475 
476  __m128i maskF = _mm_set1_epi32(~0u);
477  mAccF0 = _mm_and_si128(mAccF0, mAccF1);
478  __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
479  unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
480  return (maskA == 0xFFFFu);
481 }
482 
483 /*!
484  @brief OR array elements against another 2 arrays
485  *dst |= *src1 | src2 | src3 | src4
486  @return true if all bits are 1
487 
488  @ingroup SSE2
489 */
490 inline
491 bool sse2_or_block_5way(__m128i* BMRESTRICT dst,
492  const __m128i* BMRESTRICT src1,
493  const __m128i* BMRESTRICT src2,
494  const __m128i* BMRESTRICT src3,
495  const __m128i* BMRESTRICT src4)
496 {
497  __m128i m1A, m1B, m1C, m1D;
498  __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
499  __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
500 
501  const __m128i* BMRESTRICT src_end1 =
502  (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
503 
504  do
505  {
506  m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
507  m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
508  m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
509  m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
510 
511  m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
512  m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
513  m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
514  m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
515 
516  m1A = _mm_or_si128(m1A, _mm_load_si128(src3 + 0));
517  m1B = _mm_or_si128(m1B, _mm_load_si128(src3 + 1));
518  m1C = _mm_or_si128(m1C, _mm_load_si128(src3 + 2));
519  m1D = _mm_or_si128(m1D, _mm_load_si128(src3 + 3));
520 
521  m1A = _mm_or_si128(m1A, _mm_load_si128(src4 + 0));
522  m1B = _mm_or_si128(m1B, _mm_load_si128(src4 + 1));
523  m1C = _mm_or_si128(m1C, _mm_load_si128(src4 + 2));
524  m1D = _mm_or_si128(m1D, _mm_load_si128(src4 + 3));
525 
526  _mm_stream_si128(dst + 0, m1A);
527  _mm_stream_si128(dst + 1, m1B);
528  _mm_stream_si128(dst + 2, m1C);
529  _mm_stream_si128(dst + 3, m1D);
530 
531  mAccF1 = _mm_and_si128(mAccF1, m1C);
532  mAccF1 = _mm_and_si128(mAccF1, m1D);
533  mAccF0 = _mm_and_si128(mAccF0, m1A);
534  mAccF0 = _mm_and_si128(mAccF0, m1B);
535 
536  src1 += 4; src2 += 4;
537  src3 += 4; src4 += 4;
538 
539  _mm_prefetch ((const char*)src3, _MM_HINT_T0);
540  _mm_prefetch ((const char*)src4, _MM_HINT_T0);
541 
542  dst += 4;
543 
544  } while (src1 < src_end1);
545 
546  __m128i maskF = _mm_set1_epi32(~0u);
547  mAccF0 = _mm_and_si128(mAccF0, mAccF1);
548  __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
549  unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
550  return (maskA == 0xFFFFu);
551 }
552 
553 
554 
555 /*!
556  @brief XOR block against another
557  *dst ^= *src
558  @return 0 if no bits were set
559  @ingroup SSE2
560 */
561 inline
562 unsigned sse2_xor_block(__m128i* BMRESTRICT dst,
563  const __m128i* BMRESTRICT src)
564 {
565  __m128i m1A, m1B, m1C, m1D;
566  __m128i accA, accB, accC, accD;
567 
568  const __m128i* BMRESTRICT src_end =
569  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
570 
571  accA = accB = accC = accD = _mm_setzero_si128();
572 
573  do
574  {
575  m1A = _mm_xor_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
576  m1B = _mm_xor_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
577  m1C = _mm_xor_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
578  m1D = _mm_xor_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
579 
580  _mm_store_si128(dst+0, m1A);
581  _mm_store_si128(dst+1, m1B);
582  _mm_store_si128(dst+2, m1C);
583  _mm_store_si128(dst+3, m1D);
584 
585  accA = _mm_or_si128(accA, m1A);
586  accB = _mm_or_si128(accB, m1B);
587  accC = _mm_or_si128(accC, m1C);
588  accD = _mm_or_si128(accD, m1D);
589 
590  src += 4; dst += 4;
591  } while (src < src_end);
592 
593  accA = _mm_or_si128(accA, accB); // A = A | B
594  accC = _mm_or_si128(accC, accD); // C = C | D
595  accA = _mm_or_si128(accA, accC); // A = A | C
596 
598  _mm_store_si128((__m128i*)macc, accA);
599  return macc[0] | macc[1] | macc[2] | macc[3];
600 }
601 
602 /*!
603  @brief 3 operand XOR
604  *dst = *src1 ^ src2
605  @return 0 if no bits were set
606  @ingroup SSE2
607 */
608 inline
609 unsigned sse2_xor_block_2way(__m128i* BMRESTRICT dst,
610  const __m128i* BMRESTRICT src1,
611  const __m128i* BMRESTRICT src2)
612 {
613  __m128i m1A, m1B, m1C, m1D;
614  __m128i accA, accB, accC, accD;
615 
616  const __m128i* BMRESTRICT src1_end =
617  (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
618 
619  accA = accB = accC = accD = _mm_setzero_si128();
620 
621  do
622  {
623  m1A = _mm_xor_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
624  m1B = _mm_xor_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
625  m1C = _mm_xor_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
626  m1D = _mm_xor_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
627 
628  _mm_store_si128(dst + 0, m1A);
629  _mm_store_si128(dst + 1, m1B);
630  _mm_store_si128(dst + 2, m1C);
631  _mm_store_si128(dst + 3, m1D);
632 
633  accA = _mm_or_si128(accA, m1A);
634  accB = _mm_or_si128(accB, m1B);
635  accC = _mm_or_si128(accC, m1C);
636  accD = _mm_or_si128(accD, m1D);
637 
638  src1 += 4; src2 += 4; dst += 4;
639  } while (src1 < src1_end);
640 
641  accA = _mm_or_si128(accA, accB); // A = A | B
642  accC = _mm_or_si128(accC, accD); // C = C | D
643  accA = _mm_or_si128(accA, accC); // A = A | C
644 
646  _mm_store_si128((__m128i*)macc, accA);
647  return macc[0] | macc[1] | macc[2] | macc[3];
648 }
649 
650 
651 /*!
652  @brief AND-NOT (SUB) array elements against another array
653  *dst &= ~*src
654 
655  @return 0 if no bits were set
656 
657  @ingroup SSE2
658 */
659 inline
660 unsigned sse2_sub_block(__m128i* BMRESTRICT dst,
661  const __m128i* BMRESTRICT src)
662 {
663  __m128i m1A, m1B, m1C, m1D;
664  __m128i accA, accB, accC, accD;
665 
666  accA = accB = accC = accD = _mm_setzero_si128();
667 
668  const __m128i* BMRESTRICT src_end =
669  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
670 
671  do
672  {
673  m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
674  m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
675  m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
676  m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
677 
678  _mm_store_si128(dst+0, m1A);
679  _mm_store_si128(dst+1, m1B);
680  _mm_store_si128(dst+2, m1C);
681  _mm_store_si128(dst+3, m1D);
682 
683  accA = _mm_or_si128(accA, m1A);
684  accB = _mm_or_si128(accB, m1B);
685  accC = _mm_or_si128(accC, m1C);
686  accD = _mm_or_si128(accD, m1D);
687 
688  src += 4; dst += 4;
689  } while (src < src_end);
690 
691  accA = _mm_or_si128(accA, accB); // A = A | B
692  accC = _mm_or_si128(accC, accD); // C = C | D
693  accA = _mm_or_si128(accA, accC); // A = A | C
694 
695 
697  _mm_store_si128((__m128i*)macc, accA);
698  return macc[0] | macc[1] | macc[2] | macc[3];
699 }
700 
701 
702 /*!
703  @brief SSE2 block memset
704  *dst = value
705 
706  @ingroup SSE2
707 */
708 
710 void sse2_set_block(__m128i* BMRESTRICT dst, bm::word_t value)
711 {
712  __m128i* BMRESTRICT dst_end =
713  (__m128i*)((bm::word_t*)(dst) + bm::set_block_size);
714 
715  __m128i xmm0 = _mm_set1_epi32((int)value);
716  do
717  {
718  _mm_store_si128(dst, xmm0);
719  _mm_store_si128(dst+1, xmm0);
720  _mm_store_si128(dst+2, xmm0);
721  _mm_store_si128(dst+3, xmm0);
722 
723  _mm_store_si128(dst+4, xmm0);
724  _mm_store_si128(dst+5, xmm0);
725  _mm_store_si128(dst+6, xmm0);
726  _mm_store_si128(dst+7, xmm0);
727 
728  dst += 8;
729  } while (dst < dst_end);
730 }
731 
732 /*!
733  @brief SSE2 block copy
734  *dst = *src
735 
736  @ingroup SSE2
737 */
739 void sse2_copy_block(__m128i* BMRESTRICT dst,
740  const __m128i* BMRESTRICT src)
741 {
742  __m128i xmm0, xmm1, xmm2, xmm3;
743  const __m128i* BMRESTRICT src_end =
744  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
745 
746  do
747  {
748  xmm0 = _mm_load_si128(src+0);
749  xmm1 = _mm_load_si128(src+1);
750  xmm2 = _mm_load_si128(src+2);
751  xmm3 = _mm_load_si128(src+3);
752 
753  _mm_store_si128(dst+0, xmm0);
754  _mm_store_si128(dst+1, xmm1);
755  _mm_store_si128(dst+2, xmm2);
756  _mm_store_si128(dst+3, xmm3);
757 
758  xmm0 = _mm_load_si128(src+4);
759  xmm1 = _mm_load_si128(src+5);
760  xmm2 = _mm_load_si128(src+6);
761  xmm3 = _mm_load_si128(src+7);
762 
763  _mm_store_si128(dst+4, xmm0);
764  _mm_store_si128(dst+5, xmm1);
765  _mm_store_si128(dst+6, xmm2);
766  _mm_store_si128(dst+7, xmm3);
767 
768  src += 8; dst += 8;
769 
770  } while (src < src_end);
771 }
772 
773 /*!
774  @brief SSE2 block copy
775  *dst = *src
776 
777  @ingroup SSE2
778 */
780 void sse2_stream_block(__m128i* BMRESTRICT dst,
781  const __m128i* BMRESTRICT src)
782 {
783  __m128i xmm0, xmm1, xmm2, xmm3;
784  const __m128i* BMRESTRICT src_end =
785  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
786 
787  do
788  {
789  xmm0 = _mm_load_si128(src+0);
790  xmm1 = _mm_load_si128(src+1);
791  xmm2 = _mm_load_si128(src+2);
792  xmm3 = _mm_load_si128(src+3);
793 
794  _mm_stream_si128(dst+0, xmm0);
795  _mm_stream_si128(dst+1, xmm1);
796  _mm_stream_si128(dst+2, xmm2);
797  _mm_stream_si128(dst+3, xmm3);
798 
799  xmm0 = _mm_load_si128(src+4);
800  xmm1 = _mm_load_si128(src+5);
801  xmm2 = _mm_load_si128(src+6);
802  xmm3 = _mm_load_si128(src+7);
803 
804  _mm_stream_si128(dst+4, xmm0);
805  _mm_stream_si128(dst+5, xmm1);
806  _mm_stream_si128(dst+6, xmm2);
807  _mm_stream_si128(dst+7, xmm3);
808 
809  src += 8; dst += 8;
810 
811  } while (src < src_end);
812 }
813 
814 
815 /*!
816  @brief Invert bit block
817  *dst = ~*dst
818  or
819  *dst ^= *dst
820 
821  @ingroup SSE2
822 */
823 inline
824 void sse2_invert_block(__m128i* dst)
825 {
826  __m128i maskF = _mm_set1_epi32(~0u);
827  __m128i* BMRESTRICT dst_end =
828  (__m128i*)((bm::word_t*)(dst) + bm::set_block_size);
829 
830  __m128i mA, mB, mC, mD;
831  do
832  {
833  mA = _mm_load_si128(dst + 0);
834  mA = _mm_xor_si128(mA, maskF);
835  _mm_store_si128(dst+0, mA);
836 
837  mB = _mm_load_si128(dst + 1);
838  mB = _mm_xor_si128(mB, maskF);
839  _mm_store_si128(dst + 1, mB);
840 
841  mC = _mm_load_si128(dst + 2);
842  mC = _mm_xor_si128(mC, maskF);
843  _mm_store_si128(dst + 2, mC);
844 
845  mD = _mm_load_si128(dst + 3);
846  mD = _mm_xor_si128(mD, maskF);
847  _mm_store_si128(dst + 3, mD);
848 
849  dst += 4;
850 
851  } while (dst < (__m128i*)dst_end);
852 }
853 
855 __m128i sse2_and(__m128i a, __m128i b)
856 {
857  return _mm_and_si128(a, b);
858 }
859 
861 __m128i sse2_or(__m128i a, __m128i b)
862 {
863  return _mm_or_si128(a, b);
864 }
865 
866 
868 __m128i sse2_xor(__m128i a, __m128i b)
869 {
870  return _mm_xor_si128(a, b);
871 }
872 
874 __m128i sse2_sub(__m128i a, __m128i b)
875 {
876  return _mm_andnot_si128(b, a);
877 }
878 
879 
880 /*!
881  @brief Gap block population count (array sum) utility
882  @param pbuf - unrolled, aligned to 1-start GAP buffer
883  @param sse_vect_waves - number of SSE vector lines to process
884  @param sum - result acumulator
885  @return tail pointer
886 
887  @internal
888  @ingroup SSE2
889 */
890 inline
892  const bm::gap_word_t* BMRESTRICT pbuf,
893  unsigned sse_vect_waves,
894  unsigned* sum)
895 {
896  __m128i xcnt = _mm_setzero_si128();
897 
898  for (unsigned i = 0; i < sse_vect_waves; ++i)
899  {
900  __m128i mm0 = _mm_loadu_si128((__m128i*)(pbuf - 1));
901  __m128i mm1 = _mm_loadu_si128((__m128i*)(pbuf + 8 - 1));
902  __m128i mm_s2 = _mm_add_epi16(mm1, mm0);
903  xcnt = _mm_add_epi16(xcnt, mm_s2);
904  pbuf += 16;
905  }
906  xcnt = _mm_sub_epi16(_mm_srli_epi32(xcnt, 16), xcnt);
907 
908  unsigned short* cnt8 = (unsigned short*)&xcnt;
909  *sum += (cnt8[0]) + (cnt8[2]) + (cnt8[4]) + (cnt8[6]);
910  return pbuf;
911 }
912 
913 #ifdef __GNUG__
914 #pragma GCC diagnostic pop
915 #endif
916 
917 
918 } // namespace
919 
920 
921 
922 #endif
BM_ALIGN16ATTR
#define BM_ALIGN16ATTR
Definition: bmdef.h:277
bm::set_block_size
const unsigned set_block_size
Definition: bmconst.h:54
bm::sse2_invert_block
void sse2_invert_block(__m128i *dst)
Invert bit block dst = ~*dst or dst ^= *dst.
Definition: bmsse_util.h:824
bm::sse2_sub_block
unsigned sse2_sub_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND-NOT (SUB) array elements against another array dst &= ~*src.
Definition: bmsse_util.h:660
bm::sse_empty_guard::~sse_empty_guard
BMFORCEINLINE ~sse_empty_guard()
Definition: bmsse_util.h:63
bm::sse2_or
BMFORCEINLINE __m128i sse2_or(__m128i a, __m128i b)
Definition: bmsse_util.h:861
bm::sse2_gap_sum_arr
const bm::gap_word_t * sse2_gap_sum_arr(const bm::gap_word_t *BMRESTRICT pbuf, unsigned sse_vect_waves, unsigned *sum)
Gap block population count (array sum) utility.
Definition: bmsse_util.h:891
bm::sse2_and_block
unsigned sse2_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND blocks2 dst &= *src.
Definition: bmsse_util.h:125
bm::sse_empty_guard
SSE2 reinitialization guard class.
Definition: bmsse_util.h:55
bm::sse2_stream_block
BMFORCEINLINE void sse2_stream_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SSE2 block copy dst = *src.
Definition: bmsse_util.h:780
bm::sse2_xor_block
unsigned sse2_xor_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
XOR block against another dst ^= *src.
Definition: bmsse_util.h:562
bm::sse2_and_arr_unal
unsigned sse2_and_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end)
AND array elements against another array (unaligned) dst &= *src.
Definition: bmsse_util.h:175
bm::sse2_or_block
bool sse2_or_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
OR array elements against another array dst |= *src.
Definition: bmsse_util.h:288
bm::sse2_or_block_2way
bool sse2_or_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
OR 2 blocks anc copy result to the destination dst = *src1 | src2.
Definition: bmsse_util.h:394
bm::sse2_xor_block_2way
unsigned sse2_xor_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
3 operand XOR dst = *src1 ^ src2
Definition: bmsse_util.h:609
bm::sse2_xor
BMFORCEINLINE __m128i sse2_xor(__m128i a, __m128i b)
Definition: bmsse_util.h:868
bm::gap_word_t
unsigned short gap_word_t
Definition: bmconst.h:77
bm::sse2_and
BMFORCEINLINE __m128i sse2_and(__m128i a, __m128i b)
Definition: bmsse_util.h:855
bm::id_t
unsigned int id_t
Definition: bmconst.h:37
bm::sse2_andnot_arr_2_mask
void sse2_andnot_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask)
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
Definition: bmsse_util.h:102
bm::sse2_xor_arr_2_mask
void sse2_xor_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask)
XOR array elements to specified mask dst = *src ^ mask.
Definition: bmsse_util.h:78
bm::sse2_or_block_3way
bool sse2_or_block_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
OR array elements against another 2 arrays dst |= *src1 | src2.
Definition: bmsse_util.h:440
bm::sse_empty_guard::sse_empty_guard
BMFORCEINLINE sse_empty_guard()
Definition: bmsse_util.h:58
BMFORCEINLINE
#define BMFORCEINLINE
Definition: bmdef.h:203
bm
Definition: bm.h:76
bm::sse2_copy_block
BMFORCEINLINE void sse2_copy_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SSE2 block copy dst = *src.
Definition: bmsse_util.h:739
bm::word_t
unsigned int word_t
Definition: bmconst.h:38
BMRESTRICT
#define BMRESTRICT
Definition: bmdef.h:193
bm::sse2_set_block
BMFORCEINLINE void sse2_set_block(__m128i *BMRESTRICT dst, bm::word_t value)
SSE2 block memset dst = value.
Definition: bmsse_util.h:710
bm::sse2_sub
BMFORCEINLINE __m128i sse2_sub(__m128i a, __m128i b)
Definition: bmsse_util.h:874
bm::sse2_or_block_5way
bool sse2_or_block_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4)
OR array elements against another 2 arrays dst |= *src1 | src2 | src3 | src4.
Definition: bmsse_util.h:491
BM_ALIGN16
#define BM_ALIGN16
Definition: bmdef.h:276
bm::sse2_or_arr_unal
bool sse2_or_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end)
OR array elements against another array (unaligned) dst |= *src.
Definition: bmsse_util.h:342