Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
volk_32f_x2_dot_prod_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
58#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
59#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
60
61#include <stdio.h>
62#include <volk/volk_common.h>
63
64
65#ifdef LV_HAVE_GENERIC
66
67
68static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
69 const float* input,
70 const float* taps,
71 unsigned int num_points)
72{
73
74 float dotProduct = 0;
75 const float* aPtr = input;
76 const float* bPtr = taps;
77 unsigned int number = 0;
78
79 for (number = 0; number < num_points; number++) {
80 dotProduct += ((*aPtr++) * (*bPtr++));
81 }
82
83 *result = dotProduct;
84}
85
86#endif /*LV_HAVE_GENERIC*/
87
88
89#ifdef LV_HAVE_SSE
90
91
92static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
93 const float* input,
94 const float* taps,
95 unsigned int num_points)
96{
97
98 unsigned int number = 0;
99 const unsigned int sixteenthPoints = num_points / 16;
100
101 float dotProduct = 0;
102 const float* aPtr = input;
103 const float* bPtr = taps;
104
105 __m128 a0Val, a1Val, a2Val, a3Val;
106 __m128 b0Val, b1Val, b2Val, b3Val;
107 __m128 c0Val, c1Val, c2Val, c3Val;
108
109 __m128 dotProdVal0 = _mm_setzero_ps();
110 __m128 dotProdVal1 = _mm_setzero_ps();
111 __m128 dotProdVal2 = _mm_setzero_ps();
112 __m128 dotProdVal3 = _mm_setzero_ps();
113
114 for (; number < sixteenthPoints; number++) {
115
116 a0Val = _mm_loadu_ps(aPtr);
117 a1Val = _mm_loadu_ps(aPtr + 4);
118 a2Val = _mm_loadu_ps(aPtr + 8);
119 a3Val = _mm_loadu_ps(aPtr + 12);
120 b0Val = _mm_loadu_ps(bPtr);
121 b1Val = _mm_loadu_ps(bPtr + 4);
122 b2Val = _mm_loadu_ps(bPtr + 8);
123 b3Val = _mm_loadu_ps(bPtr + 12);
124
125 c0Val = _mm_mul_ps(a0Val, b0Val);
126 c1Val = _mm_mul_ps(a1Val, b1Val);
127 c2Val = _mm_mul_ps(a2Val, b2Val);
128 c3Val = _mm_mul_ps(a3Val, b3Val);
129
130 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
131 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
132 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
133 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
134
135 aPtr += 16;
136 bPtr += 16;
137 }
138
139 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
140 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
141 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
142
143 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
144
145 _mm_store_ps(dotProductVector,
146 dotProdVal0); // Store the results back into the dot product vector
147
148 dotProduct = dotProductVector[0];
149 dotProduct += dotProductVector[1];
150 dotProduct += dotProductVector[2];
151 dotProduct += dotProductVector[3];
152
153 number = sixteenthPoints * 16;
154 for (; number < num_points; number++) {
155 dotProduct += ((*aPtr++) * (*bPtr++));
156 }
157
158 *result = dotProduct;
159}
160
161#endif /*LV_HAVE_SSE*/
162
163#ifdef LV_HAVE_SSE3
164
165#include <pmmintrin.h>
166
167static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
168 const float* input,
169 const float* taps,
170 unsigned int num_points)
171{
172 unsigned int number = 0;
173 const unsigned int sixteenthPoints = num_points / 16;
174
175 float dotProduct = 0;
176 const float* aPtr = input;
177 const float* bPtr = taps;
178
179 __m128 a0Val, a1Val, a2Val, a3Val;
180 __m128 b0Val, b1Val, b2Val, b3Val;
181 __m128 c0Val, c1Val, c2Val, c3Val;
182
183 __m128 dotProdVal0 = _mm_setzero_ps();
184 __m128 dotProdVal1 = _mm_setzero_ps();
185 __m128 dotProdVal2 = _mm_setzero_ps();
186 __m128 dotProdVal3 = _mm_setzero_ps();
187
188 for (; number < sixteenthPoints; number++) {
189
190 a0Val = _mm_loadu_ps(aPtr);
191 a1Val = _mm_loadu_ps(aPtr + 4);
192 a2Val = _mm_loadu_ps(aPtr + 8);
193 a3Val = _mm_loadu_ps(aPtr + 12);
194 b0Val = _mm_loadu_ps(bPtr);
195 b1Val = _mm_loadu_ps(bPtr + 4);
196 b2Val = _mm_loadu_ps(bPtr + 8);
197 b3Val = _mm_loadu_ps(bPtr + 12);
198
199 c0Val = _mm_mul_ps(a0Val, b0Val);
200 c1Val = _mm_mul_ps(a1Val, b1Val);
201 c2Val = _mm_mul_ps(a2Val, b2Val);
202 c3Val = _mm_mul_ps(a3Val, b3Val);
203
204 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
205 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
206 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
207 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
208
209 aPtr += 16;
210 bPtr += 16;
211 }
212
213 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
214 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
215 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
216
217 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
218 _mm_store_ps(dotProductVector,
219 dotProdVal0); // Store the results back into the dot product vector
220
221 dotProduct = dotProductVector[0];
222 dotProduct += dotProductVector[1];
223 dotProduct += dotProductVector[2];
224 dotProduct += dotProductVector[3];
225
226 number = sixteenthPoints * 16;
227 for (; number < num_points; number++) {
228 dotProduct += ((*aPtr++) * (*bPtr++));
229 }
230
231 *result = dotProduct;
232}
233
234#endif /*LV_HAVE_SSE3*/
235
236#ifdef LV_HAVE_SSE4_1
237
238#include <smmintrin.h>
239
240static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
241 const float* input,
242 const float* taps,
243 unsigned int num_points)
244{
245 unsigned int number = 0;
246 const unsigned int sixteenthPoints = num_points / 16;
247
248 float dotProduct = 0;
249 const float* aPtr = input;
250 const float* bPtr = taps;
251
252 __m128 aVal1, bVal1, cVal1;
253 __m128 aVal2, bVal2, cVal2;
254 __m128 aVal3, bVal3, cVal3;
255 __m128 aVal4, bVal4, cVal4;
256
257 __m128 dotProdVal = _mm_setzero_ps();
258
259 for (; number < sixteenthPoints; number++) {
260
261 aVal1 = _mm_loadu_ps(aPtr);
262 aPtr += 4;
263 aVal2 = _mm_loadu_ps(aPtr);
264 aPtr += 4;
265 aVal3 = _mm_loadu_ps(aPtr);
266 aPtr += 4;
267 aVal4 = _mm_loadu_ps(aPtr);
268 aPtr += 4;
269
270 bVal1 = _mm_loadu_ps(bPtr);
271 bPtr += 4;
272 bVal2 = _mm_loadu_ps(bPtr);
273 bPtr += 4;
274 bVal3 = _mm_loadu_ps(bPtr);
275 bPtr += 4;
276 bVal4 = _mm_loadu_ps(bPtr);
277 bPtr += 4;
278
279 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
280 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
281 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
282 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
283
284 cVal1 = _mm_or_ps(cVal1, cVal2);
285 cVal3 = _mm_or_ps(cVal3, cVal4);
286 cVal1 = _mm_or_ps(cVal1, cVal3);
287
288 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
289 }
290
291 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
292 _mm_store_ps(dotProductVector,
293 dotProdVal); // Store the results back into the dot product vector
294
295 dotProduct = dotProductVector[0];
296 dotProduct += dotProductVector[1];
297 dotProduct += dotProductVector[2];
298 dotProduct += dotProductVector[3];
299
300 number = sixteenthPoints * 16;
301 for (; number < num_points; number++) {
302 dotProduct += ((*aPtr++) * (*bPtr++));
303 }
304
305 *result = dotProduct;
306}
307
308#endif /*LV_HAVE_SSE4_1*/
309
310#ifdef LV_HAVE_AVX
311
312#include <immintrin.h>
313
314static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
315 const float* input,
316 const float* taps,
317 unsigned int num_points)
318{
319
320 unsigned int number = 0;
321 const unsigned int sixteenthPoints = num_points / 16;
322
323 float dotProduct = 0;
324 const float* aPtr = input;
325 const float* bPtr = taps;
326
327 __m256 a0Val, a1Val;
328 __m256 b0Val, b1Val;
329 __m256 c0Val, c1Val;
330
331 __m256 dotProdVal0 = _mm256_setzero_ps();
332 __m256 dotProdVal1 = _mm256_setzero_ps();
333
334 for (; number < sixteenthPoints; number++) {
335
336 a0Val = _mm256_loadu_ps(aPtr);
337 a1Val = _mm256_loadu_ps(aPtr + 8);
338 b0Val = _mm256_loadu_ps(bPtr);
339 b1Val = _mm256_loadu_ps(bPtr + 8);
340
341 c0Val = _mm256_mul_ps(a0Val, b0Val);
342 c1Val = _mm256_mul_ps(a1Val, b1Val);
343
344 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
345 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
346
347 aPtr += 16;
348 bPtr += 16;
349 }
350
351 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
352
353 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
354
355 _mm256_storeu_ps(dotProductVector,
356 dotProdVal0); // Store the results back into the dot product vector
357
358 dotProduct = dotProductVector[0];
359 dotProduct += dotProductVector[1];
360 dotProduct += dotProductVector[2];
361 dotProduct += dotProductVector[3];
362 dotProduct += dotProductVector[4];
363 dotProduct += dotProductVector[5];
364 dotProduct += dotProductVector[6];
365 dotProduct += dotProductVector[7];
366
367 number = sixteenthPoints * 16;
368 for (; number < num_points; number++) {
369 dotProduct += ((*aPtr++) * (*bPtr++));
370 }
371
372 *result = dotProduct;
373}
374
375#endif /*LV_HAVE_AVX*/
376
377#if LV_HAVE_AVX2 && LV_HAVE_FMA
378#include <immintrin.h>
379static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
380 const float* input,
381 const float* taps,
382 unsigned int num_points)
383{
384 unsigned int number;
385 const unsigned int eighthPoints = num_points / 8;
386
387 const float* aPtr = input;
388 const float* bPtr = taps;
389
390 __m256 dotProdVal = _mm256_setzero_ps();
391 __m256 aVal1, bVal1;
392
393 for (number = 0; number < eighthPoints; number++) {
394
395 aVal1 = _mm256_loadu_ps(aPtr);
396 bVal1 = _mm256_loadu_ps(bPtr);
397 aPtr += 8;
398 bPtr += 8;
399
400 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
401 }
402
403 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
404 _mm256_storeu_ps(dotProductVector,
405 dotProdVal); // Store the results back into the dot product vector
406
407 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
408 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
409 dotProductVector[6] + dotProductVector[7];
410
411 for (number = eighthPoints * 8; number < num_points; number++) {
412 dotProduct += ((*aPtr++) * (*bPtr++));
413 }
414
415 *result = dotProduct;
416}
417#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
418
419#if LV_HAVE_AVX512F
420#include <immintrin.h>
421static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
422 const float* input,
423 const float* taps,
424 unsigned int num_points)
425{
426 unsigned int number;
427 const unsigned int sixteenthPoints = num_points / 16;
428
429 const float* aPtr = input;
430 const float* bPtr = taps;
431
432 __m512 dotProdVal = _mm512_setzero_ps();
433 __m512 aVal1, bVal1;
434
435 for (number = 0; number < sixteenthPoints; number++) {
436
437 aVal1 = _mm512_loadu_ps(aPtr);
438 bVal1 = _mm512_loadu_ps(bPtr);
439 aPtr += 16;
440 bPtr += 16;
441
442 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
443 }
444
445 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
446 _mm512_storeu_ps(dotProductVector,
447 dotProdVal); // Store the results back into the dot product vector
448
449 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
450 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
451 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
452 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
453 dotProductVector[12] + dotProductVector[13] +
454 dotProductVector[14] + dotProductVector[15];
455
456 for (number = sixteenthPoints * 16; number < num_points; number++) {
457 dotProduct += ((*aPtr++) * (*bPtr++));
458 }
459
460 *result = dotProduct;
461}
462#endif /* LV_HAVE_AVX512F */
463
464#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
465
466#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
467#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
468
469#include <stdio.h>
470#include <volk/volk_common.h>
471
472
473#ifdef LV_HAVE_GENERIC
474
475
476static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result,
477 const float* input,
478 const float* taps,
479 unsigned int num_points)
480{
481
482 float dotProduct = 0;
483 const float* aPtr = input;
484 const float* bPtr = taps;
485 unsigned int number = 0;
486
487 for (number = 0; number < num_points; number++) {
488 dotProduct += ((*aPtr++) * (*bPtr++));
489 }
490
491 *result = dotProduct;
492}
493
494#endif /*LV_HAVE_GENERIC*/
495
496
497#ifdef LV_HAVE_SSE
498
499
500static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
501 const float* input,
502 const float* taps,
503 unsigned int num_points)
504{
505
506 unsigned int number = 0;
507 const unsigned int sixteenthPoints = num_points / 16;
508
509 float dotProduct = 0;
510 const float* aPtr = input;
511 const float* bPtr = taps;
512
513 __m128 a0Val, a1Val, a2Val, a3Val;
514 __m128 b0Val, b1Val, b2Val, b3Val;
515 __m128 c0Val, c1Val, c2Val, c3Val;
516
517 __m128 dotProdVal0 = _mm_setzero_ps();
518 __m128 dotProdVal1 = _mm_setzero_ps();
519 __m128 dotProdVal2 = _mm_setzero_ps();
520 __m128 dotProdVal3 = _mm_setzero_ps();
521
522 for (; number < sixteenthPoints; number++) {
523
524 a0Val = _mm_load_ps(aPtr);
525 a1Val = _mm_load_ps(aPtr + 4);
526 a2Val = _mm_load_ps(aPtr + 8);
527 a3Val = _mm_load_ps(aPtr + 12);
528 b0Val = _mm_load_ps(bPtr);
529 b1Val = _mm_load_ps(bPtr + 4);
530 b2Val = _mm_load_ps(bPtr + 8);
531 b3Val = _mm_load_ps(bPtr + 12);
532
533 c0Val = _mm_mul_ps(a0Val, b0Val);
534 c1Val = _mm_mul_ps(a1Val, b1Val);
535 c2Val = _mm_mul_ps(a2Val, b2Val);
536 c3Val = _mm_mul_ps(a3Val, b3Val);
537
538 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
539 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
540 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
541 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
542
543 aPtr += 16;
544 bPtr += 16;
545 }
546
547 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
548 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
549 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
550
551 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
552
553 _mm_store_ps(dotProductVector,
554 dotProdVal0); // Store the results back into the dot product vector
555
556 dotProduct = dotProductVector[0];
557 dotProduct += dotProductVector[1];
558 dotProduct += dotProductVector[2];
559 dotProduct += dotProductVector[3];
560
561 number = sixteenthPoints * 16;
562 for (; number < num_points; number++) {
563 dotProduct += ((*aPtr++) * (*bPtr++));
564 }
565
566 *result = dotProduct;
567}
568
569#endif /*LV_HAVE_SSE*/
570
571#ifdef LV_HAVE_SSE3
572
573#include <pmmintrin.h>
574
575static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
576 const float* input,
577 const float* taps,
578 unsigned int num_points)
579{
580 unsigned int number = 0;
581 const unsigned int sixteenthPoints = num_points / 16;
582
583 float dotProduct = 0;
584 const float* aPtr = input;
585 const float* bPtr = taps;
586
587 __m128 a0Val, a1Val, a2Val, a3Val;
588 __m128 b0Val, b1Val, b2Val, b3Val;
589 __m128 c0Val, c1Val, c2Val, c3Val;
590
591 __m128 dotProdVal0 = _mm_setzero_ps();
592 __m128 dotProdVal1 = _mm_setzero_ps();
593 __m128 dotProdVal2 = _mm_setzero_ps();
594 __m128 dotProdVal3 = _mm_setzero_ps();
595
596 for (; number < sixteenthPoints; number++) {
597
598 a0Val = _mm_load_ps(aPtr);
599 a1Val = _mm_load_ps(aPtr + 4);
600 a2Val = _mm_load_ps(aPtr + 8);
601 a3Val = _mm_load_ps(aPtr + 12);
602 b0Val = _mm_load_ps(bPtr);
603 b1Val = _mm_load_ps(bPtr + 4);
604 b2Val = _mm_load_ps(bPtr + 8);
605 b3Val = _mm_load_ps(bPtr + 12);
606
607 c0Val = _mm_mul_ps(a0Val, b0Val);
608 c1Val = _mm_mul_ps(a1Val, b1Val);
609 c2Val = _mm_mul_ps(a2Val, b2Val);
610 c3Val = _mm_mul_ps(a3Val, b3Val);
611
612 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
613 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
614 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
615 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
616
617 aPtr += 16;
618 bPtr += 16;
619 }
620
621 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
622 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
623 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
624
625 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
626 _mm_store_ps(dotProductVector,
627 dotProdVal0); // Store the results back into the dot product vector
628
629 dotProduct = dotProductVector[0];
630 dotProduct += dotProductVector[1];
631 dotProduct += dotProductVector[2];
632 dotProduct += dotProductVector[3];
633
634 number = sixteenthPoints * 16;
635 for (; number < num_points; number++) {
636 dotProduct += ((*aPtr++) * (*bPtr++));
637 }
638
639 *result = dotProduct;
640}
641
642#endif /*LV_HAVE_SSE3*/
643
644#ifdef LV_HAVE_SSE4_1
645
646#include <smmintrin.h>
647
648static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
649 const float* input,
650 const float* taps,
651 unsigned int num_points)
652{
653 unsigned int number = 0;
654 const unsigned int sixteenthPoints = num_points / 16;
655
656 float dotProduct = 0;
657 const float* aPtr = input;
658 const float* bPtr = taps;
659
660 __m128 aVal1, bVal1, cVal1;
661 __m128 aVal2, bVal2, cVal2;
662 __m128 aVal3, bVal3, cVal3;
663 __m128 aVal4, bVal4, cVal4;
664
665 __m128 dotProdVal = _mm_setzero_ps();
666
667 for (; number < sixteenthPoints; number++) {
668
669 aVal1 = _mm_load_ps(aPtr);
670 aPtr += 4;
671 aVal2 = _mm_load_ps(aPtr);
672 aPtr += 4;
673 aVal3 = _mm_load_ps(aPtr);
674 aPtr += 4;
675 aVal4 = _mm_load_ps(aPtr);
676 aPtr += 4;
677
678 bVal1 = _mm_load_ps(bPtr);
679 bPtr += 4;
680 bVal2 = _mm_load_ps(bPtr);
681 bPtr += 4;
682 bVal3 = _mm_load_ps(bPtr);
683 bPtr += 4;
684 bVal4 = _mm_load_ps(bPtr);
685 bPtr += 4;
686
687 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
688 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
689 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
690 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
691
692 cVal1 = _mm_or_ps(cVal1, cVal2);
693 cVal3 = _mm_or_ps(cVal3, cVal4);
694 cVal1 = _mm_or_ps(cVal1, cVal3);
695
696 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
697 }
698
699 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
700 _mm_store_ps(dotProductVector,
701 dotProdVal); // Store the results back into the dot product vector
702
703 dotProduct = dotProductVector[0];
704 dotProduct += dotProductVector[1];
705 dotProduct += dotProductVector[2];
706 dotProduct += dotProductVector[3];
707
708 number = sixteenthPoints * 16;
709 for (; number < num_points; number++) {
710 dotProduct += ((*aPtr++) * (*bPtr++));
711 }
712
713 *result = dotProduct;
714}
715
716#endif /*LV_HAVE_SSE4_1*/
717
718#ifdef LV_HAVE_AVX
719
720#include <immintrin.h>
721
722static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
723 const float* input,
724 const float* taps,
725 unsigned int num_points)
726{
727
728 unsigned int number = 0;
729 const unsigned int sixteenthPoints = num_points / 16;
730
731 float dotProduct = 0;
732 const float* aPtr = input;
733 const float* bPtr = taps;
734
735 __m256 a0Val, a1Val;
736 __m256 b0Val, b1Val;
737 __m256 c0Val, c1Val;
738
739 __m256 dotProdVal0 = _mm256_setzero_ps();
740 __m256 dotProdVal1 = _mm256_setzero_ps();
741
742 for (; number < sixteenthPoints; number++) {
743
744 a0Val = _mm256_load_ps(aPtr);
745 a1Val = _mm256_load_ps(aPtr + 8);
746 b0Val = _mm256_load_ps(bPtr);
747 b1Val = _mm256_load_ps(bPtr + 8);
748
749 c0Val = _mm256_mul_ps(a0Val, b0Val);
750 c1Val = _mm256_mul_ps(a1Val, b1Val);
751
752 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
753 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
754
755 aPtr += 16;
756 bPtr += 16;
757 }
758
759 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
760
761 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
762
763 _mm256_store_ps(dotProductVector,
764 dotProdVal0); // Store the results back into the dot product vector
765
766 dotProduct = dotProductVector[0];
767 dotProduct += dotProductVector[1];
768 dotProduct += dotProductVector[2];
769 dotProduct += dotProductVector[3];
770 dotProduct += dotProductVector[4];
771 dotProduct += dotProductVector[5];
772 dotProduct += dotProductVector[6];
773 dotProduct += dotProductVector[7];
774
775 number = sixteenthPoints * 16;
776 for (; number < num_points; number++) {
777 dotProduct += ((*aPtr++) * (*bPtr++));
778 }
779
780 *result = dotProduct;
781}
782#endif /*LV_HAVE_AVX*/
783
784
785#if LV_HAVE_AVX2 && LV_HAVE_FMA
786#include <immintrin.h>
787static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
788 const float* input,
789 const float* taps,
790 unsigned int num_points)
791{
792 unsigned int number;
793 const unsigned int eighthPoints = num_points / 8;
794
795 const float* aPtr = input;
796 const float* bPtr = taps;
797
798 __m256 dotProdVal = _mm256_setzero_ps();
799 __m256 aVal1, bVal1;
800
801 for (number = 0; number < eighthPoints; number++) {
802
803 aVal1 = _mm256_load_ps(aPtr);
804 bVal1 = _mm256_load_ps(bPtr);
805 aPtr += 8;
806 bPtr += 8;
807
808 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
809 }
810
811 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
812 _mm256_store_ps(dotProductVector,
813 dotProdVal); // Store the results back into the dot product vector
814
815 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
816 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
817 dotProductVector[6] + dotProductVector[7];
818
819 for (number = eighthPoints * 8; number < num_points; number++) {
820 dotProduct += ((*aPtr++) * (*bPtr++));
821 }
822
823 *result = dotProduct;
824}
825#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
826
827#if LV_HAVE_AVX512F
828#include <immintrin.h>
829static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
830 const float* input,
831 const float* taps,
832 unsigned int num_points)
833{
834 unsigned int number;
835 const unsigned int sixteenthPoints = num_points / 16;
836
837 const float* aPtr = input;
838 const float* bPtr = taps;
839
840 __m512 dotProdVal = _mm512_setzero_ps();
841 __m512 aVal1, bVal1;
842
843 for (number = 0; number < sixteenthPoints; number++) {
844
845 aVal1 = _mm512_load_ps(aPtr);
846 bVal1 = _mm512_load_ps(bPtr);
847 aPtr += 16;
848 bPtr += 16;
849
850 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
851 }
852
853 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
854 _mm512_store_ps(dotProductVector,
855 dotProdVal); // Store the results back into the dot product vector
856
857 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
858 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
859 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
860 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
861 dotProductVector[12] + dotProductVector[13] +
862 dotProductVector[14] + dotProductVector[15];
863
864 for (number = sixteenthPoints * 16; number < num_points; number++) {
865 dotProduct += ((*aPtr++) * (*bPtr++));
866 }
867
868 *result = dotProduct;
869}
870#endif /* LV_HAVE_AVX512F */
871
872#ifdef LV_HAVE_NEON
873#include <arm_neon.h>
874
875static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result,
876 const float* input,
877 const float* taps,
878 unsigned int num_points)
879{
880
881 unsigned int quarter_points = num_points / 16;
882 float dotProduct = 0;
883 const float* aPtr = input;
884 const float* bPtr = taps;
885 unsigned int number = 0;
886
887 float32x4x4_t a_val, b_val, accumulator0;
888 accumulator0.val[0] = vdupq_n_f32(0);
889 accumulator0.val[1] = vdupq_n_f32(0);
890 accumulator0.val[2] = vdupq_n_f32(0);
891 accumulator0.val[3] = vdupq_n_f32(0);
892 // factor of 4 loop unroll with independent accumulators
893 // uses 12 out of 16 neon q registers
894 for (number = 0; number < quarter_points; ++number) {
895 a_val = vld4q_f32(aPtr);
896 b_val = vld4q_f32(bPtr);
897 accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
898 accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
899 accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
900 accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
901 aPtr += 16;
902 bPtr += 16;
903 }
904 accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
905 accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
906 accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
907 __VOLK_ATTR_ALIGNED(32) float accumulator[4];
908 vst1q_f32(accumulator, accumulator0.val[0]);
909 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
910
911 for (number = quarter_points * 16; number < num_points; number++) {
912 dotProduct += ((*aPtr++) * (*bPtr++));
913 }
914
915 *result = dotProduct;
916}
917
918#endif
919
920
921#ifdef LV_HAVE_NEON
922static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
923 const float* input,
924 const float* taps,
925 unsigned int num_points)
926{
927
928 unsigned int quarter_points = num_points / 8;
929 float dotProduct = 0;
930 const float* aPtr = input;
931 const float* bPtr = taps;
932 unsigned int number = 0;
933
934 float32x4x2_t a_val, b_val, accumulator_val;
935 accumulator_val.val[0] = vdupq_n_f32(0);
936 accumulator_val.val[1] = vdupq_n_f32(0);
937 // factor of 2 loop unroll with independent accumulators
938 for (number = 0; number < quarter_points; ++number) {
939 a_val = vld2q_f32(aPtr);
940 b_val = vld2q_f32(bPtr);
941 accumulator_val.val[0] =
942 vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
943 accumulator_val.val[1] =
944 vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
945 aPtr += 8;
946 bPtr += 8;
947 }
948 accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
949 __VOLK_ATTR_ALIGNED(32) float accumulator[4];
950 vst1q_f32(accumulator, accumulator_val.val[0]);
951 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
952
953 for (number = quarter_points * 8; number < num_points; number++) {
954 dotProduct += ((*aPtr++) * (*bPtr++));
955 }
956
957 *result = dotProduct;
958}
959
960#endif /* LV_HAVE_NEON */
961
962#ifdef LV_HAVE_NEONV7
963extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
964 const float* aVector,
965 const float* bVector,
966 unsigned int num_points);
967#endif /* LV_HAVE_NEONV7 */
968
969#ifdef LV_HAVE_NEONV7
970extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
971 const float* aVector,
972 const float* bVector,
973 unsigned int num_points);
974#endif /* LV_HAVE_NEONV7 */
975
976#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/