15#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
16#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
21 __m256 yl, yh, tmp1, tmp2;
22 yl = _mm256_moveldup_ps(y);
23 yh = _mm256_movehdup_ps(y);
24 tmp1 = _mm256_mul_ps(x, yl);
25 x = _mm256_shuffle_ps(x, x, 0xB1);
26 tmp2 = _mm256_mul_ps(x, yh);
29 return _mm256_addsub_ps(tmp1, tmp2);
34 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
35 return _mm256_xor_ps(x, conjugator);
40 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
41 const __m256 dreal = _mm256_moveldup_ps(y);
42 const __m256 dimag = _mm256_movehdup_ps(y);
44 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
45 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
46 const __m256 multreal = _mm256_mul_ps(x, dreal);
47 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
48 return _mm256_add_ps(multreal, multimag);
53 __m256 tmp1 = _mm256_mul_ps(
val,
val);
54 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
55 tmp1 = _mm256_shuffle_ps(tmp1, tmp1,
_MM_SHUFFLE(3, 1, 2, 0));
56 tmp1 = _mm256_sqrt_ps(tmp1);
57 return _mm256_div_ps(
val, tmp1);
62 __m256 complex1, complex2;
63 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
64 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
65 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
66 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
67 return _mm256_hadd_ps(complex1, complex2);
76 const __m256 symbols1,
86 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
87 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
89 return _mm256_mul_ps(norms, scalar);
94 __m256 sign_mask_dummy = _mm256_setzero_ps();
148 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
149 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
150 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
151 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
156 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
157 const __m256 abs_mask =
158 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
165 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
167 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
168 return _mm256_or_ps(dst, sign);
180 llr0 = _mm256_xor_ps(llr0, sign_mask);
181 __m256 dst = _mm256_add_ps(llr0, llr1);
186 __m256 sq_acc, __m256 acc, __m256
val, __m256 rec, __m256 aux)
188 aux = _mm256_mul_ps(aux,
val);
189 aux = _mm256_sub_ps(aux, acc);
190 aux = _mm256_mul_ps(aux, aux);
191 aux = _mm256_mul_ps(aux, rec);
192 return _mm256_add_ps(sq_acc, aux);