15#ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
16#define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
24 const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
56 __m256i sign_bits = _mm256_setzero_si256();
60 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
61 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
62 sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
64 return _mm256_castsi256_ps(sign_bits);
77 llr0 = _mm256_xor_ps(llr0, sign_mask);
78 __m256 dst = _mm256_add_ps(llr0, llr1);
83 const __m256 cplxValue1)
85 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
86 const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0);
87 const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1);
88 const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
89 return _mm256_permutevar8x32_ps(complex_result, idx);
93 const __m256 symbols1,
103 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
104 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
106 return _mm256_mul_ps(norms, scalar);
129 __m256i* max_indices,
130 __m256i* current_indices,
131 __m256i indices_increment)
133 in0 = _mm256_mul_ps(in0, in0);
134 in1 = _mm256_mul_ps(in1, in1);
154 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
165 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
168 *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
179 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
180 _mm256_castsi256_ps(*current_indices),
184 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
191 __m256i* max_indices,
192 __m256i* current_indices,
193 __m256i indices_increment)
195 in0 = _mm256_mul_ps(in0, in0);
196 in1 = _mm256_mul_ps(in1, in1);
198 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
199 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
211 *max_values = _mm256_max_ps(abs_squared, *max_values);
214 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
215 _mm256_castsi256_ps(*current_indices),
218 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
241 __m256i* min_indices,
242 __m256i* current_indices,
243 __m256i indices_increment)
245 in0 = _mm256_mul_ps(in0, in0);
246 in1 = _mm256_mul_ps(in1, in1);
266 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
277 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
280 *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
291 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
292 _mm256_castsi256_ps(*current_indices),
296 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
303 __m256i* min_indices,
304 __m256i* current_indices,
305 __m256i indices_increment)
307 in0 = _mm256_mul_ps(in0, in0);
308 in1 = _mm256_mul_ps(in1, in1);
310 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
311 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
323 *min_values = _mm256_min_ps(abs_squared, *min_values);
326 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
327 _mm256_castsi256_ps(*current_indices),
330 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);