51#include "NE10_types.h"
52#include "NE10_macros.h"
53#include "NE10_fft.neonintrinsic.h"
54#include "NE10_fft_generic_float32.h"
56typedef float32x4x2_t CPLX;
57typedef float32x4_t REAL;
58#define NE10_REAL_DUP_NEON_F32 vdupq_n_f32
59#define NE10_CPLX_LOAD(PTR) vld2q_f32 ((ne10_float32_t*) (PTR))
60#define NE10_CPLX_STORE(PTR,OUT) \
62 vst2q_f32 ((ne10_float32_t*) (PTR), OUT); \
65static inline void NE10_LOAD_TW_AND_MUL (CPLX &scratch_in,
69 float32x2_t d2_tmp = vld1_f32 ((ne10_float32_t *)ptr_in);
70 scratch_tw.val[0] = NE10_REAL_DUP_NEON_F32 (d2_tmp[0]);
71 scratch_tw.val[1] = NE10_REAL_DUP_NEON_F32 (d2_tmp[1]);
72 NE10_CPX_MUL_NEON_F32 (scratch_in, scratch_in, scratch_tw);
75static inline REAL NE10_S_MUL_NEON_F32 (
const REAL vec,
76 const ne10_float32_t scalar)
78 REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
79 REAL result = scalar_neon * vec;
83static inline REAL NE10_S_MLA_NEON_F32 (
const REAL dst,
85 const ne10_float32_t scalar)
87 REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
88 return vmlaq_f32 (dst, src, scalar_neon);
91static inline REAL NE10_S_MLS_NEON_F32 (
const REAL dst,
93 const ne10_float32_t scalar)
95 REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
96 return vmlsq_f32 (dst, src, scalar_neon);
102static inline void NE10_FFT2_MUL_TW_NEON (CPLX scratch_out[2],
103 const CPLX scratch_in[2],
104 const CPLX scratch_tw[1])
106 scratch_out[0] = scratch_in[0];
107 NE10_CPX_MUL_NEON_F32 (scratch_out[1], scratch_in[1], scratch_tw[0]);
110static inline void NE10_FFT3_MUL_TW_NEON (CPLX scratch_out[3],
111 const CPLX scratch_in[3],
112 const CPLX scratch_tw[2])
114 NE10_FFT2_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
115 NE10_CPX_MUL_NEON_F32 (scratch_out[2], scratch_in[2], scratch_tw[1]);
118static inline void NE10_FFT4_MUL_TW_NEON (CPLX scratch_out[4],
119 const CPLX scratch_in[4],
120 const CPLX scratch_tw[3])
122 NE10_FFT3_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
123 NE10_CPX_MUL_NEON_F32 (scratch_out[3], scratch_in[3], scratch_tw[2]);
126static inline void NE10_FFT5_MUL_TW_NEON (CPLX scratch_out[5],
127 const CPLX scratch_in[5],
128 const CPLX scratch_tw[4])
130 NE10_FFT4_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
131 NE10_CPX_MUL_NEON_F32 (scratch_out[4], scratch_in[4], scratch_tw[3]);
137static inline void NE10_FFT2_CONJ (CPLX scratch_out[2])
139 scratch_out[0].val[1] = -scratch_out[0].val[1];
140 scratch_out[1].val[1] = -scratch_out[1].val[1];
143static inline void NE10_FFT3_CONJ (CPLX scratch_out[3])
145 NE10_FFT2_CONJ (scratch_out);
146 scratch_out[2].val[1] = -scratch_out[2].val[1];
149static inline void NE10_FFT4_CONJ (CPLX scratch_out[4])
151 NE10_FFT3_CONJ (scratch_out);
152 scratch_out[3].val[1] = -scratch_out[3].val[1];
155static inline void NE10_FFT5_CONJ (CPLX scratch_out[5])
157 NE10_FFT4_CONJ (scratch_out);
158 scratch_out[4].val[1] = -scratch_out[4].val[1];
161static inline void NE10_FFT8_CONJ (CPLX scratch_out[8])
163 NE10_FFT5_CONJ (scratch_out);
164 scratch_out[5].val[1] = -scratch_out[5].val[1];
165 scratch_out[6].val[1] = -scratch_out[6].val[1];
166 scratch_out[7].val[1] = -scratch_out[7].val[1];
173static inline void NE10_FFT2_SCALING (CPLX scratch_out[2],
174 const REAL one_by_fft_neon)
176#ifdef NE10_DSP_CFFT_SCALING
177 scratch_out[0].val[0] *= one_by_fft_neon;
178 scratch_out[0].val[1] *= one_by_fft_neon;
179 scratch_out[1].val[0] *= one_by_fft_neon;
180 scratch_out[1].val[1] *= one_by_fft_neon;
184static inline void NE10_FFT3_SCALING (CPLX scratch_out[3],
185 const REAL one_by_fft_neon)
187#ifdef NE10_DSP_CFFT_SCALING
188 NE10_FFT2_SCALING (scratch_out, one_by_fft_neon);
189 scratch_out[2].val[0] *= one_by_fft_neon;
190 scratch_out[2].val[1] *= one_by_fft_neon;
194static inline void NE10_FFT4_SCALING (CPLX scratch_out[4],
195 const REAL one_by_fft_neon)
197#ifdef NE10_DSP_CFFT_SCALING
198 NE10_FFT3_SCALING (scratch_out, one_by_fft_neon);
199 scratch_out[3].val[0] *= one_by_fft_neon;
200 scratch_out[3].val[1] *= one_by_fft_neon;
204static inline void NE10_FFT5_SCALING (CPLX scratch_out[5],
205 const REAL one_by_fft_neon)
207#ifdef NE10_DSP_CFFT_SCALING
208 NE10_FFT4_SCALING (scratch_out, one_by_fft_neon);
209 scratch_out[4].val[0] *= one_by_fft_neon;
210 scratch_out[4].val[1] *= one_by_fft_neon;
214static inline void NE10_FFT8_SCALING (CPLX scratch_out[8],
215 const REAL one_by_fft_neon)
217#ifdef NE10_DSP_CFFT_SCALING
218 NE10_FFT5_SCALING (scratch_out, one_by_fft_neon);
219 scratch_out[5].val[0] *= one_by_fft_neon;
220 scratch_out[5].val[1] *= one_by_fft_neon;
221 scratch_out[6].val[0] *= one_by_fft_neon;
222 scratch_out[6].val[1] *= one_by_fft_neon;
223 scratch_out[7].val[0] *= one_by_fft_neon;
224 scratch_out[7].val[1] *= one_by_fft_neon;
234static inline void NE10_FFT2_FUC_NEON_F32 (CPLX scratch_out[2],
235 const CPLX scratch_in[2])
237 NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch_in[0], scratch_in[1]);
238 NE10_CPX_SUB_NEON_F32 (scratch_out[1], scratch_in[0], scratch_in[1]);
241static inline void NE10_FFT3_FUC_NEON_F32 (CPLX Fout[3],
244 const float32x4_t TW_3IN_NEON_F32 = vdupq_n_f32 (TW_3IN_F32);
245 const float32x4_t HALF_NEON_F32 = vdupq_n_f32 (0.5f);
247 NE10_CPX_ADD_NEON_F32 (Fout[2], Fin[1], Fin[2]);
248 NE10_CPX_SUB_NEON_F32 (Fout[0], Fin[1], Fin[2]);
250 Fout[1].val[0] = Fin[0].val[0] - Fout[2].val[0] * HALF_NEON_F32;
251 Fout[1].val[1] = Fin[0].val[1] - Fout[2].val[1] * HALF_NEON_F32;
253 Fout[0].val[0] = Fout[0].val[0] * TW_3IN_NEON_F32;
254 Fout[0].val[1] = Fout[0].val[1] * TW_3IN_NEON_F32;
257static inline void NE10_FFT4_FUC_NEON_F32 (CPLX scratch_out[4],
258 const CPLX scratch_in[4])
262 NE10_CPX_ADD_NEON_F32 (scratch[0], scratch_in[0], scratch_in[2]);
263 NE10_CPX_SUB_NEON_F32 (scratch[1], scratch_in[0], scratch_in[2]);
264 NE10_CPX_ADD_NEON_F32 (scratch[2], scratch_in[1], scratch_in[3]);
265 NE10_CPX_SUB_NEON_F32 (scratch[3], scratch_in[1], scratch_in[3]);
267 NE10_CPX_SUB_NEON_F32 (scratch_out[2], scratch[0], scratch[2]);
268 NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch[0], scratch[2]);
270 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
271 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
272 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
273 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
276static inline void NE10_FFT4_FUC_INPLACE_NEON_F32 (CPLX scratch_out[4])
280 NE10_CPX_ADD_NEON_F32 (scratch[0], scratch_out[0], scratch_out[2]);
281 NE10_CPX_SUB_NEON_F32 (scratch[1], scratch_out[0], scratch_out[2]);
282 NE10_CPX_ADD_NEON_F32 (scratch[2], scratch_out[1], scratch_out[3]);
283 NE10_CPX_SUB_NEON_F32 (scratch[3], scratch_out[1], scratch_out[3]);
285 NE10_CPX_SUB_NEON_F32 (scratch_out[2], scratch[0], scratch[2]);
286 NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch[0], scratch[2]);
288 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
289 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
290 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
291 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
294static inline void NE10_FFT5_FUC_INPLACE_NEON_F32 (CPLX Fout[5])
298 NE10_CPX_ADD_NEON_F32 (s[1], Fout[1], Fout[4]);
299 NE10_CPX_ADD_NEON_F32 (s[2], Fout[2], Fout[3]);
304 Fout[0].val[0] = Fout[0].val[0] + s[1].val[0] + s[2].val[0];
305 Fout[0].val[1] = Fout[0].val[1] + s[1].val[1] + s[2].val[1];
307 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[1].val[0], TW_5A_F32.r);
308 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[1].val[1], TW_5A_F32.r);
309 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[1].val[0], TW_5B_F32.r);
310 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[1].val[1], TW_5B_F32.r);
312 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[2].val[0], TW_5B_F32.r);
313 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[2].val[1], TW_5B_F32.r);
314 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[2].val[0], TW_5A_F32.r);
315 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[2].val[1], TW_5A_F32.r);
317 NE10_CPX_SUB_NEON_F32 (s[4], Fout[1], Fout[4]);
318 NE10_CPX_SUB_NEON_F32 (s[3], Fout[2], Fout[3]);
320 s[1].val[0] = NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5A_F32.i);
321 s[1].val[1] = -NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5A_F32.i);
322 s[2].val[0] = -NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5B_F32.i);
323 s[2].val[1] = NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5B_F32.i);
325 s[1].val[0] = NE10_S_MLA_NEON_F32 (s[1].val[0], s[3].val[1], TW_5B_F32.i);
326 s[1].val[1] = NE10_S_MLS_NEON_F32 (s[1].val[1], s[3].val[0], TW_5B_F32.i);
327 s[2].val[0] = NE10_S_MLA_NEON_F32 (s[2].val[0], s[3].val[1], TW_5A_F32.i);
328 s[2].val[1] = NE10_S_MLS_NEON_F32 (s[2].val[1], s[3].val[0], TW_5A_F32.i);
330 NE10_CPX_SUB_NEON_F32 (Fout[1], s[0], s[1]);
331 NE10_CPX_ADD_NEON_F32 (Fout[4], s[0], s[1]);
332 NE10_CPX_ADD_NEON_F32 (Fout[2], s[5], s[2]);
333 NE10_CPX_SUB_NEON_F32 (Fout[3], s[5], s[2]);
336#define NE10_BUTTERFLY_INDEX_NEON_F32(OUT,IN,OUT_I,OUT_J,IN_I,IN_J) \
338 NE10_CPX_ADD_NEON_F32 (OUT[OUT_I],IN[IN_I],IN[IN_J]); \
339 NE10_CPX_SUB_NEON_F32 (OUT[OUT_J],IN[IN_I],IN[IN_J]); \
342static inline void NE10_FFT8_FUC_NEON_F32 (CPLX out[8],
348 { 1.00000, 0.00000 },
349 { 0.70711, -0.70711 },
350 { 0.00000, -1.00000 },
351 { -0.70711, -0.70711 },
357 NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 0, 4, 0, 4);
358 NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 1, 5, 1, 5);
359 NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 2, 6, 2, 6);
360 NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 3, 7, 3, 7);
367#define NE10_CPX_MUL_TW8_NEON_F32(OUT,TW_8_TABLE,OUT_I,TW_J) \
369 ne10_fft_cpx_float32_t TW_TMP = TW_8_TABLE[TW_J]; \
371 TW_TMP_NEON.val[0] = NE10_REAL_DUP_NEON_F32 (TW_TMP.r); \
372 TW_TMP_NEON.val[1] = NE10_REAL_DUP_NEON_F32 (TW_TMP.i); \
373 NE10_CPX_MUL_NEON_F32 (OUT[OUT_I],OUT[OUT_I],TW_TMP_NEON); \
376 NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 4, 0);
377 NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 5, 1);
378 NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 6, 2);
379 NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 7, 3);
381 NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 0, 2, 0, 2);
382 NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 1, 3, 1, 3);
383 NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 4, 6, 4, 6);
384 NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 5, 7, 5, 7);
390 NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 2, 0);
391 NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 3, 2);
392 NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 6, 0);
393 NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 7, 2);
394#undef NE10_CPX_MUL_TW8_NEON_F32
396 NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 0, 4, 0, 1);
397 NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 2, 6, 2, 3);
398 NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 1, 5, 4, 5);
399 NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 3, 7, 6, 7);
415template<ne10_
int32_t is_first_stage, ne10_
int32_t is_inverse,
bool is_scaled>
416static void ne10_radix_2_butterfly_float32_neon (CPLX *Fout,
419 const ne10_int32_t fstride,
420 const ne10_int32_t out_step,
421 const ne10_int32_t nfft)
426 const ne10_int32_t in_step = nfft / 2;
427 ne10_int32_t f_count;
428 ne10_int32_t m_count;
430 const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
432 for (f_count = fstride; f_count > 0; f_count--)
434 for (m_count = out_step; m_count > 0; m_count--)
436#ifndef NE10_INLINE_ASM_OPT
437 in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
438 in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
445 if (is_first_stage == 0)
447 NE10_LOAD_TW_AND_MUL (in[1], twiddles);
450 NE10_FFT2_FUC_NEON_F32 (out, in);
454 NE10_FFT2_CONJ (out);
458 NE10_FFT2_SCALING (out, one_by_fft_neon);
462 NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
463 NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
466#error Currently, inline assembly optimizations are only available on AArch64.
469 "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t"
470 "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t"
473 [pin1]
"r"(Fin + in_step)
474 :
"memory",
"v0",
"v1",
"v2",
"v3");
479 "fneg v1.4s, v1.4s \n\t"
480 "fneg v3.4s, v3.4s \n\t"
483 :
"v0",
"v1",
"v2",
"v3");
486 if (is_first_stage == 0)
489 "ld1 {v4.1d}, [%[ptw]] \n\t"
491 "fmul v14.4s, v2.4s, v4.s[1] \n\t"
492 "fmul v2.4s, v2.4s, v4.s[0] \n\t"
493 "fmls v2.4s, v3.4s, v4.s[1] \n\t"
494 "fmul v15.4s, v3.4s, v4.s[0] \n\t"
495 "fadd v3.4s, v14.4s, v15.4s \n\t"
498 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v14",
"v15");
502 "fadd v4.4s, v0.4s, v2.4s \n\t"
503 "fadd v5.4s, v1.4s, v3.4s \n\t"
504 "fsub v6.4s, v0.4s, v2.4s \n\t"
505 "fsub v7.4s, v1.4s, v3.4s \n\t"
509 "v0",
"v1",
"v2",
"v3",
510 "v4",
"v5",
"v6",
"v7");
515 "fneg v5.4s, v5.4s \n\t"
516 "fneg v7.4s, v7.4s \n\t"
519 :
"v4",
"v5",
"v6",
"v7");
525 "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t"
526 "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t"
527 "fmul v6.4s, v6.4s, %[one_by_nfft].4s \n\t"
528 "fmul v7.4s, v7.4s, %[one_by_nfft].4s \n\t"
530 : [one_by_nfft]
"w"(one_by_fft_neon)
531 :
"v4",
"v5",
"v6",
"v7");
535 "st2 {v4.4s, v5.4s}, [%[pout0]] \n\t"
536 "st2 {v6.4s, v7.4s}, [%[pout1]] \n\t"
539 [pout1]
"r"(Fout + out_step)
540 :
"memory",
"v4",
"v5",
"v6",
"v7");
546 if (is_first_stage == 0)
556 if (is_first_stage == 0)
558 twiddles -= out_step;
559 Fout += (2 - 1) * out_step;
563template<ne10_
int32_t is_first_stage, ne10_
int32_t is_inverse,
bool is_scaled>
564static void ne10_radix_4_butterfly_float32_neon (CPLX *Fout,
567 const ne10_int32_t fstride,
568 const ne10_int32_t out_step,
569 const ne10_int32_t nfft)
572 #ifdef NE10_INLINE_ASM_OPT
576 const ne10_int32_t in_step = nfft / 4;
577 ne10_int32_t f_count;
578 ne10_int32_t m_count;
580 const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
582 for (f_count = fstride; f_count > 0; f_count--)
584 for (m_count = out_step; m_count > 0; m_count--)
586#ifndef NE10_INLINE_ASM_OPT
587 in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
588 in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
589 in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
590 in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
597 if (is_first_stage == 0)
599 NE10_LOAD_TW_AND_MUL (in[1], twiddles);
600 NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
601 NE10_LOAD_TW_AND_MUL (in[3], twiddles + out_step * 2);
604 NE10_FFT4_FUC_INPLACE_NEON_F32 (in);
612 NE10_FFT4_SCALING (in, one_by_fft_neon);
615 NE10_CPLX_STORE (Fout + 0 * out_step, in[0]);
616 NE10_CPLX_STORE (Fout + 1 * out_step, in[1]);
617 NE10_CPLX_STORE (Fout + 2 * out_step, in[2]);
618 NE10_CPLX_STORE (Fout + 3 * out_step, in[3]);
621#error Currently, inline assembly optimizations are only available on AArch64.
623#define NEON_REGISTERS_C2C_R4 \
628#define NEON_REGISTERS_C2C_TW_R4 \
634 "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t"
635 "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t"
636 "ld2 {v4.4s, v5.4s}, [%[pin2]] \n\t"
637 "ld2 {v6.4s, v7.4s}, [%[pin3]] \n\t"
640 [pin1]
"r"(Fin + in_step),
641 [pin2]
"r"(Fin + in_step * 2),
642 [pin3]
"r"(Fin + in_step * 3)
643 :
"memory", NEON_REGISTERS_C2C_R4);
648 "fneg v1.4s, v1.4s \n\t"
649 "fneg v3.4s, v3.4s \n\t"
650 "fneg v5.4s, v5.4s \n\t"
651 "fneg v7.4s, v7.4s \n\t"
654 : NEON_REGISTERS_C2C_R4);
657 if (is_first_stage == 0)
660 "ld1 { v8.1d}, [%[ptw0]] \n\t"
661 "ld1 { v9.1d}, [%[ptw1]] \n\t"
662 "ld1 {v10.1d}, [%[ptw2]] \n\t"
664 "fmul v14.4s, v2.4s, v8.s[1] \n\t"
665 "fmul v2.4s, v2.4s, v8.s[0] \n\t"
666 "fmls v2.4s, v3.4s, v8.s[1] \n\t"
667 "fmul v15.4s, v3.4s, v8.s[0] \n\t"
668 "fadd v3.4s, v14.4s, v15.4s \n\t"
670 "fmul v14.4s, v4.4s, v9.s[1] \n\t"
671 "fmul v4.4s, v4.4s, v9.s[0] \n\t"
672 "fmls v4.4s, v5.4s, v9.s[1] \n\t"
673 "fmul v15.4s, v5.4s, v9.s[0] \n\t"
674 "fadd v5.4s, v14.4s, v15.4s \n\t"
676 "fmul v14.4s, v6.4s, v10.s[1] \n\t"
677 "fmul v6.4s, v6.4s, v10.s[0] \n\t"
678 "fmls v6.4s, v7.4s, v10.s[1] \n\t"
679 "fmul v15.4s, v7.4s, v10.s[0] \n\t"
680 "fadd v7.4s, v14.4s, v15.4s \n\t"
682 : [ptw0]
"r"(twiddles),
683 [ptw1]
"r"(twiddles + out_step),
684 [ptw2]
"r"(twiddles + out_step * 2)
685 :
"memory", NEON_REGISTERS_C2C_R4,
686 NEON_REGISTERS_C2C_TW_R4,
"v14",
"v15");
690 "fadd %[s0r].4s, v0.4s, v4.4s \n\t"
691 "fadd %[s0i].4s, v1.4s, v5.4s \n\t"
693 "fsub %[s1r].4s, v0.4s, v4.4s \n\t"
694 "fsub %[s1i].4s, v1.4s, v5.4s \n\t"
696 "fadd %[s2r].4s, v2.4s, v6.4s \n\t"
697 "fadd %[s2i].4s, v3.4s, v7.4s \n\t"
699 "fsub %[s3r].4s, v2.4s, v6.4s \n\t"
700 "fsub %[s3i].4s, v3.4s, v7.4s \n\t"
701 : [s0r]
"+w"(s[0].val[0]),
702 [s0i]
"+w"(s[0].val[1]),
703 [s1r]
"+w"(s[1].val[0]),
704 [s1i]
"+w"(s[1].val[1]),
705 [s2r]
"+w"(s[2].val[0]),
706 [s2i]
"+w"(s[2].val[1]),
707 [s3r]
"+w"(s[3].val[0]),
708 [s3i]
"+w"(s[3].val[1])
710 : NEON_REGISTERS_C2C_R4);
713 "fadd v0.4s, %[s0r].4s, %[s2r].4s \n\t"
714 "fadd v1.4s, %[s0i].4s, %[s2i].4s \n\t"
715 "fsub v4.4s, %[s0r].4s, %[s2r].4s \n\t"
716 "fsub v5.4s, %[s0i].4s, %[s2i].4s \n\t"
718 "fadd v2.4s, %[s1r].4s, %[s3i].4s \n\t"
719 "fsub v3.4s, %[s1i].4s, %[s3r].4s \n\t"
720 "fsub v6.4s, %[s1r].4s, %[s3i].4s \n\t"
721 "fadd v7.4s, %[s1i].4s, %[s3r].4s \n\t"
723 : [s0r]
"w"(s[0].val[0]),
724 [s0i]
"w"(s[0].val[1]),
725 [s1r]
"w"(s[1].val[0]),
726 [s1i]
"w"(s[1].val[1]),
727 [s2r]
"w"(s[2].val[0]),
728 [s2i]
"w"(s[2].val[1]),
729 [s3r]
"w"(s[3].val[0]),
730 [s3i]
"w"(s[3].val[1])
731 : NEON_REGISTERS_C2C_R4);
736 "fneg v1.4s, v1.4s \n\t"
737 "fneg v3.4s, v3.4s \n\t"
738 "fneg v5.4s, v5.4s \n\t"
739 "fneg v7.4s, v7.4s \n\t"
742 : NEON_REGISTERS_C2C_R4);
748 "fmul v0.4s, v0.4s, %[one_by_nfft].4s \n\t"
749 "fmul v1.4s, v1.4s, %[one_by_nfft].4s \n\t"
750 "fmul v2.4s, v2.4s, %[one_by_nfft].4s \n\t"
751 "fmul v3.4s, v3.4s, %[one_by_nfft].4s \n\t"
752 "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t"
753 "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t"
754 "fmul v6.4s, v6.4s, %[one_by_nfft].4s \n\t"
755 "fmul v7.4s, v7.4s, %[one_by_nfft].4s \n\t"
757 : [one_by_nfft]
"w"(one_by_fft_neon)
758 : NEON_REGISTERS_C2C_R4);
762 "st2 {v0.4s, v1.4s}, [%[pout0]] \n\t"
763 "st2 {v2.4s, v3.4s}, [%[pout1]] \n\t"
764 "st2 {v4.4s, v5.4s}, [%[pout2]] \n\t"
765 "st2 {v6.4s, v7.4s}, [%[pout3]] \n\t"
768 [pout1]
"r"(Fout + out_step),
769 [pout2]
"r"(Fout + out_step * 2),
770 [pout3]
"r"(Fout + out_step * 3)
771 : NEON_REGISTERS_C2C_R4);
776 if (is_first_stage == 0)
786 if (is_first_stage == 0)
788 twiddles -= out_step;
789 Fout += (4 - 1) * out_step;
794template<ne10_
int32_t is_first_stage, ne10_
int32_t is_inverse,
bool is_scaled>
795static void ne10_radix_3_butterfly_float32_neon (CPLX *Fout,
798 const ne10_int32_t fstride,
799 const ne10_int32_t out_step,
800 const ne10_int32_t nfft)
806 const ne10_int32_t in_step = nfft / 3;
807 ne10_int32_t f_count;
808 ne10_int32_t m_count;
810 const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
811 const float32x4_t TW_3IN_NEON_F32 = vdupq_n_f32 (TW_3IN_F32);
812 const float32x4_t HALF_NEON_F32 = vdupq_n_f32 (0.5f);
814 for (f_count = fstride; f_count > 0; f_count--)
816 for (m_count = out_step; m_count > 0; m_count--)
818#ifndef NE10_INLINE_ASM_OPT
819 in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
820 in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
821 in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
828 if (is_first_stage == 0)
830 NE10_LOAD_TW_AND_MUL (in[1], twiddles);
831 NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
834 NE10_CPX_ADD_NEON_F32 (s[2], in[1], in[2]);
835 NE10_CPX_SUB_NEON_F32 (s[0], in[1], in[2]);
838 s[1].val[0] = - s[2].val[0] * HALF_NEON_F32;
839 s[1].val[1] = - s[2].val[1] * HALF_NEON_F32;
841 s[1].val[0] += s[3].val[0];
842 s[1].val[1] += s[3].val[1];
843 s[0].val[0] *= TW_3IN_NEON_F32;
844 s[0].val[1] *= TW_3IN_NEON_F32;
846 out[0].val[0] = s[3].val[0] + s[2].val[0];
847 out[0].val[1] = s[3].val[1] + s[2].val[1];
848 out[1].val[0] = s[1].val[0] - s[0].val[1];
849 out[1].val[1] = s[1].val[1] + s[0].val[0];
850 out[2].val[0] = s[1].val[0] + s[0].val[1];
851 out[2].val[1] = s[1].val[1] - s[0].val[0];
855 NE10_FFT3_CONJ (out);
859 NE10_FFT3_SCALING (out, one_by_fft_neon);
862 NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
863 NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
864 NE10_CPLX_STORE (Fout + 2 * out_step, out[2]);
867#error Currently, inline assembly optimizations are only available on AArch64.
870 "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t"
871 "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t"
872 "ld2 {v4.4s, v5.4s}, [%[pin2]] \n\t"
875 [pin1]
"r"(Fin + in_step),
876 [pin2]
"r"(Fin + in_step * 2)
877 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5");
882 "fneg v1.4s, v1.4s \n\t"
883 "fneg v3.4s, v3.4s \n\t"
884 "fneg v5.4s, v5.4s \n\t"
890 if (is_first_stage == 0)
894 "ld1 {v6.1d}, [%[ptw0]] \n\t"
895 "ld1 {v7.1d}, [%[ptw1]] \n\t"
897 "fmul v10.4s, v2.4s, v6.s[1] \n\t"
898 "fmul v2.4s, v2.4s, v6.s[0] \n\t"
899 "fmls v2.4s, v3.4s, v6.s[1] \n\t"
900 "fmul v11.4s, v3.4s, v6.s[0] \n\t"
901 "fadd v3.4s, v10.4s, v11.4s \n\t"
903 "fmul v10.4s, v4.4s, v7.s[1] \n\t"
904 "fmul v4.4s, v4.4s, v7.s[0] \n\t"
905 "fmls v4.4s, v5.4s, v7.s[1] \n\t"
906 "fmul v11.4s, v5.4s, v7.s[0] \n\t"
907 "fadd v5.4s, v10.4s, v11.4s \n\t"
909 : [ptw0]
"r"(twiddles),
910 [ptw1]
"r"(twiddles + out_step)
911 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
912 "v6",
"v7",
"v8",
"v9",
917 "fadd %[s2r].4s, v2.4s, v4.4s \n\t"
918 "fadd %[s2i].4s, v3.4s, v5.4s \n\t"
920 "fsub %[s0r].4s, v2.4s, v4.4s \n\t"
921 "fsub %[s0i].4s, v3.4s, v5.4s \n\t"
923 "mov %[s3r].16b, v0.16b \n\t"
924 "mov %[s3i].16b, v1.16b \n\t"
925 : [s0r]
"+w"(s[0].val[0]),
926 [s0i]
"+w"(s[0].val[1]),
927 [s2r]
"+w"(s[2].val[0]),
928 [s2i]
"+w"(s[2].val[1]),
929 [s3r]
"+w"(s[3].val[0]),
930 [s3i]
"+w"(s[3].val[1])
932 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5");
934 s[1].val[0] = - s[2].val[0] * HALF_NEON_F32;
935 s[1].val[1] = - s[2].val[1] * HALF_NEON_F32;
937 s[1].val[0] += s[3].val[0];
938 s[1].val[1] += s[3].val[1];
939 s[0].val[0] *= TW_3IN_NEON_F32;
940 s[0].val[1] *= TW_3IN_NEON_F32;
946 "fadd v0.4s, %[s3r].4s, %[s2r].4s \n\t"
947 "fadd v1.4s, %[s3i].4s, %[s2i].4s \n\t"
948 "fsub v2.4s, %[s1r].4s, %[s0i].4s \n\t"
949 "fadd v3.4s, %[s1i].4s, %[s0r].4s \n\t"
950 "fadd v4.4s, %[s1r].4s, %[s0i].4s \n\t"
951 "fsub v5.4s, %[s1i].4s, %[s0r].4s \n\t"
953 : [s0r]
"w"(s[0].val[0]),
954 [s0i]
"w"(s[0].val[1]),
955 [s1r]
"w"(s[1].val[0]),
956 [s1i]
"w"(s[1].val[1]),
957 [s2r]
"w"(s[2].val[0]),
958 [s2i]
"w"(s[2].val[1]),
959 [s3r]
"w"(s[3].val[0]),
960 [s3i]
"w"(s[3].val[1])
961 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5");
966 "fneg v1.4s, v1.4s \n\t"
967 "fneg v3.4s, v3.4s \n\t"
968 "fneg v5.4s, v5.4s \n\t"
977 "fmul v0.4s, v0.4s, %[one_by_nfft].4s \n\t"
978 "fmul v1.4s, v1.4s, %[one_by_nfft].4s \n\t"
979 "fmul v2.4s, v2.4s, %[one_by_nfft].4s \n\t"
980 "fmul v3.4s, v3.4s, %[one_by_nfft].4s \n\t"
981 "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t"
982 "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t"
984 : [one_by_nfft]
"w"(one_by_fft_neon)
985 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5");
989 "st2 {v0.4s, v1.4s}, [%[pout0]] \n\t"
990 "st2 {v2.4s, v3.4s}, [%[pout1]] \n\t"
991 "st2 {v4.4s, v5.4s}, [%[pout2]] \n\t"
994 [pout1]
"r"(Fout + out_step),
995 [pout2]
"r"(Fout + 2 * out_step)
996 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5");
1002 if (is_first_stage == 0)
1012 if (is_first_stage == 0)
1014 twiddles -= out_step;
1015 Fout += (3 - 1) * out_step;
1020template<ne10_
int32_t is_first_stage, ne10_
int32_t is_inverse,
bool is_scaled>
1021static void ne10_radix_5_butterfly_float32_neon (CPLX *Fout,
1024 const ne10_int32_t fstride,
1025 const ne10_int32_t out_step,
1026 const ne10_int32_t nfft)
1031 const ne10_int32_t in_step = nfft / 5;
1032 ne10_int32_t f_count;
1033 ne10_int32_t m_count;
1035 const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
1037 for (f_count = fstride; f_count > 0; f_count--)
1039 for (m_count = out_step; m_count > 0; m_count--)
1041 in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
1042 in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
1043 in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
1044 in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
1045 in[4] = NE10_CPLX_LOAD (Fin + 4 * in_step);
1047 if (is_inverse == 1)
1049 NE10_FFT5_CONJ (in);
1052 if (is_first_stage == 0)
1054 NE10_LOAD_TW_AND_MUL (in[1], twiddles);
1055 NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
1056 NE10_LOAD_TW_AND_MUL (in[3], twiddles + out_step * 2);
1057 NE10_LOAD_TW_AND_MUL (in[4], twiddles + out_step * 3);
1060 NE10_CPX_ADD_NEON_F32 (s[1], in[1], in[4]);
1061 NE10_CPX_ADD_NEON_F32 (s[2], in[2], in[3]);
1066 in[0].val[0] = in[0].val[0] + s[1].val[0] + s[2].val[0];
1067 in[0].val[1] = in[0].val[1] + s[1].val[1] + s[2].val[1];
1069 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[1].val[0], TW_5A_F32.r);
1070 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[1].val[1], TW_5A_F32.r);
1071 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[1].val[0], TW_5B_F32.r);
1072 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[1].val[1], TW_5B_F32.r);
1074 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[2].val[0], TW_5B_F32.r);
1075 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[2].val[1], TW_5B_F32.r);
1076 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[2].val[0], TW_5A_F32.r);
1077 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[2].val[1], TW_5A_F32.r);
1079 NE10_CPX_SUB_NEON_F32 (s[4], in[1], in[4]);
1080 NE10_CPX_SUB_NEON_F32 (s[3], in[2], in[3]);
1082 s[1].val[0] = NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5A_F32.i);
1083 s[1].val[1] = -NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5A_F32.i);
1084 s[2].val[0] = -NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5B_F32.i);
1085 s[2].val[1] = NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5B_F32.i);
1087 s[1].val[0] = NE10_S_MLA_NEON_F32 (s[1].val[0], s[3].val[1], TW_5B_F32.i);
1088 s[1].val[1] = NE10_S_MLS_NEON_F32 (s[1].val[1], s[3].val[0], TW_5B_F32.i);
1089 s[2].val[0] = NE10_S_MLA_NEON_F32 (s[2].val[0], s[3].val[1], TW_5A_F32.i);
1090 s[2].val[1] = NE10_S_MLS_NEON_F32 (s[2].val[1], s[3].val[0], TW_5A_F32.i);
1092 NE10_CPX_SUB_NEON_F32 (in[1], s[0], s[1]);
1093 NE10_CPX_ADD_NEON_F32 (in[4], s[0], s[1]);
1094 NE10_CPX_ADD_NEON_F32 (in[2], s[5], s[2]);
1095 NE10_CPX_SUB_NEON_F32 (in[3], s[5], s[2]);
1097 if (is_inverse == 1)
1099 NE10_FFT5_CONJ (in);
1103 NE10_FFT5_SCALING (in, one_by_fft_neon);
1106 NE10_CPLX_STORE (Fout + 0 * out_step, in[0]);
1107 NE10_CPLX_STORE (Fout + 1 * out_step, in[1]);
1108 NE10_CPLX_STORE (Fout + 2 * out_step, in[2]);
1109 NE10_CPLX_STORE (Fout + 3 * out_step, in[3]);
1110 NE10_CPLX_STORE (Fout + 4 * out_step, in[4]);
1114 if (is_first_stage == 0)
1124 if (is_first_stage == 0)
1126 twiddles -= out_step;
1127 Fout += (5 - 1) * out_step;
1132template<ne10_
int32_t is_first_stage, ne10_
int32_t is_inverse,
bool is_scaled>
1133static void ne10_radix_8_butterfly_float32_neon (CPLX *Fout,
1136 const ne10_int32_t fstride,
1137 const ne10_int32_t out_step,
1138 const ne10_int32_t nfft)
1143 const ne10_int32_t in_step = nfft / 8;
1144 ne10_int32_t f_count;
1145 ne10_int32_t m_count;
1147 const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
1149 for (f_count = fstride; f_count > 0; f_count--)
1151 for (m_count = out_step; m_count > 0; m_count--)
1153 in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
1154 in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
1155 in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
1156 in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
1157 in[4] = NE10_CPLX_LOAD (Fin + 4 * in_step);
1158 in[5] = NE10_CPLX_LOAD (Fin + 5 * in_step);
1159 in[6] = NE10_CPLX_LOAD (Fin + 6 * in_step);
1160 in[7] = NE10_CPLX_LOAD (Fin + 7 * in_step);
1162 if (is_inverse == 1)
1164 NE10_FFT8_CONJ (in);
1167 NE10_FFT8_FUC_NEON_F32 (out, in);
1169 if (is_inverse == 1)
1171 NE10_FFT8_CONJ (out);
1175 NE10_FFT8_SCALING (out, one_by_fft_neon);
1178 NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
1179 NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
1180 NE10_CPLX_STORE (Fout + 2 * out_step, out[2]);
1181 NE10_CPLX_STORE (Fout + 3 * out_step, out[3]);
1182 NE10_CPLX_STORE (Fout + 4 * out_step, out[4]);
1183 NE10_CPLX_STORE (Fout + 5 * out_step, out[5]);
1184 NE10_CPLX_STORE (Fout + 6 * out_step, out[6]);
1185 NE10_CPLX_STORE (Fout + 7 * out_step, out[7]);
1193template<ne10_
int32_t is_inverse,
bool is_scaled>
1194static void ne10_mixed_radix_generic_butterfly_float32_neon_impl (CPLX *Fout,
1196 const ne10_int32_t *factors,
1200 ne10_int32_t fstride, mstride, radix;
1201 ne10_int32_t stage_count;
1205 stage_count = factors[0];
1206 fstride = factors[1];
1208 radix = factors[ stage_count << 1 ];
1209 nfft = fstride * radix;
1212 if (stage_count % 2 == 0)
1214 ne10_swap_ptr (buffer, Fout);
1221 ne10_radix_2_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1225 ne10_radix_4_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1229 ne10_radix_3_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1233 ne10_radix_5_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1237 ne10_radix_8_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1255 radix = factors[ stage_count << 1 ];
1258 while (stage_count > 0)
1262 assert ((radix > 1) && (radix < 6));
1264 ne10_swap_ptr (buffer, Fout);
1270 ne10_radix_2_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1271 twiddles, fstride, mstride, nfft);
1274 ne10_radix_3_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1275 twiddles, fstride, mstride, nfft);
1278 ne10_radix_4_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1279 twiddles, fstride, mstride, nfft);
1282 ne10_radix_5_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1283 twiddles, fstride, mstride, nfft);
1287 twiddles += mstride * (radix - 1);
1291 radix = factors[ stage_count << 1 ];
1295template<ne10_
int32_t is_inverse>
1296static void ne10_c2c_1d_last_stage_neon (CPLX *Fout,
1299 const ne10_int32_t fstride,
1300 const ne10_int32_t out_step,
1303 ne10_int32_t f_count;
1304 ne10_int32_t m_count;
1306 for (f_count = fstride; f_count > 0; f_count--)
1309 CPLX scratch_out[4];
1312 for (m_count = out_step / NE10_FFT_PARA_LEVEL; m_count > 0; m_count--)
1314#ifndef NE10_INLINE_ASM_OPT
1315 scratch_in[0] = NE10_CPLX_LOAD (Fin + 0);
1316 scratch_in[1] = NE10_CPLX_LOAD (Fin + 1);
1317 scratch_in[2] = NE10_CPLX_LOAD (Fin + 2);
1318 scratch_in[3] = NE10_CPLX_LOAD (Fin + 3);
1322 CPLX scratch0, scratch_in0;
1323 CPLX scratch1, scratch_in1;
1324 CPLX scratch2, scratch_in2;
1325 CPLX scratch3, scratch_in3;
1327 scratch_in0 = scratch_in[0];
1328 scratch_in1 = scratch_in[1];
1329 scratch_in2 = scratch_in[2];
1330 scratch_in3 = scratch_in[3];
1332 NE10_RADIX4X4C_TRANSPOSE_NEON (scratch, scratch_in);
1334 scratch_in[0] = scratch0;
1335 scratch_in[1] = scratch1;
1336 scratch_in[2] = scratch2;
1337 scratch_in[3] = scratch3;
1341#error Currently, inline assembly optimizations are only available on AArch64.
1343 const float *pin = (
const float *) Fin;
1345 "ld2 {v0.4s, v1.4s}, [%[pin]], %[offset] \n\t"
1346 "ld2 {v2.4s, v3.4s}, [%[pin]], %[offset] \n\t"
1347 "ld2 {v4.4s, v5.4s}, [%[pin]], %[offset] \n\t"
1348 "ld2 {v6.4s, v7.4s}, [%[pin]] \n\t"
1351 "trn1 v8.4s, v0.4s, v2.4s \n\t"
1352 "trn2 v9.4s, v0.4s, v2.4s \n\t"
1353 "trn1 v10.4s, v4.4s, v6.4s \n\t"
1354 "trn2 v11.4s, v4.4s, v6.4s \n\t"
1356 "trn1 %[in0r].2d, v8.2d, v10.2d \n\t"
1357 "trn1 %[in1r].2d, v9.2d, v11.2d \n\t"
1358 "trn2 %[in2r].2d, v8.2d, v10.2d \n\t"
1359 "trn2 %[in3r].2d, v9.2d, v11.2d \n\t"
1361 "trn1 v8.4s, v1.4s, v3.4s \n\t"
1362 "trn2 v9.4s, v1.4s, v3.4s \n\t"
1363 "trn1 v10.4s, v5.4s, v7.4s \n\t"
1364 "trn2 v11.4s, v5.4s, v7.4s \n\t"
1366 "trn1 %[in0i].2d, v8.2d, v10.2d \n\t"
1367 "trn1 %[in1i].2d, v9.2d, v11.2d \n\t"
1368 "trn2 %[in2i].2d, v8.2d, v10.2d \n\t"
1369 "trn2 %[in3i].2d, v9.2d, v11.2d \n\t"
1370 : [in0r]
"+w"(scratch_in[0].val[0]),
1371 [in0i]
"+w"(scratch_in[0].val[1]),
1372 [in1r]
"+w"(scratch_in[1].val[0]),
1373 [in1i]
"+w"(scratch_in[1].val[1]),
1374 [in2r]
"+w"(scratch_in[2].val[0]),
1375 [in2i]
"+w"(scratch_in[2].val[1]),
1376 [in3r]
"+w"(scratch_in[3].val[0]),
1377 [in3i]
"+w"(scratch_in[3].val[1]),
1380 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
1381 "v8",
"v9",
"v10",
"v11");
1387 NE10_FFT4_CONJ (scratch_in);
1392#ifndef NE10_INLINE_ASM_OPT
1395 scratch_tw[0] = NE10_CPLX_LOAD (twiddles + 0 * out_step);
1396 scratch_tw[1] = NE10_CPLX_LOAD (twiddles + 1 * out_step);
1397 scratch_tw[2] = NE10_CPLX_LOAD (twiddles + 2 * out_step);
1399 NE10_FFT4_MUL_TW_NEON (scratch_in, scratch_in, scratch_tw);
1402#error Currently, inline assembly optimizations are only available on AArch64.
1404 const float *tw = (
const float *)twiddles;
1406 "ld2 {v0.4s, v1.4s}, [%[tw]], %[offset] \n\t"
1407 "ld2 {v2.4s, v3.4s}, [%[tw]], %[offset] \n\t"
1408 "ld2 {v4.4s, v5.4s}, [%[tw]] \n\t"
1410 "fmul v6.4s, %[in1r].4s, v1.4s \n\t"
1411 "fmul %[in1r].4s, %[in1r].4s, v0.4s \n\t"
1412 "fmls %[in1r].4s, %[in1i].4s, v1.4s \n\t"
1413 "fmul v7.4s, %[in1i].4s, v0.4s \n\t"
1414 "fadd %[in1i].4s, v6.4s, v7.4s \n\t"
1416 "fmul v6.4s, %[in2r].4s, v3.4s \n\t"
1417 "fmul %[in2r].4s, %[in2r].4s, v2.4s \n\t"
1418 "fmls %[in2r].4s, %[in2i].4s, v3.4s \n\t"
1419 "fmul v7.4s, %[in2i].4s, v2.4s \n\t"
1420 "fadd %[in2i].4s, v6.4s, v7.4s \n\t"
1422 "fmul v6.4s, %[in3r].4s, v5.4s \n\t"
1423 "fmul %[in3r].4s, %[in3r].4s, v4.4s \n\t"
1424 "fmls %[in3r].4s, %[in3i].4s, v5.4s \n\t"
1425 "fmul v7.4s, %[in3i].4s, v4.4s \n\t"
1426 "fadd %[in3i].4s, v6.4s, v7.4s \n\t"
1428 [in1r]
"+w"(scratch_in[1].val[0]),
1429 [in1i]
"+w"(scratch_in[1].val[1]),
1430 [in2r]
"+w"(scratch_in[2].val[0]),
1431 [in2i]
"+w"(scratch_in[2].val[1]),
1432 [in3r]
"+w"(scratch_in[3].val[0]),
1433 [in3i]
"+w"(scratch_in[3].val[1])
1434 : [offset]
"r"(out_step * 8)
1435 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
1442 NE10_CPX_ADD_NEON_F32 (scratch[0], scratch_in[0], scratch_in[2]);
1443 NE10_CPX_SUB_NEON_F32 (scratch[1], scratch_in[0], scratch_in[2]);
1444 NE10_CPX_ADD_NEON_F32 (scratch[2], scratch_in[1], scratch_in[3]);
1445 NE10_CPX_SUB_NEON_F32 (scratch[3], scratch_in[1], scratch_in[3]);
1447#ifndef NE10_INLINE_ASM_OPT
1448 NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch[0], scratch[2]);
1449 NE10_CPX_SUB_NEON_F32 (scratch_out[2], scratch[0], scratch[2]);
1451 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
1452 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
1453 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
1454 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
1456 if (is_inverse == 1)
1458 NE10_FFT4_CONJ (scratch_out);
1466 NE10_CPLX_STORE (Fout_cpx + 0 * out_step, scratch_out[0]);
1467 NE10_CPLX_STORE (Fout_cpx + 1 * out_step, scratch_out[1]);
1468 NE10_CPLX_STORE (Fout_cpx + 2 * out_step, scratch_out[2]);
1469 NE10_CPLX_STORE (Fout_cpx + 3 * out_step, scratch_out[3]);
1473#error Currently, inline assembly optimizations are only available on AArch64.
1476 "fadd v0.4s, %[s0r].4s, %[s2r].4s \n\t"
1477 "fadd v1.4s, %[s0i].4s, %[s2i].4s \n\t"
1478 "fsub v4.4s, %[s0r].4s, %[s2r].4s \n\t"
1479 "fsub v5.4s, %[s0i].4s, %[s2i].4s \n\t"
1480 "fadd v2.4s, %[s1r].4s, %[s3i].4s \n\t"
1481 "fsub v3.4s, %[s1i].4s, %[s3r].4s \n\t"
1482 "fsub v6.4s, %[s1r].4s, %[s3i].4s \n\t"
1483 "fadd v7.4s, %[s1i].4s, %[s3r].4s \n\t"
1485 : [s0r]
"w"(scratch[0].val[0]),
1486 [s0i]
"w"(scratch[0].val[1]),
1487 [s1r]
"w"(scratch[1].val[0]),
1488 [s1i]
"w"(scratch[1].val[1]),
1489 [s2r]
"w"(scratch[2].val[0]),
1490 [s2i]
"w"(scratch[2].val[1]),
1491 [s3r]
"w"(scratch[3].val[0]),
1492 [s3i]
"w"(scratch[3].val[1])
1493 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7");
1495 if (is_inverse == 1)
1498 "fneg v1.4s, v1.4s \n\t"
1499 "fneg v3.4s, v3.4s \n\t"
1500 "fneg v5.4s, v5.4s \n\t"
1501 "fneg v7.4s, v7.4s \n\t"
1504 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7");
1507 float *pout = (
float *) Fout;
1509 "st2 {v0.4s, v1.4s}, [%[pout]], %[offset] \n\t"
1510 "st2 {v2.4s, v3.4s}, [%[pout]], %[offset] \n\t"
1511 "st2 {v4.4s, v5.4s}, [%[pout]], %[offset] \n\t"
1512 "st2 {v6.4s, v7.4s}, [%[pout]] \n\t"
1514 : [offset]
"r"(out_step * 8)
1515 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7");
1524 ne10_int32_t left_over = out_step % 4;
1533 for (m_count = out_step % 4; m_count > 0; m_count--)
1538 scratch_in[0] = Fin_s[0];
1539 scratch_in[1] = Fin_s[1];
1540 scratch_in[2] = Fin_s[2];
1541 scratch_in[3] = Fin_s[3];
1545 scratch_in[0].i = -scratch_in[0].i;
1546 scratch_in[1].i = -scratch_in[1].i;
1547 scratch_in[2].i = -scratch_in[2].i;
1548 scratch_in[3].i = -scratch_in[3].i;
1551 scratch_tw[0] = twiddles[0 * out_step];
1552 scratch_tw[1] = twiddles[1 * out_step];
1553 scratch_tw[2] = twiddles[2 * out_step];
1555 FFT4_MUL_TW (scratch_in, scratch_in, scratch_tw);
1557 FFT4_FCU_INPLACE (scratch_in);
1561 scratch_in[0].i = -scratch_in[0].i;
1562 scratch_in[1].i = -scratch_in[1].i;
1563 scratch_in[2].i = -scratch_in[2].i;
1564 scratch_in[3].i = -scratch_in[3].i;
1567 Fout_s[0 * out_step] = scratch_in[0];
1568 Fout_s[1 * out_step] = scratch_in[1];
1569 Fout_s[2 * out_step] = scratch_in[2];
1570 Fout_s[3 * out_step] = scratch_in[3];
1578typedef void (*NE10_MIXED_RADIX_FUNC) (CPLX*,
const CPLX *,
const ne10_int32_t *,
1581void ne10_mixed_radix_generic_butterfly_float32_neon (
1584 const ne10_int32_t *factors,
1587 const ne10_int32_t is_scaled)
1589 ne10_int32_t stage_count = factors[0];
1590 ne10_int32_t fstride = factors[1];
1591 ne10_int32_t radix = factors[stage_count << 1];
1593 NE10_MIXED_RADIX_FUNC ne10_mixed_radix_impl = NULL;
1597 ne10_int32_t nfft = fstride * radix;
1601 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<0, true>;
1605 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<0, false>;
1608 ne10_mixed_radix_impl ((CPLX *) buffer,
1614 ne10_c2c_1d_last_stage_neon<0> ((CPLX *) Fout,
1615 (
const CPLX *) buffer,
1622void ne10_mixed_radix_generic_butterfly_inverse_float32_neon (
1625 const ne10_int32_t *factors,
1628 const ne10_int32_t is_scaled)
1630 ne10_int32_t stage_count = factors[0];
1631 ne10_int32_t fstride = factors[1];
1632 ne10_int32_t radix = factors[stage_count << 1];
1634 NE10_MIXED_RADIX_FUNC ne10_mixed_radix_impl = NULL;
1638 ne10_int32_t nfft = fstride * radix;
1642 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<1, true>;
1646 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<1, false>;
1649 ne10_mixed_radix_impl ((CPLX *) buffer,
1655 ne10_c2c_1d_last_stage_neon<1> ((CPLX *) Fout,
1656 (
const CPLX *) buffer,