Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fft_generic_int32.neonintrinsic.h
1/*
2 * Copyright 2015 ARM Limited
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/* license of Kiss FFT */
29/*
30Copyright (c) 2003-2010, Mark Borgerding
31
32All rights reserved.
33
34Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35
36 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39
40THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41*/
42
43/*
44 * NE10 Library : dsp/NE10_fft_generic_int32.neonintrisic.h
45 *
46 * This file must be compiled by C++ toolchain because some functions are
47 * written as template functions to make it easier for compiler to
48 * reduce branch jump.
49 */
50
51#ifndef NE10_FFT_GENERIC_INT32_NEONINTRINSIC_H
52#define NE10_FFT_GENERIC_INT32_NEONINTRINSIC_H
53
54#include "NE10_types.h"
55#include "NE10_macros.h"
56#include "NE10_fft.neonintrinsic.h"
57#include "NE10_fft_generic_int32.h"
58
59typedef int32x4x2_t CPLX;
60typedef int32x4_t REAL;
61#define NE10_REAL_DUP_NEON_S32 vdupq_n_s32
62
63#ifndef NE10_INLINE_ASM_OPT
64#define NE10_CPLX_LOAD(PTR) vld2q_s32 ((ne10_int32_t*) (PTR))
65#define NE10_CPLX_STORE(PTR,OUT) \
66 do { \
67 vst2q_s32 ((ne10_int32_t*) (PTR), OUT); \
68 } while (0)
69#else // NE10_INLINE_ASM_OPT
70#ifndef __aarch64__
71#error Currently, inline assembly optimizations are only available on AArch64.
72#else // __aarch64__
73template<class T>
74inline static int32x4x2_t NE10_CPLX_LOAD(T *ptr)
75{
76 int32x4x2_t result;
77 asm volatile (
78 "ld2 {v0.4s, v1.4s}, [%[pin]] \n\t"
79 "mov %[r].16b, v0.16b \n\t"
80 "mov %[i].16b, v1.16b \n\t"
81 : [r]"+w"(result.val[0]),
82 [i]"+w"(result.val[1])
83 : [pin]"r"(ptr)
84 : "memory", "v0", "v1");
85 return result;
86}
87
88template<class T>
89inline static void NE10_CPLX_STORE(T *ptr, int32x4x2_t out)
90{
91 asm volatile (
92 "mov v0.16b, %[r].16b \n\t"
93 "mov v1.16b, %[i].16b \n\t"
94 "st2 {v0.4s, v1.4s}, [%[pout]] \n\t"
95 : [r]"+w"(out.val[0]),
96 [i]"+w"(out.val[1])
97 : [pout]"r"(ptr)
98 : "memory", "v0", "v1");
99}
100
101#endif // __aarch64__
102#endif // NE10_INLINE_ASM_OPT
103
104template<>
105inline CPLX NE10_CPX_LOAD_S<CPLX> (const CPLX *ptr)
106{
107 return NE10_CPLX_LOAD(ptr);
108}
109
110template<>
111inline void NE10_CPX_STORE_S<CPLX> (CPLX *ptr, const CPLX out)
112{
113 NE10_CPLX_STORE (ptr, out);
114}
115
116template<>
117inline void NE10_LOAD_BY_STEP<1, CPLX> (CPLX out[1],
118 const CPLX *Fin,
119 const ne10_int32_t)
120{
121 out[0] = NE10_CPX_LOAD_S (Fin);
122}
123
124template<>
125inline void NE10_STORE_BY_STEP<1, CPLX> (CPLX *Fout,
126 const CPLX out[1],
127 const ne10_int32_t)
128{
129 NE10_CPX_STORE_S (Fout, out[0]);
130}
131
132static inline REAL NE10_S_MUL_NEON_S32 (const REAL vec,
133 const ne10_int32_t scalar)
134{
135 REAL scalar_neon = NE10_REAL_DUP_NEON_S32 (scalar);
136 REAL result = vqrdmulhq_s32 (scalar_neon, vec);
137 return result;
138}
139
140static inline void NE10_CPX_MUL_NEON_S32 (CPLX &result, const CPLX A, const CPLX B)
141{
142 REAL ARBR = vqrdmulhq_s32 (A.val[0], B.val[0]);
143 REAL ARBI = vqrdmulhq_s32 (A.val[0], B.val[1]);
144 REAL AIBR = vqrdmulhq_s32 (A.val[1], B.val[0]);
145 REAL AIBI = vqrdmulhq_s32 (A.val[1], B.val[1]);
146 result.val[0] = ARBR - AIBI;
147 result.val[1] = ARBI + AIBR;
148}
149
150template<int RADIX>
151inline void NE10_LOAD_TW_AND_MUL (CPLX scratch_in[RADIX],
152 const ne10_fft_cpx_int32_t *ptr_in,
153 const ne10_int32_t step)
154{
155 CPLX scratch_tw;
156 int32x2_t d2_tmp = vld1_s32 ((ne10_int32_t *)(ptr_in + (RADIX - 2) * step));
157
158 scratch_tw.val[0] = NE10_REAL_DUP_NEON_S32 (d2_tmp[0]);
159 scratch_tw.val[1] = NE10_REAL_DUP_NEON_S32 (d2_tmp[1]);
160 NE10_CPX_MUL_NEON_S32 (scratch_in[RADIX - 1], scratch_in[RADIX - 1], scratch_tw);
161
162 NE10_LOAD_TW_AND_MUL<RADIX - 1> (scratch_in, ptr_in, step);
163}
164
165template<>
166inline void NE10_LOAD_TW_AND_MUL<1> (CPLX [1],
167 const ne10_fft_cpx_int32_t *,
168 const ne10_int32_t)
169{
170}
171
173// Conj inplace.
175template<>
176inline void NE10_CONJ_S<CPLX> (CPLX &cplx)
177{
178 cplx.val[1] = -cplx.val[1];
179}
180
181template<>
182inline void NE10_CONJ<1, CPLX> (CPLX in[1])
183{
184 NE10_CONJ_S<CPLX> (in[0]);
185}
186
188// Scaling
189// If Macro NE10_DSP_CFFT_SCALING is not defined, these functions do nothing.
191template<int RADIX, int SIZE = RADIX>
193 inline void operator() (CPLX scratch_out[RADIX])
194 {
195#ifdef NE10_DSP_CFFT_SCALING
196 const int32x4_t one_by_RADIX =
197 {
198 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
199 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
200 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
201 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f)
202 };
203 scratch_out[SIZE - 1].val[0] = vqrdmulhq_s32 (scratch_out[SIZE - 1].val[0], one_by_RADIX);
204 scratch_out[SIZE - 1].val[1] = vqrdmulhq_s32 (scratch_out[SIZE - 1].val[1], one_by_RADIX);
205 NE10_FFT_SCALING<RADIX, SIZE - 1> () (scratch_out);
206#endif
207 }
208};
209
210template<int RADIX>
211struct NE10_FFT_SCALING<RADIX, 1> {
212 inline void operator () (CPLX scratch_out[1])
213 {
214#ifdef NE10_DSP_CFFT_SCALING
215 const int32x4_t one_by_RADIX =
216 {
217 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
218 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
219 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
220 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f)
221 };
222 scratch_out[0].val[0] = vqrdmulhq_s32 (scratch_out[0].val[0], one_by_RADIX);
223 scratch_out[0].val[1] = vqrdmulhq_s32 (scratch_out[0].val[1], one_by_RADIX);
224#endif
225 }
226};
227
228inline void NE10_CPX_ADD_NEON_S32 (CPLX &result, const CPLX a, const CPLX b)
229{
230 result.val[0] = vaddq_s32 (a.val[0], b.val[0]);
231 result.val[1] = vaddq_s32 (a.val[1], b.val[1]);
232}
233
234inline void NE10_CPX_SUB_NEON_S32 (CPLX &result, const CPLX a, const CPLX b)
235{
236 result.val[0] = vsubq_s32 (a.val[0], b.val[0]);
237 result.val[1] = vsubq_s32 (a.val[1], b.val[1]);
238}
239
240inline REAL NE10_HALF (REAL src)
241{
242 const int32x4_t CONST_HALF_NEON = { -1, -1, -1, -1};
243 src = vshlq_s32 (src, CONST_HALF_NEON);
244 return src;
245}
246
248// FFT Kernel
249// F: Forward
250// C: Complex
251// U: Unscaled
253template<int RADIX>
254inline void NE10_FFT_FCU_NEON_S32 (CPLX [RADIX], const CPLX [RADIX]);
255
256template<>
257inline void NE10_FFT_FCU_NEON_S32<2> (CPLX scratch_out[2],
258 const CPLX scratch_in[2])
259{
260 NE10_CPX_ADD_NEON_S32 (scratch_out[0], scratch_in[0], scratch_in[1]);
261 NE10_CPX_SUB_NEON_S32 (scratch_out[1], scratch_in[0], scratch_in[1]);
262}
263
264template<>
265inline void NE10_FFT_FCU_NEON_S32<3> (CPLX Fout[3],
266 const CPLX Fin[3])
267{
268 CPLX scratch[4];
269
270 Fout[0] = Fin[0];
271 Fout[1] = Fin[1];
272 Fout[2] = Fin[2];
273
274 scratch[1] = Fout[1];
275 scratch[2] = Fout[2];
276
277 NE10_CPX_ADD_NEON_S32 (scratch[3], scratch[1], scratch[2]);
278 NE10_CPX_SUB_NEON_S32 (scratch[0], scratch[1], scratch[2]);
279
280 Fout[1].val[0] = Fout[0].val[0] - NE10_HALF (scratch[3].val[0]);
281 Fout[1].val[1] = Fout[0].val[1] - NE10_HALF (scratch[3].val[1]);
282
283 scratch[0].val[0] = NE10_S_MUL_NEON_S32 (scratch[0].val[0], TW_3IN_S32);
284 scratch[0].val[1] = NE10_S_MUL_NEON_S32 (scratch[0].val[1], TW_3IN_S32);
285
286 Fout[0].val[0] += scratch[3].val[0];
287 Fout[0].val[1] += scratch[3].val[1];
288
289 Fout[2].val[0] = Fout[1].val[0] + scratch[0].val[1];
290 Fout[2].val[1] = Fout[1].val[1] - scratch[0].val[0];
291
292 Fout[1].val[0] -= scratch[0].val[1];
293 Fout[1].val[1] += scratch[0].val[0];
294}
295
296template<>
297inline void NE10_FFT_FCU_NEON_S32<4> (CPLX scratch_out[4],
298 const CPLX scratch_in[4])
299{
300 CPLX scratch[4];
301
302 NE10_CPX_ADD_NEON_S32 (scratch[0], scratch_in[0], scratch_in[2]);
303 NE10_CPX_SUB_NEON_S32 (scratch[1], scratch_in[0], scratch_in[2]);
304 NE10_CPX_ADD_NEON_S32 (scratch[2], scratch_in[1], scratch_in[3]);
305 NE10_CPX_SUB_NEON_S32 (scratch[3], scratch_in[1], scratch_in[3]);
306
307 NE10_CPX_SUB_NEON_S32 (scratch_out[2], scratch[0], scratch[2]);
308 NE10_CPX_ADD_NEON_S32 (scratch_out[0], scratch[0], scratch[2]);
309
310 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
311 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
312 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
313 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
314}
315
316template<>
317inline void NE10_FFT_FCU_NEON_S32<5> (CPLX Fout[5],
318 const CPLX Fin[5])
319{
320 CPLX scratch[13], scratch_in[5];
321
322 scratch_in[0] = Fin[0];
323 scratch_in[1] = Fin[1];
324 scratch_in[2] = Fin[2];
325 scratch_in[3] = Fin[3];
326 scratch_in[4] = Fin[4];
327
328 scratch[0] = scratch_in[0];
329 scratch[1] = scratch_in[1];
330 scratch[2] = scratch_in[2];
331 scratch[3] = scratch_in[3];
332 scratch[4] = scratch_in[4];
333
334 NE10_CPX_ADD_NEON_S32 (scratch[ 7], scratch[1], scratch[4]);
335 NE10_CPX_SUB_NEON_S32 (scratch[10], scratch[1], scratch[4]);
336 NE10_CPX_ADD_NEON_S32 (scratch[ 8], scratch[2], scratch[3]);
337 NE10_CPX_SUB_NEON_S32 (scratch[ 9], scratch[2], scratch[3]);
338
339 scratch_in[0].val[0] += scratch[7].val[0] + scratch[8].val[0];
340 scratch_in[0].val[1] += scratch[7].val[1] + scratch[8].val[1];
341
342 scratch[5].val[0] = scratch[0].val[0]
343 + NE10_S_MUL_NEON_S32 (scratch[7].val[0], TW_5A_S32.r)
344 + NE10_S_MUL_NEON_S32 (scratch[8].val[0], TW_5B_S32.r);
345 scratch[5].val[1] = scratch[0].val[1]
346 + NE10_S_MUL_NEON_S32 (scratch[7].val[1], TW_5A_S32.r)
347 + NE10_S_MUL_NEON_S32 (scratch[8].val[1], TW_5B_S32.r);
348
349 scratch[6].val[0] = NE10_S_MUL_NEON_S32 (scratch[10].val[1], TW_5A_S32.i)
350 + NE10_S_MUL_NEON_S32 (scratch[9].val[1], TW_5B_S32.i);
351 scratch[6].val[1] = -NE10_S_MUL_NEON_S32 (scratch[10].val[0], TW_5A_S32.i)
352 - NE10_S_MUL_NEON_S32 (scratch[9].val[0], TW_5B_S32.i);
353
354 NE10_CPX_SUB_NEON_S32 (scratch_in[1], scratch[5], scratch[6]);
355 NE10_CPX_ADD_NEON_S32 (scratch_in[4], scratch[5], scratch[6]);
356
357 scratch[11].val[0] = scratch[0].val[0]
358 + NE10_S_MUL_NEON_S32 (scratch[7].val[0], TW_5B_S32.r)
359 + NE10_S_MUL_NEON_S32 (scratch[8].val[0], TW_5A_S32.r);
360 scratch[11].val[1] = scratch[0].val[1]
361 + NE10_S_MUL_NEON_S32 (scratch[7].val[1], TW_5B_S32.r)
362 + NE10_S_MUL_NEON_S32 (scratch[8].val[1], TW_5A_S32.r);
363
364 scratch[12].val[0] = -NE10_S_MUL_NEON_S32 (scratch[10].val[1], TW_5B_S32.i)
365 + NE10_S_MUL_NEON_S32 (scratch[9].val[1], TW_5A_S32.i);
366 scratch[12].val[1] = NE10_S_MUL_NEON_S32 (scratch[10].val[0], TW_5B_S32.i)
367 - NE10_S_MUL_NEON_S32 (scratch[9].val[0], TW_5A_S32.i);
368
369 NE10_CPX_ADD_NEON_S32 (scratch_in[2], scratch[11], scratch[12]);
370 NE10_CPX_SUB_NEON_S32 (scratch_in[3], scratch[11], scratch[12]);
371
372 Fout[0] = scratch_in[0];
373 Fout[1] = scratch_in[1];
374 Fout[2] = scratch_in[2];
375 Fout[3] = scratch_in[3];
376 Fout[4] = scratch_in[4];
377}
378
380// Following are butterfly functions
382template<ne10_int32_t RADIX, bool is_first_stage, bool is_inverse, bool is_scaled>
383static __attribute__ ((noinline)) void ne10_radix_butterfly_int32_neon (
384 CPLX *Fout,
385 const CPLX *Fin,
386 const ne10_fft_cpx_int32_t *twiddles,
387 const ne10_int32_t fstride,
388 const ne10_int32_t out_step,
389 const ne10_int32_t nfft)
390{
391 PRINT_HIT;
392 const ne10_int32_t in_step = nfft / RADIX;
393 ne10_int32_t f_count;
394 ne10_int32_t m_count;
395
396 for (f_count = fstride; f_count > 0; f_count--)
397 {
398 for (m_count = out_step; m_count > 0; m_count--)
399 {
400 CPLX in[RADIX];
401 CPLX out[RADIX];
402
403 NE10_LOAD_BY_STEP<RADIX, CPLX> (in, Fin, in_step);
404
405 if (is_inverse)
406 {
407 NE10_CONJ<RADIX> (in);
408 }
409
410 if (is_scaled)
411 {
413 }
414
415 if (!is_first_stage)
416 {
417 NE10_LOAD_TW_AND_MUL<RADIX> (in, twiddles, out_step);
418 }
419
420 NE10_FFT_FCU_NEON_S32<RADIX> (out, in);
421
422 if (is_inverse)
423 {
424 NE10_CONJ<RADIX> (out);
425 }
426
427 NE10_STORE_BY_STEP<RADIX, CPLX> (Fout, out, out_step);
428
429 Fin++;
430
431 if (!is_first_stage)
432 {
433 Fout++;
434 twiddles++;
435 }
436 else
437 {
438 Fout += RADIX;
439 }
440 }
441 if (!is_first_stage)
442 {
443 twiddles -= out_step;
444 Fout += (RADIX - 1) * out_step;
445 }
446 }
447}
448
449template<bool is_inverse, bool is_scaled>
450static void ne10_mixed_radix_generic_butterfly_int32_neon_impl (CPLX *Fout,
451 const CPLX *Fin,
452 const ne10_int32_t *factors,
453 const ne10_fft_cpx_int32_t *twiddles,
454 CPLX *buffer)
455{
456 ne10_int32_t fstride, mstride, radix;
457 ne10_int32_t stage_count;
458 ne10_int32_t nfft;
459
460 // init fstride, mstride, radix, nfft
461 stage_count = factors[0];
462 fstride = factors[1];
463 mstride = 1;
464 radix = factors[ stage_count << 1 ]; // radix of first stage
465 nfft = fstride * radix;
466
467 // swap to make sure output to Fout
468 if (stage_count % 2 == 0)
469 {
470 ne10_swap_ptr (buffer, Fout);
471 }
472
473 // first stage
474 switch (radix)
475 {
476 case 2:
477 ne10_radix_butterfly_int32_neon<2, true, is_inverse, is_scaled> (Fout, Fin,
478 NULL,
479 fstride, 1, nfft);
480 break;
481 case 4:
482 ne10_radix_butterfly_int32_neon<4, true, is_inverse, is_scaled> (Fout, Fin,
483 NULL,
484 fstride, 1, nfft);
485 break;
486 case 3:
487 ne10_radix_butterfly_int32_neon<3, true, is_inverse, is_scaled> (Fout, Fin,
488 NULL,
489 fstride, 1, nfft);
490 break;
491 case 5:
492 ne10_radix_butterfly_int32_neon<5, true, is_inverse, is_scaled> (Fout, Fin,
493 NULL,
494 fstride, 1, nfft);
495 break;
496 }
497
498 stage_count--;
499 if (!stage_count) // finish
500 {
501 return;
502 }
503
504 mstride *= radix;
505
506 // update radix
507 if (radix % 2)
508 {
509 twiddles += radix;
510 }
511 radix = factors[ stage_count << 1 ];
512
513 // other stages
514 while (stage_count > 0)
515 {
516 // radix of first stage, should be one of {2,3,5,4}
517 assert ((radix > 1) && (radix < 6));
518
519 ne10_swap_ptr (buffer, Fout);
520
521 fstride /= radix;
522 switch (radix)
523 {
524 case 2:
525 ne10_radix_butterfly_int32_neon<2, false, is_inverse, is_scaled> (Fout, buffer,
526 twiddles,
527 fstride, mstride, nfft);
528 break;
529 case 3:
530 ne10_radix_butterfly_int32_neon<3, false, is_inverse, is_scaled> (Fout, buffer,
531 twiddles,
532 fstride, mstride, nfft);
533 break;
534 case 4:
535 ne10_radix_butterfly_int32_neon<4, false, is_inverse, is_scaled> (Fout, buffer,
536 twiddles,
537 fstride, mstride, nfft);
538 break;
539 case 5:
540 ne10_radix_butterfly_int32_neon<5, false, is_inverse, is_scaled> (Fout, buffer,
541 twiddles, fstride, mstride, nfft);
542 break;
543 } // switch (radix)
544
545 twiddles += mstride * (radix - 1);
546 mstride *= radix;
547
548 stage_count--;
549 radix = factors[ stage_count << 1 ];
550 } // while (stage_count)
551}
552
553template<bool is_inverse, bool is_scaled>
554static void ne10_c2c_1d_last_stage_neon (CPLX *Fout,
555 const CPLX *Fin,
556 const ne10_fft_cpx_int32_t *twiddles,
557 const ne10_int32_t fstride,
558 const ne10_int32_t out_step,
559 const ne10_int32_t)
560{
561 ne10_int32_t f_count;
562 ne10_int32_t m_count;
563
564 for (f_count = fstride; f_count > 0; f_count--)
565 {
566 CPLX scratch_in[4];
567 CPLX scratch_out[4];
568
569 for (m_count = out_step / NE10_FFT_PARA_LEVEL; m_count > 0; m_count--)
570 {
571 scratch_in[0] = NE10_CPLX_LOAD (Fin + 0);
572 scratch_in[1] = NE10_CPLX_LOAD (Fin + 1);
573 scratch_in[2] = NE10_CPLX_LOAD (Fin + 2);
574 scratch_in[3] = NE10_CPLX_LOAD (Fin + 3);
575
576 if (is_scaled)
577 {
578 NE10_FFT_SCALING<4> () (scratch_in);
579 }
580
581 // Transpose
582 {
583 float32x4x2_t scratch0, scratch_in0;
584 float32x4x2_t scratch1, scratch_in1;
585 float32x4x2_t scratch2, scratch_in2;
586 float32x4x2_t scratch3, scratch_in3;
587
588 scratch_in0.val[0] = vreinterpretq_f32_s32 (scratch_in[0].val[0]);
589 scratch_in1.val[0] = vreinterpretq_f32_s32 (scratch_in[1].val[0]);
590 scratch_in2.val[0] = vreinterpretq_f32_s32 (scratch_in[2].val[0]);
591 scratch_in3.val[0] = vreinterpretq_f32_s32 (scratch_in[3].val[0]);
592 scratch_in0.val[1] = vreinterpretq_f32_s32 (scratch_in[0].val[1]);
593 scratch_in1.val[1] = vreinterpretq_f32_s32 (scratch_in[1].val[1]);
594 scratch_in2.val[1] = vreinterpretq_f32_s32 (scratch_in[2].val[1]);
595 scratch_in3.val[1] = vreinterpretq_f32_s32 (scratch_in[3].val[1]);
596
597 NE10_RADIX4X4C_TRANSPOSE_NEON (scratch, scratch_in);
598
599 scratch_in[0].val[0] = vreinterpretq_s32_f32 (scratch0.val[0]);
600 scratch_in[1].val[0] = vreinterpretq_s32_f32 (scratch1.val[0]);
601 scratch_in[2].val[0] = vreinterpretq_s32_f32 (scratch2.val[0]);
602 scratch_in[3].val[0] = vreinterpretq_s32_f32 (scratch3.val[0]);
603 scratch_in[0].val[1] = vreinterpretq_s32_f32 (scratch0.val[1]);
604 scratch_in[1].val[1] = vreinterpretq_s32_f32 (scratch1.val[1]);
605 scratch_in[2].val[1] = vreinterpretq_s32_f32 (scratch2.val[1]);
606 scratch_in[3].val[1] = vreinterpretq_s32_f32 (scratch3.val[1]);
607 }
608
609 if (is_inverse)
610 {
611 NE10_CONJ<4, CPLX> (scratch_in);
612 }
613
614 // Not first stage
615 {
616 CPLX scratch_tw[3];
617
618 scratch_tw[0] = NE10_CPLX_LOAD (twiddles + 0 * out_step);
619 scratch_tw[1] = NE10_CPLX_LOAD (twiddles + 1 * out_step);
620 scratch_tw[2] = NE10_CPLX_LOAD (twiddles + 2 * out_step);
621
622 NE10_CPX_MUL_NEON_S32 (scratch_in[1], scratch_in[1], scratch_tw[0]);
623 NE10_CPX_MUL_NEON_S32 (scratch_in[2], scratch_in[2], scratch_tw[1]);
624 NE10_CPX_MUL_NEON_S32 (scratch_in[3], scratch_in[3], scratch_tw[2]);
625 }
626
627 NE10_FFT_FCU_NEON_S32<4> (scratch_out, scratch_in);
628
629 if (is_inverse)
630 {
631 NE10_CONJ<4, CPLX> (scratch_out);
632 }
633
634 // Store.
635 {
636 ne10_fft_cpx_int32_t *Fout_cpx;
637 Fout_cpx = (ne10_fft_cpx_int32_t *) Fout;
638
639 NE10_CPLX_STORE (Fout_cpx + 0 * out_step, scratch_out[0]);
640 NE10_CPLX_STORE (Fout_cpx + 1 * out_step, scratch_out[1]);
641 NE10_CPLX_STORE (Fout_cpx + 2 * out_step, scratch_out[2]);
642 NE10_CPLX_STORE (Fout_cpx + 3 * out_step, scratch_out[3]);
643 }
644
645 Fin += 4;
646 Fout += 1;
647 twiddles += 4;
648 }
649 }
650
651 ne10_int32_t left_over = out_step % 4;
652 if (left_over == 0)
653 {
654 return;
655 }
656
657 // Left over.
658 const ne10_fft_cpx_int32_t *Fin_s = (ne10_fft_cpx_int32_t *) Fin;
660 for (m_count = out_step % 4; m_count > 0; m_count--)
661 {
662 ne10_fft_cpx_int32_t scratch_in[4];
663 ne10_fft_cpx_int32_t scratch_tw[4];
664
665 scratch_in[0] = Fin_s[0];
666 scratch_in[1] = Fin_s[1];
667 scratch_in[2] = Fin_s[2];
668 scratch_in[3] = Fin_s[3];
669
670 if (is_scaled)
671 {
672 scratch_in[0].r = scratch_in[0].r >> 2;
673 scratch_in[1].r = scratch_in[1].r >> 2;
674 scratch_in[2].r = scratch_in[2].r >> 2;
675 scratch_in[3].r = scratch_in[3].r >> 2;
676
677 scratch_in[0].i = scratch_in[0].i >> 2;
678 scratch_in[1].i = scratch_in[1].i >> 2;
679 scratch_in[2].i = scratch_in[2].i >> 2;
680 scratch_in[3].i = scratch_in[3].i >> 2;
681 }
682
683
684 if (is_inverse)
685 {
686 scratch_in[0].i = -scratch_in[0].i;
687 scratch_in[1].i = -scratch_in[1].i;
688 scratch_in[2].i = -scratch_in[2].i;
689 scratch_in[3].i = -scratch_in[3].i;
690 }
691
692 scratch_tw[0] = twiddles[0 * out_step];
693 scratch_tw[1] = twiddles[1 * out_step];
694 scratch_tw[2] = twiddles[2 * out_step];
695
696 NE10_CPX_MUL_S32 (scratch_in[1], scratch_in[1], scratch_tw[0]);
697 NE10_CPX_MUL_S32 (scratch_in[2], scratch_in[2], scratch_tw[1]);
698 NE10_CPX_MUL_S32 (scratch_in[3], scratch_in[3], scratch_tw[2]);
699
700 FFT_FCU<4> (scratch_in, scratch_in);
701
702 if (is_inverse)
703 {
704 scratch_in[0].i = -scratch_in[0].i;
705 scratch_in[1].i = -scratch_in[1].i;
706 scratch_in[2].i = -scratch_in[2].i;
707 scratch_in[3].i = -scratch_in[3].i;
708 }
709
710 Fout_s[0 * out_step] = scratch_in[0];
711 Fout_s[1 * out_step] = scratch_in[1];
712 Fout_s[2 * out_step] = scratch_in[2];
713 Fout_s[3 * out_step] = scratch_in[3];
714
715 Fin_s += 4;
716 Fout_s += 1;
717 twiddles += 1;
718 }
719}
720
721#endif
structure for the 32 bits fixed point FFT function.
Definition NE10_types.h:329