Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fft_float32.neonintrinsic.c
1/*
2 * Copyright 2014-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : dsp/NE10_fft_float32.neon.c
30 */
31
32#include <arm_neon.h>
33
34#include "NE10_types.h"
35#include "NE10_macros.h"
36#include "NE10_fft.h"
37#include "NE10_dsp.h"
38
39static inline void ne10_fft4_forward_float32 (ne10_fft_cpx_float32_t * Fout,
41{
42 ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
43 ne10_float32_t tmp_r, tmp_i;
44
45 s2_r = Fin[0].r - Fin[2].r;
46 s2_i = Fin[0].i - Fin[2].i;
47
48 tmp_r = Fin[0].r + Fin[2].r;
49 tmp_i = Fin[0].i + Fin[2].i;
50
51 s0_r = Fin[1].r + Fin[3].r;
52 s0_i = Fin[1].i + Fin[3].i;
53
54 s1_r = Fin[1].r - Fin[3].r;
55 s1_i = Fin[1].i - Fin[3].i;
56 Fout[2].r = tmp_r - s0_r;
57 Fout[2].i = tmp_i - s0_i;
58 Fout[0].r = tmp_r + s0_r;
59 Fout[0].i = tmp_i + s0_i;
60
61 Fout[1].r = s2_r + s1_i;
62 Fout[1].i = s2_i - s1_r;
63 Fout[3].r = s2_r - s1_i;
64 Fout[3].i = s2_i + s1_r;
65}
66
67static inline void ne10_fft4_backward_float32 (ne10_fft_cpx_float32_t * Fout,
69{
70 ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
71 ne10_float32_t tmp_r, tmp_i;
72
73 s2_r = Fin[0].r - Fin[2].r;
74 s2_i = Fin[0].i - Fin[2].i;
75
76 tmp_r = Fin[0].r + Fin[2].r;
77 tmp_i = Fin[0].i + Fin[2].i;
78
79 s0_r = Fin[1].r + Fin[3].r;
80 s0_i = Fin[1].i + Fin[3].i;
81
82 s1_r = Fin[1].r - Fin[3].r;
83 s1_i = Fin[1].i - Fin[3].i;
84 Fout[2].r = (tmp_r - s0_r) * 0.25f;
85 Fout[2].i = (tmp_i - s0_i) * 0.25f;
86 Fout[0].r = (tmp_r + s0_r) * 0.25f;
87 Fout[0].i = (tmp_i + s0_i) * 0.25f;
88
89 Fout[1].r = (s2_r - s1_i) * 0.25f;
90 Fout[1].i = (s2_i + s1_r) * 0.25f;
91 Fout[3].r = (s2_r + s1_i) * 0.25f;
92 Fout[3].i = (s2_i - s1_r) * 0.25f;
93}
94
95
96static inline void ne10_fft8_forward_float32 (ne10_fft_cpx_float32_t * Fout,
98{
99 ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
100 ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
101 const ne10_float32_t TW_81 = 0.70710678;
102
103 s0_r = Fin[0].r + Fin[4].r;
104 s0_i = Fin[0].i + Fin[4].i;
105 s1_r = Fin[0].r - Fin[4].r;
106 s1_i = Fin[0].i - Fin[4].i;
107 s2_r = Fin[1].r + Fin[5].r;
108 s2_i = Fin[1].i + Fin[5].i;
109 s3_r = Fin[1].r - Fin[5].r;
110 s3_i = Fin[1].i - Fin[5].i;
111 s4_r = Fin[2].r + Fin[6].r;
112 s4_i = Fin[2].i + Fin[6].i;
113 s5_r = Fin[2].r - Fin[6].r;
114 s5_i = Fin[2].i - Fin[6].i;
115 s6_r = Fin[3].r + Fin[7].r;
116 s6_i = Fin[3].i + Fin[7].i;
117 s7_r = Fin[3].r - Fin[7].r;
118 s7_i = Fin[3].i - Fin[7].i;
119
120 t0_r = s0_r - s4_r;
121 t0_i = s0_i - s4_i;
122 t1_r = s0_r + s4_r;
123 t1_i = s0_i + s4_i;
124 t2_r = s2_r + s6_r;
125 t2_i = s2_i + s6_i;
126 t3_r = s2_r - s6_r;
127 t3_i = s2_i - s6_i;
128 Fout[0].r = t1_r + t2_r;
129 Fout[0].i = t1_i + t2_i;
130 Fout[4].r = t1_r - t2_r;
131 Fout[4].i = t1_i - t2_i;
132 Fout[2].r = t0_r + t3_i;
133 Fout[2].i = t0_i - t3_r;
134 Fout[6].r = t0_r - t3_i;
135 Fout[6].i = t0_i + t3_r;
136
137 t4_r = (s3_r + s3_i) * TW_81;
138 t4_i = - (s3_r - s3_i) * TW_81;
139 t5_r = (s7_r - s7_i) * TW_81;
140 t5_i = (s7_r + s7_i) * TW_81;
141
142 t0_r = s1_r - s5_i;
143 t0_i = s1_i + s5_r;
144 t1_r = s1_r + s5_i;
145 t1_i = s1_i - s5_r;
146 t2_r = t4_r - t5_r;
147 t2_i = t4_i - t5_i;
148 t3_r = t4_r + t5_r;
149 t3_i = t4_i + t5_i;
150 Fout[1].r = t1_r + t2_r;
151 Fout[1].i = t1_i + t2_i;
152 Fout[5].r = t1_r - t2_r;
153 Fout[5].i = t1_i - t2_i;
154 Fout[3].r = t0_r + t3_i;
155 Fout[3].i = t0_i - t3_r;
156 Fout[7].r = t0_r - t3_i;
157 Fout[7].i = t0_i + t3_r;
158}
159
160static inline void ne10_fft8_backward_float32 (ne10_fft_cpx_float32_t * Fout,
162{
163 ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
164 ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
165 const ne10_float32_t TW_81 = 0.70710678;
166
167 s0_r = Fin[0].r + Fin[4].r;
168 s0_i = Fin[0].i + Fin[4].i;
169 s1_r = Fin[0].r - Fin[4].r;
170 s1_i = Fin[0].i - Fin[4].i;
171 s2_r = Fin[1].r + Fin[5].r;
172 s2_i = Fin[1].i + Fin[5].i;
173 s3_r = Fin[1].r - Fin[5].r;
174 s3_i = Fin[1].i - Fin[5].i;
175 s4_r = Fin[2].r + Fin[6].r;
176 s4_i = Fin[2].i + Fin[6].i;
177 s5_r = Fin[2].r - Fin[6].r;
178 s5_i = Fin[2].i - Fin[6].i;
179 s6_r = Fin[3].r + Fin[7].r;
180 s6_i = Fin[3].i + Fin[7].i;
181 s7_r = Fin[3].r - Fin[7].r;
182 s7_i = Fin[3].i - Fin[7].i;
183
184 t0_r = s0_r - s4_r;
185 t0_i = s0_i - s4_i;
186 t1_r = s0_r + s4_r;
187 t1_i = s0_i + s4_i;
188 t2_r = s2_r + s6_r;
189 t2_i = s2_i + s6_i;
190 t3_r = s2_r - s6_r;
191 t3_i = s2_i - s6_i;
192 Fout[0].r = (t1_r + t2_r) * 0.125f;
193 Fout[0].i = (t1_i + t2_i) * 0.125f;
194 Fout[4].r = (t1_r - t2_r) * 0.125f;
195 Fout[4].i = (t1_i - t2_i) * 0.125f;
196 Fout[2].r = (t0_r - t3_i) * 0.125f;
197 Fout[2].i = (t0_i + t3_r) * 0.125f;
198 Fout[6].r = (t0_r + t3_i) * 0.125f;
199 Fout[6].i = (t0_i - t3_r) * 0.125f;
200
201 t4_r = (s3_r - s3_i) * TW_81;
202 t4_i = (s3_r + s3_i) * TW_81;
203 t5_r = (s7_r + s7_i) * TW_81;
204 t5_i = - (s7_r - s7_i) * TW_81;
205
206 t0_r = s1_r + s5_i;
207 t0_i = s1_i - s5_r;
208 t1_r = s1_r - s5_i;
209 t1_i = s1_i + s5_r;
210 t2_r = t4_r - t5_r;
211 t2_i = t4_i - t5_i;
212 t3_r = t4_r + t5_r;
213 t3_i = t4_i + t5_i;
214 Fout[1].r = (t1_r + t2_r) * 0.125f;
215 Fout[1].i = (t1_i + t2_i) * 0.125f;
216 Fout[5].r = (t1_r - t2_r) * 0.125f;
217 Fout[5].i = (t1_i - t2_i) * 0.125f;
218 Fout[3].r = (t0_r - t3_i) * 0.125f;
219 Fout[3].i = (t0_i + t3_r) * 0.125f;
220 Fout[7].r = (t0_r + t3_i) * 0.125f;
221 Fout[7].i = (t0_i - t3_r) * 0.125f;
222}
223
224static void ne10_fft16_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
226 ne10_fft_cpx_float32_t * twiddles)
227{
228 ne10_fft_cpx_float32_t *tw1, *tw2, *tw3;
229
230 // the first stage
231 float32_t *p_src0, *p_src4, *p_src8, *p_src12;
232 float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
233 float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
234 float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
235 float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
236 p_src0 = (float32_t*) (& (Fin[0]));
237 p_src4 = (float32_t*) (& (Fin[4]));
238 p_src8 = (float32_t*) (& (Fin[8]));
239 p_src12 = (float32_t*) (& (Fin[12]));
240 q2_in_0123 = vld2q_f32 (p_src0);
241 q2_in_4567 = vld2q_f32 (p_src4);
242 q2_in_89ab = vld2q_f32 (p_src8);
243 q2_in_cdef = vld2q_f32 (p_src12);
244
245 q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
246 q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
247 q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
248 q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
249
250 q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
251 q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
252 q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
253 q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
254
255 q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
256 q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
257 q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
258 q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
259 q_out_r159d = vaddq_f32 (q_t2_r, q_t1_i);
260 q_out_i159d = vsubq_f32 (q_t2_i, q_t1_r);
261 q_out_r37bf = vsubq_f32 (q_t2_r, q_t1_i);
262 q_out_i37bf = vaddq_f32 (q_t2_i, q_t1_r);
263
264 // second stages
265 float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
266 float32_t *p_tw1, *p_tw2, *p_tw3;
267 float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
268 float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
269 float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
270 float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
271 float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
272 float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
273 float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
274 tw1 = twiddles;
275 tw2 = twiddles + 4;
276 tw3 = twiddles + 8;
277 p_dst0 = (float32_t*) (&Fout[0]);
278 p_dst1 = (float32_t*) (&Fout[4]);
279 p_dst2 = (float32_t*) (&Fout[8]);
280 p_dst3 = (float32_t*) (&Fout[12]);
281 p_tw1 = (float32_t*) tw1;
282 p_tw2 = (float32_t*) tw2;
283 p_tw3 = (float32_t*) tw3;
284 q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
285 q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
286 q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
287 q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
288 q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
289 q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
290 q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
291 q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
292 q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
293 q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
294 q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
295 q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
296 q2_tw1 = vld2q_f32 (p_tw1);
297 q2_tw2 = vld2q_f32 (p_tw2);
298 q2_tw3 = vld2q_f32 (p_tw3);
299
300 q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
301 q_s0_i = vmulq_f32 (q_in_r4567, q2_tw1.val[1]);
302 q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
303 q_s1_i = vmulq_f32 (q_in_r89ab, q2_tw2.val[1]);
304 q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
305 q_s2_i = vmulq_f32 (q_in_rcdef, q2_tw3.val[1]);
306 q_s0_r = vmlsq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
307 q_s0_i = vmlaq_f32 (q_s0_i, q_in_i4567, q2_tw1.val[0]);
308 q_s1_r = vmlsq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
309 q_s1_i = vmlaq_f32 (q_s1_i, q_in_i89ab, q2_tw2.val[0]);
310 q_s2_r = vmlsq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
311 q_s2_i = vmlaq_f32 (q_s2_i, q_in_icdef, q2_tw3.val[0]);
312
313
314 q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
315 q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
316 q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
317 q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
318
319 q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
320 q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
321 q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
322 q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
323
324 q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
325 q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
326 q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
327 q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
328
329 q2_out_4567.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
330 q2_out_4567.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
331 q2_out_cdef.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
332 q2_out_cdef.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
333
334 vst2q_f32 (p_dst0, q2_out_0123);
335 vst2q_f32 (p_dst1, q2_out_4567);
336 vst2q_f32 (p_dst2, q2_out_89ab);
337 vst2q_f32 (p_dst3, q2_out_cdef);
338}
339
340static void ne10_fft16_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
342 ne10_fft_cpx_float32_t * twiddles)
343{
344 ne10_fft_cpx_float32_t *tw1, *tw2, *tw3;
345
346 // the first stage
347 float32_t *p_src0, *p_src4, *p_src8, *p_src12;
348 float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
349 float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
350 float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
351 float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
352 p_src0 = (float32_t*) (& (Fin[0]));
353 p_src4 = (float32_t*) (& (Fin[4]));
354 p_src8 = (float32_t*) (& (Fin[8]));
355 p_src12 = (float32_t*) (& (Fin[12]));
356 q2_in_0123 = vld2q_f32 (p_src0);
357 q2_in_4567 = vld2q_f32 (p_src4);
358 q2_in_89ab = vld2q_f32 (p_src8);
359 q2_in_cdef = vld2q_f32 (p_src12);
360
361 q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
362 q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
363 q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
364 q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
365
366 q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
367 q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
368 q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
369 q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
370
371 q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
372 q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
373 q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
374 q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
375 q_out_r159d = vsubq_f32 (q_t2_r, q_t1_i);
376 q_out_i159d = vaddq_f32 (q_t2_i, q_t1_r);
377 q_out_r37bf = vaddq_f32 (q_t2_r, q_t1_i);
378 q_out_i37bf = vsubq_f32 (q_t2_i, q_t1_r);
379
380 // second stages
381 float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
382 float32_t *p_tw1, *p_tw2, *p_tw3;
383 float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
384 float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
385 float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
386 float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
387 float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
388 float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
389 float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
390 float32x4_t q_one_by_nfft;
391 tw1 = twiddles;
392 tw2 = twiddles + 4;
393 tw3 = twiddles + 8;
394 p_dst0 = (float32_t*) (&Fout[0]);
395 p_dst1 = (float32_t*) (&Fout[4]);
396 p_dst2 = (float32_t*) (&Fout[8]);
397 p_dst3 = (float32_t*) (&Fout[12]);
398 p_tw1 = (float32_t*) tw1;
399 p_tw2 = (float32_t*) tw2;
400 p_tw3 = (float32_t*) tw3;
401 q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
402 q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
403 q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
404 q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
405 q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
406 q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
407 q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
408 q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
409 q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
410 q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
411 q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
412 q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
413 q2_tw1 = vld2q_f32 (p_tw1);
414 q2_tw2 = vld2q_f32 (p_tw2);
415 q2_tw3 = vld2q_f32 (p_tw3);
416
417 q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
418 q_s0_i = vmulq_f32 (q_in_i4567, q2_tw1.val[0]);
419 q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
420 q_s1_i = vmulq_f32 (q_in_i89ab, q2_tw2.val[0]);
421 q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
422 q_s2_i = vmulq_f32 (q_in_icdef, q2_tw3.val[0]);
423 q_s0_r = vmlaq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
424 q_s0_i = vmlsq_f32 (q_s0_i, q_in_r4567, q2_tw1.val[1]);
425 q_s1_r = vmlaq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
426 q_s1_i = vmlsq_f32 (q_s1_i, q_in_r89ab, q2_tw2.val[1]);
427 q_s2_r = vmlaq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
428 q_s2_i = vmlsq_f32 (q_s2_i, q_in_rcdef, q2_tw3.val[1]);
429
430 q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
431 q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
432 q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
433 q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
434
435 q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
436 q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
437 q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
438 q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
439
440 q_one_by_nfft = vdupq_n_f32 (0.0625f);
441 q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
442 q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
443 q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
444 q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
445
446 q2_out_4567.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
447 q2_out_4567.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
448 q2_out_cdef.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
449 q2_out_cdef.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
450
451 q2_out_89ab.val[0] = vmulq_f32 (q2_out_89ab.val[0], q_one_by_nfft);
452 q2_out_89ab.val[1] = vmulq_f32 (q2_out_89ab.val[1], q_one_by_nfft);
453 q2_out_0123.val[0] = vmulq_f32 (q2_out_0123.val[0], q_one_by_nfft);
454 q2_out_0123.val[1] = vmulq_f32 (q2_out_0123.val[1], q_one_by_nfft);
455 q2_out_4567.val[0] = vmulq_f32 (q2_out_4567.val[0], q_one_by_nfft);
456 q2_out_4567.val[1] = vmulq_f32 (q2_out_4567.val[1], q_one_by_nfft);
457 q2_out_cdef.val[0] = vmulq_f32 (q2_out_cdef.val[0], q_one_by_nfft);
458 q2_out_cdef.val[1] = vmulq_f32 (q2_out_cdef.val[1], q_one_by_nfft);
459
460 vst2q_f32 (p_dst0, q2_out_0123);
461 vst2q_f32 (p_dst1, q2_out_4567);
462 vst2q_f32 (p_dst2, q2_out_89ab);
463 vst2q_f32 (p_dst3, q2_out_cdef);
464}
465
466static inline void ne10_radix8x4_neon (ne10_fft_cpx_float32_t * Fout,
468 ne10_int32_t stride)
469{
470 ne10_int32_t f_count;
471 ne10_int32_t src_step = stride << 1;
472
473 const ne10_float32_t TW_81 = 0.70710678;
474 const ne10_float32_t TW_81N = -0.70710678;
475
476 float32_t *p_src, *p_dst;
477 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7;
478 float32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i;
479 float32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i;
480 float32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i;
481 float32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i;
482 float32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i;
483 float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
484 float32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i;
485 float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7;
486 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7;
487 float32x4_t q_tw_81, q_tw_81n;
488
489 p_src = (float32_t *) Fin;
490 p_dst = (float32_t *) Fout;
491
492 for (f_count = 0; f_count < stride; f_count += 4)
493 {
494 q2_in0 = vld2q_f32 (p_src);
495 p_src += src_step;
496 q2_in2 = vld2q_f32 (p_src);
497 p_src += src_step;
498 q2_in4 = vld2q_f32 (p_src);
499 p_src += src_step;
500 q2_in6 = vld2q_f32 (p_src);
501 p_src += src_step;
502 q2_in1 = vld2q_f32 (p_src);
503 p_src += src_step;
504 q2_in3 = vld2q_f32 (p_src);
505 p_src += src_step;
506 q2_in5 = vld2q_f32 (p_src);
507 p_src += src_step;
508 q2_in7 = vld2q_f32 (p_src);
509 p_src += src_step;
510
511 q_sin0_r = vaddq_f32 (q2_in0.val[0], q2_in1.val[0]);
512 q_sin0_i = vaddq_f32 (q2_in0.val[1], q2_in1.val[1]);
513 q_sin1_r = vsubq_f32 (q2_in0.val[0], q2_in1.val[0]);
514 q_sin1_i = vsubq_f32 (q2_in0.val[1], q2_in1.val[1]);
515 q_sin2_r = vaddq_f32 (q2_in2.val[0], q2_in3.val[0]);
516 q_sin2_i = vaddq_f32 (q2_in2.val[1], q2_in3.val[1]);
517 q_sin3_r = vsubq_f32 (q2_in2.val[0], q2_in3.val[0]);
518 q_sin3_i = vsubq_f32 (q2_in2.val[1], q2_in3.val[1]);
519 q_sin4_r = vaddq_f32 (q2_in4.val[0], q2_in5.val[0]);
520 q_sin4_i = vaddq_f32 (q2_in4.val[1], q2_in5.val[1]);
521 q_sin5_r = vsubq_f32 (q2_in4.val[0], q2_in5.val[0]);
522 q_sin5_i = vsubq_f32 (q2_in4.val[1], q2_in5.val[1]);
523 q_sin6_r = vaddq_f32 (q2_in6.val[0], q2_in7.val[0]);
524 q_sin6_i = vaddq_f32 (q2_in6.val[1], q2_in7.val[1]);
525 q_sin7_r = vsubq_f32 (q2_in6.val[0], q2_in7.val[0]);
526 q_sin7_i = vsubq_f32 (q2_in6.val[1], q2_in7.val[1]);
527
528 // radix 4 butterfly without twiddles
529 q_tw_81 = vdupq_n_f32 (TW_81);
530 q_tw_81n = vdupq_n_f32 (TW_81N);
531 q_s5_r = q_sin5_i;
532 q_s5_i = vnegq_f32 (q_sin5_r);
533 q_s3_r = vaddq_f32 (q_sin3_r, q_sin3_i);
534 q_s3_i = vsubq_f32 (q_sin3_i, q_sin3_r);
535 q_s7_r = vsubq_f32 (q_sin7_r, q_sin7_i);
536 q_s7_i = vaddq_f32 (q_sin7_i, q_sin7_r);
537 q_s3_r = vmulq_f32 (q_s3_r, q_tw_81);
538 q_s3_i = vmulq_f32 (q_s3_i, q_tw_81);
539 q_s7_r = vmulq_f32 (q_s7_r, q_tw_81n);
540 q_s7_i = vmulq_f32 (q_s7_i, q_tw_81n);
541
542 // radix 2 butterfly
543 q_s8_r = vaddq_f32 (q_sin0_r, q_sin4_r);
544 q_s8_i = vaddq_f32 (q_sin0_i, q_sin4_i);
545 q_s9_r = vaddq_f32 (q_sin1_r, q_s5_r);
546 q_s9_i = vaddq_f32 (q_sin1_i, q_s5_i);
547 q_s10_r = vsubq_f32 (q_sin0_r, q_sin4_r);
548 q_s10_i = vsubq_f32 (q_sin0_i, q_sin4_i);
549 q_s11_r = vsubq_f32 (q_sin1_r, q_s5_r);
550 q_s11_i = vsubq_f32 (q_sin1_i, q_s5_i);
551
552 // radix 2 butterfly
553 q_s12_r = vaddq_f32 (q_sin2_r, q_sin6_r);
554 q_s12_i = vaddq_f32 (q_sin2_i, q_sin6_i);
555 q_s13_r = vaddq_f32 (q_s3_r, q_s7_r);
556 q_s13_i = vaddq_f32 (q_s3_i, q_s7_i);
557 q_s14_r = vsubq_f32 (q_sin2_r, q_sin6_r);
558 q_s14_i = vsubq_f32 (q_sin2_i, q_sin6_i);
559 q_s15_r = vsubq_f32 (q_s3_r, q_s7_r);
560 q_s15_i = vsubq_f32 (q_s3_i, q_s7_i);
561
562 // third result
563 q_out4_r = vsubq_f32 (q_s8_r, q_s12_r);
564 q_out4_i = vsubq_f32 (q_s8_i, q_s12_i);
565 q_out5_r = vsubq_f32 (q_s9_r, q_s13_r);
566 q_out5_i = vsubq_f32 (q_s9_i, q_s13_i);
567
568 // first result
569 q_out0_r = vaddq_f32 (q_s8_r, q_s12_r);
570 q_out0_i = vaddq_f32 (q_s8_i, q_s12_i);
571 q_out1_r = vaddq_f32 (q_s9_r, q_s13_r);
572 q_out1_i = vaddq_f32 (q_s9_i, q_s13_i);
573
574 // second result
575 q_out2_r = vaddq_f32 (q_s10_r, q_s14_i);
576 q_out2_i = vsubq_f32 (q_s10_i, q_s14_r);
577 q_out3_r = vaddq_f32 (q_s11_r, q_s15_i);
578 q_out3_i = vsubq_f32 (q_s11_i, q_s15_r);
579
580 // forth result
581 q_out6_r = vsubq_f32 (q_s10_r, q_s14_i);
582 q_out6_i = vaddq_f32 (q_s10_i, q_s14_r);
583 q_out7_r = vsubq_f32 (q_s11_r, q_s15_i);
584 q_out7_i = vaddq_f32 (q_s11_i, q_s15_r);
585
586 q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
587 q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
588 q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
589 q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
590 q2_tmp4 = vtrnq_f32 (q_out4_r, q_out5_r);
591 q2_tmp5 = vtrnq_f32 (q_out4_i, q_out5_i);
592 q2_tmp6 = vtrnq_f32 (q_out6_r, q_out7_r);
593 q2_tmp7 = vtrnq_f32 (q_out6_i, q_out7_i);
594
595 q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
596 q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
597 q2_out2.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
598 q2_out2.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
599 q2_out4.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
600 q2_out4.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
601 q2_out6.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
602 q2_out6.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
603
604 q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[0]), vget_low_f32 (q2_tmp6.val[0]));
605 q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[0]), vget_low_f32 (q2_tmp7.val[0]));
606 q2_out3.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[1]), vget_low_f32 (q2_tmp6.val[1]));
607 q2_out3.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[1]), vget_low_f32 (q2_tmp7.val[1]));
608 q2_out5.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[0]), vget_high_f32 (q2_tmp6.val[0]));
609 q2_out5.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[0]), vget_high_f32 (q2_tmp7.val[0]));
610 q2_out7.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[1]), vget_high_f32 (q2_tmp6.val[1]));
611 q2_out7.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[1]), vget_high_f32 (q2_tmp7.val[1]));
612
613 // store
614 vst2q_f32 (p_dst, q2_out0);
615 p_dst += 8;
616 vst2q_f32 (p_dst, q2_out1);
617 p_dst += 8;
618 vst2q_f32 (p_dst, q2_out2);
619 p_dst += 8;
620 vst2q_f32 (p_dst, q2_out3);
621 p_dst += 8;
622 vst2q_f32 (p_dst, q2_out4);
623 p_dst += 8;
624 vst2q_f32 (p_dst, q2_out5);
625 p_dst += 8;
626 vst2q_f32 (p_dst, q2_out6);
627 p_dst += 8;
628 vst2q_f32 (p_dst, q2_out7);
629 p_dst += 8;
630
631 p_src = p_src - src_step * 8 + 8;
632 } // f_count
633}
634
635static inline void ne10_radix4x4_without_twiddles_neon (ne10_fft_cpx_float32_t * Fout,
637 ne10_int32_t stride)
638{
639 ne10_int32_t f_count;
640 ne10_int32_t src_step = stride << 1;
641
642 float32_t *p_src, *p_dst;
643 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
644 float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
645 float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
646 float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3;
647 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
648
649 p_src = (float32_t *) Fin;
650 p_dst = (float32_t *) Fout;
651
652 for (f_count = 0; f_count < stride; f_count += 4)
653 {
654 // load
655 q2_in0 = vld2q_f32 (p_src);
656 p_src += src_step;
657 q2_in1 = vld2q_f32 (p_src);
658 p_src += src_step;
659 q2_in2 = vld2q_f32 (p_src);
660 p_src += src_step;
661 q2_in3 = vld2q_f32 (p_src);
662 p_src += src_step;
663
664 // radix 4 butterfly without twiddles
665 q_s0_r = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
666 q_s0_i = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
667 q_s1_r = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
668 q_s1_i = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
669 q_s2_r = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
670 q_s2_i = vaddq_f32 (q2_in1.val[1], q2_in3.val[1]);
671 q_s3_r = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
672 q_s3_i = vsubq_f32 (q2_in1.val[1], q2_in3.val[1]);
673
674 // third result
675 q_out2_r = vsubq_f32 (q_s0_r, q_s2_r);
676 q_out2_i = vsubq_f32 (q_s0_i, q_s2_i);
677 q_out0_r = vaddq_f32 (q_s0_r, q_s2_r);
678 q_out0_i = vaddq_f32 (q_s0_i, q_s2_i);
679
680 q_out1_r = vaddq_f32 (q_s1_r, q_s3_i);
681 q_out1_i = vsubq_f32 (q_s1_i, q_s3_r);
682 q_out3_r = vsubq_f32 (q_s1_r, q_s3_i);
683 q_out3_i = vaddq_f32 (q_s1_i, q_s3_r);
684
685 q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
686 q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
687 q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
688 q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
689 q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
690 q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
691 q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
692 q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
693 q2_out2.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
694 q2_out2.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
695 q2_out3.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
696 q2_out3.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
697
698 // store
699 vst2q_f32 (p_dst, q2_out0);
700 p_dst += 8;
701 vst2q_f32 (p_dst, q2_out1);
702 p_dst += 8;
703 vst2q_f32 (p_dst, q2_out2);
704 p_dst += 8;
705 vst2q_f32 (p_dst, q2_out3);
706 p_dst += 8;
707
708 p_src = p_src - src_step * 4 + 8;
709 }
710}
711
712static inline void ne10_radix4x4_with_twiddles_neon (ne10_fft_cpx_float32_t * Fout,
715 ne10_int32_t src_stride,
716 ne10_int32_t dst_stride,
717 ne10_int32_t mstride)
718{
719 ne10_int32_t m_count;
720 ne10_int32_t src_step = src_stride << 1;
721 ne10_int32_t dst_step = dst_stride << 1;
722 ne10_int32_t tw_step = mstride << 1;
723
724 float32_t *p_src, *p_dst, *p_tw;
725 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
726 float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
727 float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
728 float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
729 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
730
731 p_src = (float32_t *) Fin;
732 p_dst = (float32_t *) Fout;
733 p_tw = (float32_t *) tw;
734
735 for (m_count = 0; m_count < mstride; m_count += 4)
736 {
737 // load
738 q2_in0 = vld2q_f32 (p_src);
739 p_src += src_step;
740 q2_in1 = vld2q_f32 (p_src);
741 p_src += src_step;
742 q2_in2 = vld2q_f32 (p_src);
743 p_src += src_step;
744 q2_in3 = vld2q_f32 (p_src);
745 p_src += src_step;
746
747 q2_tw0 = vld2q_f32 (p_tw);
748 p_tw += tw_step;
749 q2_tw1 = vld2q_f32 (p_tw);
750 p_tw += tw_step;
751 q2_tw2 = vld2q_f32 (p_tw);
752
753 q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
754 q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
755 q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
756 q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
757 q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
758 q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
759 q_s1_r = vmlsq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
760 q_s1_i = vmlaq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
761 q_s2_r = vmlsq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
762 q_s2_i = vmlaq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
763 q_s3_r = vmlsq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
764 q_s3_i = vmlaq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
765
766 q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
767 q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
768 q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
769 q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
770
771 q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
772 q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
773 q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
774 q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
775
776 q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
777 q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
778 q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
779 q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
780
781 q2_out1.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
782 q2_out1.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
783 q2_out3.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
784 q2_out3.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
785
786 // store
787 vst2q_f32 (p_dst, q2_out0);
788 p_dst += dst_step;
789 vst2q_f32 (p_dst, q2_out1);
790 p_dst += dst_step;
791 vst2q_f32 (p_dst, q2_out2);
792 p_dst += dst_step;
793 vst2q_f32 (p_dst, q2_out3);
794 p_dst += dst_step;
795
796 p_src = p_src - src_step * 4 + 8;
797 p_dst = p_dst - dst_step * 4 + 8;
798 p_tw = p_tw - tw_step * 2 + 8;
799 }
800}
801static inline void ne10_radix8x4_inverse_neon (ne10_fft_cpx_float32_t * Fout,
803 ne10_int32_t stride)
804{
805 ne10_int32_t f_count;
806 ne10_int32_t src_step = stride << 1;
807
808 const ne10_float32_t TW_81 = 0.70710678;
809 const ne10_float32_t TW_81N = -0.70710678;
810
811 float32_t *p_src, *p_dst;
812 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7;
813 float32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i;
814 float32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i;
815 float32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i;
816 float32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i;
817 float32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i;
818 float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
819 float32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i;
820 float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7;
821 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7;
822 float32x4_t q_tw_81, q_tw_81n;
823
824 p_src = (float32_t *) Fin;
825 p_dst = (float32_t *) Fout;
826
827 for (f_count = 0; f_count < stride; f_count += 4)
828 {
829 q2_in0 = vld2q_f32 (p_src);
830 p_src += src_step;
831 q2_in2 = vld2q_f32 (p_src);
832 p_src += src_step;
833 q2_in4 = vld2q_f32 (p_src);
834 p_src += src_step;
835 q2_in6 = vld2q_f32 (p_src);
836 p_src += src_step;
837 q2_in1 = vld2q_f32 (p_src);
838 p_src += src_step;
839 q2_in3 = vld2q_f32 (p_src);
840 p_src += src_step;
841 q2_in5 = vld2q_f32 (p_src);
842 p_src += src_step;
843 q2_in7 = vld2q_f32 (p_src);
844 p_src += src_step;
845
846 q_sin0_r = vaddq_f32 (q2_in0.val[0], q2_in1.val[0]);
847 q_sin0_i = vaddq_f32 (q2_in0.val[1], q2_in1.val[1]);
848 q_sin1_r = vsubq_f32 (q2_in0.val[0], q2_in1.val[0]);
849 q_sin1_i = vsubq_f32 (q2_in0.val[1], q2_in1.val[1]);
850 q_sin2_r = vaddq_f32 (q2_in2.val[0], q2_in3.val[0]);
851 q_sin2_i = vaddq_f32 (q2_in2.val[1], q2_in3.val[1]);
852 q_sin3_r = vsubq_f32 (q2_in2.val[0], q2_in3.val[0]);
853 q_sin3_i = vsubq_f32 (q2_in2.val[1], q2_in3.val[1]);
854 q_sin4_r = vaddq_f32 (q2_in4.val[0], q2_in5.val[0]);
855 q_sin4_i = vaddq_f32 (q2_in4.val[1], q2_in5.val[1]);
856 q_sin5_r = vsubq_f32 (q2_in4.val[0], q2_in5.val[0]);
857 q_sin5_i = vsubq_f32 (q2_in4.val[1], q2_in5.val[1]);
858 q_sin6_r = vaddq_f32 (q2_in6.val[0], q2_in7.val[0]);
859 q_sin6_i = vaddq_f32 (q2_in6.val[1], q2_in7.val[1]);
860 q_sin7_r = vsubq_f32 (q2_in6.val[0], q2_in7.val[0]);
861 q_sin7_i = vsubq_f32 (q2_in6.val[1], q2_in7.val[1]);
862
863 // radix 4 butterfly without twiddles
864 q_tw_81 = vdupq_n_f32 (TW_81);
865 q_tw_81n = vdupq_n_f32 (TW_81N);
866 q_s5_r = vnegq_f32 (q_sin5_i);
867 q_s5_i = q_sin5_r;
868 q_s3_r = vsubq_f32 (q_sin3_r, q_sin3_i);
869 q_s3_i = vaddq_f32 (q_sin3_i, q_sin3_r);
870 q_s7_r = vaddq_f32 (q_sin7_r, q_sin7_i);
871 q_s7_i = vsubq_f32 (q_sin7_i, q_sin7_r);
872 q_s3_r = vmulq_f32 (q_s3_r, q_tw_81);
873 q_s3_i = vmulq_f32 (q_s3_i, q_tw_81);
874 q_s7_r = vmulq_f32 (q_s7_r, q_tw_81n);
875 q_s7_i = vmulq_f32 (q_s7_i, q_tw_81n);
876
877 // radix 2 butterfly
878 q_s8_r = vaddq_f32 (q_sin0_r, q_sin4_r);
879 q_s8_i = vaddq_f32 (q_sin0_i, q_sin4_i);
880 q_s9_r = vaddq_f32 (q_sin1_r, q_s5_r);
881 q_s9_i = vaddq_f32 (q_sin1_i, q_s5_i);
882 q_s10_r = vsubq_f32 (q_sin0_r, q_sin4_r);
883 q_s10_i = vsubq_f32 (q_sin0_i, q_sin4_i);
884 q_s11_r = vsubq_f32 (q_sin1_r, q_s5_r);
885 q_s11_i = vsubq_f32 (q_sin1_i, q_s5_i);
886
887 // radix 2 butterfly
888 q_s12_r = vaddq_f32 (q_sin2_r, q_sin6_r);
889 q_s12_i = vaddq_f32 (q_sin2_i, q_sin6_i);
890 q_s13_r = vaddq_f32 (q_s3_r, q_s7_r);
891 q_s13_i = vaddq_f32 (q_s3_i, q_s7_i);
892 q_s14_r = vsubq_f32 (q_sin2_r, q_sin6_r);
893 q_s14_i = vsubq_f32 (q_sin2_i, q_sin6_i);
894 q_s15_r = vsubq_f32 (q_s3_r, q_s7_r);
895 q_s15_i = vsubq_f32 (q_s3_i, q_s7_i);
896
897 // third result
898 q_out4_r = vsubq_f32 (q_s8_r, q_s12_r);
899 q_out4_i = vsubq_f32 (q_s8_i, q_s12_i);
900 q_out5_r = vsubq_f32 (q_s9_r, q_s13_r);
901 q_out5_i = vsubq_f32 (q_s9_i, q_s13_i);
902
903 // first result
904 q_out0_r = vaddq_f32 (q_s8_r, q_s12_r);
905 q_out0_i = vaddq_f32 (q_s8_i, q_s12_i);
906 q_out1_r = vaddq_f32 (q_s9_r, q_s13_r);
907 q_out1_i = vaddq_f32 (q_s9_i, q_s13_i);
908
909 // second result
910 q_out2_r = vsubq_f32 (q_s10_r, q_s14_i);
911 q_out2_i = vaddq_f32 (q_s10_i, q_s14_r);
912 q_out3_r = vsubq_f32 (q_s11_r, q_s15_i);
913 q_out3_i = vaddq_f32 (q_s11_i, q_s15_r);
914
915 // forth result
916 q_out6_r = vaddq_f32 (q_s10_r, q_s14_i);
917 q_out6_i = vsubq_f32 (q_s10_i, q_s14_r);
918 q_out7_r = vaddq_f32 (q_s11_r, q_s15_i);
919 q_out7_i = vsubq_f32 (q_s11_i, q_s15_r);
920
921 q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
922 q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
923 q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
924 q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
925 q2_tmp4 = vtrnq_f32 (q_out4_r, q_out5_r);
926 q2_tmp5 = vtrnq_f32 (q_out4_i, q_out5_i);
927 q2_tmp6 = vtrnq_f32 (q_out6_r, q_out7_r);
928 q2_tmp7 = vtrnq_f32 (q_out6_i, q_out7_i);
929
930 q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
931 q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
932 q2_out2.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
933 q2_out2.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
934 q2_out4.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
935 q2_out4.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
936 q2_out6.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
937 q2_out6.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
938
939 q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[0]), vget_low_f32 (q2_tmp6.val[0]));
940 q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[0]), vget_low_f32 (q2_tmp7.val[0]));
941 q2_out3.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[1]), vget_low_f32 (q2_tmp6.val[1]));
942 q2_out3.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[1]), vget_low_f32 (q2_tmp7.val[1]));
943 q2_out5.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[0]), vget_high_f32 (q2_tmp6.val[0]));
944 q2_out5.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[0]), vget_high_f32 (q2_tmp7.val[0]));
945 q2_out7.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[1]), vget_high_f32 (q2_tmp6.val[1]));
946 q2_out7.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[1]), vget_high_f32 (q2_tmp7.val[1]));
947
948 // store
949 vst2q_f32 (p_dst, q2_out0);
950 p_dst += 8;
951 vst2q_f32 (p_dst, q2_out1);
952 p_dst += 8;
953 vst2q_f32 (p_dst, q2_out2);
954 p_dst += 8;
955 vst2q_f32 (p_dst, q2_out3);
956 p_dst += 8;
957 vst2q_f32 (p_dst, q2_out4);
958 p_dst += 8;
959 vst2q_f32 (p_dst, q2_out5);
960 p_dst += 8;
961 vst2q_f32 (p_dst, q2_out6);
962 p_dst += 8;
963 vst2q_f32 (p_dst, q2_out7);
964 p_dst += 8;
965
966 p_src = p_src - src_step * 8 + 8;
967 } // f_count
968}
969
970static inline void ne10_radix4x4_inverse_without_twiddles_neon (ne10_fft_cpx_float32_t * Fout,
972 ne10_int32_t stride)
973{
974 ne10_int32_t f_count;
975 ne10_int32_t src_step = stride << 1;
976
977 float32_t *p_src, *p_dst;
978 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
979 float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
980 float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
981 float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3;
982 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
983
984 p_src = (float32_t *) Fin;
985 p_dst = (float32_t *) Fout;
986
987 for (f_count = 0; f_count < stride; f_count += 4)
988 {
989 // load
990 q2_in0 = vld2q_f32 (p_src);
991 p_src += src_step;
992 q2_in1 = vld2q_f32 (p_src);
993 p_src += src_step;
994 q2_in2 = vld2q_f32 (p_src);
995 p_src += src_step;
996 q2_in3 = vld2q_f32 (p_src);
997 p_src += src_step;
998
999 // radix 4 butterfly without twiddles
1000 q_s0_r = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1001 q_s0_i = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1002 q_s1_r = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1003 q_s1_i = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1004 q_s2_r = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
1005 q_s2_i = vaddq_f32 (q2_in1.val[1], q2_in3.val[1]);
1006 q_s3_r = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
1007 q_s3_i = vsubq_f32 (q2_in1.val[1], q2_in3.val[1]);
1008
1009 q_out2_r = vsubq_f32 (q_s0_r, q_s2_r);
1010 q_out2_i = vsubq_f32 (q_s0_i, q_s2_i);
1011 q_out0_r = vaddq_f32 (q_s0_r, q_s2_r);
1012 q_out0_i = vaddq_f32 (q_s0_i, q_s2_i);
1013
1014 q_out1_r = vsubq_f32 (q_s1_r, q_s3_i);
1015 q_out1_i = vaddq_f32 (q_s1_i, q_s3_r);
1016 q_out3_r = vaddq_f32 (q_s1_r, q_s3_i);
1017 q_out3_i = vsubq_f32 (q_s1_i, q_s3_r);
1018
1019 q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
1020 q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
1021 q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
1022 q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
1023 q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
1024 q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
1025 q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
1026 q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
1027 q2_out2.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
1028 q2_out2.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
1029 q2_out3.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
1030 q2_out3.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
1031
1032 // store
1033 vst2q_f32 (p_dst, q2_out0);
1034 p_dst += 8;
1035 vst2q_f32 (p_dst, q2_out1);
1036 p_dst += 8;
1037 vst2q_f32 (p_dst, q2_out2);
1038 p_dst += 8;
1039 vst2q_f32 (p_dst, q2_out3);
1040 p_dst += 8;
1041
1042 p_src = p_src - src_step * 4 + 8;
1043 }
1044}
1045
1046static inline void ne10_radix4x4_inverse_with_twiddles_neon (ne10_fft_cpx_float32_t * Fout,
1049 ne10_int32_t src_stride,
1050 ne10_int32_t dst_stride,
1051 ne10_int32_t mstride)
1052{
1053 ne10_int32_t m_count;
1054 ne10_int32_t src_step = src_stride << 1;
1055 ne10_int32_t dst_step = dst_stride << 1;
1056 ne10_int32_t tw_step = mstride << 1;
1057
1058 float32_t *p_src, *p_dst, *p_tw;
1059 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
1060 float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
1061 float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
1062 float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
1063 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
1064
1065 p_src = (float32_t *) Fin;
1066 p_dst = (float32_t *) Fout;
1067 p_tw = (float32_t *) tw;
1068
1069 for (m_count = 0; m_count < mstride; m_count += 4)
1070 {
1071 // load
1072 q2_in0 = vld2q_f32 (p_src);
1073 p_src += src_step;
1074 q2_in1 = vld2q_f32 (p_src);
1075 p_src += src_step;
1076 q2_in2 = vld2q_f32 (p_src);
1077 p_src += src_step;
1078 q2_in3 = vld2q_f32 (p_src);
1079 p_src += src_step;
1080
1081 q2_tw0 = vld2q_f32 (p_tw);
1082 p_tw += tw_step;
1083 q2_tw1 = vld2q_f32 (p_tw);
1084 p_tw += tw_step;
1085 q2_tw2 = vld2q_f32 (p_tw);
1086
1087 q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
1088 q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
1089 q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
1090 q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
1091 q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
1092 q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
1093 q_s1_r = vmlaq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
1094 q_s1_i = vmlsq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
1095 q_s2_r = vmlaq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
1096 q_s2_i = vmlsq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
1097 q_s3_r = vmlaq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
1098 q_s3_i = vmlsq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
1099
1100 q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
1101 q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
1102 q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
1103 q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
1104
1105 q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
1106 q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
1107 q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
1108 q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
1109
1110 q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
1111 q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
1112 q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
1113 q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
1114
1115 q2_out1.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
1116 q2_out1.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
1117 q2_out3.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
1118 q2_out3.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
1119
1120 // store
1121 vst2q_f32 (p_dst, q2_out0);
1122 p_dst += dst_step;
1123 vst2q_f32 (p_dst, q2_out1);
1124 p_dst += dst_step;
1125 vst2q_f32 (p_dst, q2_out2);
1126 p_dst += dst_step;
1127 vst2q_f32 (p_dst, q2_out3);
1128 p_dst += dst_step;
1129
1130 p_src = p_src - src_step * 4 + 8;
1131 p_dst = p_dst - dst_step * 4 + 8;
1132 p_tw = p_tw - tw_step * 2 + 8;
1133 }
1134}
1135
1136static inline void ne10_radix4x4_inverse_with_twiddles_last_stage_neon (ne10_fft_cpx_float32_t * Fout,
1139 ne10_int32_t src_stride,
1140 ne10_int32_t dst_stride,
1141 ne10_int32_t mstride,
1142 ne10_int32_t nfft)
1143{
1144 ne10_int32_t m_count;
1145 ne10_int32_t src_step = src_stride << 1;
1146 ne10_int32_t dst_step = dst_stride << 1;
1147 ne10_int32_t tw_step = mstride << 1;
1148 ne10_float32_t one_by_nfft = (1.0f / (ne10_float32_t) nfft);
1149
1150 float32_t *p_src, *p_dst, *p_tw;
1151 float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
1152 float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
1153 float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
1154 float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
1155 float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
1156 float32x4_t q_one_by_nfft = vdupq_n_f32 (one_by_nfft);
1157
1158 p_src = (float32_t *) Fin;
1159 p_dst = (float32_t *) Fout;
1160 p_tw = (float32_t *) tw;
1161
1162 for (m_count = 0; m_count < mstride; m_count += 4)
1163 {
1164 // load
1165 q2_in0 = vld2q_f32 (p_src);
1166 p_src += src_step;
1167 q2_in1 = vld2q_f32 (p_src);
1168 p_src += src_step;
1169 q2_in2 = vld2q_f32 (p_src);
1170 p_src += src_step;
1171 q2_in3 = vld2q_f32 (p_src);
1172 p_src += src_step;
1173
1174 q2_tw0 = vld2q_f32 (p_tw);
1175 p_tw += tw_step;
1176 q2_tw1 = vld2q_f32 (p_tw);
1177 p_tw += tw_step;
1178 q2_tw2 = vld2q_f32 (p_tw);
1179
1180 q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
1181 q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
1182 q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
1183 q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
1184 q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
1185 q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
1186 q_s1_r = vmlaq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
1187 q_s1_i = vmlsq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
1188 q_s2_r = vmlaq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
1189 q_s2_i = vmlsq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
1190 q_s3_r = vmlaq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
1191 q_s3_i = vmlsq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
1192
1193 q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
1194 q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
1195 q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
1196 q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
1197
1198 q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
1199 q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
1200 q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
1201 q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
1202
1203 q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
1204 q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
1205 q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
1206 q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
1207
1208 q2_out1.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
1209 q2_out1.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
1210 q2_out3.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
1211 q2_out3.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
1212
1213 q2_out0.val[0] = vmulq_f32 (q2_out0.val[0], q_one_by_nfft);
1214 q2_out0.val[1] = vmulq_f32 (q2_out0.val[1], q_one_by_nfft);
1215 q2_out1.val[0] = vmulq_f32 (q2_out1.val[0], q_one_by_nfft);
1216 q2_out1.val[1] = vmulq_f32 (q2_out1.val[1], q_one_by_nfft);
1217 q2_out2.val[0] = vmulq_f32 (q2_out2.val[0], q_one_by_nfft);
1218 q2_out2.val[1] = vmulq_f32 (q2_out2.val[1], q_one_by_nfft);
1219 q2_out3.val[0] = vmulq_f32 (q2_out3.val[0], q_one_by_nfft);
1220 q2_out3.val[1] = vmulq_f32 (q2_out3.val[1], q_one_by_nfft);
1221
1222 // store
1223 vst2q_f32 (p_dst, q2_out0);
1224 p_dst += dst_step;
1225 vst2q_f32 (p_dst, q2_out1);
1226 p_dst += dst_step;
1227 vst2q_f32 (p_dst, q2_out2);
1228 p_dst += dst_step;
1229 vst2q_f32 (p_dst, q2_out3);
1230 p_dst += dst_step;
1231
1232 p_src = p_src - src_step * 4 + 8;
1233 p_dst = p_dst - dst_step * 4 + 8;
1234 p_tw = p_tw - tw_step * 2 + 8;
1235 }
1236}
1237
1238void ne10_mixed_radix_fft_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
1240 ne10_int32_t * factors,
1241 ne10_fft_cpx_float32_t * twiddles,
1242 ne10_fft_cpx_float32_t * buffer)
1243{
1244 ne10_int32_t fstride, mstride, N;
1245 ne10_int32_t fstride1;
1246 ne10_int32_t f_count;
1247 ne10_int32_t stage_count;
1248
1249 ne10_fft_cpx_float32_t *Fin1, *Fout1;
1250 ne10_fft_cpx_float32_t *Fout_ls = Fout;
1252 ne10_fft_cpx_float32_t *tw, *tw1;
1253
1254 // init fstride, mstride, N
1255 stage_count = factors[0];
1256 fstride = factors[1];
1257 mstride = factors[ (stage_count << 1) - 1 ];
1258 N = factors[ stage_count << 1 ]; // radix
1259
1260 // the first stage
1261 Fin1 = Fin;
1262 Fout1 = Fout;
1263 if (N == 2) // length of FFT is 2^n (n is odd)
1264 {
1265 // radix 8
1266 N = fstride >> 1; // 1/4 of length of FFT
1267 tw = twiddles;
1268 fstride1 = fstride >> 2;
1269
1270 ne10_radix8x4_neon (Fout, Fin, fstride1);
1271
1272 tw += 6;
1273 mstride <<= 2;
1274 fstride >>= 4;
1275 stage_count -= 2;
1276
1277 // swap
1278 Ftmp = buffer;
1279 buffer = Fout;
1280 Fout = Ftmp;
1281 }
1282 else if (N == 4) // length of FFT is 2^n (n is even)
1283 {
1284 //fstride is nfft>>2
1285 ne10_radix4x4_without_twiddles_neon (Fout, Fin, fstride);
1286
1287 N = fstride; // 1/4 of length of FFT
1288
1289 // swap
1290 Ftmp = buffer;
1291 buffer = Fout;
1292 Fout = Ftmp;
1293
1294 // update address for other stages
1295 stage_count--;
1296 tw = twiddles;
1297 fstride >>= 2;
1298 // end of first stage
1299 }
1300
1301
1302 // others but the last one
1303 for (; stage_count > 1 ; stage_count--)
1304 {
1305 Fin1 = buffer;
1306 for (f_count = 0; f_count < fstride; f_count ++)
1307 {
1308 Fout1 = & Fout[ f_count * mstride << 2 ];
1309 tw1 = tw;
1310 ne10_radix4x4_with_twiddles_neon (Fout1, Fin1, tw1, N, mstride, mstride);
1311 Fin1 += mstride;
1312 } // f_count
1313 tw += mstride * 3;
1314 mstride <<= 2;
1315 fstride >>= 2;
1316
1317 // swap
1318 Ftmp = buffer;
1319 buffer = Fout;
1320 Fout = Ftmp;
1321 } // stage_count
1322
1323 // the last one
1324 if (stage_count)
1325 {
1326 Fin1 = buffer;
1327 // if stage count is even, output to the input array
1328 Fout1 = Fout_ls;
1329
1330 for (f_count = 0; f_count < fstride; f_count ++)
1331 {
1332 tw1 = tw;
1333 ne10_radix4x4_with_twiddles_neon (Fout1, Fin1, tw1, N, N, mstride);
1334 Fin1 += mstride;
1335 Fout1 += mstride;
1336 } // f_count
1337 } // last stage
1338}
1339
1340void ne10_mixed_radix_fft_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
1342 ne10_int32_t * factors,
1343 ne10_fft_cpx_float32_t * twiddles,
1344 ne10_fft_cpx_float32_t * buffer)
1345{
1346 ne10_int32_t fstride, mstride, N;
1347 ne10_int32_t fstride1;
1348 ne10_int32_t f_count;
1349 ne10_int32_t stage_count;
1350 ne10_int32_t nfft;
1351
1352 ne10_fft_cpx_float32_t *Fin1, *Fout1;
1353 ne10_fft_cpx_float32_t *Fout_ls = Fout;
1355 ne10_fft_cpx_float32_t *tw, *tw1;
1356
1357 // init fstride, mstride, N
1358 stage_count = factors[0];
1359 fstride = factors[1];
1360 mstride = factors[ (stage_count << 1) - 1 ];
1361 N = factors[ stage_count << 1 ]; // radix
1362 nfft = fstride * N;
1363
1364 // the first stage
1365 Fin1 = Fin;
1366 Fout1 = Fout;
1367 if (N == 2) // length of FFT is 2^n (n is odd)
1368 {
1369 // radix 8
1370 N = fstride >> 1; // 1/4 of length of FFT
1371 tw = twiddles;
1372 fstride1 = fstride >> 2;
1373
1374 ne10_radix8x4_inverse_neon (Fout, Fin, fstride1);
1375
1376 tw += 6;
1377 mstride <<= 2;
1378 fstride >>= 4;
1379 stage_count -= 2;
1380
1381 // swap
1382 Ftmp = buffer;
1383 buffer = Fout;
1384 Fout = Ftmp;
1385 }
1386 else if (N == 4) // length of FFT is 2^n (n is even)
1387 {
1388 //fstride is nfft>>2
1389 ne10_radix4x4_inverse_without_twiddles_neon (Fout, Fin, fstride);
1390
1391 N = fstride; // 1/4 of length of FFT
1392
1393 // swap
1394 Ftmp = buffer;
1395 buffer = Fout;
1396 Fout = Ftmp;
1397
1398 // update address for other stages
1399 stage_count--;
1400 tw = twiddles;
1401 fstride >>= 2;
1402 // end of first stage
1403 }
1404
1405 // others but the last one
1406 for (; stage_count > 1 ; stage_count--)
1407 {
1408 Fin1 = buffer;
1409 for (f_count = 0; f_count < fstride; f_count ++)
1410 {
1411 Fout1 = & Fout[ f_count * mstride << 2 ];
1412 tw1 = tw;
1413 ne10_radix4x4_inverse_with_twiddles_neon (Fout1, Fin1, tw1, N, mstride, mstride);
1414 Fin1 += mstride;
1415 } // f_count
1416 tw += mstride * 3;
1417 mstride <<= 2;
1418 fstride >>= 2;
1419
1420 // swap
1421 Ftmp = buffer;
1422 buffer = Fout;
1423 Fout = Ftmp;
1424 } // stage_count
1425
1426 // the last one
1427 if (stage_count)
1428 {
1429 Fin1 = buffer;
1430 // if stage count is even, output to the input array
1431 Fout1 = Fout_ls;
1432
1433 for (f_count = 0; f_count < fstride; f_count ++)
1434 {
1435 tw1 = tw;
1436 ne10_radix4x4_inverse_with_twiddles_last_stage_neon (Fout1, Fin1, tw1, N, N, mstride, nfft);
1437 Fin1 += mstride;
1438 Fout1 += mstride;
1439 } // f_count
1440 } // last stage
1441}
1442
1462 ne10_int32_t inverse_fft)
1463{
1464 // For input shorter than 16, fall back to c version.
1465 // We would not get much improvement from NEON for these cases.
1466 if (cfg->nfft < 16)
1467 {
1468 ne10_fft_c2c_1d_float32_c (fout, fin, cfg, inverse_fft);
1469 return;
1470 }
1471
1472 ne10_int32_t stage_count = cfg->factors[0];
1473 ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
1474
1475 assert ((algorithm_flag == NE10_FFT_ALG_24)
1476 || (algorithm_flag == NE10_FFT_ALG_ANY));
1477
1478 // For NE10_FFT_ALG_ANY.
1479 // Function will return inside this branch.
1480 if (algorithm_flag == NE10_FFT_ALG_ANY)
1481 {
1482 if (inverse_fft)
1483 {
1484 ne10_mixed_radix_generic_butterfly_inverse_float32_neon (fout, fin,
1485 cfg->factors, cfg->twiddles, cfg->buffer, cfg->is_backward_scaled);
1486 }
1487 else
1488 {
1489 ne10_mixed_radix_generic_butterfly_float32_neon (fout, fin,
1490 cfg->factors, cfg->twiddles, cfg->buffer, cfg->is_forward_scaled);
1491 }
1492 return;
1493 }
1494
1495 // Since function goes pass assertion and skips branch above, algorithm_flag
1496 // must be NE10_FFT_ALG_24.
1497 if (inverse_fft)
1498 {
1499 switch (cfg->nfft)
1500 {
1501 case 4:
1502 ne10_fft4_backward_float32 (fout, fin);
1503 break;
1504 case 8:
1505 ne10_fft8_backward_float32 (fout, fin);
1506 break;
1507 case 16:
1508 ne10_fft16_backward_float32_neon (fout, fin, cfg->twiddles);
1509 break;
1510 default:
1511 ne10_mixed_radix_fft_backward_float32_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1512 break;
1513 }
1514 }
1515 else
1516 {
1517 switch (cfg->nfft)
1518 {
1519 case 4:
1520 ne10_fft4_forward_float32 (fout, fin);
1521 break;
1522 case 8:
1523 ne10_fft8_forward_float32 (fout, fin);
1524 break;
1525 case 16:
1526 ne10_fft16_forward_float32_neon (fout, fin, cfg->twiddles);
1527 break;
1528 default:
1529 ne10_mixed_radix_fft_forward_float32_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1530 break;
1531 }
1532 }
1533}
1534
//end of C2C_FFT_IFFT group
void ne10_fft_c2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Mixed radix-2/3/4/5 complex FFT/IFFT of float(32-bit) data.
void ne10_fft_c2c_1d_float32_c(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Mixed radix-2/3/4/5 complex FFT/IFFT of float(32-bit) data.
structure for the floating point FFT state
Definition NE10_types.h:241
ne10_int32_t is_forward_scaled
@biref Flag to control scaling behaviour in forward floating point complex FFT.
Definition NE10_types.h:255
ne10_int32_t is_backward_scaled
@biref Flag to control scaling behaviour in backward floating point complex FFT.
Definition NE10_types.h:264