Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fft.neonintrinsic.h
1/*
2 * Copyright 2014-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : dsp/NE10_fft.neonintrinsic.h
30 */
31
32#ifndef NE10_FFT_NEONINTRINSIC_H
33#define NE10_FFT_NEONINTRINSIC_H
34
35#include "NE10_fft.h"
36#include <arm_neon.h>
37
38#define NE10_CPX_ADD_NEON_F32(Z,A,B) do { \
39 Z.val[0] = A.val[0] + B.val[0]; \
40 Z.val[1] = A.val[1] + B.val[1]; \
41} while (0);
42
43#define NE10_CPX_SUB_NEON_F32(Z,A,B) do { \
44 Z.val[0] = A.val[0] - B.val[0]; \
45 Z.val[1] = A.val[1] - B.val[1]; \
46} while (0);
47
48#define NE10_CPX_MUL_NEON_F32(Z,A,B) do { \
49 float32x4_t ARBR = vmulq_f32( A.val[0], B.val[0] ); \
50 float32x4_t ARBI = vmulq_f32( A.val[0], B.val[1] ); \
51 Z.val[0] = vmlsq_f32(ARBR, A.val[1], B.val[1]); \
52 Z.val[1] = vmlaq_f32(ARBI, A.val[1], B.val[0]); \
53} while (0);
54
55#define NE10_CPX_MUL_INV_NEON_F32(Z,A,B) do { \
56 float32x4_t ARBR = vmulq_f32( A.val[0], B.val[0] ); \
57 float32x4_t AIBI = vmulq_f32( A.val[1], B.val[1] ); \
58 float32x4_t ARBI = vmulq_f32( A.val[0], B.val[1] ); \
59 float32x4_t AIBR = vmulq_f32( A.val[1], B.val[0] ); \
60 Z.val[0] = vaddq_f32(ARBR,AIBI); \
61 Z.val[1] = vsubq_f32(AIBR,ARBI); \
62} while (0);
63
64#define NE10_BUTTERFLY_NEON_F32(O1,O2,I1,I2) do { \
65 NE10_CPX_ADD_NEON(O1,I1,I2); \
66 NE10_CPX_SUB_NEON(O2,I1,I2); \
67} while(0);
68
69#define NE10_DECLARE_2(TYPE,NAME) TYPE NAME ## 0; \
70 TYPE NAME ## 1;
71
72#define NE10_DECLARE_3(TYPE,NAME) NE10_DECLARE_2(TYPE,NAME); \
73 TYPE NAME ## 2;
74
75#define NE10_DECLARE_4(TYPE,NAME) NE10_DECLARE_3(TYPE,NAME); \
76 TYPE NAME ## 3;
77
78#define NE10_DECLARE_8(TYPE,NAME) NE10_DECLARE_4(TYPE,NAME); \
79 TYPE NAME ## 4; \
80 TYPE NAME ## 5; \
81 TYPE NAME ## 6; \
82 TYPE NAME ## 7;
83
84#define NE10_REVERSE_FLOAT32X4(VECTOR4F) do { \
85 VECTOR4F = vrev64q_f32(VECTOR4F); \
86 VECTOR4F = vcombine_f32( vget_high_f32( VECTOR4F ), vget_low_f32( VECTOR4F ) ); \
87} while (0);
88
89#define NE10_REVERSE_OUT_FLOAT32X4(VECTOR4F_OUT,VECTOR4F) do { \
90 float32x4_t Q_TMP = vrev64q_f32(VECTOR4F); \
91 VECTOR4F_OUT = vcombine_f32( vget_high_f32( Q_TMP ), vget_low_f32( Q_TMP ) ); \
92} while (0);
93
94#define NE10_RADIX4X4C_TRANSPOSE_NEON(Q2_OUT,Q2_IN) do { \
95 NE10_DECLARE_4(float32x4x2_t,q2_tmp); \
96 q2_tmp0 = vtrnq_f32 (Q2_IN ## 0 .val[0], Q2_IN ## 1 .val[0]); \
97 q2_tmp1 = vtrnq_f32 (Q2_IN ## 0 .val[1], Q2_IN ## 1 .val[1]); \
98 q2_tmp2 = vtrnq_f32 (Q2_IN ## 2 .val[0], Q2_IN ## 3 .val[0]); \
99 q2_tmp3 = vtrnq_f32 (Q2_IN ## 2 .val[1], Q2_IN ## 3 .val[1]); \
100 Q2_OUT ## 0 .val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0])); \
101 Q2_OUT ## 0 .val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0])); \
102 Q2_OUT ## 1 .val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1])); \
103 Q2_OUT ## 1 .val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1])); \
104 Q2_OUT ## 2 .val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0])); \
105 Q2_OUT ## 2 .val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0])); \
106 Q2_OUT ## 3 .val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1])); \
107 Q2_OUT ## 3 .val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1])); \
108} while(0);
109
110#define VDUPQ_N_F32(VAR) { VAR, VAR, VAR, VAR }
111
112#define CONST_TW_81 0.70710678
113#define CONST_TW_81N -0.70710678
114
115static const float32x4_t Q_TW_81 = VDUPQ_N_F32(CONST_TW_81 );
116static const float32x4_t Q_TW_81N = VDUPQ_N_F32(CONST_TW_81N);
117
118#define DIV_TW81 1.4142136f
119#define DIV_TW81N -1.4142136f
120
121static const float32x4_t DIV_TW81_NEON = VDUPQ_N_F32(DIV_TW81);
122static const float32x4_t DIV_TW81N_NEON = VDUPQ_N_F32(DIV_TW81N);
123
124#define NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_OUT,Q_IN) do { \
125 Q_OUT ## 0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 4); \
126 Q_OUT ## 1 = vsubq_f32 (Q_IN ## 0, Q_IN ## 4); \
127 Q_OUT ## 2 = vaddq_f32 (Q_IN ## 1, Q_IN ## 5); \
128 Q_OUT ## 3 = vsubq_f32 (Q_IN ## 1, Q_IN ## 5); \
129 Q_OUT ## 4 = vaddq_f32 (Q_IN ## 2, Q_IN ## 6); \
130 Q_OUT ## 5 = vsubq_f32 (Q_IN ## 2, Q_IN ## 6); \
131 Q_OUT ## 6 = vaddq_f32 (Q_IN ## 3, Q_IN ## 7); \
132 Q_OUT ## 7 = vsubq_f32 (Q_IN ## 3, Q_IN ## 7); \
133 Q_OUT ## 3 = vmulq_f32 (Q_OUT ## 3, Q_TW_81 ); \
134 Q_OUT ## 7 = vmulq_f32 (Q_OUT ## 7, Q_TW_81N); \
135} while(0);
136
137#define NE10_RADIX8x4_R2C_NEON_KERNEL_S2(Q_OUT,Q_IN) do { \
138 NE10_DECLARE_4(float32x4_t,Q_S); \
139 Q_S0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 4); \
140 Q_S1 = vaddq_f32 (Q_IN ## 2, Q_IN ## 6); \
141 Q_S2 = vsubq_f32 (Q_IN ## 7, Q_IN ## 3); \
142 Q_S3 = vaddq_f32 (Q_IN ## 3, Q_IN ## 7); \
143 Q_OUT ## 0 = vaddq_f32 ( Q_S0, Q_S1 ); \
144 Q_OUT ## 1 = vaddq_f32 ( Q_IN ## 1, Q_S3 ); \
145 Q_OUT ## 2 = vsubq_f32 ( Q_S2, Q_IN ## 5 ); \
146 Q_OUT ## 3 = vsubq_f32 ( Q_IN ## 0, Q_IN ## 4 ); \
147 Q_OUT ## 4 = vsubq_f32 ( Q_IN ## 6, Q_IN ## 2 ); \
148 Q_OUT ## 5 = vsubq_f32 ( Q_IN ## 1, Q_S3 ); \
149 Q_OUT ## 6 = vaddq_f32 ( Q_IN ## 5, Q_S2 ); \
150 Q_OUT ## 7 = vsubq_f32 ( Q_S0, Q_S1 ); \
151} while(0);
152
153#define NE10_RADIX8x4_C2R_NEON_KERNEL_S1(Q_OUT,Q_IN) do { \
154 NE10_DECLARE_8(float32x4_t,Q_S_IN); \
155 Q_S_IN0 = vaddq_f32(Q_IN ## 0, Q_IN ## 7); \
156 Q_S_IN1 = vsubq_f32(Q_IN ## 0, Q_IN ## 7); \
157 Q_S_IN2 = vaddq_f32(Q_IN ## 1, Q_IN ## 5); \
158 Q_S_IN3 = vsubq_f32(Q_IN ## 1, Q_IN ## 5); \
159 Q_S_IN4 = vaddq_f32(Q_IN ## 6, Q_IN ## 2); \
160 Q_S_IN5 = vsubq_f32(Q_IN ## 6, Q_IN ## 2); \
161 Q_S_IN6 = vaddq_f32(Q_IN ## 3, Q_IN ## 3); \
162 Q_S_IN7 = vaddq_f32(Q_IN ## 4, Q_IN ## 4); \
163 Q_OUT ## 0 = vaddq_f32(Q_S_IN0, Q_S_IN6); \
164 Q_OUT ## 1 = vaddq_f32(Q_S_IN2, Q_S_IN2); \
165 Q_OUT ## 2 = vsubq_f32(Q_S_IN1, Q_S_IN7); \
166 Q_OUT ## 3 = vsubq_f32(Q_S_IN3, Q_S_IN4); \
167 Q_OUT ## 4 = vsubq_f32(Q_S_IN0, Q_S_IN6); \
168 Q_OUT ## 5 = vaddq_f32(Q_S_IN5, Q_S_IN5); \
169 Q_OUT ## 6 = vaddq_f32(Q_S_IN1, Q_S_IN7); \
170 Q_OUT ## 7 = vaddq_f32(Q_S_IN4, Q_S_IN3); \
171} while (0);
172
173#define NE10_RADIX8x4_C2R_NEON_KERNEL_S2(Q_OUT,Q_IN) do { \
174 Q_IN ## 3 = vmulq_f32(Q_IN ## 3,DIV_TW81_NEON); \
175 Q_IN ## 7 = vmulq_f32(Q_IN ## 7,DIV_TW81N_NEON); \
176 Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 1); \
177 Q_OUT ## 4 = vsubq_f32(Q_IN ## 0, Q_IN ## 1); \
178 Q_OUT ## 1 = vaddq_f32(Q_IN ## 2, Q_IN ## 3); \
179 Q_OUT ## 5 = vsubq_f32(Q_IN ## 2, Q_IN ## 3); \
180 Q_OUT ## 2 = vaddq_f32(Q_IN ## 4, Q_IN ## 5); \
181 Q_OUT ## 6 = vsubq_f32(Q_IN ## 4, Q_IN ## 5); \
182 Q_OUT ## 3 = vaddq_f32(Q_IN ## 6, Q_IN ## 7); \
183 Q_OUT ## 7 = vsubq_f32(Q_IN ## 6, Q_IN ## 7); \
184} while(0);
185
186#define NE10_RADIX8x4_C2R_NEON_KERNEL_SCALE(Q_OUT) do { \
187 Q_OUT ## 0 = vmulq_f32( Q_OUT ## 0, EIGH_NEON); \
188 Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, EIGH_NEON); \
189 Q_OUT ## 2 = vmulq_f32( Q_OUT ## 2, EIGH_NEON); \
190 Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, EIGH_NEON); \
191 Q_OUT ## 4 = vmulq_f32( Q_OUT ## 4, EIGH_NEON); \
192 Q_OUT ## 5 = vmulq_f32( Q_OUT ## 5, EIGH_NEON); \
193 Q_OUT ## 6 = vmulq_f32( Q_OUT ## 6, EIGH_NEON); \
194 Q_OUT ## 7 = vmulq_f32( Q_OUT ## 7, EIGH_NEON); \
195} while(0);
196
197#define NE10_RADIX4x4_C2R_NEON_KERNEL_SCALE(Q_OUT) do { \
198 Q_OUT ## 0 = vmulq_f32( Q_OUT ## 0, QUAD_NEON); \
199 Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, QUAD_NEON); \
200 Q_OUT ## 2 = vmulq_f32( Q_OUT ## 2, QUAD_NEON); \
201 Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, QUAD_NEON); \
202} while(0);
203
204#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_SCALE(Q2_OUT) do { \
205 Q2_OUT ## 0 .val[0] = vmulq_f32( Q2_OUT ## 0 .val[0], QUAD_NEON); \
206 Q2_OUT ## 1 .val[0] = vmulq_f32( Q2_OUT ## 1 .val[0], QUAD_NEON); \
207 Q2_OUT ## 2 .val[0] = vmulq_f32( Q2_OUT ## 2 .val[0], QUAD_NEON); \
208 Q2_OUT ## 3 .val[0] = vmulq_f32( Q2_OUT ## 3 .val[0], QUAD_NEON); \
209 Q2_OUT ## 0 .val[1] = vmulq_f32( Q2_OUT ## 0 .val[1], QUAD_NEON); \
210 Q2_OUT ## 1 .val[1] = vmulq_f32( Q2_OUT ## 1 .val[1], QUAD_NEON); \
211 Q2_OUT ## 2 .val[1] = vmulq_f32( Q2_OUT ## 2 .val[1], QUAD_NEON); \
212 Q2_OUT ## 3 .val[1] = vmulq_f32( Q2_OUT ## 3 .val[1], QUAD_NEON); \
213} while(0);
214
215#define NE10_RADIX8x4_R2C_NEON_KERNEL(Q_OUT,Q_IN) do { \
216 NE10_DECLARE_8(float32x4_t,Q_S_IN); \
217 NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_S_IN,Q_IN); \
218 NE10_RADIX8x4_R2C_NEON_KERNEL_S2(Q_OUT,Q_S_IN); \
219} while(0);
220
221#define NE10_RADIX4x4_R2C_NEON_KERNEL(Q_OUT,Q_IN) do { \
222 NE10_DECLARE_4(float32x4_t,Q_S_IN); \
223 Q_S_IN0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 2); \
224 Q_S_IN1 = vaddq_f32 (Q_IN ## 1, Q_IN ## 3); \
225 Q_OUT ## 0 = vaddq_f32 (Q_S_IN0, Q_S_IN1); \
226 Q_OUT ## 1 = vsubq_f32 (Q_IN##0, Q_IN##2); \
227 Q_OUT ## 2 = vsubq_f32 (Q_IN##3, Q_IN##1); \
228 Q_OUT ## 3 = vsubq_f32 (Q_S_IN0, Q_S_IN1); \
229} while(0);
230
231#define NE10_RADIX4x4_C2R_NEON_KERNEL(Q_OUT,Q_IN) do { \
232 NE10_DECLARE_4(float32x4_t,Q_S_IN); \
233 Q_S_IN0 = vaddq_f32 (Q_IN##0, Q_IN##3); \
234 Q_S_IN1 = vsubq_f32 (Q_IN##0, Q_IN##3); \
235 Q_S_IN2 = vaddq_f32 (Q_IN##1, Q_IN##1); \
236 Q_S_IN3 = vaddq_f32 (Q_IN##2, Q_IN##2); \
237 Q_OUT ## 0 = vaddq_f32 (Q_S_IN0, Q_S_IN2); \
238 Q_OUT ## 1 = vsubq_f32 (Q_S_IN1, Q_S_IN3); \
239 Q_OUT ## 2 = vsubq_f32 (Q_S_IN0, Q_S_IN2); \
240 Q_OUT ## 3 = vaddq_f32 (Q_S_IN1, Q_S_IN3); \
241} while(0);
242
243#define NE10_RADIX8x4_C2R_NEON_KERNEL(Q_OUT,Q_IN) do { \
244 NE10_DECLARE_8(float32x4_t,Q_S_IN_C2R_KERNEL); \
245 NE10_RADIX8x4_C2R_NEON_KERNEL_S1(Q_S_IN_C2R_KERNEL,Q_IN); \
246 NE10_RADIX8x4_C2R_NEON_KERNEL_S2(Q_OUT,Q_S_IN_C2R_KERNEL); \
247} while(0);
248
249#define NE10_RADIX8x4_R2C_NEON_LOAD(PTR_IN,Q_IN,IN_STEP) do { \
250 Q_IN ## 0 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
251 PTR_IN += IN_STEP; \
252 Q_IN ## 1 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
253 PTR_IN += IN_STEP; \
254 Q_IN ## 2 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
255 PTR_IN += IN_STEP; \
256 Q_IN ## 3 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
257 PTR_IN += IN_STEP; \
258 Q_IN ## 4 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
259 PTR_IN += IN_STEP; \
260 Q_IN ## 5 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
261 PTR_IN += IN_STEP; \
262 Q_IN ## 6 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
263 PTR_IN += IN_STEP; \
264 Q_IN ## 7 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
265 PTR_IN += IN_STEP; \
266} while(0);
267
268#define NE10_RADIX4x4_R2C_NEON_LOAD(PTR_IN,Q_IN,IN_STEP) do {\
269 Q_IN ## 0 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
270 PTR_IN += IN_STEP; \
271 Q_IN ## 1 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
272 PTR_IN += IN_STEP; \
273 Q_IN ## 2 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
274 PTR_IN += IN_STEP; \
275 Q_IN ## 3 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
276 PTR_IN += IN_STEP; \
277} while(0);
278
279#define NE10_RADIX8x4_R2C_NEON_STORE(PTR_OUT,Q_OUT,OUT_STEP) do { \
280 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 0 * OUT_STEP ), Q_OUT ## 0); \
281 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 1 * OUT_STEP ), Q_OUT ## 1); \
282 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 2 * OUT_STEP ), Q_OUT ## 2); \
283 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 3 * OUT_STEP ), Q_OUT ## 3); \
284 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 4 * OUT_STEP ), Q_OUT ## 4); \
285 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 5 * OUT_STEP ), Q_OUT ## 5); \
286 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 6 * OUT_STEP ), Q_OUT ## 6); \
287 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 7 * OUT_STEP ), Q_OUT ## 7); \
288} while(0);
289
290#define NE10_RADIX4x4_R2C_NEON_STORE(PTR_OUT,Q_OUT,OUT_STEP) do { \
291 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 0 * OUT_STEP ), Q_OUT ## 0); \
292 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 1 * OUT_STEP ), Q_OUT ## 1); \
293 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 2 * OUT_STEP ), Q_OUT ## 2); \
294 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 3 * OUT_STEP ), Q_OUT ## 3); \
295} while(0);
296
297#define NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW) do { \
298 Q2_OUT ## 0 = Q2_IN ## 0; \
299 NE10_CPX_MUL_NEON_F32(Q2_OUT ## 1,Q2_IN ## 1,Q2_TW ## 0); \
300 NE10_CPX_MUL_NEON_F32(Q2_OUT ## 2,Q2_IN ## 2,Q2_TW ## 1); \
301 NE10_CPX_MUL_NEON_F32(Q2_OUT ## 3,Q2_IN ## 3,Q2_TW ## 2); \
302} while(0);
303
304#define NE10_RADIX4x4_C2R_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW) do { \
305 Q2_OUT ## 0 = Q2_IN ## 0; \
306 NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 1,Q2_IN ## 1,Q2_TW ## 0); \
307 NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 2,Q2_IN ## 2,Q2_TW ## 1); \
308 NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 3,Q2_IN ## 3,Q2_TW ## 2); \
309} while(0);
310
311#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN) do { \
312 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 0,Q2_IN ## 0,Q2_IN ## 2); \
313 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 1,Q2_IN ## 0,Q2_IN ## 2); \
314 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 2,Q2_IN ## 1,Q2_IN ## 3); \
315 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 3,Q2_IN ## 1,Q2_IN ## 3); \
316} while(0);
317
318#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN) do { \
319 Q2_OUT ## 0 .val[0] = vaddq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
320 Q2_OUT ## 0 .val[1] = vaddq_f32(Q2_IN ## 0 .val[1] , Q2_IN ## 2 .val[1]); \
321 Q2_OUT ## 2 .val[0] = vsubq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
322 Q2_OUT ## 2 .val[1] = vsubq_f32(Q2_IN ## 2 .val[1] , Q2_IN ## 0 .val[1]); \
323 Q2_OUT ## 1 .val[0] = vaddq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[1]); \
324 Q2_OUT ## 1 .val[1] = vsubq_f32(Q2_IN ## 1 .val[1] , Q2_IN ## 3 .val[0]); \
325 Q2_OUT ## 3 .val[0] = vsubq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[1]); \
326 Q2_OUT ## 3 .val[1] = vaddq_f32(Q2_IN ## 3 .val[0] , Q2_IN ## 1 .val[1]); \
327 Q2_OUT ## 3 .val[1] = vnegq_f32(Q2_OUT ## 3 .val[1]); \
328} while(0);
329
330#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(Q_OUT,Q_IN) do { \
331 float32x4_t Q_TMP; \
332 Q_IN ## 1 = vmulq_f32(Q_IN ## 1, Q_TW_81); \
333 Q_IN ## 3 = vmulq_f32(Q_IN ## 3, Q_TW_81); \
334 Q_TMP = vsubq_f32(Q_IN ## 1, Q_IN ## 3); \
335 Q_IN ## 3 = vaddq_f32(Q_IN ## 1, Q_IN ## 3); \
336 Q_IN ## 1 = Q_TMP; \
337 Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 1); \
338 Q_OUT ## 1 = vaddq_f32(Q_IN ## 2, Q_IN ## 3); \
339 Q_OUT ## 2 = vsubq_f32(Q_IN ## 0, Q_IN ## 1); \
340 Q_OUT ## 3 = vsubq_f32(Q_IN ## 2, Q_IN ## 3); \
341 Q_OUT ## 1 = vnegq_f32(Q_OUT ## 1); \
342} while(0);
343
344#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(Q_OUT,Q_IN) do { \
345 float32x4_t Q_TMP; \
346 Q_IN ## 1 = vnegq_f32(Q_IN ## 1 ); \
347 Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 2); \
348 Q_OUT ## 1 = vsubq_f32(Q_IN ## 0, Q_IN ## 2); \
349 Q_OUT ## 2 = vaddq_f32(Q_IN ## 1, Q_IN ## 3); \
350 Q_OUT ## 3 = vsubq_f32(Q_IN ## 1, Q_IN ## 3); \
351 Q_TMP = vaddq_f32(Q_OUT ## 1, Q_OUT ## 3); \
352 Q_OUT ## 3 = vsubq_f32(Q_OUT ## 3, Q_OUT ## 1); \
353 Q_OUT ## 1 = Q_TMP; \
354 Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, DIV_TW81_NEON); \
355 Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, DIV_TW81_NEON); \
356 Q_OUT ## 0 = vaddq_f32( Q_OUT ## 0, Q_OUT ## 0 ); \
357 Q_OUT ## 2 = vaddq_f32( Q_OUT ## 2, Q_OUT ## 2 ); \
358} while(0);
359
360#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN) do { \
361 Q2_IN ## 3 .val[1] = vnegq_f32(Q2_IN ## 3 .val[1]); \
362 Q2_OUT ## 0 .val[0] = vaddq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
363 Q2_OUT ## 0 .val[1] = vsubq_f32(Q2_IN ## 0 .val[1] , Q2_IN ## 2 .val[1]); \
364 Q2_OUT ## 2 .val[0] = vsubq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
365 Q2_OUT ## 2 .val[1] = vaddq_f32(Q2_IN ## 2 .val[1] , Q2_IN ## 0 .val[1]); \
366 Q2_OUT ## 1 .val[0] = vaddq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[0]); \
367 Q2_OUT ## 1 .val[1] = vaddq_f32(Q2_IN ## 1 .val[1] , Q2_IN ## 3 .val[1]); \
368 Q2_OUT ## 3 .val[0] = vsubq_f32(Q2_IN ## 3 .val[1] , Q2_IN ## 1 .val[1]); \
369 Q2_OUT ## 3 .val[1] = vsubq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[0]); \
370} while(0);
371
372#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN) do { \
373 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 0,Q2_IN ## 0,Q2_IN ## 1); \
374 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 2,Q2_IN ## 0,Q2_IN ## 1); \
375 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 1,Q2_IN ## 2,Q2_IN ## 3); \
376 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 3,Q2_IN ## 2,Q2_IN ## 3); \
377} while(0);
378
379#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL(Q2_OUT,Q2_IN,Q2_TW) do { \
380 NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW); \
381 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_IN,Q2_OUT); \
382 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN); \
383} while(0);
384
385#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL(Q2_OUT,Q2_IN,Q2_TW) do { \
386 NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN); \
387 NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2(Q2_IN,Q2_OUT); \
388 NE10_RADIX4x4_C2R_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW); \
389} while(0);
390
391#ifdef NE10_VERBOSE
392 #define NE10_PRINT_Qx8_VECTOR(Q_VECTOR) do { \
393 fprintf(stderr,"inside %s\n", __FUNCTION__ ); \
394 fprintf(stderr, #Q_VECTOR "\n"); \
395 fprintf(stderr,"0:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0[0], Q_VECTOR ## 0[1], Q_VECTOR ## 0[2], Q_VECTOR ## 0[3] ); \
396 fprintf(stderr,"1:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1[0], Q_VECTOR ## 1[1], Q_VECTOR ## 1[2], Q_VECTOR ## 1[3] ); \
397 fprintf(stderr,"2:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2[0], Q_VECTOR ## 2[1], Q_VECTOR ## 2[2], Q_VECTOR ## 2[3] ); \
398 fprintf(stderr,"3:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3[0], Q_VECTOR ## 3[1], Q_VECTOR ## 3[2], Q_VECTOR ## 3[3] ); \
399 fprintf(stderr,"4:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 4[0], Q_VECTOR ## 4[1], Q_VECTOR ## 4[2], Q_VECTOR ## 4[3] ); \
400 fprintf(stderr,"5:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 5[0], Q_VECTOR ## 5[1], Q_VECTOR ## 5[2], Q_VECTOR ## 5[3] ); \
401 fprintf(stderr,"6:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 6[0], Q_VECTOR ## 6[1], Q_VECTOR ## 6[2], Q_VECTOR ## 6[3] ); \
402 fprintf(stderr,"7:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 7[0], Q_VECTOR ## 7[1], Q_VECTOR ## 7[2], Q_VECTOR ## 7[3] ); \
403 } while(0);
404 #define NE10_PRINT_Qx4_VECTOR(Q_VECTOR) do { \
405 fprintf(stderr,"inside %s\n", __FUNCTION__ ); \
406 fprintf(stderr, #Q_VECTOR "\n"); \
407 fprintf(stderr,"0:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0[0], Q_VECTOR ## 0[1], Q_VECTOR ## 0[2], Q_VECTOR ## 0[3] ); \
408 fprintf(stderr,"1:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1[0], Q_VECTOR ## 1[1], Q_VECTOR ## 1[2], Q_VECTOR ## 1[3] ); \
409 fprintf(stderr,"2:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2[0], Q_VECTOR ## 2[1], Q_VECTOR ## 2[2], Q_VECTOR ## 2[3] ); \
410 fprintf(stderr,"3:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3[0], Q_VECTOR ## 3[1], Q_VECTOR ## 3[2], Q_VECTOR ## 3[3] ); \
411 } while(0);
412 #define NE10_PRINT_Q2x4_VECTOR(Q_VECTOR) do { \
413 fprintf(stderr,"inside %s\n", __FUNCTION__ ); \
414 fprintf(stderr, #Q_VECTOR "\n"); \
415 fprintf(stderr,"0R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0 .val[0][0], Q_VECTOR ## 0 .val[0][1], Q_VECTOR ## 0 .val[0][2], Q_VECTOR ## 0 .val[0][3] ); \
416 fprintf(stderr,"1R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1 .val[0][0], Q_VECTOR ## 1 .val[0][1], Q_VECTOR ## 1 .val[0][2], Q_VECTOR ## 1 .val[0][3] ); \
417 fprintf(stderr,"2R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2 .val[0][0], Q_VECTOR ## 2 .val[0][1], Q_VECTOR ## 2 .val[0][2], Q_VECTOR ## 2 .val[0][3] ); \
418 fprintf(stderr,"3R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3 .val[0][0], Q_VECTOR ## 3 .val[0][1], Q_VECTOR ## 3 .val[0][2], Q_VECTOR ## 3 .val[0][3] ); \
419 fprintf(stderr,"0I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0 .val[1][0], Q_VECTOR ## 0 .val[1][1], Q_VECTOR ## 0 .val[1][2], Q_VECTOR ## 0 .val[1][3] ); \
420 fprintf(stderr,"1I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1 .val[1][0], Q_VECTOR ## 1 .val[1][1], Q_VECTOR ## 1 .val[1][2], Q_VECTOR ## 1 .val[1][3] ); \
421 fprintf(stderr,"2I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2 .val[1][0], Q_VECTOR ## 2 .val[1][1], Q_VECTOR ## 2 .val[1][2], Q_VECTOR ## 2 .val[1][3] ); \
422 fprintf(stderr,"3I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3 .val[1][0], Q_VECTOR ## 3 .val[1][1], Q_VECTOR ## 3 .val[1][2], Q_VECTOR ## 3 .val[1][3] ); \
423 } while(0);
424#else // NE10_VERBOSE not defined
425 #define NE10_PRINT_Qx8_VECTOR(Q_VECTOR) ;
426 #define NE10_PRINT_Qx4_VECTOR(Q_VECTOR) ;
427 #define NE10_PRINT_Q2x4_VECTOR(Q2_VECTOR) ;
428#endif // NE10_VERBOSE
429#endif // header