Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fft_int32.neonintrinsic.c
1/*
2 * Copyright 2013-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : dsp/NE10_fft_int32.neon.c
30 */
31
32#include <arm_neon.h>
33
34#include "NE10_types.h"
35#include "NE10_macros.h"
36#include "NE10_fft.h"
37#include "NE10_dsp.h"
38
39#define FFT4_FS_START \
40 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \
41 ne10_int32_t tmp_r, tmp_i;
42
43
44#define FFT4_FS \
45 s2_r = Fin[0].r - Fin[2].r; \
46 s2_i = Fin[0].i - Fin[2].i; \
47 tmp_r = Fin[0].r + Fin[2].r; \
48 tmp_i = Fin[0].i + Fin[2].i; \
49 s0_r = Fin[1].r + Fin[3].r; \
50 s0_i = Fin[1].i + Fin[3].i; \
51 s1_r = Fin[1].r - Fin[3].r; \
52 s1_i = Fin[1].i - Fin[3].i;
53
54#define FFT4_FS_SCALED \
55 s2_r = (Fin[0].r - Fin[2].r) >> 2; \
56 s2_i = (Fin[0].i - Fin[2].i) >> 2; \
57 tmp_r = (Fin[0].r + Fin[2].r) >> 2; \
58 tmp_i = (Fin[0].i + Fin[2].i) >> 2; \
59 s0_r = (Fin[1].r + Fin[3].r) >> 2; \
60 s0_i = (Fin[1].i + Fin[3].i) >> 2; \
61 s1_r = (Fin[1].r - Fin[3].r) >> 2; \
62 s1_i = (Fin[1].i - Fin[3].i) >> 2;
63
64#define FFT4_FWD_LS \
65 Fout[2].r = tmp_r - s0_r; \
66 Fout[2].i = tmp_i - s0_i; \
67 Fout[0].r = tmp_r + s0_r; \
68 Fout[0].i = tmp_i + s0_i; \
69 Fout[1].r = s2_r + s1_i; \
70 Fout[1].i = s2_i - s1_r; \
71 Fout[3].r = s2_r - s1_i; \
72 Fout[3].i = s2_i + s1_r;
73
74#define FFT4_INV_LS \
75 Fout[2].r = tmp_r - s0_r; \
76 Fout[2].i = tmp_i - s0_i; \
77 Fout[0].r = tmp_r + s0_r; \
78 Fout[0].i = tmp_i + s0_i; \
79 Fout[1].r = s2_r - s1_i; \
80 Fout[1].i = s2_i + s1_r; \
81 Fout[3].r = s2_r + s1_i; \
82 Fout[3].i = s2_i - s1_r;
83
84static inline void ne10_fft4_forward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
86
87{
88 FFT4_FS_START
89 FFT4_FS
90 FFT4_FWD_LS
91}
92
93static inline void ne10_fft4_backward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
95
96{
97 FFT4_FS_START
98 FFT4_FS
99 FFT4_INV_LS
100}
101static inline void ne10_fft4_forward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
103
104{
105 FFT4_FS_START
106 FFT4_FS_SCALED
107 FFT4_FWD_LS
108}
109
110static inline void ne10_fft4_backward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
112
113{
114 FFT4_FS_START
115 FFT4_FS_SCALED
116 FFT4_INV_LS
117}
118
119#define FFT8_FS_START \
120 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i; \
121 ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \
122 const ne10_int32_t TW_81 = 1518500249;
123
124#define FFT8_FS \
125 s0_r = Fin[0].r + Fin[4].r; \
126 s0_i = Fin[0].i + Fin[4].i; \
127 s1_r = Fin[0].r - Fin[4].r; \
128 s1_i = Fin[0].i - Fin[4].i; \
129 s2_r = Fin[1].r + Fin[5].r; \
130 s2_i = Fin[1].i + Fin[5].i; \
131 s3_r = Fin[1].r - Fin[5].r; \
132 s3_i = Fin[1].i - Fin[5].i; \
133 s4_r = Fin[2].r + Fin[6].r; \
134 s4_i = Fin[2].i + Fin[6].i; \
135 s5_r = Fin[2].r - Fin[6].r; \
136 s5_i = Fin[2].i - Fin[6].i; \
137 s6_r = Fin[3].r + Fin[7].r; \
138 s6_i = Fin[3].i + Fin[7].i; \
139 s7_r = Fin[3].r - Fin[7].r; \
140 s7_i = Fin[3].i - Fin[7].i;
141
142#define FFT8_FS_SCALED \
143 s0_r = (Fin[0].r + Fin[4].r) >> 3; \
144 s0_i = (Fin[0].i + Fin[4].i) >> 3; \
145 s1_r = (Fin[0].r - Fin[4].r) >> 3; \
146 s1_i = (Fin[0].i - Fin[4].i) >> 3; \
147 s2_r = (Fin[1].r + Fin[5].r) >> 3; \
148 s2_i = (Fin[1].i + Fin[5].i) >> 3; \
149 s3_r = (Fin[1].r - Fin[5].r) >> 3; \
150 s3_i = (Fin[1].i - Fin[5].i) >> 3; \
151 s4_r = (Fin[2].r + Fin[6].r) >> 3; \
152 s4_i = (Fin[2].i + Fin[6].i) >> 3; \
153 s5_r = (Fin[2].r - Fin[6].r) >> 3; \
154 s5_i = (Fin[2].i - Fin[6].i) >> 3; \
155 s6_r = (Fin[3].r + Fin[7].r) >> 3; \
156 s6_i = (Fin[3].i + Fin[7].i) >> 3; \
157 s7_r = (Fin[3].r - Fin[7].r) >> 3; \
158 s7_i = (Fin[3].i - Fin[7].i) >> 3;
159
160
161#define FFT8_FWD_LS \
162 t0_r = s0_r - s4_r; \
163 t0_i = s0_i - s4_i; \
164 t1_r = s0_r + s4_r; \
165 t1_i = s0_i + s4_i; \
166 t2_r = s2_r + s6_r; \
167 t2_i = s2_i + s6_i; \
168 t3_r = s2_r - s6_r; \
169 t3_i = s2_i - s6_i; \
170 Fout[0].r = t1_r + t2_r; \
171 Fout[0].i = t1_i + t2_i; \
172 Fout[4].r = t1_r - t2_r; \
173 Fout[4].i = t1_i - t2_i; \
174 Fout[2].r = t0_r + t3_i; \
175 Fout[2].i = t0_i - t3_r; \
176 Fout[6].r = t0_r - t3_i; \
177 Fout[6].i = t0_i + t3_r; \
178 t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31); \
179 t4_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31); \
180 t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31); \
181 t5_i = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31); \
182 t0_r = s1_r - s5_i; \
183 t0_i = s1_i + s5_r; \
184 t1_r = s1_r + s5_i; \
185 t1_i = s1_i - s5_r; \
186 t2_r = t4_r - t5_r; \
187 t2_i = t4_i - t5_i; \
188 t3_r = t4_r + t5_r; \
189 t3_i = t4_i + t5_i; \
190 Fout[1].r = t1_r + t2_r; \
191 Fout[1].i = t1_i + t2_i; \
192 Fout[5].r = t1_r - t2_r; \
193 Fout[5].i = t1_i - t2_i; \
194 Fout[3].r = t0_r + t3_i; \
195 Fout[3].i = t0_i - t3_r; \
196 Fout[7].r = t0_r - t3_i; \
197 Fout[7].i = t0_i + t3_r;
198
199#define FFT8_INV_LS \
200 t0_r = s0_r - s4_r; \
201 t0_i = s0_i - s4_i; \
202 t1_r = s0_r + s4_r; \
203 t1_i = s0_i + s4_i; \
204 t2_r = s2_r + s6_r; \
205 t2_i = s2_i + s6_i; \
206 t3_r = s2_r - s6_r; \
207 t3_i = s2_i - s6_i; \
208 Fout[0].r = t1_r + t2_r; \
209 Fout[0].i = t1_i + t2_i; \
210 Fout[4].r = t1_r - t2_r; \
211 Fout[4].i = t1_i - t2_i; \
212 Fout[2].r = t0_r - t3_i; \
213 Fout[2].i = t0_i + t3_r; \
214 Fout[6].r = t0_r + t3_i; \
215 Fout[6].i = t0_i - t3_r; \
216 t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31); \
217 t4_i = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31); \
218 t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31); \
219 t5_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31); \
220 t0_r = s1_r + s5_i; \
221 t0_i = s1_i - s5_r; \
222 t1_r = s1_r - s5_i; \
223 t1_i = s1_i + s5_r; \
224 t2_r = t4_r - t5_r; \
225 t2_i = t4_i - t5_i; \
226 t3_r = t4_r + t5_r; \
227 t3_i = t4_i + t5_i; \
228 Fout[1].r = t1_r + t2_r; \
229 Fout[1].i = t1_i + t2_i; \
230 Fout[5].r = t1_r - t2_r; \
231 Fout[5].i = t1_i - t2_i; \
232 Fout[3].r = t0_r - t3_i; \
233 Fout[3].i = t0_i + t3_r; \
234 Fout[7].r = t0_r + t3_i; \
235 Fout[7].i = t0_i - t3_r;
236
237static inline void ne10_fft8_forward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
239
240{
241 FFT8_FS_START
242 FFT8_FS
243 FFT8_FWD_LS
244}
245
246static inline void ne10_fft8_backward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
248
249{
250 FFT8_FS_START
251 FFT8_FS
252 FFT8_INV_LS
253}
254static inline void ne10_fft8_forward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
256
257{
258 FFT8_FS_START
259 FFT8_FS_SCALED
260 FFT8_FWD_LS
261}
262
263static inline void ne10_fft8_backward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
265
266{
267 FFT8_FS_START
268 FFT8_FS_SCALED
269 FFT8_INV_LS
270}
271#define FFT16_FS_START \
272 ne10_fft_cpx_int32_t *tw1, *tw2, *tw3; \
273 int32_t *p_src0, *p_src4, *p_src8, *p_src12; \
274 int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef; \
275 int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i; \
276 int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d; \
277 int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
278
279#define FFT16_LS_START \
280 int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3; \
281 int32_t *p_tw1, *p_tw2, *p_tw3; \
282 int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i; \
283 int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i; \
284 int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3; \
285 int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef; \
286 int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef; \
287 int32x4x2_t q2_tw1, q2_tw2, q2_tw3; \
288 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5; \
289 int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
290
291#define FFT16_FS \
292 p_src0 = (int32_t*) (& (Fin[0])); \
293 p_src4 = (int32_t*) (& (Fin[4])); \
294 p_src8 = (int32_t*) (& (Fin[8])); \
295 p_src12 = (int32_t*) (& (Fin[12])); \
296 q2_in_0123 = vld2q_s32 (p_src0); \
297 q2_in_4567 = vld2q_s32 (p_src4); \
298 q2_in_89ab = vld2q_s32 (p_src8); \
299 q2_in_cdef = vld2q_s32 (p_src12); \
300 q_t2_r = vsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
301 q_t2_i = vsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
302 q_t3_r = vaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
303 q_t3_i = vaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
304 q_t0_r = vaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
305 q_t0_i = vaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
306 q_t1_r = vsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
307 q_t1_i = vsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
308 q_out_r26ae = vsubq_s32 (q_t3_r, q_t0_r); \
309 q_out_i26ae = vsubq_s32 (q_t3_i, q_t0_i); \
310 q_out_r048c = vaddq_s32 (q_t3_r, q_t0_r); \
311 q_out_i048c = vaddq_s32 (q_t3_i, q_t0_i);
312
313#define FFT16_FS_SCALED \
314 p_src0 = (int32_t*) (& (Fin[0])); \
315 p_src4 = (int32_t*) (& (Fin[4])); \
316 p_src8 = (int32_t*) (& (Fin[8])); \
317 p_src12 = (int32_t*) (& (Fin[12])); \
318 q2_in_0123 = vld2q_s32 (p_src0); \
319 q2_in_4567 = vld2q_s32 (p_src4); \
320 q2_in_89ab = vld2q_s32 (p_src8); \
321 q2_in_cdef = vld2q_s32 (p_src12); \
322 q_t2_r = vhsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
323 q_t2_i = vhsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
324 q_t3_r = vhaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]); \
325 q_t3_i = vhaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]); \
326 q_t0_r = vhaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
327 q_t0_i = vhaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
328 q_t1_r = vhsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]); \
329 q_t1_i = vhsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]); \
330 q_out_r26ae = vhsubq_s32 (q_t3_r, q_t0_r); \
331 q_out_i26ae = vhsubq_s32 (q_t3_i, q_t0_i); \
332 q_out_r048c = vhaddq_s32 (q_t3_r, q_t0_r); \
333 q_out_i048c = vhaddq_s32 (q_t3_i, q_t0_i);
334
335#define FFT16_LS_LOAD \
336 tw1 = twiddles; \
337 tw2 = twiddles + 4; \
338 tw3 = twiddles + 8; \
339 p_dst0 = (int32_t*) (&Fout[0]); \
340 p_dst1 = (int32_t*) (&Fout[4]); \
341 p_dst2 = (int32_t*) (&Fout[8]); \
342 p_dst3 = (int32_t*) (&Fout[12]); \
343 p_tw1 = (int32_t*) tw1; \
344 p_tw2 = (int32_t*) tw2; \
345 p_tw3 = (int32_t*) tw3; \
346 q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d); \
347 q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d); \
348 q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf); \
349 q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf); \
350 q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0])); \
351 q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0])); \
352 q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0])); \
353 q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0])); \
354 q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1])); \
355 q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1])); \
356 q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1])); \
357 q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1])); \
358 q2_tw1 = vld2q_s32 (p_tw1); \
359 q2_tw2 = vld2q_s32 (p_tw2); \
360 q2_tw3 = vld2q_s32 (p_tw3);
361
362#define FFT16_FWD_LS \
363 q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]); \
364 q_s0_i = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]); \
365 q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]); \
366 q_s1_i = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]); \
367 q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]); \
368 q_s2_i = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]); \
369 q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]); \
370 q_tmp1 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]); \
371 q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]); \
372 q_tmp3 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]); \
373 q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]); \
374 q_tmp5 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
375
376#define FFT16_INV_LS \
377 q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]); \
378 q_s0_i = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]); \
379 q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]); \
380 q_s1_i = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]); \
381 q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]); \
382 q_s2_i = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]); \
383 q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]); \
384 q_tmp1 = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]); \
385 q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]); \
386 q_tmp3 = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]); \
387 q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]); \
388 q_tmp5 = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
389
390#define FFT16_FWD_LS_S0 \
391 q_s0_r = vsubq_s32 (q_s0_r, q_tmp0); \
392 q_s0_i = vaddq_s32 (q_s0_i, q_tmp1); \
393 q_s1_r = vsubq_s32 (q_s1_r, q_tmp2); \
394 q_s1_i = vaddq_s32 (q_s1_i, q_tmp3); \
395 q_s2_r = vsubq_s32 (q_s2_r, q_tmp4); \
396 q_s2_i = vaddq_s32 (q_s2_i, q_tmp5);
397
398#define FFT16_INV_LS_S0 \
399 q_s0_r = vaddq_s32 (q_s0_r, q_tmp0); \
400 q_s0_i = vsubq_s32 (q_s0_i, q_tmp1); \
401 q_s1_r = vaddq_s32 (q_s1_r, q_tmp2); \
402 q_s1_i = vsubq_s32 (q_s1_i, q_tmp3); \
403 q_s2_r = vaddq_s32 (q_s2_r, q_tmp4); \
404 q_s2_i = vsubq_s32 (q_s2_i, q_tmp5);
405
406#define FFT16_LS_02 \
407 q_s5_r = vsubq_s32 (q_in_r0123, q_s1_r); \
408 q_s5_i = vsubq_s32 (q_in_i0123, q_s1_i); \
409 q2_out_0123.val[0] = vaddq_s32 (q_in_r0123, q_s1_r); \
410 q2_out_0123.val[1] = vaddq_s32 (q_in_i0123, q_s1_i); \
411 q_s3_r = vaddq_s32 (q_s0_r, q_s2_r); \
412 q_s3_i = vaddq_s32 (q_s0_i, q_s2_i); \
413 q_s4_r = vsubq_s32 (q_s0_r, q_s2_r); \
414 q_s4_i = vsubq_s32 (q_s0_i, q_s2_i); \
415 q2_out_89ab.val[0] = vsubq_s32 (q2_out_0123.val[0], q_s3_r); \
416 q2_out_89ab.val[1] = vsubq_s32 (q2_out_0123.val[1], q_s3_i); \
417 q2_out_0123.val[0] = vaddq_s32 (q2_out_0123.val[0], q_s3_r); \
418 q2_out_0123.val[1] = vaddq_s32 (q2_out_0123.val[1], q_s3_i);
419
420
421#define FFT16_LS_02_SCALED \
422 q_s5_r = vhsubq_s32 (q_in_r0123, q_s1_r); \
423 q_s5_i = vhsubq_s32 (q_in_i0123, q_s1_i); \
424 q2_out_0123.val[0] = vhaddq_s32 (q_in_r0123, q_s1_r); \
425 q2_out_0123.val[1] = vhaddq_s32 (q_in_i0123, q_s1_i); \
426 q_s3_r = vhaddq_s32 (q_s0_r, q_s2_r); \
427 q_s3_i = vhaddq_s32 (q_s0_i, q_s2_i); \
428 q_s4_r = vhsubq_s32 (q_s0_r, q_s2_r); \
429 q_s4_i = vhsubq_s32 (q_s0_i, q_s2_i); \
430 q2_out_89ab.val[0] = vhsubq_s32 (q2_out_0123.val[0], q_s3_r); \
431 q2_out_89ab.val[1] = vhsubq_s32 (q2_out_0123.val[1], q_s3_i); \
432 q2_out_0123.val[0] = vhaddq_s32 (q2_out_0123.val[0], q_s3_r); \
433 q2_out_0123.val[1] = vhaddq_s32 (q2_out_0123.val[1], q_s3_i);
434
435#define FFT16_ST \
436 vst2q_s32 (p_dst0, q2_out_0123); \
437 vst2q_s32 (p_dst1, q2_out_4567); \
438 vst2q_s32 (p_dst2, q2_out_89ab); \
439 vst2q_s32 (p_dst3, q2_out_cdef);
440
441static void ne10_fft16_forward_int32_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
443 ne10_fft_cpx_int32_t * twiddles)
444{
445 // the first stage
446 FFT16_FS_START
447 FFT16_FS
448 q_out_r159d = vaddq_s32 (q_t2_r, q_t1_i);
449 q_out_i159d = vsubq_s32 (q_t2_i, q_t1_r);
450 q_out_r37bf = vsubq_s32 (q_t2_r, q_t1_i);
451 q_out_i37bf = vaddq_s32 (q_t2_i, q_t1_r);
452
453 // second stages
454 FFT16_LS_START
455 FFT16_LS_LOAD
456 FFT16_FWD_LS
457 FFT16_FWD_LS_S0
458 FFT16_LS_02
459
460 q2_out_4567.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
461 q2_out_4567.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
462 q2_out_cdef.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
463 q2_out_cdef.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
464
465 FFT16_ST
466}
467
468static void ne10_fft16_backward_int32_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
470 ne10_fft_cpx_int32_t * twiddles)
471{
472 // the first stage
473 FFT16_FS_START
474 FFT16_FS
475 q_out_r159d = vsubq_s32 (q_t2_r, q_t1_i);
476 q_out_i159d = vaddq_s32 (q_t2_i, q_t1_r);
477 q_out_r37bf = vaddq_s32 (q_t2_r, q_t1_i);
478 q_out_i37bf = vsubq_s32 (q_t2_i, q_t1_r);
479
480 // second stages
481 FFT16_LS_START
482 FFT16_LS_LOAD
483 FFT16_INV_LS
484 FFT16_INV_LS_S0
485 FFT16_LS_02
486
487 q2_out_4567.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
488 q2_out_4567.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
489 q2_out_cdef.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
490 q2_out_cdef.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
491
492 FFT16_ST
493}
494
495static void ne10_fft16_forward_int32_scaled_neon (ne10_fft_cpx_int32_t * Fout,
497 ne10_fft_cpx_int32_t * twiddles)
498{
499 // the first stage
500 FFT16_FS_START
501 FFT16_FS_SCALED
502 q_out_r159d = vhaddq_s32 (q_t2_r, q_t1_i);
503 q_out_i159d = vhsubq_s32 (q_t2_i, q_t1_r);
504 q_out_r37bf = vhsubq_s32 (q_t2_r, q_t1_i);
505 q_out_i37bf = vhaddq_s32 (q_t2_i, q_t1_r);
506
507 // second stages
508 FFT16_LS_START
509 FFT16_LS_LOAD
510 FFT16_FWD_LS
511 FFT16_FWD_LS_S0
512 FFT16_LS_02_SCALED
513
514 q2_out_4567.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
515 q2_out_4567.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
516 q2_out_cdef.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
517 q2_out_cdef.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
518
519 FFT16_ST
520}
521
522static void ne10_fft16_backward_int32_scaled_neon (ne10_fft_cpx_int32_t * Fout,
524 ne10_fft_cpx_int32_t * twiddles)
525{
526 // the first stage
527 FFT16_FS_START
528 FFT16_FS_SCALED
529 q_out_r159d = vhsubq_s32 (q_t2_r, q_t1_i);
530 q_out_i159d = vhaddq_s32 (q_t2_i, q_t1_r);
531 q_out_r37bf = vhaddq_s32 (q_t2_r, q_t1_i);
532 q_out_i37bf = vhsubq_s32 (q_t2_i, q_t1_r);
533
534 // second stages
535 FFT16_LS_START
536 FFT16_LS_LOAD
537 FFT16_INV_LS
538 FFT16_INV_LS_S0
539 FFT16_LS_02_SCALED
540
541 q2_out_4567.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
542 q2_out_4567.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
543 q2_out_cdef.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
544 q2_out_cdef.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
545
546 FFT16_ST
547}
548
549
550#define RADIX8x4_START \
551 ne10_int32_t f_count; \
552 ne10_int32_t src_step = stride << 1; \
553 const ne10_int32_t TW_81 = 1518500249; \
554 const ne10_int32_t TW_81N = -1518500249; \
555 int32_t *p_src, *p_dst; \
556 int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7; \
557 int32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i; \
558 int32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i; \
559 int32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i; \
560 int32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i; \
561 int32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i; \
562 int32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i; \
563 int32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i; \
564 int32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7; \
565 int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7; \
566 int32x4_t q_tw_81, q_tw_81n; \
567 p_src = (int32_t *) Fin; \
568 p_dst = (int32_t *) Fout;
569
570
571#define RADIX8x4_LOAD \
572 q2_in0 = vld2q_s32 (p_src); \
573 p_src += src_step; \
574 q2_in2 = vld2q_s32 (p_src); \
575 p_src += src_step; \
576 q2_in4 = vld2q_s32 (p_src); \
577 p_src += src_step; \
578 q2_in6 = vld2q_s32 (p_src); \
579 p_src += src_step; \
580 q2_in1 = vld2q_s32 (p_src); \
581 p_src += src_step; \
582 q2_in3 = vld2q_s32 (p_src); \
583 p_src += src_step; \
584 q2_in5 = vld2q_s32 (p_src); \
585 p_src += src_step; \
586 q2_in7 = vld2q_s32 (p_src); \
587 p_src += src_step;
588
589#define RADIX8x4_STORE \
590 q2_tmp0 = vtrnq_s32 (q_out0_r, q_out1_r); \
591 q2_tmp1 = vtrnq_s32 (q_out0_i, q_out1_i); \
592 q2_tmp2 = vtrnq_s32 (q_out2_r, q_out3_r); \
593 q2_tmp3 = vtrnq_s32 (q_out2_i, q_out3_i); \
594 q2_tmp4 = vtrnq_s32 (q_out4_r, q_out5_r); \
595 q2_tmp5 = vtrnq_s32 (q_out4_i, q_out5_i); \
596 q2_tmp6 = vtrnq_s32 (q_out6_r, q_out7_r); \
597 q2_tmp7 = vtrnq_s32 (q_out6_i, q_out7_i); \
598 q2_out0.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[0]), vget_low_s32 (q2_tmp2.val[0])); \
599 q2_out0.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[0]), vget_low_s32 (q2_tmp3.val[0])); \
600 q2_out2.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[1]), vget_low_s32 (q2_tmp2.val[1])); \
601 q2_out2.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[1]), vget_low_s32 (q2_tmp3.val[1])); \
602 q2_out4.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[0]), vget_high_s32 (q2_tmp2.val[0])); \
603 q2_out4.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[0]), vget_high_s32 (q2_tmp3.val[0])); \
604 q2_out6.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[1]), vget_high_s32 (q2_tmp2.val[1])); \
605 q2_out6.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[1]), vget_high_s32 (q2_tmp3.val[1])); \
606 q2_out1.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp4.val[0]), vget_low_s32 (q2_tmp6.val[0])); \
607 q2_out1.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp5.val[0]), vget_low_s32 (q2_tmp7.val[0])); \
608 q2_out3.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp4.val[1]), vget_low_s32 (q2_tmp6.val[1])); \
609 q2_out3.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp5.val[1]), vget_low_s32 (q2_tmp7.val[1])); \
610 q2_out5.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp4.val[0]), vget_high_s32 (q2_tmp6.val[0])); \
611 q2_out5.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp5.val[0]), vget_high_s32 (q2_tmp7.val[0])); \
612 q2_out7.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp4.val[1]), vget_high_s32 (q2_tmp6.val[1])); \
613 q2_out7.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp5.val[1]), vget_high_s32 (q2_tmp7.val[1])); \
614 vst2q_s32 (p_dst, q2_out0); \
615 p_dst += 8; \
616 vst2q_s32 (p_dst, q2_out1); \
617 p_dst += 8; \
618 vst2q_s32 (p_dst, q2_out2); \
619 p_dst += 8; \
620 vst2q_s32 (p_dst, q2_out3); \
621 p_dst += 8; \
622 vst2q_s32 (p_dst, q2_out4); \
623 p_dst += 8; \
624 vst2q_s32 (p_dst, q2_out5); \
625 p_dst += 8; \
626 vst2q_s32 (p_dst, q2_out6); \
627 p_dst += 8; \
628 vst2q_s32 (p_dst, q2_out7); \
629 p_dst += 8; \
630 p_src = p_src - src_step * 8 + 8;
631
632#define RADIX8x4_FS_S0 \
633 q_sin0_r = vaddq_s32 (q2_in0.val[0], q2_in1.val[0]); \
634 q_sin0_i = vaddq_s32 (q2_in0.val[1], q2_in1.val[1]); \
635 q_sin1_r = vsubq_s32 (q2_in0.val[0], q2_in1.val[0]); \
636 q_sin1_i = vsubq_s32 (q2_in0.val[1], q2_in1.val[1]); \
637 q_sin2_r = vaddq_s32 (q2_in2.val[0], q2_in3.val[0]); \
638 q_sin2_i = vaddq_s32 (q2_in2.val[1], q2_in3.val[1]); \
639 q_sin3_r = vsubq_s32 (q2_in2.val[0], q2_in3.val[0]); \
640 q_sin3_i = vsubq_s32 (q2_in2.val[1], q2_in3.val[1]); \
641 q_sin4_r = vaddq_s32 (q2_in4.val[0], q2_in5.val[0]); \
642 q_sin4_i = vaddq_s32 (q2_in4.val[1], q2_in5.val[1]); \
643 q_sin5_r = vsubq_s32 (q2_in4.val[0], q2_in5.val[0]); \
644 q_sin5_i = vsubq_s32 (q2_in4.val[1], q2_in5.val[1]); \
645 q_sin6_r = vaddq_s32 (q2_in6.val[0], q2_in7.val[0]); \
646 q_sin6_i = vaddq_s32 (q2_in6.val[1], q2_in7.val[1]); \
647 q_sin7_r = vsubq_s32 (q2_in6.val[0], q2_in7.val[0]); \
648 q_sin7_i = vsubq_s32 (q2_in6.val[1], q2_in7.val[1]);
649
650#define RADIX8x4_FWD_S357 \
651 q_tw_81 = vdupq_n_s32 (TW_81); \
652 q_tw_81n = vdupq_n_s32 (TW_81N); \
653 q_s5_r = q_sin5_i; \
654 q_s5_i = vnegq_s32 (q_sin5_r); \
655 q_s3_r = vaddq_s32 (q_sin3_r, q_sin3_i); \
656 q_s3_i = vsubq_s32 (q_sin3_i, q_sin3_r); \
657 q_s7_r = vsubq_s32 (q_sin7_r, q_sin7_i); \
658 q_s7_i = vaddq_s32 (q_sin7_i, q_sin7_r); \
659 q_s3_r = vqdmulhq_s32 (q_s3_r, q_tw_81); \
660 q_s3_i = vqdmulhq_s32 (q_s3_i, q_tw_81); \
661 q_s7_r = vqdmulhq_s32 (q_s7_r, q_tw_81n); \
662 q_s7_i = vqdmulhq_s32 (q_s7_i, q_tw_81n);
663
664#define RADIX8x4_INV_S357 \
665 q_tw_81 = vdupq_n_s32 (TW_81); \
666 q_tw_81n = vdupq_n_s32 (TW_81N); \
667 q_s5_r = vnegq_s32 (q_sin5_i); \
668 q_s5_i = q_sin5_r; \
669 q_s3_r = vsubq_s32 (q_sin3_r, q_sin3_i); \
670 q_s3_i = vaddq_s32 (q_sin3_i, q_sin3_r); \
671 q_s7_r = vaddq_s32 (q_sin7_r, q_sin7_i); \
672 q_s7_i = vsubq_s32 (q_sin7_i, q_sin7_r); \
673 q_s3_r = vqdmulhq_s32 (q_s3_r, q_tw_81); \
674 q_s3_i = vqdmulhq_s32 (q_s3_i, q_tw_81); \
675 q_s7_r = vqdmulhq_s32 (q_s7_r, q_tw_81n); \
676 q_s7_i = vqdmulhq_s32 (q_s7_i, q_tw_81n);
677
678#define RADIX8x4_LS_02 \
679 q_s8_r = vaddq_s32 (q_sin0_r, q_sin4_r); \
680 q_s8_i = vaddq_s32 (q_sin0_i, q_sin4_i); \
681 q_s9_r = vaddq_s32 (q_sin1_r, q_s5_r); \
682 q_s9_i = vaddq_s32 (q_sin1_i, q_s5_i); \
683 q_s10_r = vsubq_s32 (q_sin0_r, q_sin4_r); \
684 q_s10_i = vsubq_s32 (q_sin0_i, q_sin4_i); \
685 q_s11_r = vsubq_s32 (q_sin1_r, q_s5_r); \
686 q_s11_i = vsubq_s32 (q_sin1_i, q_s5_i); \
687 q_s12_r = vaddq_s32 (q_sin2_r, q_sin6_r); \
688 q_s12_i = vaddq_s32 (q_sin2_i, q_sin6_i); \
689 q_s13_r = vaddq_s32 (q_s3_r, q_s7_r); \
690 q_s13_i = vaddq_s32 (q_s3_i, q_s7_i); \
691 q_s14_r = vsubq_s32 (q_sin2_r, q_sin6_r); \
692 q_s14_i = vsubq_s32 (q_sin2_i, q_sin6_i); \
693 q_s15_r = vsubq_s32 (q_s3_r, q_s7_r); \
694 q_s15_i = vsubq_s32 (q_s3_i, q_s7_i); \
695 q_out4_r = vsubq_s32 (q_s8_r, q_s12_r); \
696 q_out4_i = vsubq_s32 (q_s8_i, q_s12_i); \
697 q_out5_r = vsubq_s32 (q_s9_r, q_s13_r); \
698 q_out5_i = vsubq_s32 (q_s9_i, q_s13_i); \
699 q_out0_r = vaddq_s32 (q_s8_r, q_s12_r); \
700 q_out0_i = vaddq_s32 (q_s8_i, q_s12_i); \
701 q_out1_r = vaddq_s32 (q_s9_r, q_s13_r); \
702 q_out1_i = vaddq_s32 (q_s9_i, q_s13_i);
703
704#define RADIX8x4_FS_S0_SCALED \
705 q_sin0_r = vhaddq_s32 (q2_in0.val[0], q2_in1.val[0]); \
706 q_sin0_i = vhaddq_s32 (q2_in0.val[1], q2_in1.val[1]); \
707 q_sin1_r = vhsubq_s32 (q2_in0.val[0], q2_in1.val[0]); \
708 q_sin1_i = vhsubq_s32 (q2_in0.val[1], q2_in1.val[1]); \
709 q_sin2_r = vhaddq_s32 (q2_in2.val[0], q2_in3.val[0]); \
710 q_sin2_i = vhaddq_s32 (q2_in2.val[1], q2_in3.val[1]); \
711 q_sin3_r = vhsubq_s32 (q2_in2.val[0], q2_in3.val[0]); \
712 q_sin3_i = vhsubq_s32 (q2_in2.val[1], q2_in3.val[1]); \
713 q_sin4_r = vhaddq_s32 (q2_in4.val[0], q2_in5.val[0]); \
714 q_sin4_i = vhaddq_s32 (q2_in4.val[1], q2_in5.val[1]); \
715 q_sin5_r = vhsubq_s32 (q2_in4.val[0], q2_in5.val[0]); \
716 q_sin5_i = vhsubq_s32 (q2_in4.val[1], q2_in5.val[1]); \
717 q_sin6_r = vhaddq_s32 (q2_in6.val[0], q2_in7.val[0]); \
718 q_sin6_i = vhaddq_s32 (q2_in6.val[1], q2_in7.val[1]); \
719 q_sin7_r = vhsubq_s32 (q2_in6.val[0], q2_in7.val[0]); \
720 q_sin7_i = vhsubq_s32 (q2_in6.val[1], q2_in7.val[1]);
721
722#define RADIX8x4_LS_02_SCALED \
723 q_s8_r = vhaddq_s32 (q_sin0_r, q_sin4_r); \
724 q_s8_i = vhaddq_s32 (q_sin0_i, q_sin4_i); \
725 q_s9_r = vhaddq_s32 (q_sin1_r, q_s5_r); \
726 q_s9_i = vhaddq_s32 (q_sin1_i, q_s5_i); \
727 q_s10_r = vhsubq_s32 (q_sin0_r, q_sin4_r); \
728 q_s10_i = vhsubq_s32 (q_sin0_i, q_sin4_i); \
729 q_s11_r = vhsubq_s32 (q_sin1_r, q_s5_r); \
730 q_s11_i = vhsubq_s32 (q_sin1_i, q_s5_i); \
731 q_s12_r = vhaddq_s32 (q_sin2_r, q_sin6_r); \
732 q_s12_i = vhaddq_s32 (q_sin2_i, q_sin6_i); \
733 q_s13_r = vhaddq_s32 (q_s3_r, q_s7_r); \
734 q_s13_i = vhaddq_s32 (q_s3_i, q_s7_i); \
735 q_s14_r = vhsubq_s32 (q_sin2_r, q_sin6_r); \
736 q_s14_i = vhsubq_s32 (q_sin2_i, q_sin6_i); \
737 q_s15_r = vhsubq_s32 (q_s3_r, q_s7_r); \
738 q_s15_i = vhsubq_s32 (q_s3_i, q_s7_i); \
739 q_out4_r = vhsubq_s32 (q_s8_r, q_s12_r); \
740 q_out4_i = vhsubq_s32 (q_s8_i, q_s12_i); \
741 q_out5_r = vhsubq_s32 (q_s9_r, q_s13_r); \
742 q_out5_i = vhsubq_s32 (q_s9_i, q_s13_i); \
743 q_out0_r = vhaddq_s32 (q_s8_r, q_s12_r); \
744 q_out0_i = vhaddq_s32 (q_s8_i, q_s12_i); \
745 q_out1_r = vhaddq_s32 (q_s9_r, q_s13_r); \
746 q_out1_i = vhaddq_s32 (q_s9_i, q_s13_i);
747
748
749static inline void ne10_radix8x4_forward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
751 ne10_int32_t stride)
752{
753 RADIX8x4_START
754
755 for (f_count = 0; f_count < stride; f_count += 4)
756 {
757 RADIX8x4_LOAD
758 RADIX8x4_FS_S0
759
760
761 // radix 4 butterfly without twiddles
762 RADIX8x4_FWD_S357
763 RADIX8x4_LS_02
764
765 q_out2_r = vaddq_s32 (q_s10_r, q_s14_i);
766 q_out2_i = vsubq_s32 (q_s10_i, q_s14_r);
767 q_out3_r = vaddq_s32 (q_s11_r, q_s15_i);
768 q_out3_i = vsubq_s32 (q_s11_i, q_s15_r);
769 q_out6_r = vsubq_s32 (q_s10_r, q_s14_i);
770 q_out6_i = vaddq_s32 (q_s10_i, q_s14_r);
771 q_out7_r = vsubq_s32 (q_s11_r, q_s15_i);
772 q_out7_i = vaddq_s32 (q_s11_i, q_s15_r);
773
774 RADIX8x4_STORE
775 } // f_count
776}
777
778static inline void ne10_radix8x4_backward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
780 ne10_int32_t stride)
781{
782 RADIX8x4_START
783
784 for (f_count = 0; f_count < stride; f_count += 4)
785 {
786 RADIX8x4_LOAD
787 RADIX8x4_FS_S0
788
789 // radix 4 butterfly without twiddles
790 RADIX8x4_INV_S357
791 RADIX8x4_LS_02
792
793 q_out2_r = vsubq_s32 (q_s10_r, q_s14_i);
794 q_out2_i = vaddq_s32 (q_s10_i, q_s14_r);
795 q_out3_r = vsubq_s32 (q_s11_r, q_s15_i);
796 q_out3_i = vaddq_s32 (q_s11_i, q_s15_r);
797 q_out6_r = vaddq_s32 (q_s10_r, q_s14_i);
798 q_out6_i = vsubq_s32 (q_s10_i, q_s14_r);
799 q_out7_r = vaddq_s32 (q_s11_r, q_s15_i);
800 q_out7_i = vsubq_s32 (q_s11_i, q_s15_r);
801
802 RADIX8x4_STORE
803 } // f_count
804}
805static inline void ne10_radix8x4_forward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
807 ne10_int32_t stride)
808{
809 RADIX8x4_START
810
811 for (f_count = 0; f_count < stride; f_count += 4)
812 {
813 RADIX8x4_LOAD
814 RADIX8x4_FS_S0_SCALED
815
816 // radix 4 butterfly without twiddles
817 RADIX8x4_FWD_S357
818 RADIX8x4_LS_02_SCALED
819
820 q_out2_r = vhaddq_s32 (q_s10_r, q_s14_i);
821 q_out2_i = vhsubq_s32 (q_s10_i, q_s14_r);
822 q_out3_r = vhaddq_s32 (q_s11_r, q_s15_i);
823 q_out3_i = vhsubq_s32 (q_s11_i, q_s15_r);
824 q_out6_r = vhsubq_s32 (q_s10_r, q_s14_i);
825 q_out6_i = vhaddq_s32 (q_s10_i, q_s14_r);
826 q_out7_r = vhsubq_s32 (q_s11_r, q_s15_i);
827 q_out7_i = vhaddq_s32 (q_s11_i, q_s15_r);
828
829 RADIX8x4_STORE
830 } // f_count
831}
832
833static inline void ne10_radix8x4_backward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
835 ne10_int32_t stride)
836{
837 RADIX8x4_START
838
839 for (f_count = 0; f_count < stride; f_count += 4)
840 {
841 RADIX8x4_LOAD
842 RADIX8x4_FS_S0_SCALED
843
844 // radix 4 butterfly without twiddles
845 RADIX8x4_INV_S357
846 RADIX8x4_LS_02_SCALED
847
848 q_out2_r = vhsubq_s32 (q_s10_r, q_s14_i);
849 q_out2_i = vhaddq_s32 (q_s10_i, q_s14_r);
850 q_out3_r = vhsubq_s32 (q_s11_r, q_s15_i);
851 q_out3_i = vhaddq_s32 (q_s11_i, q_s15_r);
852 q_out6_r = vhaddq_s32 (q_s10_r, q_s14_i);
853 q_out6_i = vhsubq_s32 (q_s10_i, q_s14_r);
854 q_out7_r = vhaddq_s32 (q_s11_r, q_s15_i);
855 q_out7_i = vhsubq_s32 (q_s11_i, q_s15_r);
856
857 RADIX8x4_STORE
858 } // f_count
859}
860
861#define RADIX4x4_WITHOUT_TW_START \
862 ne10_int32_t f_count; \
863 ne10_int32_t src_step = stride << 1; \
864 int32_t *p_src, *p_dst; \
865 int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3; \
866 int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i; \
867 int32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i; \
868 int32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3; \
869 int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3; \
870 p_src = (int32_t *) Fin; \
871 p_dst = (int32_t *) Fout;
872
873#define RADIX4x4_WITHOUT_TW_LOAD \
874 q2_in0 = vld2q_s32 (p_src); \
875 p_src += src_step; \
876 q2_in1 = vld2q_s32 (p_src); \
877 p_src += src_step; \
878 q2_in2 = vld2q_s32 (p_src); \
879 p_src += src_step; \
880 q2_in3 = vld2q_s32 (p_src); \
881 p_src += src_step;
882
883#define RADIX4x4_WITHOUT_TW_STORE \
884 q2_tmp0 = vtrnq_s32 (q_out0_r, q_out1_r); \
885 q2_tmp1 = vtrnq_s32 (q_out0_i, q_out1_i); \
886 q2_tmp2 = vtrnq_s32 (q_out2_r, q_out3_r); \
887 q2_tmp3 = vtrnq_s32 (q_out2_i, q_out3_i); \
888 q2_out0.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[0]), vget_low_s32 (q2_tmp2.val[0])); \
889 q2_out0.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[0]), vget_low_s32 (q2_tmp3.val[0])); \
890 q2_out1.val[0] = vcombine_s32 (vget_low_s32 (q2_tmp0.val[1]), vget_low_s32 (q2_tmp2.val[1])); \
891 q2_out1.val[1] = vcombine_s32 (vget_low_s32 (q2_tmp1.val[1]), vget_low_s32 (q2_tmp3.val[1])); \
892 q2_out2.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[0]), vget_high_s32 (q2_tmp2.val[0])); \
893 q2_out2.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[0]), vget_high_s32 (q2_tmp3.val[0])); \
894 q2_out3.val[0] = vcombine_s32 (vget_high_s32 (q2_tmp0.val[1]), vget_high_s32 (q2_tmp2.val[1])); \
895 q2_out3.val[1] = vcombine_s32 (vget_high_s32 (q2_tmp1.val[1]), vget_high_s32 (q2_tmp3.val[1])); \
896 vst2q_s32 (p_dst, q2_out0); \
897 p_dst += 8; \
898 vst2q_s32 (p_dst, q2_out1); \
899 p_dst += 8; \
900 vst2q_s32 (p_dst, q2_out2); \
901 p_dst += 8; \
902 vst2q_s32 (p_dst, q2_out3); \
903 p_dst += 8; \
904 p_src = p_src - src_step * 4 + 8;
905
906#define RADIX4x4_WITHOUT_TW_S0 \
907 q_s0_r = vaddq_s32 (q2_in0.val[0], q2_in2.val[0]); \
908 q_s0_i = vaddq_s32 (q2_in0.val[1], q2_in2.val[1]); \
909 q_s1_r = vsubq_s32 (q2_in0.val[0], q2_in2.val[0]); \
910 q_s1_i = vsubq_s32 (q2_in0.val[1], q2_in2.val[1]); \
911 q_s2_r = vaddq_s32 (q2_in1.val[0], q2_in3.val[0]); \
912 q_s2_i = vaddq_s32 (q2_in1.val[1], q2_in3.val[1]); \
913 q_s3_r = vsubq_s32 (q2_in1.val[0], q2_in3.val[0]); \
914 q_s3_i = vsubq_s32 (q2_in1.val[1], q2_in3.val[1]); \
915 q_out2_r = vsubq_s32 (q_s0_r, q_s2_r); \
916 q_out2_i = vsubq_s32 (q_s0_i, q_s2_i); \
917 q_out0_r = vaddq_s32 (q_s0_r, q_s2_r); \
918 q_out0_i = vaddq_s32 (q_s0_i, q_s2_i);
919
920#define RADIX4x4_WITHOUT_TW_S0_SCALED \
921 q_s0_r = vhaddq_s32 (q2_in0.val[0], q2_in2.val[0]); \
922 q_s0_i = vhaddq_s32 (q2_in0.val[1], q2_in2.val[1]); \
923 q_s1_r = vhsubq_s32 (q2_in0.val[0], q2_in2.val[0]); \
924 q_s1_i = vhsubq_s32 (q2_in0.val[1], q2_in2.val[1]); \
925 q_s2_r = vhaddq_s32 (q2_in1.val[0], q2_in3.val[0]); \
926 q_s2_i = vhaddq_s32 (q2_in1.val[1], q2_in3.val[1]); \
927 q_s3_r = vhsubq_s32 (q2_in1.val[0], q2_in3.val[0]); \
928 q_s3_i = vhsubq_s32 (q2_in1.val[1], q2_in3.val[1]); \
929 q_out2_r = vhsubq_s32 (q_s0_r, q_s2_r); \
930 q_out2_i = vhsubq_s32 (q_s0_i, q_s2_i); \
931 q_out0_r = vhaddq_s32 (q_s0_r, q_s2_r); \
932 q_out0_i = vhaddq_s32 (q_s0_i, q_s2_i);
933
934
935static inline void ne10_radix4x4_without_twiddles_forward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
937 ne10_int32_t stride)
938{
939 RADIX4x4_WITHOUT_TW_START
940
941 for (f_count = 0; f_count < stride; f_count += 4)
942 {
943 // load
944 RADIX4x4_WITHOUT_TW_LOAD
945
946 // radix 4 butterfly without twiddles
947 RADIX4x4_WITHOUT_TW_S0
948
949 q_out1_r = vaddq_s32 (q_s1_r, q_s3_i);
950 q_out1_i = vsubq_s32 (q_s1_i, q_s3_r);
951 q_out3_r = vsubq_s32 (q_s1_r, q_s3_i);
952 q_out3_i = vaddq_s32 (q_s1_i, q_s3_r);
953
954 RADIX4x4_WITHOUT_TW_STORE
955 }
956}
957
958static inline void ne10_radix4x4_without_twiddles_backward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
960 ne10_int32_t stride)
961{
962 RADIX4x4_WITHOUT_TW_START
963
964 for (f_count = 0; f_count < stride; f_count += 4)
965 {
966 // load
967 RADIX4x4_WITHOUT_TW_LOAD
968
969 // radix 4 butterfly without twiddles
970 RADIX4x4_WITHOUT_TW_S0
971
972 q_out1_r = vsubq_s32 (q_s1_r, q_s3_i);
973 q_out1_i = vaddq_s32 (q_s1_i, q_s3_r);
974 q_out3_r = vaddq_s32 (q_s1_r, q_s3_i);
975 q_out3_i = vsubq_s32 (q_s1_i, q_s3_r);
976
977 RADIX4x4_WITHOUT_TW_STORE
978 }
979}
980
981static inline void ne10_radix4x4_without_twiddles_forward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
983 ne10_int32_t stride)
984{
985 RADIX4x4_WITHOUT_TW_START
986
987 for (f_count = 0; f_count < stride; f_count += 4)
988 {
989 // load
990 RADIX4x4_WITHOUT_TW_LOAD
991
992 // radix 4 butterfly without twiddles
993 RADIX4x4_WITHOUT_TW_S0_SCALED
994
995 q_out1_r = vhaddq_s32 (q_s1_r, q_s3_i);
996 q_out1_i = vhsubq_s32 (q_s1_i, q_s3_r);
997 q_out3_r = vhsubq_s32 (q_s1_r, q_s3_i);
998 q_out3_i = vhaddq_s32 (q_s1_i, q_s3_r);
999
1000 RADIX4x4_WITHOUT_TW_STORE
1001 }
1002}
1003
1004static inline void ne10_radix4x4_without_twiddles_backward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
1006 ne10_int32_t stride)
1007{
1008 RADIX4x4_WITHOUT_TW_START
1009
1010 for (f_count = 0; f_count < stride; f_count += 4)
1011 {
1012 // load
1013 RADIX4x4_WITHOUT_TW_LOAD
1014
1015 // radix 4 butterfly without twiddles
1016 RADIX4x4_WITHOUT_TW_S0_SCALED
1017
1018 q_out1_r = vhsubq_s32 (q_s1_r, q_s3_i);
1019 q_out1_i = vhaddq_s32 (q_s1_i, q_s3_r);
1020 q_out3_r = vhaddq_s32 (q_s1_r, q_s3_i);
1021 q_out3_i = vhsubq_s32 (q_s1_i, q_s3_r);
1022
1023 RADIX4x4_WITHOUT_TW_STORE
1024 }
1025}
1026
1027#define RADIX4x4_WITH_TW_START \
1028 ne10_int32_t m_count; \
1029 ne10_int32_t src_step = src_stride << 1; \
1030 ne10_int32_t dst_step = dst_stride << 1; \
1031 ne10_int32_t tw_step = mstride << 1; \
1032 int32_t *p_src, *p_dst, *p_tw; \
1033 int32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3; \
1034 int32x4x2_t q2_tw0, q2_tw1, q2_tw2; \
1035 int32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i; \
1036 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5; \
1037 int32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i; \
1038 int32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3; \
1039 p_src = (int32_t *) Fin; \
1040 p_dst = (int32_t *) Fout; \
1041 p_tw = (int32_t *) tw;
1042
1043#define RADIX4x4_WITH_TW_LOAD \
1044 q2_in0 = vld2q_s32 (p_src); \
1045 p_src += src_step; \
1046 q2_in1 = vld2q_s32 (p_src); \
1047 p_src += src_step; \
1048 q2_in2 = vld2q_s32 (p_src); \
1049 p_src += src_step; \
1050 q2_in3 = vld2q_s32 (p_src); \
1051 p_src += src_step; \
1052 q2_tw0 = vld2q_s32 (p_tw); \
1053 p_tw += tw_step; \
1054 q2_tw1 = vld2q_s32 (p_tw); \
1055 p_tw += tw_step; \
1056 q2_tw2 = vld2q_s32 (p_tw); \
1057 q_s1_r = vqdmulhq_s32 (q2_in1.val[0], q2_tw0.val[0]); \
1058 q_s1_i = vqdmulhq_s32 (q2_in1.val[1], q2_tw0.val[0]); \
1059 q_s2_r = vqdmulhq_s32 (q2_in2.val[0], q2_tw1.val[0]); \
1060 q_s2_i = vqdmulhq_s32 (q2_in2.val[1], q2_tw1.val[0]); \
1061 q_s3_r = vqdmulhq_s32 (q2_in3.val[0], q2_tw2.val[0]); \
1062 q_s3_i = vqdmulhq_s32 (q2_in3.val[1], q2_tw2.val[0]); \
1063 q_tmp0 = vqdmulhq_s32 (q2_in1.val[1], q2_tw0.val[1]); \
1064 q_tmp1 = vqdmulhq_s32 (q2_in1.val[0], q2_tw0.val[1]); \
1065 q_tmp2 = vqdmulhq_s32 (q2_in2.val[1], q2_tw1.val[1]); \
1066 q_tmp3 = vqdmulhq_s32 (q2_in2.val[0], q2_tw1.val[1]); \
1067 q_tmp4 = vqdmulhq_s32 (q2_in3.val[1], q2_tw2.val[1]); \
1068 q_tmp5 = vqdmulhq_s32 (q2_in3.val[0], q2_tw2.val[1]);
1069
1070#define RADIX4x4_WITH_TW_STORE \
1071 vst2q_s32 (p_dst, q2_out0); \
1072 p_dst += dst_step; \
1073 vst2q_s32 (p_dst, q2_out1); \
1074 p_dst += dst_step; \
1075 vst2q_s32 (p_dst, q2_out2); \
1076 p_dst += dst_step; \
1077 vst2q_s32 (p_dst, q2_out3); \
1078 p_dst += dst_step; \
1079 p_src = p_src - src_step * 4 + 8; \
1080 p_dst = p_dst - dst_step * 4 + 8; \
1081 p_tw = p_tw - tw_step * 2 + 8;
1082
1083#define RADIX4x4_WITH_TW_S1_FWD \
1084 q_s1_r = vsubq_s32 (q_s1_r, q_tmp0); \
1085 q_s1_i = vaddq_s32 (q_s1_i, q_tmp1); \
1086 q_s2_r = vsubq_s32 (q_s2_r, q_tmp2); \
1087 q_s2_i = vaddq_s32 (q_s2_i, q_tmp3); \
1088 q_s3_r = vsubq_s32 (q_s3_r, q_tmp4); \
1089 q_s3_i = vaddq_s32 (q_s3_i, q_tmp5);
1090
1091#define RADIX4x4_WITH_TW_S1_INV \
1092 q_s1_r = vaddq_s32 (q_s1_r, q_tmp0); \
1093 q_s1_i = vsubq_s32 (q_s1_i, q_tmp1); \
1094 q_s2_r = vaddq_s32 (q_s2_r, q_tmp2); \
1095 q_s2_i = vsubq_s32 (q_s2_i, q_tmp3); \
1096 q_s3_r = vaddq_s32 (q_s3_r, q_tmp4); \
1097 q_s3_i = vsubq_s32 (q_s3_i, q_tmp5);
1098
1099
1100#define RADIX4x4_WITH_TW_LS_02 \
1101 q_s4_r = vaddq_s32 (q2_in0.val[0], q_s2_r); \
1102 q_s4_i = vaddq_s32 (q2_in0.val[1], q_s2_i); \
1103 q_s5_r = vsubq_s32 (q2_in0.val[0], q_s2_r); \
1104 q_s5_i = vsubq_s32 (q2_in0.val[1], q_s2_i); \
1105 q_s6_r = vaddq_s32 (q_s1_r, q_s3_r); \
1106 q_s6_i = vaddq_s32 (q_s1_i, q_s3_i); \
1107 q_s7_r = vsubq_s32 (q_s1_r, q_s3_r); \
1108 q_s7_i = vsubq_s32 (q_s1_i, q_s3_i); \
1109 q2_out2.val[0] = vsubq_s32 (q_s4_r, q_s6_r); \
1110 q2_out2.val[1] = vsubq_s32 (q_s4_i, q_s6_i); \
1111 q2_out0.val[0] = vaddq_s32 (q_s4_r, q_s6_r); \
1112 q2_out0.val[1] = vaddq_s32 (q_s4_i, q_s6_i);
1113
1114#define RADIX4x4_WITH_TW_LS_02_SCALED \
1115 q_s4_r = vhaddq_s32 (q2_in0.val[0], q_s2_r); \
1116 q_s4_i = vhaddq_s32 (q2_in0.val[1], q_s2_i); \
1117 q_s5_r = vhsubq_s32 (q2_in0.val[0], q_s2_r); \
1118 q_s5_i = vhsubq_s32 (q2_in0.val[1], q_s2_i); \
1119 q_s6_r = vhaddq_s32 (q_s1_r, q_s3_r); \
1120 q_s6_i = vhaddq_s32 (q_s1_i, q_s3_i); \
1121 q_s7_r = vhsubq_s32 (q_s1_r, q_s3_r); \
1122 q_s7_i = vhsubq_s32 (q_s1_i, q_s3_i); \
1123 q2_out2.val[0] = vhsubq_s32 (q_s4_r, q_s6_r); \
1124 q2_out2.val[1] = vhsubq_s32 (q_s4_i, q_s6_i); \
1125 q2_out0.val[0] = vhaddq_s32 (q_s4_r, q_s6_r); \
1126 q2_out0.val[1] = vhaddq_s32 (q_s4_i, q_s6_i);
1127
1128
1129static inline void ne10_radix4x4_with_twiddles_forward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
1132 ne10_int32_t src_stride,
1133 ne10_int32_t dst_stride,
1134 ne10_int32_t mstride)
1135{
1136 RADIX4x4_WITH_TW_START
1137
1138 for (m_count = 0; m_count < mstride; m_count += 4)
1139 {
1140 // load
1141 RADIX4x4_WITH_TW_LOAD
1142 RADIX4x4_WITH_TW_S1_FWD
1143
1144 RADIX4x4_WITH_TW_LS_02
1145
1146 q2_out1.val[0] = vaddq_s32 (q_s5_r, q_s7_i);
1147 q2_out1.val[1] = vsubq_s32 (q_s5_i, q_s7_r);
1148 q2_out3.val[0] = vsubq_s32 (q_s5_r, q_s7_i);
1149 q2_out3.val[1] = vaddq_s32 (q_s5_i, q_s7_r);
1150
1151 // store
1152 RADIX4x4_WITH_TW_STORE
1153 }
1154}
1155
1156
1157static inline void ne10_radix4x4_with_twiddles_backward_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
1160 ne10_int32_t src_stride,
1161 ne10_int32_t dst_stride,
1162 ne10_int32_t mstride)
1163{
1164 RADIX4x4_WITH_TW_START
1165
1166 for (m_count = 0; m_count < mstride; m_count += 4)
1167 {
1168 // load
1169 RADIX4x4_WITH_TW_LOAD
1170 RADIX4x4_WITH_TW_S1_INV
1171
1172 RADIX4x4_WITH_TW_LS_02
1173
1174 q2_out1.val[0] = vsubq_s32 (q_s5_r, q_s7_i);
1175 q2_out1.val[1] = vaddq_s32 (q_s5_i, q_s7_r);
1176 q2_out3.val[0] = vaddq_s32 (q_s5_r, q_s7_i);
1177 q2_out3.val[1] = vsubq_s32 (q_s5_i, q_s7_r);
1178
1179 // store
1180 RADIX4x4_WITH_TW_STORE
1181 }
1182}
1183
1184
1185
1186static inline void ne10_radix4x4_with_twiddles_forward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
1189 ne10_int32_t src_stride,
1190 ne10_int32_t dst_stride,
1191 ne10_int32_t mstride)
1192{
1193 RADIX4x4_WITH_TW_START
1194
1195 for (m_count = 0; m_count < mstride; m_count += 4)
1196 {
1197 // load
1198 RADIX4x4_WITH_TW_LOAD
1199 RADIX4x4_WITH_TW_S1_FWD
1200
1201 RADIX4x4_WITH_TW_LS_02_SCALED
1202
1203 q2_out1.val[0] = vhaddq_s32 (q_s5_r, q_s7_i);
1204 q2_out1.val[1] = vhsubq_s32 (q_s5_i, q_s7_r);
1205 q2_out3.val[0] = vhsubq_s32 (q_s5_r, q_s7_i);
1206 q2_out3.val[1] = vhaddq_s32 (q_s5_i, q_s7_r);
1207
1208 // store
1209 RADIX4x4_WITH_TW_STORE
1210 }
1211}
1212
1213static inline void ne10_radix4x4_with_twiddles_backward_scaled_neon (ne10_fft_cpx_int32_t * Fout,
1216 ne10_int32_t src_stride,
1217 ne10_int32_t dst_stride,
1218 ne10_int32_t mstride)
1219{
1220 RADIX4x4_WITH_TW_START
1221
1222 for (m_count = 0; m_count < mstride; m_count += 4)
1223 {
1224 // load
1225 RADIX4x4_WITH_TW_LOAD
1226 RADIX4x4_WITH_TW_S1_INV
1227
1228 RADIX4x4_WITH_TW_LS_02_SCALED
1229
1230 q2_out1.val[0] = vhsubq_s32 (q_s5_r, q_s7_i);
1231 q2_out1.val[1] = vhaddq_s32 (q_s5_i, q_s7_r);
1232 q2_out3.val[0] = vhaddq_s32 (q_s5_r, q_s7_i);
1233 q2_out3.val[1] = vhsubq_s32 (q_s5_i, q_s7_r);
1234
1235 // store
1236 RADIX4x4_WITH_TW_STORE
1237 }
1238}
1239
1240#define ne10_mixed_radix_fft_forward_int32_neon(scaled) \
1241void ne10_mixed_radix_fft_forward_int32_##scaled##_neon (ne10_fft_cpx_int32_t * Fout, \
1242 ne10_fft_cpx_int32_t * Fin, \
1243 ne10_int32_t * factors, \
1244 ne10_fft_cpx_int32_t * twiddles, \
1245 ne10_fft_cpx_int32_t * buffer) \
1246{ \
1247 ne10_int32_t fstride, mstride, N; \
1248 ne10_int32_t fstride1; \
1249 ne10_int32_t f_count; \
1250 ne10_int32_t stage_count; \
1251 \
1252 ne10_fft_cpx_int32_t *Fin1, *Fout1; \
1253 ne10_fft_cpx_int32_t *Fout_ls = Fout; \
1254 ne10_fft_cpx_int32_t *Ftmp; \
1255 ne10_fft_cpx_int32_t *tw, *tw1; \
1256 \
1257 /* init fstride, mstride, N */ \
1258 stage_count = factors[0]; \
1259 fstride = factors[1]; \
1260 mstride = factors[ (stage_count << 1) - 1 ]; \
1261 N = factors[ stage_count << 1 ]; \
1262 \
1263 /* the first stage */ \
1264 Fin1 = Fin; \
1265 Fout1 = Fout; \
1266 if (N == 2) \
1267 { \
1268 N = fstride >> 1;\
1269 tw = twiddles; \
1270 fstride1 = fstride >> 2; \
1271 ne10_radix8x4_forward_##scaled##_neon (Fout, Fin, fstride1);\
1272 \
1273 tw += 6; \
1274 mstride <<= 2; \
1275 fstride >>= 4; \
1276 stage_count -= 2; \
1277 \
1278 Ftmp = buffer; \
1279 buffer = Fout; \
1280 Fout = Ftmp; \
1281 } \
1282 else if (N == 4) \
1283 { \
1284 ne10_radix4x4_without_twiddles_forward_##scaled##_neon (Fout, Fin, fstride); \
1285 N = fstride; \
1286 Ftmp = buffer; \
1287 buffer = Fout; \
1288 Fout = Ftmp; \
1289 /* update address for other stages*/ \
1290 stage_count--; \
1291 tw = twiddles; \
1292 fstride >>= 2; \
1293 } \
1294 /* others but the last one*/ \
1295 for (; stage_count > 1 ; stage_count--) \
1296 { \
1297 Fin1 = buffer; \
1298 for (f_count = 0; f_count < fstride; f_count ++) \
1299 { \
1300 Fout1 = & Fout[ f_count * mstride << 2 ]; \
1301 tw1 = tw; \
1302 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1303 Fin1 += mstride; \
1304 } \
1305 tw += mstride * 3; \
1306 mstride <<= 2; \
1307 Ftmp = buffer; \
1308 buffer = Fout; \
1309 Fout = Ftmp; \
1310 fstride >>= 2; \
1311 }\
1312 /* the last one*/ \
1313 if (stage_count) \
1314 { \
1315 Fin1 = buffer; \
1316 Fout1 = Fout_ls; \
1317 for (f_count = 0; f_count < fstride; f_count ++) \
1318 { \
1319 tw1 = tw; \
1320 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1321 Fin1 += mstride; \
1322 Fout1 += mstride; \
1323 } \
1324 } \
1325}
1326
1327#define ne10_mixed_radix_fft_backward_int32_neon(scaled) \
1328void ne10_mixed_radix_fft_backward_int32_##scaled##_neon (ne10_fft_cpx_int32_t * Fout, \
1329 ne10_fft_cpx_int32_t * Fin, \
1330 ne10_int32_t * factors, \
1331 ne10_fft_cpx_int32_t * twiddles, \
1332 ne10_fft_cpx_int32_t * buffer) \
1333{ \
1334 ne10_int32_t fstride, mstride, N; \
1335 ne10_int32_t fstride1; \
1336 ne10_int32_t f_count; \
1337 ne10_int32_t stage_count; \
1338 \
1339 ne10_fft_cpx_int32_t *Fin1, *Fout1; \
1340 ne10_fft_cpx_int32_t *Fout_ls = Fout; \
1341 ne10_fft_cpx_int32_t *Ftmp; \
1342 ne10_fft_cpx_int32_t *tw, *tw1; \
1343 \
1344 /* init fstride, mstride, N */ \
1345 stage_count = factors[0]; \
1346 fstride = factors[1]; \
1347 mstride = factors[ (stage_count << 1) - 1 ]; \
1348 N = factors[ stage_count << 1 ]; \
1349 \
1350 /* the first stage */ \
1351 Fin1 = Fin; \
1352 Fout1 = Fout; \
1353 if (N == 2) \
1354 { \
1355 N = fstride >> 1;\
1356 tw = twiddles; \
1357 fstride1 = fstride >> 2; \
1358 ne10_radix8x4_backward_##scaled##_neon (Fout, Fin, fstride1);\
1359 \
1360 tw += 6; \
1361 mstride <<= 2; \
1362 fstride >>= 4; \
1363 stage_count -= 2; \
1364 \
1365 Ftmp = buffer; \
1366 buffer = Fout; \
1367 Fout = Ftmp; \
1368 } \
1369 else if (N == 4) \
1370 { \
1371 ne10_radix4x4_without_twiddles_backward_##scaled##_neon (Fout, Fin, fstride); \
1372 N = fstride; \
1373 Ftmp = buffer; \
1374 buffer = Fout; \
1375 Fout = Ftmp; \
1376 /* update address for other stages*/ \
1377 stage_count--; \
1378 tw = twiddles; \
1379 fstride >>= 2; \
1380 } \
1381 /* others but the last one*/ \
1382 for (; stage_count > 1 ; stage_count--) \
1383 { \
1384 Fin1 = buffer; \
1385 for (f_count = 0; f_count < fstride; f_count ++) \
1386 { \
1387 Fout1 = & Fout[ f_count * mstride << 2 ]; \
1388 tw1 = tw; \
1389 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1390 Fin1 += mstride; \
1391 } \
1392 tw += mstride * 3; \
1393 mstride <<= 2; \
1394 Ftmp = buffer; \
1395 buffer = Fout; \
1396 Fout = Ftmp; \
1397 fstride >>= 2; \
1398 }\
1399 /* the last one*/ \
1400 if (stage_count) \
1401 { \
1402 Fin1 = buffer; \
1403 Fout1 = Fout_ls; \
1404 for (f_count = 0; f_count < fstride; f_count ++) \
1405 { \
1406 tw1 = tw; \
1407 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1408 Fin1 += mstride; \
1409 Fout1 += mstride; \
1410 } \
1411 } \
1412}
1413
1414ne10_mixed_radix_fft_forward_int32_neon (unscaled)
1415ne10_mixed_radix_fft_forward_int32_neon (scaled)
1416ne10_mixed_radix_fft_backward_int32_neon (unscaled)
1417ne10_mixed_radix_fft_backward_int32_neon (scaled)
1418
1419
1420static void ne10_fft_split_r2c_1d_int32_neon (ne10_fft_cpx_int32_t *dst,
1421 const ne10_fft_cpx_int32_t *src,
1422 ne10_fft_cpx_int32_t *twiddles,
1423 ne10_int32_t ncfft,
1424 ne10_int32_t scaled_flag)
1425{
1426 ne10_int32_t k;
1427 ne10_int32_t count = ncfft / 2;
1428 ne10_fft_cpx_int32_t fpnk, fpk, f1k, f2k, tw, tdc;
1429 int32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
1430 int32x4_t q_fpnk_r, q_fpnk_i;
1431 int32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
1432 int32x4_t q_tw_r, q_tw_i;
1433 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1434 int32x4_t q_dst2_r, q_dst2_i;
1435 int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1436
1437 tdc.r = src[0].r;
1438 tdc.i = src[0].i;
1439
1440 if (scaled_flag)
1441 NE10_F2I32_FIXDIV (tdc, 2);
1442
1443 dst[0].r = tdc.r + tdc.i;
1444 dst[ncfft].r = tdc.r - tdc.i;
1445 dst[ncfft].i = dst[0].i = 0;
1446 if (count >= 4)
1447 {
1448
1449 if (scaled_flag)
1450 {
1451 for (k = 1; k <= count ; k += 4)
1452 {
1453 p_src = (int32_t*) (& (src[k]));
1454 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1455 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1456 p_dst = (int32_t*) (& (dst[k]));
1457 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1458
1459 q2_fpk = vld2q_s32 (p_src);
1460 q2_fpnk = vld2q_s32 (p_src2);
1461
1462 q2_tw = vld2q_s32 (p_twiddles);
1463 q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
1464 q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
1465 q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
1466 q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
1467 q_fpnk_i = vnegq_s32 (q_fpnk_i);
1468
1469 q_f1k_r = vhaddq_s32 (q2_fpk.val[0], q_fpnk_r);
1470 q_f1k_i = vhaddq_s32 (q2_fpk.val[1], q_fpnk_i);
1471
1472 q_f2k_r = vhsubq_s32 (q2_fpk.val[0], q_fpnk_r);
1473 q_f2k_i = vhsubq_s32 (q2_fpk.val[1], q_fpnk_i);
1474
1475 q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
1476 q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
1477 q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
1478 q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
1479 q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
1480 q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
1481
1482 q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
1483 q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
1484 q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
1485 q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
1486 q_dst2_r = vrev64q_s32 (q_dst2_r);
1487 q_dst2_i = vrev64q_s32 (q_dst2_i);
1488 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1489 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1490 vst2q_s32 (p_dst, q2_dst);
1491 vst2q_s32 (p_dst2, q2_dst2);
1492
1493 }
1494 }
1495 else
1496 {
1497 for (k = 1; k <= count ; k += 4)
1498 {
1499 p_src = (int32_t*) (& (src[k]));
1500 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1501 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1502 p_dst = (int32_t*) (& (dst[k]));
1503 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1504
1505 q2_fpk = vld2q_s32 (p_src);
1506 q2_fpnk = vld2q_s32 (p_src2);
1507
1508 q2_tw = vld2q_s32 (p_twiddles);
1509 q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
1510 q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
1511 q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
1512 q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
1513 q_fpnk_i = vnegq_s32 (q_fpnk_i);
1514
1515 q_f1k_r = vaddq_s32 (q2_fpk.val[0], q_fpnk_r);
1516 q_f1k_i = vaddq_s32 (q2_fpk.val[1], q_fpnk_i);
1517
1518 q_f2k_r = vsubq_s32 (q2_fpk.val[0], q_fpnk_r);
1519 q_f2k_i = vsubq_s32 (q2_fpk.val[1], q_fpnk_i);
1520
1521 q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
1522 q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
1523 q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
1524 q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
1525 q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
1526 q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
1527
1528 q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
1529 q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
1530 q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
1531 q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
1532 q_dst2_r = vrev64q_s32 (q_dst2_r);
1533 q_dst2_i = vrev64q_s32 (q_dst2_i);
1534 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1535 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1536 vst2q_s32 (p_dst, q2_dst);
1537 vst2q_s32 (p_dst2, q2_dst2);
1538
1539 }
1540 }
1541 }
1542 else
1543 {
1544
1545 for (k = 1; k <= ncfft / 2 ; ++k)
1546 {
1547 fpk = src[k];
1548 fpnk.r = src[ncfft - k].r;
1549 fpnk.i = - src[ncfft - k].i;
1550 if (scaled_flag)
1551 {
1552 NE10_F2I32_FIXDIV (fpk, 2);
1553 NE10_F2I32_FIXDIV (fpnk, 2);
1554 }
1555
1556 f1k.r = fpk.r + fpnk.r;
1557 f1k.i = fpk.i + fpnk.i;
1558
1559 f2k.r = fpk.r - fpnk.r;
1560 f2k.i = fpk.i - fpnk.i;
1561
1562 tw.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> 32))) << 1;
1563 tw.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).i) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> 32))) << 1;
1564
1565 dst[k].r = (f1k.r + tw.r) >> 1;
1566 dst[k].i = (f1k.i + tw.i) >> 1;
1567 dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
1568 dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
1569 }
1570 }
1571}
1572
1573static void ne10_fft_split_c2r_1d_int32_neon (ne10_fft_cpx_int32_t *dst,
1574 const ne10_fft_cpx_int32_t *src,
1575 ne10_fft_cpx_int32_t *twiddles,
1576 ne10_int32_t ncfft,
1577 ne10_int32_t scaled_flag)
1578{
1579
1580 ne10_int32_t k;
1581 ne10_int32_t count = ncfft / 2;
1582 ne10_fft_cpx_int32_t fk, fnkc, fek, fok, tmp;
1583 int32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1584 int32x4_t q_fnkc_r, q_fnkc_i;
1585 int32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1586 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1587 int32x4_t q_dst2_r, q_dst2_i;
1588 int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1589
1590
1591 dst[0].r = src[0].r + src[ncfft].r;
1592 dst[0].i = src[0].r - src[ncfft].r;
1593 if (scaled_flag)
1594 NE10_F2I32_FIXDIV (dst[0], 2);
1595 if (count >= 4)
1596 {
1597 if (scaled_flag)
1598 {
1599 for (k = 1; k <= count ; k += 4)
1600 {
1601 p_src = (int32_t*) (& (src[k]));
1602 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1603 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1604 p_dst = (int32_t*) (& (dst[k]));
1605 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1606
1607 q2_fk = vld2q_s32 (p_src);
1608 q2_fnkc = vld2q_s32 (p_src2);
1609 q2_tw = vld2q_s32 (p_twiddles);
1610 q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1611 q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1612 q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1613 q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1614 q_fnkc_i = vnegq_s32 (q_fnkc_i);
1615
1616 q_fek_r = vhaddq_s32 (q2_fk.val[0], q_fnkc_r);
1617 q_fek_i = vhaddq_s32 (q2_fk.val[1], q_fnkc_i);
1618 q_tmp0 = vhsubq_s32 (q2_fk.val[0], q_fnkc_r);
1619 q_tmp1 = vhsubq_s32 (q2_fk.val[1], q_fnkc_i);
1620
1621 q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1622 q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1623 q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1624 q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1625 q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1626 q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1627
1628 q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1629 q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1630 q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1631 q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1632 q_dst2_r = vrev64q_s32 (q_dst2_r);
1633 q_dst2_i = vrev64q_s32 (q_dst2_i);
1634 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1635 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1636 vst2q_s32 (p_dst, q2_dst);
1637 vst2q_s32 (p_dst2, q2_dst2);
1638
1639 }
1640
1641 }
1642 else
1643 {
1644 for (k = 1; k <= count ; k += 4)
1645 {
1646 p_src = (int32_t*) (& (src[k]));
1647 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1648 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1649 p_dst = (int32_t*) (& (dst[k]));
1650 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1651
1652 q2_fk = vld2q_s32 (p_src);
1653 q2_fnkc = vld2q_s32 (p_src2);
1654 q2_tw = vld2q_s32 (p_twiddles);
1655 q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1656 q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1657 q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1658 q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1659 q_fnkc_i = vnegq_s32 (q_fnkc_i);
1660
1661 q_fek_r = vaddq_s32 (q2_fk.val[0], q_fnkc_r);
1662 q_fek_i = vaddq_s32 (q2_fk.val[1], q_fnkc_i);
1663 q_tmp0 = vsubq_s32 (q2_fk.val[0], q_fnkc_r);
1664 q_tmp1 = vsubq_s32 (q2_fk.val[1], q_fnkc_i);
1665
1666 q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1667 q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1668 q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1669 q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1670 q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1671 q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1672
1673 q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1674 q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1675 q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1676 q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1677 q_dst2_r = vrev64q_s32 (q_dst2_r);
1678 q_dst2_i = vrev64q_s32 (q_dst2_i);
1679 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1680 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1681 vst2q_s32 (p_dst, q2_dst);
1682 vst2q_s32 (p_dst2, q2_dst2);
1683
1684 }
1685 }
1686 }
1687 else
1688 {
1689
1690 for (k = 1; k <= ncfft / 2; k++)
1691 {
1692 fk = src[k];
1693 fnkc.r = src[ncfft - k].r;
1694 fnkc.i = -src[ncfft - k].i;
1695 if (scaled_flag)
1696 {
1697 NE10_F2I32_FIXDIV (fk, 2);
1698 NE10_F2I32_FIXDIV (fnkc, 2);
1699 }
1700
1701 fek.r = fk.r + fnkc.r;
1702 fek.i = fk.i + fnkc.i;
1703
1704 tmp.r = fk.r - fnkc.r;
1705 tmp.i = fk.i - fnkc.i;
1706
1707 fok.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).r) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> 32))) << 1;
1708 fok.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> 32))) << 1;
1709
1710 dst[k].r = fek.r + fok.r;
1711 dst[k].i = fek.i + fok.i;
1712
1713 dst[ncfft - k].r = fek.r - fok.r;
1714 dst[ncfft - k].i = fok.i - fek.i;
1715 }
1716 }
1717}
1718
1719
1742 ne10_int32_t inverse_fft,
1743 ne10_int32_t scaled_flag)
1744{
1745 // For input shorter than 16, fall back to c version.
1746 // We would not get much improvement from NEON for these cases.
1747 if (cfg->nfft < 16)
1748 {
1749 ne10_fft_c2c_1d_int32_c (fout, fin, cfg, inverse_fft, scaled_flag);
1750 return;
1751 }
1752
1753 ne10_int32_t stage_count = cfg->factors[0];
1754 ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
1755
1756 assert ((algorithm_flag == NE10_FFT_ALG_24)
1757 || (algorithm_flag == NE10_FFT_ALG_ANY));
1758
1759 // For NE10_FFT_ALG_ANY.
1760 // Function will return inside this branch.
1761 if (algorithm_flag == NE10_FFT_ALG_ANY)
1762 {
1763 if (inverse_fft)
1764 {
1765 ne10_mixed_radix_generic_butterfly_inverse_int32_neon (fout, fin,
1766 cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1767 }
1768 else
1769 {
1770 ne10_mixed_radix_generic_butterfly_int32_neon (fout, fin,
1771 cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1772 }
1773 return;
1774 }
1775
1776 if (scaled_flag)
1777 {
1778 if (inverse_fft)
1779 {
1780 switch (cfg->nfft)
1781 {
1782 case 4:
1783 ne10_fft4_backward_int32_scaled (fout, fin);
1784 break;
1785 case 8:
1786 ne10_fft8_backward_int32_scaled (fout, fin);
1787 break;
1788 case 16:
1789 ne10_fft16_backward_int32_scaled_neon (fout, fin, cfg->twiddles);
1790 break;
1791 default:
1792 ne10_mixed_radix_fft_backward_int32_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1793 break;
1794 }
1795 }
1796 else
1797 {
1798 switch (cfg->nfft)
1799 {
1800 case 4:
1801 ne10_fft4_forward_int32_scaled (fout, fin);
1802 break;
1803 case 8:
1804 ne10_fft8_forward_int32_scaled (fout, fin);
1805 break;
1806 case 16:
1807 ne10_fft16_forward_int32_scaled_neon (fout, fin, cfg->twiddles);
1808 break;
1809 default:
1810 ne10_mixed_radix_fft_forward_int32_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1811 break;
1812 }
1813 }
1814 }
1815 else
1816 {
1817 if (inverse_fft)
1818 {
1819 switch (cfg->nfft)
1820 {
1821 case 4:
1822 ne10_fft4_backward_int32_unscaled (fout, fin);
1823 break;
1824 case 8:
1825 ne10_fft8_backward_int32_unscaled (fout, fin);
1826 break;
1827 case 16:
1828 ne10_fft16_backward_int32_unscaled_neon (fout, fin, cfg->twiddles);
1829 break;
1830 default:
1831 ne10_mixed_radix_fft_backward_int32_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1832 break;
1833 }
1834 }
1835 else
1836 {
1837 switch (cfg->nfft)
1838 {
1839 case 4:
1840 ne10_fft4_forward_int32_unscaled (fout, fin);
1841 break;
1842 case 8:
1843 ne10_fft8_forward_int32_unscaled (fout, fin);
1844 break;
1845 case 16:
1846 ne10_fft16_forward_int32_unscaled_neon (fout, fin, cfg->twiddles);
1847 break;
1848 default:
1849 ne10_mixed_radix_fft_forward_int32_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1850 break;
1851 }
1852 }
1853 }
1854}
1855
//end of C2C_FFT_IFFT group
1859
1877 ne10_int32_t *fin,
1879 ne10_int32_t scaled_flag)
1880{
1881 ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1882 ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1883 ne10_fft_state_int32_t c2c_state;
1884
1885 c2c_state.nfft = cfg->ncfft;
1886 c2c_state.factors = cfg->factors;
1887 c2c_state.twiddles = cfg->twiddles;
1888 c2c_state.buffer = tmpbuf2;
1889
1890 ne10_fft_c2c_1d_int32_neon (tmpbuf1, (ne10_fft_cpx_int32_t*) fin, &c2c_state, 0, scaled_flag);
1891 ne10_fft_split_r2c_1d_int32_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1892}
1893
1905void ne10_fft_c2r_1d_int32_neon (ne10_int32_t *fout,
1908 ne10_int32_t scaled_flag)
1909{
1910 ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1911 ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1912 ne10_fft_state_int32_t c2c_state;
1913
1914 c2c_state.nfft = cfg->ncfft;
1915 c2c_state.factors = cfg->factors;
1916 c2c_state.twiddles = cfg->twiddles;
1917 c2c_state.buffer = tmpbuf2;
1918
1919 ne10_fft_split_c2r_1d_int32_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1920 ne10_fft_c2c_1d_int32_neon ( (ne10_fft_cpx_int32_t*) fout, tmpbuf1, &c2c_state, 1, scaled_flag);
1921}
1922
void ne10_fft_c2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 32-bit fixed point data.
void ne10_fft_c2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 32-bit fixed point data.
void ne10_fft_r2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 FFT (real to complex) of int32 data.
void ne10_fft_c2r_1d_int32_neon(ne10_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 IFFT (complex to real) of int32 data.
structure for the 32 bits fixed point FFT function.
Definition NE10_types.h:329