Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fft_int32.neon.c
1/*
2 * Copyright 2013-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : dsp/NE10_fft_int32.neon.c
30 */
31
32#include <arm_neon.h>
33
34#include "NE10_types.h"
35#include "NE10_macros.h"
36#include "NE10_fft.h"
37#include "NE10_dsp.h"
38
39static inline void ne10_fft4_forward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
41
42{
43 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
44 ne10_int32_t tmp_r, tmp_i;
45
46 s2_r = Fin[0].r - Fin[2].r;
47 s2_i = Fin[0].i - Fin[2].i;
48
49 tmp_r = Fin[0].r + Fin[2].r;
50 tmp_i = Fin[0].i + Fin[2].i;
51
52 s0_r = Fin[1].r + Fin[3].r;
53 s0_i = Fin[1].i + Fin[3].i;
54
55 s1_r = Fin[1].r - Fin[3].r;
56 s1_i = Fin[1].i - Fin[3].i;
57 Fout[2].r = tmp_r - s0_r;
58 Fout[2].i = tmp_i - s0_i;
59 Fout[0].r = tmp_r + s0_r;
60 Fout[0].i = tmp_i + s0_i;
61
62 Fout[1].r = s2_r + s1_i;
63 Fout[1].i = s2_i - s1_r;
64 Fout[3].r = s2_r - s1_i;
65 Fout[3].i = s2_i + s1_r;
66}
67
68static inline void ne10_fft4_backward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
70
71{
72 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
73 ne10_int32_t tmp_r, tmp_i;
74
75 s2_r = Fin[0].r - Fin[2].r;
76 s2_i = Fin[0].i - Fin[2].i;
77
78 tmp_r = Fin[0].r + Fin[2].r;
79 tmp_i = Fin[0].i + Fin[2].i;
80
81 s0_r = Fin[1].r + Fin[3].r;
82 s0_i = Fin[1].i + Fin[3].i;
83
84 s1_r = Fin[1].r - Fin[3].r;
85 s1_i = Fin[1].i - Fin[3].i;
86
87 Fout[2].r = tmp_r - s0_r;
88 Fout[2].i = tmp_i - s0_i;
89 Fout[0].r = tmp_r + s0_r;
90 Fout[0].i = tmp_i + s0_i;
91
92 Fout[1].r = s2_r - s1_i;
93 Fout[1].i = s2_i + s1_r;
94 Fout[3].r = s2_r + s1_i;
95 Fout[3].i = s2_i - s1_r;
96}
97static inline void ne10_fft4_forward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
99
100{
101 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
102 ne10_int32_t tmp_r, tmp_i;
103
104 s2_r = (Fin[0].r - Fin[2].r) >> 2;
105 s2_i = (Fin[0].i - Fin[2].i) >> 2;
106 tmp_r = (Fin[0].r + Fin[2].r) >> 2;
107 tmp_i = (Fin[0].i + Fin[2].i) >> 2;
108
109 s0_r = (Fin[1].r + Fin[3].r) >> 2;
110 s0_i = (Fin[1].i + Fin[3].i) >> 2;
111 s1_r = (Fin[1].r - Fin[3].r) >> 2;
112 s1_i = (Fin[1].i - Fin[3].i) >> 2;
113
114 Fout[2].r = tmp_r - s0_r;
115 Fout[2].i = tmp_i - s0_i;
116 Fout[0].r = tmp_r + s0_r;
117 Fout[0].i = tmp_i + s0_i;
118
119 Fout[1].r = s2_r + s1_i;
120 Fout[1].i = s2_i - s1_r;
121 Fout[3].r = s2_r - s1_i;
122 Fout[3].i = s2_i + s1_r;
123}
124
125static inline void ne10_fft4_backward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
127
128{
129 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
130 ne10_int32_t tmp_r, tmp_i;
131
132 s2_r = (Fin[0].r - Fin[2].r) >> 2;
133 s2_i = (Fin[0].i - Fin[2].i) >> 2;
134 tmp_r = (Fin[0].r + Fin[2].r) >> 2;
135 tmp_i = (Fin[0].i + Fin[2].i) >> 2;
136
137 s0_r = (Fin[1].r + Fin[3].r) >> 2;
138 s0_i = (Fin[1].i + Fin[3].i) >> 2;
139 s1_r = (Fin[1].r - Fin[3].r) >> 2;
140 s1_i = (Fin[1].i - Fin[3].i) >> 2;
141
142 Fout[2].r = tmp_r - s0_r;
143 Fout[2].i = tmp_i - s0_i;
144 Fout[0].r = tmp_r + s0_r;
145 Fout[0].i = tmp_i + s0_i;
146
147 Fout[1].r = s2_r - s1_i;
148 Fout[1].i = s2_i + s1_r;
149 Fout[3].r = s2_r + s1_i;
150 Fout[3].i = s2_i - s1_r;
151}
152static inline void ne10_fft8_forward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
154
155{
156 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
157 ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
158 const ne10_int32_t TW_81 = 1518500249;
159
160 s0_r = Fin[0].r + Fin[4].r;
161 s0_i = Fin[0].i + Fin[4].i;
162 s1_r = Fin[0].r - Fin[4].r;
163 s1_i = Fin[0].i - Fin[4].i;
164 s2_r = Fin[1].r + Fin[5].r;
165 s2_i = Fin[1].i + Fin[5].i;
166 s3_r = Fin[1].r - Fin[5].r;
167 s3_i = Fin[1].i - Fin[5].i;
168 s4_r = Fin[2].r + Fin[6].r;
169 s4_i = Fin[2].i + Fin[6].i;
170 s5_r = Fin[2].r - Fin[6].r;
171 s5_i = Fin[2].i - Fin[6].i;
172 s6_r = Fin[3].r + Fin[7].r;
173 s6_i = Fin[3].i + Fin[7].i;
174 s7_r = Fin[3].r - Fin[7].r;
175 s7_i = Fin[3].i - Fin[7].i;
176
177 t0_r = s0_r - s4_r;
178 t0_i = s0_i - s4_i;
179 t1_r = s0_r + s4_r;
180 t1_i = s0_i + s4_i;
181 t2_r = s2_r + s6_r;
182 t2_i = s2_i + s6_i;
183 t3_r = s2_r - s6_r;
184 t3_i = s2_i - s6_i;
185 Fout[0].r = t1_r + t2_r;
186 Fout[0].i = t1_i + t2_i;
187 Fout[4].r = t1_r - t2_r;
188 Fout[4].i = t1_i - t2_i;
189 Fout[2].r = t0_r + t3_i;
190 Fout[2].i = t0_i - t3_r;
191 Fout[6].r = t0_r - t3_i;
192 Fout[6].i = t0_i + t3_r;
193
194 t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31);
195 t4_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31);
196 t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31);
197 t5_i = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31);
198
199 t0_r = s1_r - s5_i;
200 t0_i = s1_i + s5_r;
201 t1_r = s1_r + s5_i;
202 t1_i = s1_i - s5_r;
203 t2_r = t4_r - t5_r;
204 t2_i = t4_i - t5_i;
205 t3_r = t4_r + t5_r;
206 t3_i = t4_i + t5_i;
207 Fout[1].r = t1_r + t2_r;
208 Fout[1].i = t1_i + t2_i;
209 Fout[5].r = t1_r - t2_r;
210 Fout[5].i = t1_i - t2_i;
211 Fout[3].r = t0_r + t3_i;
212 Fout[3].i = t0_i - t3_r;
213 Fout[7].r = t0_r - t3_i;
214 Fout[7].i = t0_i + t3_r;
215}
216
217static inline void ne10_fft8_backward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
219
220{
221 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
222 ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
223 const ne10_int32_t TW_81 = 1518500249;
224
225 s0_r = Fin[0].r + Fin[4].r;
226 s0_i = Fin[0].i + Fin[4].i;
227 s1_r = Fin[0].r - Fin[4].r;
228 s1_i = Fin[0].i - Fin[4].i;
229 s2_r = Fin[1].r + Fin[5].r;
230 s2_i = Fin[1].i + Fin[5].i;
231 s3_r = Fin[1].r - Fin[5].r;
232 s3_i = Fin[1].i - Fin[5].i;
233 s4_r = Fin[2].r + Fin[6].r;
234 s4_i = Fin[2].i + Fin[6].i;
235 s5_r = Fin[2].r - Fin[6].r;
236 s5_i = Fin[2].i - Fin[6].i;
237 s6_r = Fin[3].r + Fin[7].r;
238 s6_i = Fin[3].i + Fin[7].i;
239 s7_r = Fin[3].r - Fin[7].r;
240 s7_i = Fin[3].i - Fin[7].i;
241
242 t0_r = s0_r - s4_r;
243 t0_i = s0_i - s4_i;
244 t1_r = s0_r + s4_r;
245 t1_i = s0_i + s4_i;
246 t2_r = s2_r + s6_r;
247 t2_i = s2_i + s6_i;
248 t3_r = s2_r - s6_r;
249 t3_i = s2_i - s6_i;
250 Fout[0].r = t1_r + t2_r;
251 Fout[0].i = t1_i + t2_i;
252 Fout[4].r = t1_r - t2_r;
253 Fout[4].i = t1_i - t2_i;
254 Fout[2].r = t0_r - t3_i;
255 Fout[2].i = t0_i + t3_r;
256 Fout[6].r = t0_r + t3_i;
257 Fout[6].i = t0_i - t3_r;
258
259 t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31);
260 t4_i = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31);
261 t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31);
262 t5_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31);
263
264 t0_r = s1_r + s5_i;
265 t0_i = s1_i - s5_r;
266 t1_r = s1_r - s5_i;
267 t1_i = s1_i + s5_r;
268 t2_r = t4_r - t5_r;
269 t2_i = t4_i - t5_i;
270 t3_r = t4_r + t5_r;
271 t3_i = t4_i + t5_i;
272 Fout[1].r = t1_r + t2_r;
273 Fout[1].i = t1_i + t2_i;
274 Fout[5].r = t1_r - t2_r;
275 Fout[5].i = t1_i - t2_i;
276 Fout[3].r = t0_r - t3_i;
277 Fout[3].i = t0_i + t3_r;
278 Fout[7].r = t0_r + t3_i;
279 Fout[7].i = t0_i - t3_r;
280}
281static inline void ne10_fft8_forward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
283
284{
285 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
286 ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
287 const ne10_int32_t TW_81 = 1518500249;
288
289 s0_r = (Fin[0].r + Fin[4].r) >> 3;
290 s0_i = (Fin[0].i + Fin[4].i) >> 3;
291 s1_r = (Fin[0].r - Fin[4].r) >> 3;
292 s1_i = (Fin[0].i - Fin[4].i) >> 3;
293 s2_r = (Fin[1].r + Fin[5].r) >> 3;
294 s2_i = (Fin[1].i + Fin[5].i) >> 3;
295 s3_r = (Fin[1].r - Fin[5].r) >> 3;
296 s3_i = (Fin[1].i - Fin[5].i) >> 3;
297 s4_r = (Fin[2].r + Fin[6].r) >> 3;
298 s4_i = (Fin[2].i + Fin[6].i) >> 3;
299 s5_r = (Fin[2].r - Fin[6].r) >> 3;
300 s5_i = (Fin[2].i - Fin[6].i) >> 3;
301 s6_r = (Fin[3].r + Fin[7].r) >> 3;
302 s6_i = (Fin[3].i + Fin[7].i) >> 3;
303 s7_r = (Fin[3].r - Fin[7].r) >> 3;
304 s7_i = (Fin[3].i - Fin[7].i) >> 3;
305
306 t0_r = s0_r - s4_r;
307 t0_i = s0_i - s4_i;
308 t1_r = s0_r + s4_r;
309 t1_i = s0_i + s4_i;
310 t2_r = s2_r + s6_r;
311 t2_i = s2_i + s6_i;
312 t3_r = s2_r - s6_r;
313 t3_i = s2_i - s6_i;
314 Fout[0].r = t1_r + t2_r;
315 Fout[0].i = t1_i + t2_i;
316 Fout[4].r = t1_r - t2_r;
317 Fout[4].i = t1_i - t2_i;
318 Fout[2].r = t0_r + t3_i;
319 Fout[2].i = t0_i - t3_r;
320 Fout[6].r = t0_r - t3_i;
321 Fout[6].i = t0_i + t3_r;
322
323 t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31);
324 t4_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31);
325 t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31);
326 t5_i = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31);
327
328 t0_r = s1_r - s5_i;
329 t0_i = s1_i + s5_r;
330 t1_r = s1_r + s5_i;
331 t1_i = s1_i - s5_r;
332 t2_r = t4_r - t5_r;
333 t2_i = t4_i - t5_i;
334 t3_r = t4_r + t5_r;
335 t3_i = t4_i + t5_i;
336 Fout[1].r = t1_r + t2_r;
337 Fout[1].i = t1_i + t2_i;
338 Fout[5].r = t1_r - t2_r;
339 Fout[5].i = t1_i - t2_i;
340 Fout[3].r = t0_r + t3_i;
341 Fout[3].i = t0_i - t3_r;
342 Fout[7].r = t0_r - t3_i;
343 Fout[7].i = t0_i + t3_r;
344}
345
346static inline void ne10_fft8_backward_int32_scaled (ne10_fft_cpx_int32_t * Fout,
348
349{
350 ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
351 ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
352 const ne10_int32_t TW_81 = 1518500249;
353
354 s0_r = (Fin[0].r + Fin[4].r) >> 3;
355 s0_i = (Fin[0].i + Fin[4].i) >> 3;
356 s1_r = (Fin[0].r - Fin[4].r) >> 3;
357 s1_i = (Fin[0].i - Fin[4].i) >> 3;
358 s2_r = (Fin[1].r + Fin[5].r) >> 3;
359 s2_i = (Fin[1].i + Fin[5].i) >> 3;
360 s3_r = (Fin[1].r - Fin[5].r) >> 3;
361 s3_i = (Fin[1].i - Fin[5].i) >> 3;
362 s4_r = (Fin[2].r + Fin[6].r) >> 3;
363 s4_i = (Fin[2].i + Fin[6].i) >> 3;
364 s5_r = (Fin[2].r - Fin[6].r) >> 3;
365 s5_i = (Fin[2].i - Fin[6].i) >> 3;
366 s6_r = (Fin[3].r + Fin[7].r) >> 3;
367 s6_i = (Fin[3].i + Fin[7].i) >> 3;
368 s7_r = (Fin[3].r - Fin[7].r) >> 3;
369 s7_i = (Fin[3].i - Fin[7].i) >> 3;
370
371 t0_r = s0_r - s4_r;
372 t0_i = s0_i - s4_i;
373 t1_r = s0_r + s4_r;
374 t1_i = s0_i + s4_i;
375 t2_r = s2_r + s6_r;
376 t2_i = s2_i + s6_i;
377 t3_r = s2_r - s6_r;
378 t3_i = s2_i - s6_i;
379 Fout[0].r = t1_r + t2_r;
380 Fout[0].i = t1_i + t2_i;
381 Fout[4].r = t1_r - t2_r;
382 Fout[4].i = t1_i - t2_i;
383 Fout[2].r = t0_r - t3_i;
384 Fout[2].i = t0_i + t3_r;
385 Fout[6].r = t0_r + t3_i;
386 Fout[6].i = t0_i - t3_r;
387
388 t4_r = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r - s3_i) * TW_81) >> 31);
389 t4_i = (ne10_int32_t) ( ( (ne10_int64_t) (s3_r + s3_i) * TW_81) >> 31);
390 t5_r = (ne10_int32_t) ( ( (ne10_int64_t) (s7_r + s7_i) * TW_81) >> 31);
391 t5_i = - (ne10_int32_t) ( ( (ne10_int64_t) (s7_r - s7_i) * TW_81) >> 31);
392
393 t0_r = s1_r + s5_i;
394 t0_i = s1_i - s5_r;
395 t1_r = s1_r - s5_i;
396 t1_i = s1_i + s5_r;
397 t2_r = t4_r - t5_r;
398 t2_i = t4_i - t5_i;
399 t3_r = t4_r + t5_r;
400 t3_i = t4_i + t5_i;
401 Fout[1].r = t1_r + t2_r;
402 Fout[1].i = t1_i + t2_i;
403 Fout[5].r = t1_r - t2_r;
404 Fout[5].i = t1_i - t2_i;
405 Fout[3].r = t0_r - t3_i;
406 Fout[3].i = t0_i + t3_r;
407 Fout[7].r = t0_r + t3_i;
408 Fout[7].i = t0_i - t3_r;
409}
410
411static void ne10_fft16_forward_int32_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
413 ne10_fft_cpx_int32_t * twiddles)
414{
415 ne10_fft_cpx_int32_t *tw1, *tw2, *tw3;
416
417 // the first stage
418 int32_t *p_src0, *p_src4, *p_src8, *p_src12;
419 int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
420 int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
421 int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
422 int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
423 p_src0 = (int32_t*) (& (Fin[0]));
424 p_src4 = (int32_t*) (& (Fin[4]));
425 p_src8 = (int32_t*) (& (Fin[8]));
426 p_src12 = (int32_t*) (& (Fin[12]));
427 q2_in_0123 = vld2q_s32 (p_src0);
428 q2_in_4567 = vld2q_s32 (p_src4);
429 q2_in_89ab = vld2q_s32 (p_src8);
430 q2_in_cdef = vld2q_s32 (p_src12);
431
432 q_t2_r = vsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
433 q_t2_i = vsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
434 q_t3_r = vaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
435 q_t3_i = vaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
436
437 q_t0_r = vaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
438 q_t0_i = vaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
439 q_t1_r = vsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
440 q_t1_i = vsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
441
442 q_out_r26ae = vsubq_s32 (q_t3_r, q_t0_r);
443 q_out_i26ae = vsubq_s32 (q_t3_i, q_t0_i);
444 q_out_r048c = vaddq_s32 (q_t3_r, q_t0_r);
445 q_out_i048c = vaddq_s32 (q_t3_i, q_t0_i);
446 q_out_r159d = vaddq_s32 (q_t2_r, q_t1_i);
447 q_out_i159d = vsubq_s32 (q_t2_i, q_t1_r);
448 q_out_r37bf = vsubq_s32 (q_t2_r, q_t1_i);
449 q_out_i37bf = vaddq_s32 (q_t2_i, q_t1_r);
450
451 // second stages
452 int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
453 int32_t *p_tw1, *p_tw2, *p_tw3;
454 int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
455 int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
456 int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
457 int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
458 int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
459 int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
460 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
461 int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
462 tw1 = twiddles;
463 tw2 = twiddles + 4;
464 tw3 = twiddles + 8;
465 p_dst0 = (int32_t*) (&Fout[0]);
466 p_dst1 = (int32_t*) (&Fout[4]);
467 p_dst2 = (int32_t*) (&Fout[8]);
468 p_dst3 = (int32_t*) (&Fout[12]);
469 p_tw1 = (int32_t*) tw1;
470 p_tw2 = (int32_t*) tw2;
471 p_tw3 = (int32_t*) tw3;
472 q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
473 q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
474 q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
475 q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
476 q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
477 q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
478 q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
479 q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
480 q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
481 q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
482 q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
483 q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
484 q2_tw1 = vld2q_s32 (p_tw1);
485 q2_tw2 = vld2q_s32 (p_tw2);
486 q2_tw3 = vld2q_s32 (p_tw3);
487
488 q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
489 q_s0_i = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
490 q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
491 q_s1_i = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
492 q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
493 q_s2_i = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
494 q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
495 q_tmp1 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
496 q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
497 q_tmp3 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
498 q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
499 q_tmp5 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
500 q_s0_r = vsubq_s32 (q_s0_r, q_tmp0);
501 q_s0_i = vaddq_s32 (q_s0_i, q_tmp1);
502 q_s1_r = vsubq_s32 (q_s1_r, q_tmp2);
503 q_s1_i = vaddq_s32 (q_s1_i, q_tmp3);
504 q_s2_r = vsubq_s32 (q_s2_r, q_tmp4);
505 q_s2_i = vaddq_s32 (q_s2_i, q_tmp5);
506
507 q_s5_r = vsubq_s32 (q_in_r0123, q_s1_r);
508 q_s5_i = vsubq_s32 (q_in_i0123, q_s1_i);
509 q2_out_0123.val[0] = vaddq_s32 (q_in_r0123, q_s1_r);
510 q2_out_0123.val[1] = vaddq_s32 (q_in_i0123, q_s1_i);
511
512 q_s3_r = vaddq_s32 (q_s0_r, q_s2_r);
513 q_s3_i = vaddq_s32 (q_s0_i, q_s2_i);
514 q_s4_r = vsubq_s32 (q_s0_r, q_s2_r);
515 q_s4_i = vsubq_s32 (q_s0_i, q_s2_i);
516
517 q2_out_89ab.val[0] = vsubq_s32 (q2_out_0123.val[0], q_s3_r);
518 q2_out_89ab.val[1] = vsubq_s32 (q2_out_0123.val[1], q_s3_i);
519 q2_out_0123.val[0] = vaddq_s32 (q2_out_0123.val[0], q_s3_r);
520 q2_out_0123.val[1] = vaddq_s32 (q2_out_0123.val[1], q_s3_i);
521
522 q2_out_4567.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
523 q2_out_4567.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
524 q2_out_cdef.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
525 q2_out_cdef.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
526
527 vst2q_s32 (p_dst0, q2_out_0123);
528 vst2q_s32 (p_dst1, q2_out_4567);
529 vst2q_s32 (p_dst2, q2_out_89ab);
530 vst2q_s32 (p_dst3, q2_out_cdef);
531}
532
533static void ne10_fft16_backward_int32_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
535 ne10_fft_cpx_int32_t * twiddles)
536{
537 ne10_fft_cpx_int32_t *tw1, *tw2, *tw3;
538
539 // the first stage
540 int32_t *p_src0, *p_src4, *p_src8, *p_src12;
541 int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
542 int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
543 int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
544 int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
545 p_src0 = (int32_t*) (& (Fin[0]));
546 p_src4 = (int32_t*) (& (Fin[4]));
547 p_src8 = (int32_t*) (& (Fin[8]));
548 p_src12 = (int32_t*) (& (Fin[12]));
549 q2_in_0123 = vld2q_s32 (p_src0);
550 q2_in_4567 = vld2q_s32 (p_src4);
551 q2_in_89ab = vld2q_s32 (p_src8);
552 q2_in_cdef = vld2q_s32 (p_src12);
553
554 q_t2_r = vsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
555 q_t2_i = vsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
556 q_t3_r = vaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
557 q_t3_i = vaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
558
559 q_t0_r = vaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
560 q_t0_i = vaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
561 q_t1_r = vsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
562 q_t1_i = vsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
563
564 q_out_r26ae = vsubq_s32 (q_t3_r, q_t0_r);
565 q_out_i26ae = vsubq_s32 (q_t3_i, q_t0_i);
566 q_out_r048c = vaddq_s32 (q_t3_r, q_t0_r);
567 q_out_i048c = vaddq_s32 (q_t3_i, q_t0_i);
568 q_out_r159d = vsubq_s32 (q_t2_r, q_t1_i);
569 q_out_i159d = vaddq_s32 (q_t2_i, q_t1_r);
570 q_out_r37bf = vaddq_s32 (q_t2_r, q_t1_i);
571 q_out_i37bf = vsubq_s32 (q_t2_i, q_t1_r);
572
573 // second stages
574 int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
575 int32_t *p_tw1, *p_tw2, *p_tw3;
576 int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
577 int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
578 int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
579 int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
580 int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
581 int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
582 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
583 int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
584 tw1 = twiddles;
585 tw2 = twiddles + 4;
586 tw3 = twiddles + 8;
587 p_dst0 = (int32_t*) (&Fout[0]);
588 p_dst1 = (int32_t*) (&Fout[4]);
589 p_dst2 = (int32_t*) (&Fout[8]);
590 p_dst3 = (int32_t*) (&Fout[12]);
591 p_tw1 = (int32_t*) tw1;
592 p_tw2 = (int32_t*) tw2;
593 p_tw3 = (int32_t*) tw3;
594 q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
595 q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
596 q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
597 q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
598 q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
599 q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
600 q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
601 q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
602 q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
603 q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
604 q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
605 q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
606 q2_tw1 = vld2q_s32 (p_tw1);
607 q2_tw2 = vld2q_s32 (p_tw2);
608 q2_tw3 = vld2q_s32 (p_tw3);
609
610 q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
611 q_s0_i = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
612 q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
613 q_s1_i = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
614 q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
615 q_s2_i = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
616 q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
617 q_tmp1 = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
618 q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
619 q_tmp3 = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
620 q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
621 q_tmp5 = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
622 q_s0_r = vaddq_s32 (q_s0_r, q_tmp0);
623 q_s0_i = vsubq_s32 (q_s0_i, q_tmp1);
624 q_s1_r = vaddq_s32 (q_s1_r, q_tmp2);
625 q_s1_i = vsubq_s32 (q_s1_i, q_tmp3);
626 q_s2_r = vaddq_s32 (q_s2_r, q_tmp4);
627 q_s2_i = vsubq_s32 (q_s2_i, q_tmp5);
628
629 q_s5_r = vsubq_s32 (q_in_r0123, q_s1_r);
630 q_s5_i = vsubq_s32 (q_in_i0123, q_s1_i);
631 q2_out_0123.val[0] = vaddq_s32 (q_in_r0123, q_s1_r);
632 q2_out_0123.val[1] = vaddq_s32 (q_in_i0123, q_s1_i);
633
634 q_s3_r = vaddq_s32 (q_s0_r, q_s2_r);
635 q_s3_i = vaddq_s32 (q_s0_i, q_s2_i);
636 q_s4_r = vsubq_s32 (q_s0_r, q_s2_r);
637 q_s4_i = vsubq_s32 (q_s0_i, q_s2_i);
638
639 q2_out_89ab.val[0] = vsubq_s32 (q2_out_0123.val[0], q_s3_r);
640 q2_out_89ab.val[1] = vsubq_s32 (q2_out_0123.val[1], q_s3_i);
641 q2_out_0123.val[0] = vaddq_s32 (q2_out_0123.val[0], q_s3_r);
642 q2_out_0123.val[1] = vaddq_s32 (q2_out_0123.val[1], q_s3_i);
643
644 q2_out_4567.val[0] = vsubq_s32 (q_s5_r, q_s4_i);
645 q2_out_4567.val[1] = vaddq_s32 (q_s5_i, q_s4_r);
646 q2_out_cdef.val[0] = vaddq_s32 (q_s5_r, q_s4_i);
647 q2_out_cdef.val[1] = vsubq_s32 (q_s5_i, q_s4_r);
648
649 vst2q_s32 (p_dst0, q2_out_0123);
650 vst2q_s32 (p_dst1, q2_out_4567);
651 vst2q_s32 (p_dst2, q2_out_89ab);
652 vst2q_s32 (p_dst3, q2_out_cdef);
653}
654
655static void ne10_fft16_forward_int32_scaled_neon (ne10_fft_cpx_int32_t * Fout,
657 ne10_fft_cpx_int32_t * twiddles)
658{
659 ne10_fft_cpx_int32_t *tw1, *tw2, *tw3;
660
661 // the first stage
662 int32_t *p_src0, *p_src4, *p_src8, *p_src12;
663 int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
664 int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
665 int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
666 int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
667 p_src0 = (int32_t*) (& (Fin[0]));
668 p_src4 = (int32_t*) (& (Fin[4]));
669 p_src8 = (int32_t*) (& (Fin[8]));
670 p_src12 = (int32_t*) (& (Fin[12]));
671 q2_in_0123 = vld2q_s32 (p_src0);
672 q2_in_4567 = vld2q_s32 (p_src4);
673 q2_in_89ab = vld2q_s32 (p_src8);
674 q2_in_cdef = vld2q_s32 (p_src12);
675
676 q_t2_r = vhsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
677 q_t2_i = vhsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
678 q_t3_r = vhaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
679 q_t3_i = vhaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
680
681 q_t0_r = vhaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
682 q_t0_i = vhaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
683 q_t1_r = vhsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
684 q_t1_i = vhsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
685
686 q_out_r26ae = vhsubq_s32 (q_t3_r, q_t0_r);
687 q_out_i26ae = vhsubq_s32 (q_t3_i, q_t0_i);
688 q_out_r048c = vhaddq_s32 (q_t3_r, q_t0_r);
689 q_out_i048c = vhaddq_s32 (q_t3_i, q_t0_i);
690 q_out_r159d = vhaddq_s32 (q_t2_r, q_t1_i);
691 q_out_i159d = vhsubq_s32 (q_t2_i, q_t1_r);
692 q_out_r37bf = vhsubq_s32 (q_t2_r, q_t1_i);
693 q_out_i37bf = vhaddq_s32 (q_t2_i, q_t1_r);
694
695
696 // second stages
697 int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
698 int32_t *p_tw1, *p_tw2, *p_tw3;
699 int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
700 int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
701 int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
702 int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
703 int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
704 int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
705 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
706 int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
707 tw1 = twiddles;
708 tw2 = twiddles + 4;
709 tw3 = twiddles + 8;
710 p_dst0 = (int32_t*) (&Fout[0]);
711 p_dst1 = (int32_t*) (&Fout[4]);
712 p_dst2 = (int32_t*) (&Fout[8]);
713 p_dst3 = (int32_t*) (&Fout[12]);
714 p_tw1 = (int32_t*) tw1;
715 p_tw2 = (int32_t*) tw2;
716 p_tw3 = (int32_t*) tw3;
717 q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
718 q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
719 q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
720 q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
721 q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
722 q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
723 q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
724 q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
725 q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
726 q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
727 q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
728 q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
729 q2_tw1 = vld2q_s32 (p_tw1);
730 q2_tw2 = vld2q_s32 (p_tw2);
731 q2_tw3 = vld2q_s32 (p_tw3);
732
733 q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
734 q_s0_i = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
735 q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
736 q_s1_i = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
737 q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
738 q_s2_i = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
739 q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
740 q_tmp1 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
741 q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
742 q_tmp3 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
743 q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
744 q_tmp5 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
745
746 q_s0_r = vsubq_s32 (q_s0_r, q_tmp0);
747 q_s0_i = vaddq_s32 (q_s0_i, q_tmp1);
748 q_s1_r = vsubq_s32 (q_s1_r, q_tmp2);
749 q_s1_i = vaddq_s32 (q_s1_i, q_tmp3);
750 q_s2_r = vsubq_s32 (q_s2_r, q_tmp4);
751 q_s2_i = vaddq_s32 (q_s2_i, q_tmp5);
752
753 q_s5_r = vhsubq_s32 (q_in_r0123, q_s1_r);
754 q_s5_i = vhsubq_s32 (q_in_i0123, q_s1_i);
755 q2_out_0123.val[0] = vhaddq_s32 (q_in_r0123, q_s1_r);
756 q2_out_0123.val[1] = vhaddq_s32 (q_in_i0123, q_s1_i);
757
758 q_s3_r = vhaddq_s32 (q_s0_r, q_s2_r);
759 q_s3_i = vhaddq_s32 (q_s0_i, q_s2_i);
760 q_s4_r = vhsubq_s32 (q_s0_r, q_s2_r);
761 q_s4_i = vhsubq_s32 (q_s0_i, q_s2_i);
762
763 q2_out_89ab.val[0] = vhsubq_s32 (q2_out_0123.val[0], q_s3_r);
764 q2_out_89ab.val[1] = vhsubq_s32 (q2_out_0123.val[1], q_s3_i);
765 q2_out_0123.val[0] = vhaddq_s32 (q2_out_0123.val[0], q_s3_r);
766 q2_out_0123.val[1] = vhaddq_s32 (q2_out_0123.val[1], q_s3_i);
767
768 q2_out_4567.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
769 q2_out_4567.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
770 q2_out_cdef.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
771 q2_out_cdef.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
772
773 vst2q_s32 (p_dst0, q2_out_0123);
774 vst2q_s32 (p_dst1, q2_out_4567);
775 vst2q_s32 (p_dst2, q2_out_89ab);
776 vst2q_s32 (p_dst3, q2_out_cdef);
777}
778
779static void ne10_fft16_backward_int32_scaled_neon (ne10_fft_cpx_int32_t * Fout,
781 ne10_fft_cpx_int32_t * twiddles)
782{
783 ne10_fft_cpx_int32_t *tw1, *tw2, *tw3;
784
785 // the first stage
786 int32_t *p_src0, *p_src4, *p_src8, *p_src12;
787 int32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
788 int32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
789 int32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
790 int32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
791 p_src0 = (int32_t*) (& (Fin[0]));
792 p_src4 = (int32_t*) (& (Fin[4]));
793 p_src8 = (int32_t*) (& (Fin[8]));
794 p_src12 = (int32_t*) (& (Fin[12]));
795 q2_in_0123 = vld2q_s32 (p_src0);
796 q2_in_4567 = vld2q_s32 (p_src4);
797 q2_in_89ab = vld2q_s32 (p_src8);
798 q2_in_cdef = vld2q_s32 (p_src12);
799
800 q_t2_r = vhsubq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
801 q_t2_i = vhsubq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
802 q_t3_r = vhaddq_s32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
803 q_t3_i = vhaddq_s32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
804
805 q_t0_r = vhaddq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
806 q_t0_i = vhaddq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
807 q_t1_r = vhsubq_s32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
808 q_t1_i = vhsubq_s32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
809
810 q_out_r26ae = vhsubq_s32 (q_t3_r, q_t0_r);
811 q_out_i26ae = vhsubq_s32 (q_t3_i, q_t0_i);
812 q_out_r048c = vhaddq_s32 (q_t3_r, q_t0_r);
813 q_out_i048c = vhaddq_s32 (q_t3_i, q_t0_i);
814 q_out_r159d = vhsubq_s32 (q_t2_r, q_t1_i);
815 q_out_i159d = vhaddq_s32 (q_t2_i, q_t1_r);
816 q_out_r37bf = vhaddq_s32 (q_t2_r, q_t1_i);
817 q_out_i37bf = vhsubq_s32 (q_t2_i, q_t1_r);
818
819 // second stages
820 int32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
821 int32_t *p_tw1, *p_tw2, *p_tw3;
822 int32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
823 int32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
824 int32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
825 int32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
826 int32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
827 int32x4x2_t q2_tw1, q2_tw2, q2_tw3;
828 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_tmp4, q_tmp5;
829 int32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
830 tw1 = twiddles;
831 tw2 = twiddles + 4;
832 tw3 = twiddles + 8;
833 p_dst0 = (int32_t*) (&Fout[0]);
834 p_dst1 = (int32_t*) (&Fout[4]);
835 p_dst2 = (int32_t*) (&Fout[8]);
836 p_dst3 = (int32_t*) (&Fout[12]);
837 p_tw1 = (int32_t*) tw1;
838 p_tw2 = (int32_t*) tw2;
839 p_tw3 = (int32_t*) tw3;
840 q2_tmp_0 = vzipq_s32 (q_out_r048c, q_out_r159d);
841 q2_tmp_1 = vzipq_s32 (q_out_i048c, q_out_i159d);
842 q2_tmp_2 = vzipq_s32 (q_out_r26ae, q_out_r37bf);
843 q2_tmp_3 = vzipq_s32 (q_out_i26ae, q_out_i37bf);
844 q_in_r0123 = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[0]), vget_low_s32 (q2_tmp_2.val[0]));
845 q_in_i0123 = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[0]), vget_low_s32 (q2_tmp_3.val[0]));
846 q_in_r4567 = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[0]), vget_high_s32 (q2_tmp_2.val[0]));
847 q_in_i4567 = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[0]), vget_high_s32 (q2_tmp_3.val[0]));
848 q_in_r89ab = vcombine_s32 (vget_low_s32 (q2_tmp_0.val[1]), vget_low_s32 (q2_tmp_2.val[1]));
849 q_in_i89ab = vcombine_s32 (vget_low_s32 (q2_tmp_1.val[1]), vget_low_s32 (q2_tmp_3.val[1]));
850 q_in_rcdef = vcombine_s32 (vget_high_s32 (q2_tmp_0.val[1]), vget_high_s32 (q2_tmp_2.val[1]));
851 q_in_icdef = vcombine_s32 (vget_high_s32 (q2_tmp_1.val[1]), vget_high_s32 (q2_tmp_3.val[1]));
852 q2_tw1 = vld2q_s32 (p_tw1);
853 q2_tw2 = vld2q_s32 (p_tw2);
854 q2_tw3 = vld2q_s32 (p_tw3);
855
856 q_s0_r = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[0]);
857 q_s0_i = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[0]);
858 q_s1_r = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[0]);
859 q_s1_i = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[0]);
860 q_s2_r = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[0]);
861 q_s2_i = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[0]);
862 q_tmp0 = vqrdmulhq_s32 (q_in_i4567, q2_tw1.val[1]);
863 q_tmp1 = vqrdmulhq_s32 (q_in_r4567, q2_tw1.val[1]);
864 q_tmp2 = vqrdmulhq_s32 (q_in_i89ab, q2_tw2.val[1]);
865 q_tmp3 = vqrdmulhq_s32 (q_in_r89ab, q2_tw2.val[1]);
866 q_tmp4 = vqrdmulhq_s32 (q_in_icdef, q2_tw3.val[1]);
867 q_tmp5 = vqrdmulhq_s32 (q_in_rcdef, q2_tw3.val[1]);
868 q_s0_r = vaddq_s32 (q_s0_r, q_tmp0);
869 q_s0_i = vsubq_s32 (q_s0_i, q_tmp1);
870 q_s1_r = vaddq_s32 (q_s1_r, q_tmp2);
871 q_s1_i = vsubq_s32 (q_s1_i, q_tmp3);
872 q_s2_r = vaddq_s32 (q_s2_r, q_tmp4);
873 q_s2_i = vsubq_s32 (q_s2_i, q_tmp5);
874
875 q_s5_r = vhsubq_s32 (q_in_r0123, q_s1_r);
876 q_s5_i = vhsubq_s32 (q_in_i0123, q_s1_i);
877 q2_out_0123.val[0] = vhaddq_s32 (q_in_r0123, q_s1_r);
878 q2_out_0123.val[1] = vhaddq_s32 (q_in_i0123, q_s1_i);
879
880 q_s3_r = vhaddq_s32 (q_s0_r, q_s2_r);
881 q_s3_i = vhaddq_s32 (q_s0_i, q_s2_i);
882 q_s4_r = vhsubq_s32 (q_s0_r, q_s2_r);
883 q_s4_i = vhsubq_s32 (q_s0_i, q_s2_i);
884
885 q2_out_89ab.val[0] = vhsubq_s32 (q2_out_0123.val[0], q_s3_r);
886 q2_out_89ab.val[1] = vhsubq_s32 (q2_out_0123.val[1], q_s3_i);
887 q2_out_0123.val[0] = vhaddq_s32 (q2_out_0123.val[0], q_s3_r);
888 q2_out_0123.val[1] = vhaddq_s32 (q2_out_0123.val[1], q_s3_i);
889
890 q2_out_4567.val[0] = vhsubq_s32 (q_s5_r, q_s4_i);
891 q2_out_4567.val[1] = vhaddq_s32 (q_s5_i, q_s4_r);
892 q2_out_cdef.val[0] = vhaddq_s32 (q_s5_r, q_s4_i);
893 q2_out_cdef.val[1] = vhsubq_s32 (q_s5_i, q_s4_r);
894
895 vst2q_s32 (p_dst0, q2_out_0123);
896 vst2q_s32 (p_dst1, q2_out_4567);
897 vst2q_s32 (p_dst2, q2_out_89ab);
898 vst2q_s32 (p_dst3, q2_out_cdef);
899}
900
901static void ne10_fft_split_r2c_1d_int32_neon (ne10_fft_cpx_int32_t *dst,
902 const ne10_fft_cpx_int32_t *src,
903 ne10_fft_cpx_int32_t *twiddles,
904 ne10_int32_t ncfft,
905 ne10_int32_t scaled_flag)
906{
907 ne10_int32_t k;
908 ne10_int32_t count = ncfft / 2;
909 ne10_fft_cpx_int32_t fpnk, fpk, f1k, f2k, tw, tdc;
910 int32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
911 int32x4_t q_fpnk_r, q_fpnk_i;
912 int32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
913 int32x4_t q_tw_r, q_tw_i;
914 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
915 int32x4_t q_dst2_r, q_dst2_i;
916 int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
917
918 tdc.r = src[0].r;
919 tdc.i = src[0].i;
920
921 if (scaled_flag)
922 NE10_F2I32_FIXDIV (tdc, 2);
923
924 dst[0].r = tdc.r + tdc.i;
925 dst[ncfft].r = tdc.r - tdc.i;
926 dst[ncfft].i = dst[0].i = 0;
927 if (count >= 4)
928 {
929
930 if (scaled_flag)
931 {
932 for (k = 1; k <= count ; k += 4)
933 {
934 p_src = (int32_t*) (& (src[k]));
935 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
936 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
937 p_dst = (int32_t*) (& (dst[k]));
938 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
939
940 q2_fpk = vld2q_s32 (p_src);
941 q2_fpnk = vld2q_s32 (p_src2);
942
943 q2_tw = vld2q_s32 (p_twiddles);
944 q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
945 q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
946 q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
947 q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
948 q_fpnk_i = vnegq_s32 (q_fpnk_i);
949
950 q_f1k_r = vhaddq_s32 (q2_fpk.val[0], q_fpnk_r);
951 q_f1k_i = vhaddq_s32 (q2_fpk.val[1], q_fpnk_i);
952
953 q_f2k_r = vhsubq_s32 (q2_fpk.val[0], q_fpnk_r);
954 q_f2k_i = vhsubq_s32 (q2_fpk.val[1], q_fpnk_i);
955
956 q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
957 q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
958 q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
959 q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
960 q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
961 q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
962
963 q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
964 q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
965 q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
966 q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
967 q_dst2_r = vrev64q_s32 (q_dst2_r);
968 q_dst2_i = vrev64q_s32 (q_dst2_i);
969 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
970 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
971 vst2q_s32 (p_dst, q2_dst);
972 vst2q_s32 (p_dst2, q2_dst2);
973
974 }
975 }
976 else
977 {
978 for (k = 1; k <= count ; k += 4)
979 {
980 p_src = (int32_t*) (& (src[k]));
981 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
982 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
983 p_dst = (int32_t*) (& (dst[k]));
984 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
985
986 q2_fpk = vld2q_s32 (p_src);
987 q2_fpnk = vld2q_s32 (p_src2);
988
989 q2_tw = vld2q_s32 (p_twiddles);
990 q2_fpnk.val[0] = vrev64q_s32 (q2_fpnk.val[0]);
991 q2_fpnk.val[1] = vrev64q_s32 (q2_fpnk.val[1]);
992 q_fpnk_r = vcombine_s32 (vget_high_s32 (q2_fpnk.val[0]), vget_low_s32 (q2_fpnk.val[0]));
993 q_fpnk_i = vcombine_s32 (vget_high_s32 (q2_fpnk.val[1]), vget_low_s32 (q2_fpnk.val[1]));
994 q_fpnk_i = vnegq_s32 (q_fpnk_i);
995
996 q_f1k_r = vaddq_s32 (q2_fpk.val[0], q_fpnk_r);
997 q_f1k_i = vaddq_s32 (q2_fpk.val[1], q_fpnk_i);
998
999 q_f2k_r = vsubq_s32 (q2_fpk.val[0], q_fpnk_r);
1000 q_f2k_i = vsubq_s32 (q2_fpk.val[1], q_fpnk_i);
1001
1002 q_tmp0 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[0]);
1003 q_tmp1 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[1]);
1004 q_tmp2 = vqdmulhq_s32 (q_f2k_r, q2_tw.val[1]);
1005 q_tmp3 = vqdmulhq_s32 (q_f2k_i, q2_tw.val[0]);
1006 q_tw_r = vsubq_s32 (q_tmp0, q_tmp1);
1007 q_tw_i = vaddq_s32 (q_tmp2, q_tmp3);
1008
1009 q_dst2_r = vhsubq_s32 (q_f1k_r, q_tw_r);
1010 q_dst2_i = vhsubq_s32 (q_tw_i, q_f1k_i);
1011 q2_dst.val[0] = vhaddq_s32 (q_f1k_r, q_tw_r);
1012 q2_dst.val[1] = vhaddq_s32 (q_f1k_i, q_tw_i);
1013 q_dst2_r = vrev64q_s32 (q_dst2_r);
1014 q_dst2_i = vrev64q_s32 (q_dst2_i);
1015 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1016 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1017 vst2q_s32 (p_dst, q2_dst);
1018 vst2q_s32 (p_dst2, q2_dst2);
1019
1020 }
1021 }
1022 }
1023 else
1024 {
1025
1026 for (k = 1; k <= ncfft / 2 ; ++k)
1027 {
1028 fpk = src[k];
1029 fpnk.r = src[ncfft - k].r;
1030 fpnk.i = - src[ncfft - k].i;
1031 if (scaled_flag)
1032 {
1033 NE10_F2I32_FIXDIV (fpk, 2);
1034 NE10_F2I32_FIXDIV (fpnk, 2);
1035 }
1036
1037 f1k.r = fpk.r + fpnk.r;
1038 f1k.i = fpk.i + fpnk.i;
1039
1040 f2k.r = fpk.r - fpnk.r;
1041 f2k.i = fpk.i - fpnk.i;
1042
1043 tw.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> 32))) << 1;
1044 tw.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).i) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> 32))) << 1;
1045
1046 dst[k].r = (f1k.r + tw.r) >> 1;
1047 dst[k].i = (f1k.i + tw.i) >> 1;
1048 dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
1049 dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
1050 }
1051 }
1052}
1053
1054static void ne10_fft_split_c2r_1d_int32_neon (ne10_fft_cpx_int32_t *dst,
1055 const ne10_fft_cpx_int32_t *src,
1056 ne10_fft_cpx_int32_t *twiddles,
1057 ne10_int32_t ncfft,
1058 ne10_int32_t scaled_flag)
1059{
1060
1061 ne10_int32_t k;
1062 ne10_int32_t count = ncfft / 2;
1063 ne10_fft_cpx_int32_t fk, fnkc, fek, fok, tmp;
1064 int32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1065 int32x4_t q_fnkc_r, q_fnkc_i;
1066 int32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1067 int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1068 int32x4_t q_dst2_r, q_dst2_i;
1069 int32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1070
1071
1072 dst[0].r = src[0].r + src[ncfft].r;
1073 dst[0].i = src[0].r - src[ncfft].r;
1074 if (scaled_flag)
1075 NE10_F2I32_FIXDIV (dst[0], 2);
1076 if (count >= 4)
1077 {
1078 if (scaled_flag)
1079 {
1080 for (k = 1; k <= count ; k += 4)
1081 {
1082 p_src = (int32_t*) (& (src[k]));
1083 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1084 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1085 p_dst = (int32_t*) (& (dst[k]));
1086 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1087
1088 q2_fk = vld2q_s32 (p_src);
1089 q2_fnkc = vld2q_s32 (p_src2);
1090 q2_tw = vld2q_s32 (p_twiddles);
1091 q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1092 q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1093 q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1094 q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1095 q_fnkc_i = vnegq_s32 (q_fnkc_i);
1096
1097 q_fek_r = vhaddq_s32 (q2_fk.val[0], q_fnkc_r);
1098 q_fek_i = vhaddq_s32 (q2_fk.val[1], q_fnkc_i);
1099 q_tmp0 = vhsubq_s32 (q2_fk.val[0], q_fnkc_r);
1100 q_tmp1 = vhsubq_s32 (q2_fk.val[1], q_fnkc_i);
1101
1102 q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1103 q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1104 q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1105 q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1106 q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1107 q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1108
1109 q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1110 q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1111 q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1112 q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1113 q_dst2_r = vrev64q_s32 (q_dst2_r);
1114 q_dst2_i = vrev64q_s32 (q_dst2_i);
1115 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1116 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1117 vst2q_s32 (p_dst, q2_dst);
1118 vst2q_s32 (p_dst2, q2_dst2);
1119
1120 }
1121
1122 }
1123 else
1124 {
1125 for (k = 1; k <= count ; k += 4)
1126 {
1127 p_src = (int32_t*) (& (src[k]));
1128 p_src2 = (int32_t*) (& (src[ncfft - k - 3]));
1129 p_twiddles = (int32_t*) (& (twiddles[k - 1]));
1130 p_dst = (int32_t*) (& (dst[k]));
1131 p_dst2 = (int32_t*) (& (dst[ncfft - k - 3]));
1132
1133 q2_fk = vld2q_s32 (p_src);
1134 q2_fnkc = vld2q_s32 (p_src2);
1135 q2_tw = vld2q_s32 (p_twiddles);
1136 q2_fnkc.val[0] = vrev64q_s32 (q2_fnkc.val[0]);
1137 q2_fnkc.val[1] = vrev64q_s32 (q2_fnkc.val[1]);
1138 q_fnkc_r = vcombine_s32 (vget_high_s32 (q2_fnkc.val[0]), vget_low_s32 (q2_fnkc.val[0]));
1139 q_fnkc_i = vcombine_s32 (vget_high_s32 (q2_fnkc.val[1]), vget_low_s32 (q2_fnkc.val[1]));
1140 q_fnkc_i = vnegq_s32 (q_fnkc_i);
1141
1142 q_fek_r = vaddq_s32 (q2_fk.val[0], q_fnkc_r);
1143 q_fek_i = vaddq_s32 (q2_fk.val[1], q_fnkc_i);
1144 q_tmp0 = vsubq_s32 (q2_fk.val[0], q_fnkc_r);
1145 q_tmp1 = vsubq_s32 (q2_fk.val[1], q_fnkc_i);
1146
1147 q_fok_r = vqdmulhq_s32 (q_tmp0, q2_tw.val[0]);
1148 q_fok_i = vqdmulhq_s32 (q_tmp1, q2_tw.val[0]);
1149 q_tmp2 = vqdmulhq_s32 (q_tmp1, q2_tw.val[1]);
1150 q_tmp3 = vqdmulhq_s32 (q_tmp0, q2_tw.val[1]);
1151 q_fok_r = vaddq_s32 (q_fok_r, q_tmp2);
1152 q_fok_i = vsubq_s32 (q_fok_i, q_tmp3);
1153
1154 q_dst2_r = vsubq_s32 (q_fek_r, q_fok_r);
1155 q_dst2_i = vsubq_s32 (q_fok_i, q_fek_i);
1156 q2_dst.val[0] = vaddq_s32 (q_fek_r, q_fok_r);
1157 q2_dst.val[1] = vaddq_s32 (q_fek_i, q_fok_i);
1158 q_dst2_r = vrev64q_s32 (q_dst2_r);
1159 q_dst2_i = vrev64q_s32 (q_dst2_i);
1160 q2_dst2.val[0] = vcombine_s32 (vget_high_s32 (q_dst2_r), vget_low_s32 (q_dst2_r));
1161 q2_dst2.val[1] = vcombine_s32 (vget_high_s32 (q_dst2_i), vget_low_s32 (q_dst2_i));
1162 vst2q_s32 (p_dst, q2_dst);
1163 vst2q_s32 (p_dst2, q2_dst2);
1164
1165 }
1166 }
1167 }
1168 else
1169 {
1170
1171 for (k = 1; k <= ncfft / 2; k++)
1172 {
1173 fk = src[k];
1174 fnkc.r = src[ncfft - k].r;
1175 fnkc.i = -src[ncfft - k].i;
1176 if (scaled_flag)
1177 {
1178 NE10_F2I32_FIXDIV (fk, 2);
1179 NE10_F2I32_FIXDIV (fnkc, 2);
1180 }
1181
1182 fek.r = fk.r + fnkc.r;
1183 fek.i = fk.i + fnkc.i;
1184
1185 tmp.r = fk.r - fnkc.r;
1186 tmp.i = fk.i - fnkc.i;
1187
1188 fok.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).r) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> 32))) << 1;
1189 fok.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> 32))) << 1;
1190
1191 dst[k].r = fek.r + fok.r;
1192 dst[k].i = fek.i + fok.i;
1193
1194 dst[ncfft - k].r = fek.r - fok.r;
1195 dst[ncfft - k].i = fok.i - fek.i;
1196 }
1197 }
1198}
1199
1200
1223 ne10_int32_t inverse_fft,
1224 ne10_int32_t scaled_flag)
1225{
1226 // For input shorter than 16, fall back to c version.
1227 // We would not get much improvement from NEON for these cases.
1228 if (cfg->nfft < 16)
1229 {
1230 ne10_fft_c2c_1d_int32_c (fout, fin, cfg, inverse_fft, scaled_flag);
1231 return;
1232 }
1233
1234 ne10_int32_t stage_count = cfg->factors[0];
1235 ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
1236
1237 assert ((algorithm_flag == NE10_FFT_ALG_24)
1238 || (algorithm_flag == NE10_FFT_ALG_ANY));
1239
1240 // For NE10_FFT_ALG_ANY.
1241 // Function will return inside this branch.
1242 if (algorithm_flag == NE10_FFT_ALG_ANY)
1243 {
1244 if (inverse_fft)
1245 {
1246 ne10_mixed_radix_generic_butterfly_inverse_int32_neon (fout, fin,
1247 cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1248 }
1249 else
1250 {
1251 ne10_mixed_radix_generic_butterfly_int32_neon (fout, fin,
1252 cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1253 }
1254 return;
1255 }
1256
1257 if (scaled_flag)
1258 {
1259 if (inverse_fft)
1260 {
1261 switch (cfg->nfft)
1262 {
1263 case 4:
1264 ne10_fft4_backward_int32_scaled (fout, fin);
1265 break;
1266 case 8:
1267 ne10_fft8_backward_int32_scaled (fout, fin);
1268 break;
1269 case 16:
1270 ne10_fft16_backward_int32_scaled_neon (fout, fin, cfg->twiddles);
1271 break;
1272 default:
1273 ne10_mixed_radix_fft_backward_int32_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1274 break;
1275 }
1276 }
1277 else
1278 {
1279 switch (cfg->nfft)
1280 {
1281 case 4:
1282 ne10_fft4_forward_int32_scaled (fout, fin);
1283 break;
1284 case 8:
1285 ne10_fft8_forward_int32_scaled (fout, fin);
1286 break;
1287 case 16:
1288 ne10_fft16_forward_int32_scaled_neon (fout, fin, cfg->twiddles);
1289 break;
1290 default:
1291 ne10_mixed_radix_fft_forward_int32_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1292 break;
1293 }
1294 }
1295 }
1296 else
1297 {
1298 if (inverse_fft)
1299 {
1300 switch (cfg->nfft)
1301 {
1302 case 4:
1303 ne10_fft4_backward_int32_unscaled (fout, fin);
1304 break;
1305 case 8:
1306 ne10_fft8_backward_int32_unscaled (fout, fin);
1307 break;
1308 case 16:
1309 ne10_fft16_backward_int32_unscaled_neon (fout, fin, cfg->twiddles);
1310 break;
1311 default:
1312 ne10_mixed_radix_fft_backward_int32_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1313 break;
1314 }
1315 }
1316 else
1317 {
1318 switch (cfg->nfft)
1319 {
1320 case 4:
1321 ne10_fft4_forward_int32_unscaled (fout, fin);
1322 break;
1323 case 8:
1324 ne10_fft8_forward_int32_unscaled (fout, fin);
1325 break;
1326 case 16:
1327 ne10_fft16_forward_int32_unscaled_neon (fout, fin, cfg->twiddles);
1328 break;
1329 default:
1330 ne10_mixed_radix_fft_forward_int32_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1331 break;
1332 }
1333 }
1334 }
1335}
1336
//end of C2C_FFT_IFFT group
1340
1358 ne10_int32_t *fin,
1360 ne10_int32_t scaled_flag)
1361{
1362 ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1363 ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1364 ne10_fft_state_int32_t c2c_state;
1365
1366 c2c_state.nfft = cfg->ncfft;
1367 c2c_state.factors = cfg->factors;
1368 c2c_state.twiddles = cfg->twiddles;
1369 c2c_state.buffer = tmpbuf2;
1370
1371 ne10_fft_c2c_1d_int32_neon (tmpbuf1, (ne10_fft_cpx_int32_t*) fin, &c2c_state, 0, scaled_flag);
1372 ne10_fft_split_r2c_1d_int32_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1373}
1385void ne10_fft_c2r_1d_int32_neon (ne10_int32_t *fout,
1388 ne10_int32_t scaled_flag)
1389
1390{
1391 ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1392 ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1393 ne10_fft_state_int32_t c2c_state;
1394
1395 c2c_state.nfft = cfg->ncfft;
1396 c2c_state.factors = cfg->factors;
1397 c2c_state.twiddles = cfg->twiddles;
1398 c2c_state.buffer = tmpbuf2;
1399
1400 ne10_fft_split_c2r_1d_int32_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1401 ne10_fft_c2c_1d_int32_neon ( (ne10_fft_cpx_int32_t*) fout, tmpbuf1, &c2c_state, 1, scaled_flag);
1402}
void ne10_fft_c2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 32-bit fixed point data.
void ne10_fft_c2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 32-bit fixed point data.
void ne10_fft_r2c_1d_int32_neon(ne10_fft_cpx_int32_t *fout, ne10_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 FFT (real to complex) of int32 data.
void ne10_fft_c2r_1d_int32_neon(ne10_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 IFFT (complex to real) of int32 data.
structure for the 32 bits fixed point FFT function.
Definition NE10_types.h:329