Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fft_int16.neonintrinsic.c
1/*
2 * Copyright 2013-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : dsp/NE10_fft_int16.neon.c
30 */
31
32#include <arm_neon.h>
33
34#include "NE10_types.h"
35#include "NE10_macros.h"
36#include "NE10_fft.h"
37
38#define FFT4_FS_START \
39 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \
40 ne10_int16_t tmp_r, tmp_i;
41
42
43#define FFT4_FS \
44 s2_r = Fin[0].r - Fin[2].r; \
45 s2_i = Fin[0].i - Fin[2].i; \
46 tmp_r = Fin[0].r + Fin[2].r; \
47 tmp_i = Fin[0].i + Fin[2].i; \
48 s0_r = Fin[1].r + Fin[3].r; \
49 s0_i = Fin[1].i + Fin[3].i; \
50 s1_r = Fin[1].r - Fin[3].r; \
51 s1_i = Fin[1].i - Fin[3].i;
52
53#define FFT4_FS_SCALED \
54 s2_r = (Fin[0].r - Fin[2].r) >> 2; \
55 s2_i = (Fin[0].i - Fin[2].i) >> 2; \
56 tmp_r = (Fin[0].r + Fin[2].r) >> 2; \
57 tmp_i = (Fin[0].i + Fin[2].i) >> 2; \
58 s0_r = (Fin[1].r + Fin[3].r) >> 2; \
59 s0_i = (Fin[1].i + Fin[3].i) >> 2; \
60 s1_r = (Fin[1].r - Fin[3].r) >> 2; \
61 s1_i = (Fin[1].i - Fin[3].i) >> 2;
62
63#define FFT4_FWD_LS \
64 Fout[2].r = tmp_r - s0_r; \
65 Fout[2].i = tmp_i - s0_i; \
66 Fout[0].r = tmp_r + s0_r; \
67 Fout[0].i = tmp_i + s0_i; \
68 Fout[1].r = s2_r + s1_i; \
69 Fout[1].i = s2_i - s1_r; \
70 Fout[3].r = s2_r - s1_i; \
71 Fout[3].i = s2_i + s1_r;
72
73#define FFT4_INV_LS \
74 Fout[2].r = tmp_r - s0_r; \
75 Fout[2].i = tmp_i - s0_i; \
76 Fout[0].r = tmp_r + s0_r; \
77 Fout[0].i = tmp_i + s0_i; \
78 Fout[1].r = s2_r - s1_i; \
79 Fout[1].i = s2_i + s1_r; \
80 Fout[3].r = s2_r + s1_i; \
81 Fout[3].i = s2_i - s1_r;
82
83static inline void ne10_fft4_forward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
85
86{
87 FFT4_FS_START
88 FFT4_FS
89 FFT4_FWD_LS
90}
91
92static inline void ne10_fft4_backward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
94
95{
96 FFT4_FS_START
97 FFT4_FS
98 FFT4_INV_LS
99}
100static inline void ne10_fft4_forward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
102
103{
104 FFT4_FS_START
105 FFT4_FS_SCALED
106 FFT4_FWD_LS
107}
108
109static inline void ne10_fft4_backward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
111
112{
113 FFT4_FS_START
114 FFT4_FS_SCALED
115 FFT4_INV_LS
116}
117
118#define FFT8_FS_START \
119 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i; \
120 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \
121 const ne10_int16_t TW_81 = 23169;
122
123#define FFT8_FS \
124 s0_r = Fin[0].r + Fin[4].r; \
125 s0_i = Fin[0].i + Fin[4].i; \
126 s1_r = Fin[0].r - Fin[4].r; \
127 s1_i = Fin[0].i - Fin[4].i; \
128 s2_r = Fin[1].r + Fin[5].r; \
129 s2_i = Fin[1].i + Fin[5].i; \
130 s3_r = Fin[1].r - Fin[5].r; \
131 s3_i = Fin[1].i - Fin[5].i; \
132 s4_r = Fin[2].r + Fin[6].r; \
133 s4_i = Fin[2].i + Fin[6].i; \
134 s5_r = Fin[2].r - Fin[6].r; \
135 s5_i = Fin[2].i - Fin[6].i; \
136 s6_r = Fin[3].r + Fin[7].r; \
137 s6_i = Fin[3].i + Fin[7].i; \
138 s7_r = Fin[3].r - Fin[7].r; \
139 s7_i = Fin[3].i - Fin[7].i;
140
141#define FFT8_FS_SCALED \
142 s0_r = (Fin[0].r + Fin[4].r) >> 3; \
143 s0_i = (Fin[0].i + Fin[4].i) >> 3; \
144 s1_r = (Fin[0].r - Fin[4].r) >> 3; \
145 s1_i = (Fin[0].i - Fin[4].i) >> 3; \
146 s2_r = (Fin[1].r + Fin[5].r) >> 3; \
147 s2_i = (Fin[1].i + Fin[5].i) >> 3; \
148 s3_r = (Fin[1].r - Fin[5].r) >> 3; \
149 s3_i = (Fin[1].i - Fin[5].i) >> 3; \
150 s4_r = (Fin[2].r + Fin[6].r) >> 3; \
151 s4_i = (Fin[2].i + Fin[6].i) >> 3; \
152 s5_r = (Fin[2].r - Fin[6].r) >> 3; \
153 s5_i = (Fin[2].i - Fin[6].i) >> 3; \
154 s6_r = (Fin[3].r + Fin[7].r) >> 3; \
155 s6_i = (Fin[3].i + Fin[7].i) >> 3; \
156 s7_r = (Fin[3].r - Fin[7].r) >> 3; \
157 s7_i = (Fin[3].i - Fin[7].i) >> 3;
158
159
160#define FFT8_FWD_LS \
161 t0_r = s0_r - s4_r; \
162 t0_i = s0_i - s4_i; \
163 t1_r = s0_r + s4_r; \
164 t1_i = s0_i + s4_i; \
165 t2_r = s2_r + s6_r; \
166 t2_i = s2_i + s6_i; \
167 t3_r = s2_r - s6_r; \
168 t3_i = s2_i - s6_i; \
169 Fout[0].r = t1_r + t2_r; \
170 Fout[0].i = t1_i + t2_i; \
171 Fout[4].r = t1_r - t2_r; \
172 Fout[4].i = t1_i - t2_i; \
173 Fout[2].r = t0_r + t3_i; \
174 Fout[2].i = t0_i - t3_r; \
175 Fout[6].r = t0_r - t3_i; \
176 Fout[6].i = t0_i + t3_r; \
177 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
178 t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
179 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
180 t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
181 t0_r = s1_r - s5_i; \
182 t0_i = s1_i + s5_r; \
183 t1_r = s1_r + s5_i; \
184 t1_i = s1_i - s5_r; \
185 t2_r = t4_r - t5_r; \
186 t2_i = t4_i - t5_i; \
187 t3_r = t4_r + t5_r; \
188 t3_i = t4_i + t5_i; \
189 Fout[1].r = t1_r + t2_r; \
190 Fout[1].i = t1_i + t2_i; \
191 Fout[5].r = t1_r - t2_r; \
192 Fout[5].i = t1_i - t2_i; \
193 Fout[3].r = t0_r + t3_i; \
194 Fout[3].i = t0_i - t3_r; \
195 Fout[7].r = t0_r - t3_i; \
196 Fout[7].i = t0_i + t3_r;
197
198#define FFT8_INV_LS \
199 t0_r = s0_r - s4_r; \
200 t0_i = s0_i - s4_i; \
201 t1_r = s0_r + s4_r; \
202 t1_i = s0_i + s4_i; \
203 t2_r = s2_r + s6_r; \
204 t2_i = s2_i + s6_i; \
205 t3_r = s2_r - s6_r; \
206 t3_i = s2_i - s6_i; \
207 Fout[0].r = t1_r + t2_r; \
208 Fout[0].i = t1_i + t2_i; \
209 Fout[4].r = t1_r - t2_r; \
210 Fout[4].i = t1_i - t2_i; \
211 Fout[2].r = t0_r - t3_i; \
212 Fout[2].i = t0_i + t3_r; \
213 Fout[6].r = t0_r + t3_i; \
214 Fout[6].i = t0_i - t3_r; \
215 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
216 t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \
217 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
218 t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \
219 t0_r = s1_r + s5_i; \
220 t0_i = s1_i - s5_r; \
221 t1_r = s1_r - s5_i; \
222 t1_i = s1_i + s5_r; \
223 t2_r = t4_r - t5_r; \
224 t2_i = t4_i - t5_i; \
225 t3_r = t4_r + t5_r; \
226 t3_i = t4_i + t5_i; \
227 Fout[1].r = t1_r + t2_r; \
228 Fout[1].i = t1_i + t2_i; \
229 Fout[5].r = t1_r - t2_r; \
230 Fout[5].i = t1_i - t2_i; \
231 Fout[3].r = t0_r - t3_i; \
232 Fout[3].i = t0_i + t3_r; \
233 Fout[7].r = t0_r + t3_i; \
234 Fout[7].i = t0_i - t3_r;
235
236static inline void ne10_fft8_forward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
238
239{
240 FFT8_FS_START
241 FFT8_FS
242 FFT8_FWD_LS
243}
244
245static inline void ne10_fft8_backward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
247
248{
249 FFT8_FS_START
250 FFT8_FS
251 FFT8_INV_LS
252}
253static inline void ne10_fft8_forward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
255
256{
257 FFT8_FS_START
258 FFT8_FS_SCALED
259 FFT8_FWD_LS
260}
261
262static inline void ne10_fft8_backward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
264
265{
266 FFT8_FS_START
267 FFT8_FS_SCALED
268 FFT8_INV_LS
269}
270
271#define RADIX8x4_START \
272 ne10_int32_t f_count; \
273 ne10_int32_t src_step = stride << 1; \
274 const ne10_int16_t TW_81 = 23169; \
275 const ne10_int16_t TW_81N = -23169; \
276 int16_t *p_src, *p_dst; \
277 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3, d2_in4, d2_in5, d2_in6, d2_in7; \
278 int16x4_t d_sin0_r, d_sin0_i, d_sin1_r, d_sin1_i, d_sin2_r, d_sin2_i, d_sin3_r, d_sin3_i; \
279 int16x4_t d_sin4_r, d_sin4_i, d_sin5_r, d_sin5_i, d_sin6_r, d_sin6_i, d_sin7_r, d_sin7_i; \
280 int16x4_t d_s3_r, d_s3_i, d_s5_r, d_s5_i, d_s7_r, d_s7_i; \
281 int16x4_t d_s8_r, d_s8_i, d_s9_r, d_s9_i, d_s10_r, d_s10_i, d_s11_r, d_s11_i; \
282 int16x4_t d_s12_r, d_s12_i, d_s13_r, d_s13_i, d_s14_r, d_s14_i, d_s15_r, d_s15_i; \
283 int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \
284 int16x4_t d_out4_r, d_out4_i, d_out5_r, d_out5_i, d_out6_r, d_out6_i, d_out7_r, d_out7_i; \
285 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3, d2_out4, d2_out5, d2_out6, d2_out7; \
286 int16x8x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3; \
287 int32x4x2_t q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7; \
288 int16x4_t d_tw_81, d_tw_81n; \
289 p_src = (int16_t *) Fin; \
290 p_dst = (int16_t *) Fout;
291
292
293#define RADIX8x4_LOAD \
294 d2_in0 = vld2_s16 (p_src); \
295 p_src += src_step; \
296 d2_in2 = vld2_s16 (p_src); \
297 p_src += src_step; \
298 d2_in4 = vld2_s16 (p_src); \
299 p_src += src_step; \
300 d2_in6 = vld2_s16 (p_src); \
301 p_src += src_step; \
302 d2_in1 = vld2_s16 (p_src); \
303 p_src += src_step; \
304 d2_in3 = vld2_s16 (p_src); \
305 p_src += src_step; \
306 d2_in5 = vld2_s16 (p_src); \
307 p_src += src_step; \
308 d2_in7 = vld2_s16 (p_src); \
309 p_src += src_step;
310
311#define RADIX8x4_STORE \
312 q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \
313 q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \
314 q2_tmp2 = vtrnq_s16 (vcombine_s16(d_out4_r, d_out4_i), vcombine_s16(d_out5_r, d_out5_i)); \
315 q2_tmp3 = vtrnq_s16 (vcombine_s16(d_out6_r, d_out6_i), vcombine_s16(d_out7_r, d_out7_i)); \
316 q2_tmp4 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \
317 q2_tmp5 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \
318 q2_tmp6 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[0]), vreinterpretq_s32_s16(q2_tmp3.val[0])); \
319 q2_tmp7 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[1]), vreinterpretq_s32_s16(q2_tmp3.val[1])); \
320 d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \
321 d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \
322 d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \
323 d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \
324 d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \
325 d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \
326 d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \
327 d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \
328 d2_out4.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \
329 d2_out4.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \
330 d2_out5.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \
331 d2_out5.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \
332 d2_out6.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \
333 d2_out6.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \
334 d2_out7.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \
335 d2_out7.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \
336 vst2_s16 (p_dst, d2_out0); \
337 p_dst += 8; \
338 vst2_s16 (p_dst, d2_out1); \
339 p_dst += 8; \
340 vst2_s16 (p_dst, d2_out2); \
341 p_dst += 8; \
342 vst2_s16 (p_dst, d2_out3); \
343 p_dst += 8; \
344 vst2_s16 (p_dst, d2_out4); \
345 p_dst += 8; \
346 vst2_s16 (p_dst, d2_out5); \
347 p_dst += 8; \
348 vst2_s16 (p_dst, d2_out6); \
349 p_dst += 8; \
350 vst2_s16 (p_dst, d2_out7); \
351 p_dst += 8; \
352 p_src = p_src - src_step * 8 + 8;
353
354#define RADIX8x4_FS_S0 \
355 d_sin0_r = vadd_s16 (d2_in0.val[0], d2_in1.val[0]); \
356 d_sin0_i = vadd_s16 (d2_in0.val[1], d2_in1.val[1]); \
357 d_sin1_r = vsub_s16 (d2_in0.val[0], d2_in1.val[0]); \
358 d_sin1_i = vsub_s16 (d2_in0.val[1], d2_in1.val[1]); \
359 d_sin2_r = vadd_s16 (d2_in2.val[0], d2_in3.val[0]); \
360 d_sin2_i = vadd_s16 (d2_in2.val[1], d2_in3.val[1]); \
361 d_sin3_r = vsub_s16 (d2_in2.val[0], d2_in3.val[0]); \
362 d_sin3_i = vsub_s16 (d2_in2.val[1], d2_in3.val[1]); \
363 d_sin4_r = vadd_s16 (d2_in4.val[0], d2_in5.val[0]); \
364 d_sin4_i = vadd_s16 (d2_in4.val[1], d2_in5.val[1]); \
365 d_sin5_r = vsub_s16 (d2_in4.val[0], d2_in5.val[0]); \
366 d_sin5_i = vsub_s16 (d2_in4.val[1], d2_in5.val[1]); \
367 d_sin6_r = vadd_s16 (d2_in6.val[0], d2_in7.val[0]); \
368 d_sin6_i = vadd_s16 (d2_in6.val[1], d2_in7.val[1]); \
369 d_sin7_r = vsub_s16 (d2_in6.val[0], d2_in7.val[0]); \
370 d_sin7_i = vsub_s16 (d2_in6.val[1], d2_in7.val[1]);
371
372#define RADIX8x4_FWD_S357 \
373 d_tw_81 = vdup_n_s16 (TW_81); \
374 d_tw_81n = vdup_n_s16 (TW_81N); \
375 d_s5_r = d_sin5_i; \
376 d_s5_i = vneg_s16 (d_sin5_r); \
377 d_s3_r = vadd_s16 (d_sin3_r, d_sin3_i); \
378 d_s3_i = vsub_s16 (d_sin3_i, d_sin3_r); \
379 d_s7_r = vsub_s16 (d_sin7_r, d_sin7_i); \
380 d_s7_i = vadd_s16 (d_sin7_i, d_sin7_r); \
381 d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \
382 d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \
383 d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \
384 d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n);
385
386#define RADIX8x4_INV_S357 \
387 d_tw_81 = vdup_n_s16 (TW_81); \
388 d_tw_81n = vdup_n_s16 (TW_81N); \
389 d_s5_r = vneg_s16 (d_sin5_i); \
390 d_s5_i = d_sin5_r; \
391 d_s3_r = vsub_s16 (d_sin3_r, d_sin3_i); \
392 d_s3_i = vadd_s16 (d_sin3_i, d_sin3_r); \
393 d_s7_r = vadd_s16 (d_sin7_r, d_sin7_i); \
394 d_s7_i = vsub_s16 (d_sin7_i, d_sin7_r); \
395 d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \
396 d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \
397 d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \
398 d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n);
399
400#define RADIX8x4_LS_02 \
401 d_s8_r = vadd_s16 (d_sin0_r, d_sin4_r); \
402 d_s8_i = vadd_s16 (d_sin0_i, d_sin4_i); \
403 d_s9_r = vadd_s16 (d_sin1_r, d_s5_r); \
404 d_s9_i = vadd_s16 (d_sin1_i, d_s5_i); \
405 d_s10_r = vsub_s16 (d_sin0_r, d_sin4_r); \
406 d_s10_i = vsub_s16 (d_sin0_i, d_sin4_i); \
407 d_s11_r = vsub_s16 (d_sin1_r, d_s5_r); \
408 d_s11_i = vsub_s16 (d_sin1_i, d_s5_i); \
409 d_s12_r = vadd_s16 (d_sin2_r, d_sin6_r); \
410 d_s12_i = vadd_s16 (d_sin2_i, d_sin6_i); \
411 d_s13_r = vadd_s16 (d_s3_r, d_s7_r); \
412 d_s13_i = vadd_s16 (d_s3_i, d_s7_i); \
413 d_s14_r = vsub_s16 (d_sin2_r, d_sin6_r); \
414 d_s14_i = vsub_s16 (d_sin2_i, d_sin6_i); \
415 d_s15_r = vsub_s16 (d_s3_r, d_s7_r); \
416 d_s15_i = vsub_s16 (d_s3_i, d_s7_i); \
417 d_out4_r = vsub_s16 (d_s8_r, d_s12_r); \
418 d_out4_i = vsub_s16 (d_s8_i, d_s12_i); \
419 d_out5_r = vsub_s16 (d_s9_r, d_s13_r); \
420 d_out5_i = vsub_s16 (d_s9_i, d_s13_i); \
421 d_out0_r = vadd_s16 (d_s8_r, d_s12_r); \
422 d_out0_i = vadd_s16 (d_s8_i, d_s12_i); \
423 d_out1_r = vadd_s16 (d_s9_r, d_s13_r); \
424 d_out1_i = vadd_s16 (d_s9_i, d_s13_i);
425
426#define RADIX8x4_FS_S0_SCALED \
427 d_sin0_r = vhadd_s16 (d2_in0.val[0], d2_in1.val[0]); \
428 d_sin0_i = vhadd_s16 (d2_in0.val[1], d2_in1.val[1]); \
429 d_sin1_r = vhsub_s16 (d2_in0.val[0], d2_in1.val[0]); \
430 d_sin1_i = vhsub_s16 (d2_in0.val[1], d2_in1.val[1]); \
431 d_sin2_r = vhadd_s16 (d2_in2.val[0], d2_in3.val[0]); \
432 d_sin2_i = vhadd_s16 (d2_in2.val[1], d2_in3.val[1]); \
433 d_sin3_r = vhsub_s16 (d2_in2.val[0], d2_in3.val[0]); \
434 d_sin3_i = vhsub_s16 (d2_in2.val[1], d2_in3.val[1]); \
435 d_sin4_r = vhadd_s16 (d2_in4.val[0], d2_in5.val[0]); \
436 d_sin4_i = vhadd_s16 (d2_in4.val[1], d2_in5.val[1]); \
437 d_sin5_r = vhsub_s16 (d2_in4.val[0], d2_in5.val[0]); \
438 d_sin5_i = vhsub_s16 (d2_in4.val[1], d2_in5.val[1]); \
439 d_sin6_r = vhadd_s16 (d2_in6.val[0], d2_in7.val[0]); \
440 d_sin6_i = vhadd_s16 (d2_in6.val[1], d2_in7.val[1]); \
441 d_sin7_r = vhsub_s16 (d2_in6.val[0], d2_in7.val[0]); \
442 d_sin7_i = vhsub_s16 (d2_in6.val[1], d2_in7.val[1]);
443
444#define RADIX8x4_LS_02_SCALED \
445 d_s8_r = vhadd_s16 (d_sin0_r, d_sin4_r); \
446 d_s8_i = vhadd_s16 (d_sin0_i, d_sin4_i); \
447 d_s9_r = vhadd_s16 (d_sin1_r, d_s5_r); \
448 d_s9_i = vhadd_s16 (d_sin1_i, d_s5_i); \
449 d_s10_r = vhsub_s16 (d_sin0_r, d_sin4_r); \
450 d_s10_i = vhsub_s16 (d_sin0_i, d_sin4_i); \
451 d_s11_r = vhsub_s16 (d_sin1_r, d_s5_r); \
452 d_s11_i = vhsub_s16 (d_sin1_i, d_s5_i); \
453 d_s12_r = vhadd_s16 (d_sin2_r, d_sin6_r); \
454 d_s12_i = vhadd_s16 (d_sin2_i, d_sin6_i); \
455 d_s13_r = vhadd_s16 (d_s3_r, d_s7_r); \
456 d_s13_i = vhadd_s16 (d_s3_i, d_s7_i); \
457 d_s14_r = vhsub_s16 (d_sin2_r, d_sin6_r); \
458 d_s14_i = vhsub_s16 (d_sin2_i, d_sin6_i); \
459 d_s15_r = vhsub_s16 (d_s3_r, d_s7_r); \
460 d_s15_i = vhsub_s16 (d_s3_i, d_s7_i); \
461 d_out4_r = vhsub_s16 (d_s8_r, d_s12_r); \
462 d_out4_i = vhsub_s16 (d_s8_i, d_s12_i); \
463 d_out5_r = vhsub_s16 (d_s9_r, d_s13_r); \
464 d_out5_i = vhsub_s16 (d_s9_i, d_s13_i); \
465 d_out0_r = vhadd_s16 (d_s8_r, d_s12_r); \
466 d_out0_i = vhadd_s16 (d_s8_i, d_s12_i); \
467 d_out1_r = vhadd_s16 (d_s9_r, d_s13_r); \
468 d_out1_i = vhadd_s16 (d_s9_i, d_s13_i);
469
470
471static inline void ne10_radix8x4_forward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
473 ne10_int32_t stride)
474{
475 RADIX8x4_START
476
477 for (f_count = 0; f_count < stride; f_count += 4)
478 {
479 RADIX8x4_LOAD
480 RADIX8x4_FS_S0
481
482
483 // radix 4 butterfly without twiddles
484 RADIX8x4_FWD_S357
485 RADIX8x4_LS_02
486
487 d_out2_r = vadd_s16 (d_s10_r, d_s14_i);
488 d_out2_i = vsub_s16 (d_s10_i, d_s14_r);
489 d_out3_r = vadd_s16 (d_s11_r, d_s15_i);
490 d_out3_i = vsub_s16 (d_s11_i, d_s15_r);
491 d_out6_r = vsub_s16 (d_s10_r, d_s14_i);
492 d_out6_i = vadd_s16 (d_s10_i, d_s14_r);
493 d_out7_r = vsub_s16 (d_s11_r, d_s15_i);
494 d_out7_i = vadd_s16 (d_s11_i, d_s15_r);
495
496 RADIX8x4_STORE
497 } // f_count
498}
499
500static inline void ne10_radix8x4_backward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
502 ne10_int32_t stride)
503{
504 RADIX8x4_START
505
506 for (f_count = 0; f_count < stride; f_count += 4)
507 {
508 RADIX8x4_LOAD
509 RADIX8x4_FS_S0
510
511 // radix 4 butterfly without twiddles
512 RADIX8x4_INV_S357
513 RADIX8x4_LS_02
514
515 d_out2_r = vsub_s16 (d_s10_r, d_s14_i);
516 d_out2_i = vadd_s16 (d_s10_i, d_s14_r);
517 d_out3_r = vsub_s16 (d_s11_r, d_s15_i);
518 d_out3_i = vadd_s16 (d_s11_i, d_s15_r);
519 d_out6_r = vadd_s16 (d_s10_r, d_s14_i);
520 d_out6_i = vsub_s16 (d_s10_i, d_s14_r);
521 d_out7_r = vadd_s16 (d_s11_r, d_s15_i);
522 d_out7_i = vsub_s16 (d_s11_i, d_s15_r);
523
524 RADIX8x4_STORE
525 } // f_count
526}
527static inline void ne10_radix8x4_forward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
529 ne10_int32_t stride)
530{
531 RADIX8x4_START
532
533 for (f_count = 0; f_count < stride; f_count += 4)
534 {
535 RADIX8x4_LOAD
536 RADIX8x4_FS_S0_SCALED
537
538 // radix 4 butterfly without twiddles
539 RADIX8x4_FWD_S357
540 RADIX8x4_LS_02_SCALED
541
542 d_out2_r = vhadd_s16 (d_s10_r, d_s14_i);
543 d_out2_i = vhsub_s16 (d_s10_i, d_s14_r);
544 d_out3_r = vhadd_s16 (d_s11_r, d_s15_i);
545 d_out3_i = vhsub_s16 (d_s11_i, d_s15_r);
546 d_out6_r = vhsub_s16 (d_s10_r, d_s14_i);
547 d_out6_i = vhadd_s16 (d_s10_i, d_s14_r);
548 d_out7_r = vhsub_s16 (d_s11_r, d_s15_i);
549 d_out7_i = vhadd_s16 (d_s11_i, d_s15_r);
550
551 RADIX8x4_STORE
552 } // f_count
553}
554
555static inline void ne10_radix8x4_backward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
557 ne10_int32_t stride)
558{
559 RADIX8x4_START
560
561 for (f_count = 0; f_count < stride; f_count += 4)
562 {
563 RADIX8x4_LOAD
564 RADIX8x4_FS_S0_SCALED
565
566 // radix 4 butterfly without twiddles
567 RADIX8x4_INV_S357
568 RADIX8x4_LS_02_SCALED
569
570 d_out2_r = vhsub_s16 (d_s10_r, d_s14_i);
571 d_out2_i = vhadd_s16 (d_s10_i, d_s14_r);
572 d_out3_r = vhsub_s16 (d_s11_r, d_s15_i);
573 d_out3_i = vhadd_s16 (d_s11_i, d_s15_r);
574 d_out6_r = vhadd_s16 (d_s10_r, d_s14_i);
575 d_out6_i = vhsub_s16 (d_s10_i, d_s14_r);
576 d_out7_r = vhadd_s16 (d_s11_r, d_s15_i);
577 d_out7_i = vhsub_s16 (d_s11_i, d_s15_r);
578
579 RADIX8x4_STORE
580 } // f_count
581}
582
583#define RADIX4x4_WITHOUT_TW_START \
584 ne10_int32_t f_count; \
585 ne10_int32_t src_step = stride << 1; \
586 int16_t *p_src, *p_dst; \
587 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \
588 int16x4_t d_s0_r, d_s0_i, d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \
589 int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \
590 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \
591 int16x8x2_t q2_tmp0, q2_tmp1; \
592 int32x4x2_t q2_tmp2, q2_tmp3; \
593 p_src = (int16_t *) Fin; \
594 p_dst = (int16_t *) Fout;
595
596#define RADIX4x4_WITHOUT_TW_LOAD \
597 d2_in0 = vld2_s16 (p_src); \
598 p_src += src_step; \
599 d2_in1 = vld2_s16 (p_src); \
600 p_src += src_step; \
601 d2_in2 = vld2_s16 (p_src); \
602 p_src += src_step; \
603 d2_in3 = vld2_s16 (p_src); \
604 p_src += src_step;
605
606#define RADIX4x4_WITHOUT_TW_STORE \
607 q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \
608 q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \
609 q2_tmp2 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \
610 q2_tmp3 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \
611 d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \
612 d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \
613 d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \
614 d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \
615 d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \
616 d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \
617 d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \
618 d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \
619 vst2_s16 (p_dst, d2_out0); \
620 p_dst += 8; \
621 vst2_s16 (p_dst, d2_out1); \
622 p_dst += 8; \
623 vst2_s16 (p_dst, d2_out2); \
624 p_dst += 8; \
625 vst2_s16 (p_dst, d2_out3); \
626 p_dst += 8; \
627 p_src = p_src - src_step * 4 + 8;
628
629#define RADIX4x4_WITHOUT_TW_S0 \
630 d_s0_r = vadd_s16 (d2_in0.val[0], d2_in2.val[0]); \
631 d_s0_i = vadd_s16 (d2_in0.val[1], d2_in2.val[1]); \
632 d_s1_r = vsub_s16 (d2_in0.val[0], d2_in2.val[0]); \
633 d_s1_i = vsub_s16 (d2_in0.val[1], d2_in2.val[1]); \
634 d_s2_r = vadd_s16 (d2_in1.val[0], d2_in3.val[0]); \
635 d_s2_i = vadd_s16 (d2_in1.val[1], d2_in3.val[1]); \
636 d_s3_r = vsub_s16 (d2_in1.val[0], d2_in3.val[0]); \
637 d_s3_i = vsub_s16 (d2_in1.val[1], d2_in3.val[1]); \
638 d_out2_r = vsub_s16 (d_s0_r, d_s2_r); \
639 d_out2_i = vsub_s16 (d_s0_i, d_s2_i); \
640 d_out0_r = vadd_s16 (d_s0_r, d_s2_r); \
641 d_out0_i = vadd_s16 (d_s0_i, d_s2_i);
642
643#define RADIX4x4_WITHOUT_TW_S0_SCALED \
644 d_s0_r = vhadd_s16 (d2_in0.val[0], d2_in2.val[0]); \
645 d_s0_i = vhadd_s16 (d2_in0.val[1], d2_in2.val[1]); \
646 d_s1_r = vhsub_s16 (d2_in0.val[0], d2_in2.val[0]); \
647 d_s1_i = vhsub_s16 (d2_in0.val[1], d2_in2.val[1]); \
648 d_s2_r = vhadd_s16 (d2_in1.val[0], d2_in3.val[0]); \
649 d_s2_i = vhadd_s16 (d2_in1.val[1], d2_in3.val[1]); \
650 d_s3_r = vhsub_s16 (d2_in1.val[0], d2_in3.val[0]); \
651 d_s3_i = vhsub_s16 (d2_in1.val[1], d2_in3.val[1]); \
652 d_out2_r = vhsub_s16 (d_s0_r, d_s2_r); \
653 d_out2_i = vhsub_s16 (d_s0_i, d_s2_i); \
654 d_out0_r = vhadd_s16 (d_s0_r, d_s2_r); \
655 d_out0_i = vhadd_s16 (d_s0_i, d_s2_i);
656
657
658static inline void ne10_radix4x4_without_twiddles_forward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
660 ne10_int32_t stride)
661{
662 RADIX4x4_WITHOUT_TW_START
663
664 for (f_count = 0; f_count < stride; f_count += 4)
665 {
666 // load
667 RADIX4x4_WITHOUT_TW_LOAD
668
669 // radix 4 butterfly without twiddles
670 RADIX4x4_WITHOUT_TW_S0
671
672 d_out1_r = vadd_s16 (d_s1_r, d_s3_i);
673 d_out1_i = vsub_s16 (d_s1_i, d_s3_r);
674 d_out3_r = vsub_s16 (d_s1_r, d_s3_i);
675 d_out3_i = vadd_s16 (d_s1_i, d_s3_r);
676
677 RADIX4x4_WITHOUT_TW_STORE
678 }
679}
680
681static inline void ne10_radix4x4_without_twiddles_backward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
683 ne10_int32_t stride)
684{
685 RADIX4x4_WITHOUT_TW_START
686
687 for (f_count = 0; f_count < stride; f_count += 4)
688 {
689 // load
690 RADIX4x4_WITHOUT_TW_LOAD
691
692 // radix 4 butterfly without twiddles
693 RADIX4x4_WITHOUT_TW_S0
694
695 d_out1_r = vsub_s16 (d_s1_r, d_s3_i);
696 d_out1_i = vadd_s16 (d_s1_i, d_s3_r);
697 d_out3_r = vadd_s16 (d_s1_r, d_s3_i);
698 d_out3_i = vsub_s16 (d_s1_i, d_s3_r);
699
700 RADIX4x4_WITHOUT_TW_STORE
701 }
702}
703
704static inline void ne10_radix4x4_without_twiddles_forward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
706 ne10_int32_t stride)
707{
708 RADIX4x4_WITHOUT_TW_START
709
710 for (f_count = 0; f_count < stride; f_count += 4)
711 {
712 // load
713 RADIX4x4_WITHOUT_TW_LOAD
714
715 // radix 4 butterfly without twiddles
716 RADIX4x4_WITHOUT_TW_S0_SCALED
717
718 d_out1_r = vhadd_s16 (d_s1_r, d_s3_i);
719 d_out1_i = vhsub_s16 (d_s1_i, d_s3_r);
720 d_out3_r = vhsub_s16 (d_s1_r, d_s3_i);
721 d_out3_i = vhadd_s16 (d_s1_i, d_s3_r);
722
723 RADIX4x4_WITHOUT_TW_STORE
724 }
725}
726
727static inline void ne10_radix4x4_without_twiddles_backward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
729 ne10_int32_t stride)
730{
731 RADIX4x4_WITHOUT_TW_START
732
733 for (f_count = 0; f_count < stride; f_count += 4)
734 {
735 // load
736 RADIX4x4_WITHOUT_TW_LOAD
737
738 // radix 4 butterfly without twiddles
739 RADIX4x4_WITHOUT_TW_S0_SCALED
740
741 d_out1_r = vhsub_s16 (d_s1_r, d_s3_i);
742 d_out1_i = vhadd_s16 (d_s1_i, d_s3_r);
743 d_out3_r = vhadd_s16 (d_s1_r, d_s3_i);
744 d_out3_i = vhsub_s16 (d_s1_i, d_s3_r);
745
746 RADIX4x4_WITHOUT_TW_STORE
747 }
748}
749
750#define RADIX4x4_WITH_TW_START \
751 ne10_int32_t m_count; \
752 ne10_int32_t src_step = src_stride << 1; \
753 ne10_int32_t dst_step = dst_stride << 1; \
754 ne10_int32_t tw_step = mstride << 1; \
755 int16_t *p_src, *p_dst, *p_tw; \
756 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \
757 int16x4x2_t d2_tw0, d2_tw1, d2_tw2; \
758 int16x4_t d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \
759 int16x4_t d_tmp0, d_tmp1, d_tmp2, d_tmp3, d_tmp4, d_tmp5; \
760 int16x4_t d_s4_r, d_s4_i, d_s5_r, d_s5_i, d_s6_r, d_s6_i, d_s7_r, d_s7_i; \
761 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \
762 p_src = (int16_t *) Fin; \
763 p_dst = (int16_t *) Fout; \
764 p_tw = (int16_t *) tw;
765
766#define RADIX4x4_WITH_TW_LOAD \
767 d2_in0 = vld2_s16 (p_src); \
768 p_src += src_step; \
769 d2_in1 = vld2_s16 (p_src); \
770 p_src += src_step; \
771 d2_in2 = vld2_s16 (p_src); \
772 p_src += src_step; \
773 d2_in3 = vld2_s16 (p_src); \
774 p_src += src_step; \
775 d2_tw0 = vld2_s16 (p_tw); \
776 p_tw += tw_step; \
777 d2_tw1 = vld2_s16 (p_tw); \
778 p_tw += tw_step; \
779 d2_tw2 = vld2_s16 (p_tw); \
780 d_s1_r = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[0]); \
781 d_s1_i = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[0]); \
782 d_s2_r = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[0]); \
783 d_s2_i = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[0]); \
784 d_s3_r = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[0]); \
785 d_s3_i = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[0]); \
786 d_tmp0 = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[1]); \
787 d_tmp1 = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[1]); \
788 d_tmp2 = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[1]); \
789 d_tmp3 = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[1]); \
790 d_tmp4 = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[1]); \
791 d_tmp5 = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[1]);
792
793#define RADIX4x4_WITH_TW_STORE \
794 vst2_s16 (p_dst, d2_out0); \
795 p_dst += dst_step; \
796 vst2_s16 (p_dst, d2_out1); \
797 p_dst += dst_step; \
798 vst2_s16 (p_dst, d2_out2); \
799 p_dst += dst_step; \
800 vst2_s16 (p_dst, d2_out3); \
801 p_dst += dst_step; \
802 p_src = p_src - src_step * 4 + 8; \
803 p_dst = p_dst - dst_step * 4 + 8; \
804 p_tw = p_tw - tw_step * 2 + 8;
805
806#define RADIX4x4_WITH_TW_S1_FWD \
807 d_s1_r = vsub_s16 (d_s1_r, d_tmp0); \
808 d_s1_i = vadd_s16 (d_s1_i, d_tmp1); \
809 d_s2_r = vsub_s16 (d_s2_r, d_tmp2); \
810 d_s2_i = vadd_s16 (d_s2_i, d_tmp3); \
811 d_s3_r = vsub_s16 (d_s3_r, d_tmp4); \
812 d_s3_i = vadd_s16 (d_s3_i, d_tmp5);
813
814#define RADIX4x4_WITH_TW_S1_INV \
815 d_s1_r = vadd_s16 (d_s1_r, d_tmp0); \
816 d_s1_i = vsub_s16 (d_s1_i, d_tmp1); \
817 d_s2_r = vadd_s16 (d_s2_r, d_tmp2); \
818 d_s2_i = vsub_s16 (d_s2_i, d_tmp3); \
819 d_s3_r = vadd_s16 (d_s3_r, d_tmp4); \
820 d_s3_i = vsub_s16 (d_s3_i, d_tmp5);
821
822
823#define RADIX4x4_WITH_TW_LS_02 \
824 d_s4_r = vadd_s16 (d2_in0.val[0], d_s2_r); \
825 d_s4_i = vadd_s16 (d2_in0.val[1], d_s2_i); \
826 d_s5_r = vsub_s16 (d2_in0.val[0], d_s2_r); \
827 d_s5_i = vsub_s16 (d2_in0.val[1], d_s2_i); \
828 d_s6_r = vadd_s16 (d_s1_r, d_s3_r); \
829 d_s6_i = vadd_s16 (d_s1_i, d_s3_i); \
830 d_s7_r = vsub_s16 (d_s1_r, d_s3_r); \
831 d_s7_i = vsub_s16 (d_s1_i, d_s3_i); \
832 d2_out2.val[0] = vsub_s16 (d_s4_r, d_s6_r); \
833 d2_out2.val[1] = vsub_s16 (d_s4_i, d_s6_i); \
834 d2_out0.val[0] = vadd_s16 (d_s4_r, d_s6_r); \
835 d2_out0.val[1] = vadd_s16 (d_s4_i, d_s6_i);
836
837#define RADIX4x4_WITH_TW_LS_02_SCALED \
838 d_s4_r = vhadd_s16 (d2_in0.val[0], d_s2_r); \
839 d_s4_i = vhadd_s16 (d2_in0.val[1], d_s2_i); \
840 d_s5_r = vhsub_s16 (d2_in0.val[0], d_s2_r); \
841 d_s5_i = vhsub_s16 (d2_in0.val[1], d_s2_i); \
842 d_s6_r = vhadd_s16 (d_s1_r, d_s3_r); \
843 d_s6_i = vhadd_s16 (d_s1_i, d_s3_i); \
844 d_s7_r = vhsub_s16 (d_s1_r, d_s3_r); \
845 d_s7_i = vhsub_s16 (d_s1_i, d_s3_i); \
846 d2_out2.val[0] = vhsub_s16 (d_s4_r, d_s6_r); \
847 d2_out2.val[1] = vhsub_s16 (d_s4_i, d_s6_i); \
848 d2_out0.val[0] = vhadd_s16 (d_s4_r, d_s6_r); \
849 d2_out0.val[1] = vhadd_s16 (d_s4_i, d_s6_i);
850
851
852static inline void ne10_radix4x4_with_twiddles_forward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
855 ne10_int32_t src_stride,
856 ne10_int32_t dst_stride,
857 ne10_int32_t mstride)
858{
859 RADIX4x4_WITH_TW_START
860
861 for (m_count = 0; m_count < mstride; m_count += 4)
862 {
863 // load
864 RADIX4x4_WITH_TW_LOAD
865 RADIX4x4_WITH_TW_S1_FWD
866
867 RADIX4x4_WITH_TW_LS_02
868
869 d2_out1.val[0] = vadd_s16 (d_s5_r, d_s7_i);
870 d2_out1.val[1] = vsub_s16 (d_s5_i, d_s7_r);
871 d2_out3.val[0] = vsub_s16 (d_s5_r, d_s7_i);
872 d2_out3.val[1] = vadd_s16 (d_s5_i, d_s7_r);
873
874 // store
875 RADIX4x4_WITH_TW_STORE
876 }
877}
878
879
880static inline void ne10_radix4x4_with_twiddles_backward_unscaled_neon (ne10_fft_cpx_int16_t * Fout,
883 ne10_int32_t src_stride,
884 ne10_int32_t dst_stride,
885 ne10_int32_t mstride)
886{
887 RADIX4x4_WITH_TW_START
888
889 for (m_count = 0; m_count < mstride; m_count += 4)
890 {
891 // load
892 RADIX4x4_WITH_TW_LOAD
893 RADIX4x4_WITH_TW_S1_INV
894
895 RADIX4x4_WITH_TW_LS_02
896
897 d2_out1.val[0] = vsub_s16 (d_s5_r, d_s7_i);
898 d2_out1.val[1] = vadd_s16 (d_s5_i, d_s7_r);
899 d2_out3.val[0] = vadd_s16 (d_s5_r, d_s7_i);
900 d2_out3.val[1] = vsub_s16 (d_s5_i, d_s7_r);
901
902 // store
903 RADIX4x4_WITH_TW_STORE
904 }
905}
906
907
908
909static inline void ne10_radix4x4_with_twiddles_forward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
912 ne10_int32_t src_stride,
913 ne10_int32_t dst_stride,
914 ne10_int32_t mstride)
915{
916 RADIX4x4_WITH_TW_START
917
918 for (m_count = 0; m_count < mstride; m_count += 4)
919 {
920 // load
921 RADIX4x4_WITH_TW_LOAD
922 RADIX4x4_WITH_TW_S1_FWD
923
924 RADIX4x4_WITH_TW_LS_02_SCALED
925
926 d2_out1.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
927 d2_out1.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
928 d2_out3.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
929 d2_out3.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
930
931 // store
932 RADIX4x4_WITH_TW_STORE
933 }
934}
935
936static inline void ne10_radix4x4_with_twiddles_backward_scaled_neon (ne10_fft_cpx_int16_t * Fout,
939 ne10_int32_t src_stride,
940 ne10_int32_t dst_stride,
941 ne10_int32_t mstride)
942{
943 RADIX4x4_WITH_TW_START
944
945 for (m_count = 0; m_count < mstride; m_count += 4)
946 {
947 // load
948 RADIX4x4_WITH_TW_LOAD
949 RADIX4x4_WITH_TW_S1_INV
950
951 RADIX4x4_WITH_TW_LS_02_SCALED
952
953 d2_out1.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
954 d2_out1.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
955 d2_out3.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
956 d2_out3.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
957
958 // store
959 RADIX4x4_WITH_TW_STORE
960 }
961}
962
963
964#define ne10_mixed_radix_fft_forward_int16_neon(scaled) \
965void ne10_mixed_radix_fft_forward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \
966 ne10_fft_cpx_int16_t * Fin, \
967 ne10_int32_t * factors, \
968 ne10_fft_cpx_int16_t * twiddles, \
969 ne10_fft_cpx_int16_t * buffer) \
970{ \
971 ne10_int32_t fstride, mstride, N; \
972 ne10_int32_t fstride1; \
973 ne10_int32_t f_count; \
974 ne10_int32_t stage_count; \
975 \
976 ne10_fft_cpx_int16_t *Fin1, *Fout1; \
977 ne10_fft_cpx_int16_t *Fout_ls = Fout; \
978 ne10_fft_cpx_int16_t *Ftmp; \
979 ne10_fft_cpx_int16_t *tw, *tw1; \
980 \
981 /* init fstride, mstride, N */ \
982 stage_count = factors[0]; \
983 fstride = factors[1]; \
984 mstride = factors[ (stage_count << 1) - 1 ]; \
985 N = factors[ stage_count << 1 ]; \
986 \
987 /* the first stage */ \
988 Fin1 = Fin; \
989 Fout1 = Fout; \
990 if (N == 2) \
991 { \
992 N = fstride >> 1;\
993 tw = twiddles; \
994 fstride1 = fstride >> 2; \
995 ne10_radix8x4_forward_##scaled##_neon (Fout, Fin, fstride1);\
996 \
997 tw += 6; \
998 mstride <<= 2; \
999 fstride >>= 4; \
1000 stage_count -= 2; \
1001 \
1002 Ftmp = Fin; \
1003 Fin = Fout; \
1004 Fout = Ftmp; \
1005 } \
1006 else if (N == 4) \
1007 { \
1008 ne10_radix4x4_without_twiddles_forward_##scaled##_neon (Fout, Fin, fstride); \
1009 N = fstride; \
1010 Ftmp = Fin; \
1011 Fin = Fout; \
1012 Fout = Ftmp; \
1013 /* update address for other stages*/ \
1014 stage_count--; \
1015 tw = twiddles; \
1016 fstride >>= 2; \
1017 } \
1018 /* others but the last one*/ \
1019 for (; stage_count > 1 ; stage_count--) \
1020 { \
1021 Fin1 = Fin; \
1022 for (f_count = 0; f_count < fstride; f_count ++) \
1023 { \
1024 Fout1 = & Fout[ f_count * mstride << 2 ]; \
1025 tw1 = tw; \
1026 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1027 Fin1 += mstride; \
1028 } \
1029 tw += mstride * 3; \
1030 mstride <<= 2; \
1031 Ftmp = Fin; \
1032 Fin = Fout; \
1033 Fout = Ftmp; \
1034 fstride >>= 2; \
1035 }\
1036 /* the last one*/ \
1037 if (stage_count) \
1038 { \
1039 Fin1 = Fin; \
1040 Fout1 = Fout_ls; \
1041 for (f_count = 0; f_count < fstride; f_count ++) \
1042 { \
1043 tw1 = tw; \
1044 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1045 Fin1 += mstride; \
1046 Fout1 += mstride; \
1047 } \
1048 } \
1049}
1050
1051#define ne10_mixed_radix_fft_backward_int16_neon(scaled) \
1052void ne10_mixed_radix_fft_backward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \
1053 ne10_fft_cpx_int16_t * Fin, \
1054 ne10_int32_t * factors, \
1055 ne10_fft_cpx_int16_t * twiddles, \
1056 ne10_fft_cpx_int16_t * buffer) \
1057{ \
1058 ne10_int32_t fstride, mstride, N; \
1059 ne10_int32_t fstride1; \
1060 ne10_int32_t f_count; \
1061 ne10_int32_t stage_count; \
1062 \
1063 ne10_fft_cpx_int16_t *Fin1, *Fout1; \
1064 ne10_fft_cpx_int16_t *Fout_ls = Fout; \
1065 ne10_fft_cpx_int16_t *Ftmp; \
1066 ne10_fft_cpx_int16_t *tw, *tw1; \
1067 \
1068 /* init fstride, mstride, N */ \
1069 stage_count = factors[0]; \
1070 fstride = factors[1]; \
1071 mstride = factors[ (stage_count << 1) - 1 ]; \
1072 N = factors[ stage_count << 1 ]; \
1073 \
1074 /* the first stage */ \
1075 Fin1 = Fin; \
1076 Fout1 = Fout; \
1077 if (N == 2) \
1078 { \
1079 N = fstride >> 1;\
1080 tw = twiddles; \
1081 fstride1 = fstride >> 2; \
1082 ne10_radix8x4_backward_##scaled##_neon (Fout, Fin, fstride1);\
1083 \
1084 tw += 6; \
1085 mstride <<= 2; \
1086 fstride >>= 4; \
1087 stage_count -= 2; \
1088 \
1089 Ftmp = Fin; \
1090 Fin = Fout; \
1091 Fout = Ftmp; \
1092 } \
1093 else if (N == 4) \
1094 { \
1095 ne10_radix4x4_without_twiddles_backward_##scaled##_neon (Fout, Fin, fstride); \
1096 N = fstride; \
1097 Ftmp = Fin; \
1098 Fin = Fout; \
1099 Fout = Ftmp; \
1100 /* update address for other stages*/ \
1101 stage_count--; \
1102 tw = twiddles; \
1103 fstride >>= 2; \
1104 } \
1105 /* others but the last one*/ \
1106 for (; stage_count > 1 ; stage_count--) \
1107 { \
1108 Fin1 = Fin; \
1109 for (f_count = 0; f_count < fstride; f_count ++) \
1110 { \
1111 Fout1 = & Fout[ f_count * mstride << 2 ]; \
1112 tw1 = tw; \
1113 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \
1114 Fin1 += mstride; \
1115 } \
1116 tw += mstride * 3; \
1117 mstride <<= 2; \
1118 Ftmp = Fin; \
1119 Fin = Fout; \
1120 Fout = Ftmp; \
1121 fstride >>= 2; \
1122 }\
1123 /* the last one*/ \
1124 if (stage_count) \
1125 { \
1126 Fin1 = Fin; \
1127 Fout1 = Fout_ls; \
1128 for (f_count = 0; f_count < fstride; f_count ++) \
1129 { \
1130 tw1 = tw; \
1131 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \
1132 Fin1 += mstride; \
1133 Fout1 += mstride; \
1134 } \
1135 } \
1136}
1137
1138
1139ne10_mixed_radix_fft_forward_int16_neon (unscaled)
1140ne10_mixed_radix_fft_forward_int16_neon (scaled)
1141ne10_mixed_radix_fft_backward_int16_neon (unscaled)
1142ne10_mixed_radix_fft_backward_int16_neon (scaled)
1143
1144
1145static void ne10_fft_split_r2c_1d_int16_neon (ne10_fft_cpx_int16_t *dst,
1146 const ne10_fft_cpx_int16_t *src,
1147 ne10_fft_cpx_int16_t *twiddles,
1148 ne10_int32_t ncfft,
1149 ne10_int32_t scaled_flag)
1150{
1151 ne10_int32_t k;
1152 ne10_int32_t count = ncfft / 2;
1153 ne10_fft_cpx_int16_t fpnk, fpk, f1k, f2k, tw, tdc;
1154 int16x8x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
1155 int16x8_t q_fpnk_r, q_fpnk_i;
1156 int16x8_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
1157 int16x8_t q_tw_r, q_tw_i;
1158 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1159 int16x8_t q_dst2_r, q_dst2_i;
1160 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1161
1162 tdc.r = src[0].r;
1163 tdc.i = src[0].i;
1164
1165 if (scaled_flag)
1166 NE10_F2I16_FIXDIV (tdc, 2);
1167
1168 dst[0].r = tdc.r + tdc.i;
1169 dst[ncfft].r = tdc.r - tdc.i;
1170 dst[ncfft].i = dst[0].i = 0;
1171 if (count >= 8)
1172 {
1173
1174 if (scaled_flag)
1175 {
1176 for (k = 1; k <= count ; k += 8)
1177 {
1178 p_src = (int16_t*) (& (src[k]));
1179 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1180 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1181 p_dst = (int16_t*) (& (dst[k]));
1182 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1183
1184 q2_fpk = vld2q_s16 (p_src);
1185 q2_fpnk = vld2q_s16 (p_src2);
1186
1187 q2_tw = vld2q_s16 (p_twiddles);
1188 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1189 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1190 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1191 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1192 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1193 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1194 q_fpnk_i = vnegq_s16 (q_fpnk_i);
1195
1196 q_f1k_r = vhaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1197 q_f1k_i = vhaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1198
1199 q_f2k_r = vhsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1200 q_f2k_i = vhsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1201
1202 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1203 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1204 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1205 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1206 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1207 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1208
1209 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1210 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1211 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1212 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1213 q_dst2_r = vrev32q_s16 (q_dst2_r);
1214 q_dst2_i = vrev32q_s16 (q_dst2_i);
1215 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1216 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1217 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1218 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1219 vst2q_s16 (p_dst, q2_dst);
1220 vst2q_s16 (p_dst2, q2_dst2);
1221
1222 }
1223 }
1224 else
1225 {
1226 for (k = 1; k <= count ; k += 8)
1227 {
1228 p_src = (int16_t*) (& (src[k]));
1229 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1230 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1231 p_dst = (int16_t*) (& (dst[k]));
1232 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1233
1234 q2_fpk = vld2q_s16 (p_src);
1235 q2_fpnk = vld2q_s16 (p_src2);
1236
1237 q2_tw = vld2q_s16 (p_twiddles);
1238 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1239 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1240 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1241 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1242 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1243 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1244 q_fpnk_i = vnegq_s16 (q_fpnk_i);
1245
1246 q_f1k_r = vaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1247 q_f1k_i = vaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1248
1249 q_f2k_r = vsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1250 q_f2k_i = vsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1251
1252 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1253 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1254 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1255 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1256 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1257 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1258
1259 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1260 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1261 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1262 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1263 q_dst2_r = vrev32q_s16 (q_dst2_r);
1264 q_dst2_i = vrev32q_s16 (q_dst2_i);
1265 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1266 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1267 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1268 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1269 vst2q_s16 (p_dst, q2_dst);
1270 vst2q_s16 (p_dst2, q2_dst2);
1271
1272 }
1273 }
1274 }
1275 else
1276 {
1277
1278 for (k = 1; k <= ncfft / 2 ; ++k)
1279 {
1280 fpk = src[k];
1281 fpnk.r = src[ncfft - k].r;
1282 fpnk.i = - src[ncfft - k].i;
1283 if (scaled_flag)
1284 {
1285 NE10_F2I16_FIXDIV (fpk, 2);
1286 NE10_F2I16_FIXDIV (fpnk, 2);
1287 }
1288
1289 f1k.r = fpk.r + fpnk.r;
1290 f1k.i = fpk.i + fpnk.i;
1291
1292 f2k.r = fpk.r - fpnk.r;
1293 f2k.i = fpk.i - fpnk.i;
1294
1295 tw.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).r
1296 - (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1297 tw.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).i
1298 + (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> NE10_F2I16_SHIFT);
1299
1300 dst[k].r = (f1k.r + tw.r) >> 1;
1301 dst[k].i = (f1k.i + tw.i) >> 1;
1302 dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
1303 dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
1304 }
1305 }
1306}
1307
1308static void ne10_fft_split_c2r_1d_int16_neon (ne10_fft_cpx_int16_t *dst,
1309 const ne10_fft_cpx_int16_t *src,
1310 ne10_fft_cpx_int16_t *twiddles,
1311 ne10_int32_t ncfft,
1312 ne10_int32_t scaled_flag)
1313{
1314
1315 ne10_int32_t k;
1316 ne10_int32_t count = ncfft / 2;
1317 ne10_fft_cpx_int16_t fk, fnkc, fek, fok, tmp;
1318 int16x8x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1319 int16x8_t q_fnkc_r, q_fnkc_i;
1320 int16x8_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1321 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1322 int16x8_t q_dst2_r, q_dst2_i;
1323 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1324
1325
1326 dst[0].r = src[0].r + src[ncfft].r;
1327 dst[0].i = src[0].r - src[ncfft].r;
1328
1329 if (scaled_flag)
1330 NE10_F2I16_FIXDIV (dst[0], 2);
1331 if (count >= 8)
1332 {
1333 if (scaled_flag)
1334 {
1335 for (k = 1; k <= count ; k += 8)
1336 {
1337 p_src = (int16_t*) (& (src[k]));
1338 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1339 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1340 p_dst = (int16_t*) (& (dst[k]));
1341 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1342
1343 q2_fk = vld2q_s16 (p_src);
1344 q2_fnkc = vld2q_s16 (p_src2);
1345 q2_tw = vld2q_s16 (p_twiddles);
1346 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1347 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1348 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1349 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1350 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1351 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1352 q_fnkc_i = vnegq_s16 (q_fnkc_i);
1353
1354 q_fek_r = vhaddq_s16 (q2_fk.val[0], q_fnkc_r);
1355 q_fek_i = vhaddq_s16 (q2_fk.val[1], q_fnkc_i);
1356 q_tmp0 = vhsubq_s16 (q2_fk.val[0], q_fnkc_r);
1357 q_tmp1 = vhsubq_s16 (q2_fk.val[1], q_fnkc_i);
1358
1359 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1360 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1361 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1362 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1363 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1364 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1365
1366 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1367 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1368 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1369 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1370 q_dst2_r = vrev32q_s16 (q_dst2_r);
1371 q_dst2_i = vrev32q_s16 (q_dst2_i);
1372 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1373 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1374 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1375 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1376 vst2q_s16 (p_dst, q2_dst);
1377 vst2q_s16 (p_dst2, q2_dst2);
1378
1379 }
1380
1381 }
1382 else
1383 {
1384 for (k = 1; k <= count ; k += 8)
1385 {
1386 p_src = (int16_t*) (& (src[k]));
1387 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1388 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1389 p_dst = (int16_t*) (& (dst[k]));
1390 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1391
1392 q2_fk = vld2q_s16 (p_src);
1393 q2_fnkc = vld2q_s16 (p_src2);
1394 q2_tw = vld2q_s16 (p_twiddles);
1395 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1396 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1397 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1398 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1399 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1400 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1401 q_fnkc_i = vnegq_s16 (q_fnkc_i);
1402
1403 q_fek_r = vaddq_s16 (q2_fk.val[0], q_fnkc_r);
1404 q_fek_i = vaddq_s16 (q2_fk.val[1], q_fnkc_i);
1405 q_tmp0 = vsubq_s16 (q2_fk.val[0], q_fnkc_r);
1406 q_tmp1 = vsubq_s16 (q2_fk.val[1], q_fnkc_i);
1407
1408 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1409 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1410 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1411 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1412 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1413 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1414
1415 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1416 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1417 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1418 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1419 q_dst2_r = vrev32q_s16 (q_dst2_r);
1420 q_dst2_i = vrev32q_s16 (q_dst2_i);
1421 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1422 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1423 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1424 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1425 vst2q_s16 (p_dst, q2_dst);
1426 vst2q_s16 (p_dst2, q2_dst2);
1427
1428 }
1429 }
1430 }
1431 else
1432 {
1433
1434 for (k = 1; k <= ncfft / 2; k++)
1435 {
1436 fk = src[k];
1437 fnkc.r = src[ncfft - k].r;
1438 fnkc.i = -src[ncfft - k].i;
1439 if (scaled_flag)
1440 {
1441 NE10_F2I16_FIXDIV (fk, 2);
1442 NE10_F2I16_FIXDIV (fnkc, 2);
1443 }
1444
1445 fek.r = fk.r + fnkc.r;
1446 fek.i = fk.i + fnkc.i;
1447
1448 tmp.r = fk.r - fnkc.r;
1449 tmp.i = fk.i - fnkc.i;
1450
1451 fok.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).r
1452 + (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1453 fok.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).r
1454 - (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1455
1456 dst[k].r = fek.r + fok.r;
1457 dst[k].i = fek.i + fok.i;
1458
1459 dst[ncfft - k].r = fek.r - fok.r;
1460 dst[ncfft - k].i = fok.i - fek.i;
1461 }
1462 }
1463}
1464
1465
1490 ne10_int32_t inverse_fft,
1491 ne10_int32_t scaled_flag)
1492{
1493 if (scaled_flag)
1494 {
1495 if (inverse_fft)
1496 {
1497 switch (cfg->nfft)
1498 {
1499 case 4:
1500 ne10_fft4_backward_int16_scaled (fout, fin);
1501 break;
1502 case 8:
1503 ne10_fft8_backward_int16_scaled (fout, fin);
1504 break;
1505 default:
1506 ne10_mixed_radix_fft_backward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1507 break;
1508 }
1509 }
1510 else
1511 {
1512 switch (cfg->nfft)
1513 {
1514 case 4:
1515 ne10_fft4_forward_int16_scaled (fout, fin);
1516 break;
1517 case 8:
1518 ne10_fft8_forward_int16_scaled (fout, fin);
1519 break;
1520 default:
1521 ne10_mixed_radix_fft_forward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1522 break;
1523 }
1524 }
1525 }
1526 else
1527 {
1528 if (inverse_fft)
1529 {
1530 switch (cfg->nfft)
1531 {
1532 case 4:
1533 ne10_fft4_backward_int16_unscaled (fout, fin);
1534 break;
1535 case 8:
1536 ne10_fft8_backward_int16_unscaled (fout, fin);
1537 break;
1538 default:
1539 ne10_mixed_radix_fft_backward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1540 break;
1541 }
1542 }
1543 else
1544 {
1545 switch (cfg->nfft)
1546 {
1547 case 4:
1548 ne10_fft4_forward_int16_unscaled (fout, fin);
1549 break;
1550 case 8:
1551 ne10_fft8_forward_int16_unscaled (fout, fin);
1552 break;
1553 default:
1554 ne10_mixed_radix_fft_forward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1555 break;
1556 }
1557 }
1558 }
1559}
1560
//end of C2C_FFT_IFFT group
1564
1585 ne10_int16_t *fin,
1587 ne10_int32_t scaled_flag)
1588{
1589 ne10_fft_cpx_int16_t * tmpbuf1 = cfg->buffer;
1590 ne10_fft_cpx_int16_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1591 ne10_fft_state_int16_t c2c_state;
1592
1593 c2c_state.nfft = cfg->ncfft;
1594 c2c_state.factors = cfg->factors;
1595 c2c_state.twiddles = cfg->twiddles;
1596 c2c_state.buffer = tmpbuf2;
1597
1598 ne10_fft_c2c_1d_int16_neon (tmpbuf1, (ne10_fft_cpx_int16_t*) fin, &c2c_state, 0, scaled_flag);
1599 ne10_fft_split_r2c_1d_int16_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1600}
1601
1616void ne10_fft_c2r_1d_int16_neon (ne10_int16_t *fout,
1619 ne10_int32_t scaled_flag)
1620{
1621 ne10_fft_cpx_int16_t * tmpbuf1 = cfg->buffer;
1622 ne10_fft_cpx_int16_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1623 ne10_fft_state_int16_t c2c_state;
1624
1625 c2c_state.nfft = cfg->ncfft;
1626 c2c_state.factors = cfg->factors;
1627 c2c_state.twiddles = cfg->twiddles;
1628 c2c_state.buffer = tmpbuf2;
1629
1630 ne10_fft_split_c2r_1d_int16_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1631 ne10_fft_c2c_1d_int16_neon ( (ne10_fft_cpx_int16_t*) fout, tmpbuf1, &c2c_state, 1, scaled_flag);
1632}
1633
void ne10_fft_c2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_cfg_int16_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 16-bit fixed point data.
void ne10_fft_r2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 FFT (real to complex) of int16 data.
void ne10_fft_c2r_1d_int16_neon(ne10_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 IFFT (complex to real) of int16 data.
structure for the 16 bits fixed point FFT function.
Definition NE10_types.h:298