Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fft_int16.neon.c
1/*
2 * Copyright 2013-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : dsp/NE10_fft_int16.neon.c
30 */
31
32#include <arm_neon.h>
33
34#include "NE10_types.h"
35#include "NE10_macros.h"
36#include "NE10_fft.h"
37
38static inline void ne10_fft4_forward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
40
41{
42 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
43 ne10_int16_t tmp_r, tmp_i;
44
45 s2_r = Fin[0].r - Fin[2].r;
46 s2_i = Fin[0].i - Fin[2].i;
47
48 tmp_r = Fin[0].r + Fin[2].r;
49 tmp_i = Fin[0].i + Fin[2].i;
50
51 s0_r = Fin[1].r + Fin[3].r;
52 s0_i = Fin[1].i + Fin[3].i;
53
54 s1_r = Fin[1].r - Fin[3].r;
55 s1_i = Fin[1].i - Fin[3].i;
56 Fout[2].r = tmp_r - s0_r;
57 Fout[2].i = tmp_i - s0_i;
58 Fout[0].r = tmp_r + s0_r;
59 Fout[0].i = tmp_i + s0_i;
60
61 Fout[1].r = s2_r + s1_i;
62 Fout[1].i = s2_i - s1_r;
63 Fout[3].r = s2_r - s1_i;
64 Fout[3].i = s2_i + s1_r;
65}
66
67static inline void ne10_fft4_backward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
69
70{
71 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
72 ne10_int16_t tmp_r, tmp_i;
73
74 s2_r = Fin[0].r - Fin[2].r;
75 s2_i = Fin[0].i - Fin[2].i;
76
77 tmp_r = Fin[0].r + Fin[2].r;
78 tmp_i = Fin[0].i + Fin[2].i;
79
80 s0_r = Fin[1].r + Fin[3].r;
81 s0_i = Fin[1].i + Fin[3].i;
82
83 s1_r = Fin[1].r - Fin[3].r;
84 s1_i = Fin[1].i - Fin[3].i;
85
86 Fout[2].r = tmp_r - s0_r;
87 Fout[2].i = tmp_i - s0_i;
88 Fout[0].r = tmp_r + s0_r;
89 Fout[0].i = tmp_i + s0_i;
90
91 Fout[1].r = s2_r - s1_i;
92 Fout[1].i = s2_i + s1_r;
93 Fout[3].r = s2_r + s1_i;
94 Fout[3].i = s2_i - s1_r;
95}
96static inline void ne10_fft4_forward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
98
99{
100 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
101 ne10_int16_t tmp_r, tmp_i;
102
103 s2_r = (Fin[0].r - Fin[2].r) >> 2;
104 s2_i = (Fin[0].i - Fin[2].i) >> 2;
105 tmp_r = (Fin[0].r + Fin[2].r) >> 2;
106 tmp_i = (Fin[0].i + Fin[2].i) >> 2;
107
108 s0_r = (Fin[1].r + Fin[3].r) >> 2;
109 s0_i = (Fin[1].i + Fin[3].i) >> 2;
110 s1_r = (Fin[1].r - Fin[3].r) >> 2;
111 s1_i = (Fin[1].i - Fin[3].i) >> 2;
112
113 Fout[2].r = tmp_r - s0_r;
114 Fout[2].i = tmp_i - s0_i;
115 Fout[0].r = tmp_r + s0_r;
116 Fout[0].i = tmp_i + s0_i;
117
118 Fout[1].r = s2_r + s1_i;
119 Fout[1].i = s2_i - s1_r;
120 Fout[3].r = s2_r - s1_i;
121 Fout[3].i = s2_i + s1_r;
122}
123
124static inline void ne10_fft4_backward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
126
127{
128 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
129 ne10_int16_t tmp_r, tmp_i;
130
131 s2_r = (Fin[0].r - Fin[2].r) >> 2;
132 s2_i = (Fin[0].i - Fin[2].i) >> 2;
133 tmp_r = (Fin[0].r + Fin[2].r) >> 2;
134 tmp_i = (Fin[0].i + Fin[2].i) >> 2;
135
136 s0_r = (Fin[1].r + Fin[3].r) >> 2;
137 s0_i = (Fin[1].i + Fin[3].i) >> 2;
138 s1_r = (Fin[1].r - Fin[3].r) >> 2;
139 s1_i = (Fin[1].i - Fin[3].i) >> 2;
140
141 Fout[2].r = tmp_r - s0_r;
142 Fout[2].i = tmp_i - s0_i;
143 Fout[0].r = tmp_r + s0_r;
144 Fout[0].i = tmp_i + s0_i;
145
146 Fout[1].r = s2_r - s1_i;
147 Fout[1].i = s2_i + s1_r;
148 Fout[3].r = s2_r + s1_i;
149 Fout[3].i = s2_i - s1_r;
150}
151static inline void ne10_fft8_forward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
153
154{
155 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
156 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
157 const ne10_int16_t TW_81 = 23169;
158
159 s0_r = Fin[0].r + Fin[4].r;
160 s0_i = Fin[0].i + Fin[4].i;
161 s1_r = Fin[0].r - Fin[4].r;
162 s1_i = Fin[0].i - Fin[4].i;
163 s2_r = Fin[1].r + Fin[5].r;
164 s2_i = Fin[1].i + Fin[5].i;
165 s3_r = Fin[1].r - Fin[5].r;
166 s3_i = Fin[1].i - Fin[5].i;
167 s4_r = Fin[2].r + Fin[6].r;
168 s4_i = Fin[2].i + Fin[6].i;
169 s5_r = Fin[2].r - Fin[6].r;
170 s5_i = Fin[2].i - Fin[6].i;
171 s6_r = Fin[3].r + Fin[7].r;
172 s6_i = Fin[3].i + Fin[7].i;
173 s7_r = Fin[3].r - Fin[7].r;
174 s7_i = Fin[3].i - Fin[7].i;
175
176 t0_r = s0_r - s4_r;
177 t0_i = s0_i - s4_i;
178 t1_r = s0_r + s4_r;
179 t1_i = s0_i + s4_i;
180 t2_r = s2_r + s6_r;
181 t2_i = s2_i + s6_i;
182 t3_r = s2_r - s6_r;
183 t3_i = s2_i - s6_i;
184 Fout[0].r = t1_r + t2_r;
185 Fout[0].i = t1_i + t2_i;
186 Fout[4].r = t1_r - t2_r;
187 Fout[4].i = t1_i - t2_i;
188 Fout[2].r = t0_r + t3_i;
189 Fout[2].i = t0_i - t3_r;
190 Fout[6].r = t0_r - t3_i;
191 Fout[6].i = t0_i + t3_r;
192
193 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
194 t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
195 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
196 t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
197
198 t0_r = s1_r - s5_i;
199 t0_i = s1_i + s5_r;
200 t1_r = s1_r + s5_i;
201 t1_i = s1_i - s5_r;
202 t2_r = t4_r - t5_r;
203 t2_i = t4_i - t5_i;
204 t3_r = t4_r + t5_r;
205 t3_i = t4_i + t5_i;
206 Fout[1].r = t1_r + t2_r;
207 Fout[1].i = t1_i + t2_i;
208 Fout[5].r = t1_r - t2_r;
209 Fout[5].i = t1_i - t2_i;
210 Fout[3].r = t0_r + t3_i;
211 Fout[3].i = t0_i - t3_r;
212 Fout[7].r = t0_r - t3_i;
213 Fout[7].i = t0_i + t3_r;
214}
215
216static inline void ne10_fft8_backward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
218
219{
220 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
221 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
222 const ne10_int16_t TW_81 = 23169;
223
224 s0_r = Fin[0].r + Fin[4].r;
225 s0_i = Fin[0].i + Fin[4].i;
226 s1_r = Fin[0].r - Fin[4].r;
227 s1_i = Fin[0].i - Fin[4].i;
228 s2_r = Fin[1].r + Fin[5].r;
229 s2_i = Fin[1].i + Fin[5].i;
230 s3_r = Fin[1].r - Fin[5].r;
231 s3_i = Fin[1].i - Fin[5].i;
232 s4_r = Fin[2].r + Fin[6].r;
233 s4_i = Fin[2].i + Fin[6].i;
234 s5_r = Fin[2].r - Fin[6].r;
235 s5_i = Fin[2].i - Fin[6].i;
236 s6_r = Fin[3].r + Fin[7].r;
237 s6_i = Fin[3].i + Fin[7].i;
238 s7_r = Fin[3].r - Fin[7].r;
239 s7_i = Fin[3].i - Fin[7].i;
240
241 t0_r = s0_r - s4_r;
242 t0_i = s0_i - s4_i;
243 t1_r = s0_r + s4_r;
244 t1_i = s0_i + s4_i;
245 t2_r = s2_r + s6_r;
246 t2_i = s2_i + s6_i;
247 t3_r = s2_r - s6_r;
248 t3_i = s2_i - s6_i;
249 Fout[0].r = t1_r + t2_r;
250 Fout[0].i = t1_i + t2_i;
251 Fout[4].r = t1_r - t2_r;
252 Fout[4].i = t1_i - t2_i;
253 Fout[2].r = t0_r - t3_i;
254 Fout[2].i = t0_i + t3_r;
255 Fout[6].r = t0_r + t3_i;
256 Fout[6].i = t0_i - t3_r;
257
258 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
259 t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
260 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
261 t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
262
263 t0_r = s1_r + s5_i;
264 t0_i = s1_i - s5_r;
265 t1_r = s1_r - s5_i;
266 t1_i = s1_i + s5_r;
267 t2_r = t4_r - t5_r;
268 t2_i = t4_i - t5_i;
269 t3_r = t4_r + t5_r;
270 t3_i = t4_i + t5_i;
271 Fout[1].r = t1_r + t2_r;
272 Fout[1].i = t1_i + t2_i;
273 Fout[5].r = t1_r - t2_r;
274 Fout[5].i = t1_i - t2_i;
275 Fout[3].r = t0_r - t3_i;
276 Fout[3].i = t0_i + t3_r;
277 Fout[7].r = t0_r + t3_i;
278 Fout[7].i = t0_i - t3_r;
279}
280static inline void ne10_fft8_forward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
282
283{
284 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
285 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
286 const ne10_int16_t TW_81 = 23169;
287
288 s0_r = (Fin[0].r + Fin[4].r) >> 3;
289 s0_i = (Fin[0].i + Fin[4].i) >> 3;
290 s1_r = (Fin[0].r - Fin[4].r) >> 3;
291 s1_i = (Fin[0].i - Fin[4].i) >> 3;
292 s2_r = (Fin[1].r + Fin[5].r) >> 3;
293 s2_i = (Fin[1].i + Fin[5].i) >> 3;
294 s3_r = (Fin[1].r - Fin[5].r) >> 3;
295 s3_i = (Fin[1].i - Fin[5].i) >> 3;
296 s4_r = (Fin[2].r + Fin[6].r) >> 3;
297 s4_i = (Fin[2].i + Fin[6].i) >> 3;
298 s5_r = (Fin[2].r - Fin[6].r) >> 3;
299 s5_i = (Fin[2].i - Fin[6].i) >> 3;
300 s6_r = (Fin[3].r + Fin[7].r) >> 3;
301 s6_i = (Fin[3].i + Fin[7].i) >> 3;
302 s7_r = (Fin[3].r - Fin[7].r) >> 3;
303 s7_i = (Fin[3].i - Fin[7].i) >> 3;
304
305 t0_r = s0_r - s4_r;
306 t0_i = s0_i - s4_i;
307 t1_r = s0_r + s4_r;
308 t1_i = s0_i + s4_i;
309 t2_r = s2_r + s6_r;
310 t2_i = s2_i + s6_i;
311 t3_r = s2_r - s6_r;
312 t3_i = s2_i - s6_i;
313 Fout[0].r = t1_r + t2_r;
314 Fout[0].i = t1_i + t2_i;
315 Fout[4].r = t1_r - t2_r;
316 Fout[4].i = t1_i - t2_i;
317 Fout[2].r = t0_r + t3_i;
318 Fout[2].i = t0_i - t3_r;
319 Fout[6].r = t0_r - t3_i;
320 Fout[6].i = t0_i + t3_r;
321
322 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
323 t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
324 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
325 t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
326
327 t0_r = s1_r - s5_i;
328 t0_i = s1_i + s5_r;
329 t1_r = s1_r + s5_i;
330 t1_i = s1_i - s5_r;
331 t2_r = t4_r - t5_r;
332 t2_i = t4_i - t5_i;
333 t3_r = t4_r + t5_r;
334 t3_i = t4_i + t5_i;
335 Fout[1].r = t1_r + t2_r;
336 Fout[1].i = t1_i + t2_i;
337 Fout[5].r = t1_r - t2_r;
338 Fout[5].i = t1_i - t2_i;
339 Fout[3].r = t0_r + t3_i;
340 Fout[3].i = t0_i - t3_r;
341 Fout[7].r = t0_r - t3_i;
342 Fout[7].i = t0_i + t3_r;
343}
344
345static inline void ne10_fft8_backward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
347
348{
349 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
350 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
351 const ne10_int16_t TW_81 = 23169;
352
353 s0_r = (Fin[0].r + Fin[4].r) >> 3;
354 s0_i = (Fin[0].i + Fin[4].i) >> 3;
355 s1_r = (Fin[0].r - Fin[4].r) >> 3;
356 s1_i = (Fin[0].i - Fin[4].i) >> 3;
357 s2_r = (Fin[1].r + Fin[5].r) >> 3;
358 s2_i = (Fin[1].i + Fin[5].i) >> 3;
359 s3_r = (Fin[1].r - Fin[5].r) >> 3;
360 s3_i = (Fin[1].i - Fin[5].i) >> 3;
361 s4_r = (Fin[2].r + Fin[6].r) >> 3;
362 s4_i = (Fin[2].i + Fin[6].i) >> 3;
363 s5_r = (Fin[2].r - Fin[6].r) >> 3;
364 s5_i = (Fin[2].i - Fin[6].i) >> 3;
365 s6_r = (Fin[3].r + Fin[7].r) >> 3;
366 s6_i = (Fin[3].i + Fin[7].i) >> 3;
367 s7_r = (Fin[3].r - Fin[7].r) >> 3;
368 s7_i = (Fin[3].i - Fin[7].i) >> 3;
369
370 t0_r = s0_r - s4_r;
371 t0_i = s0_i - s4_i;
372 t1_r = s0_r + s4_r;
373 t1_i = s0_i + s4_i;
374 t2_r = s2_r + s6_r;
375 t2_i = s2_i + s6_i;
376 t3_r = s2_r - s6_r;
377 t3_i = s2_i - s6_i;
378 Fout[0].r = t1_r + t2_r;
379 Fout[0].i = t1_i + t2_i;
380 Fout[4].r = t1_r - t2_r;
381 Fout[4].i = t1_i - t2_i;
382 Fout[2].r = t0_r - t3_i;
383 Fout[2].i = t0_i + t3_r;
384 Fout[6].r = t0_r + t3_i;
385 Fout[6].i = t0_i - t3_r;
386
387 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
388 t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
389 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
390 t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
391
392 t0_r = s1_r + s5_i;
393 t0_i = s1_i - s5_r;
394 t1_r = s1_r - s5_i;
395 t1_i = s1_i + s5_r;
396 t2_r = t4_r - t5_r;
397 t2_i = t4_i - t5_i;
398 t3_r = t4_r + t5_r;
399 t3_i = t4_i + t5_i;
400 Fout[1].r = t1_r + t2_r;
401 Fout[1].i = t1_i + t2_i;
402 Fout[5].r = t1_r - t2_r;
403 Fout[5].i = t1_i - t2_i;
404 Fout[3].r = t0_r - t3_i;
405 Fout[3].i = t0_i + t3_r;
406 Fout[7].r = t0_r + t3_i;
407 Fout[7].i = t0_i - t3_r;
408}
409static void ne10_fft_split_r2c_1d_int16_neon (ne10_fft_cpx_int16_t *dst,
410 const ne10_fft_cpx_int16_t *src,
411 ne10_fft_cpx_int16_t *twiddles,
412 ne10_int32_t ncfft,
413 ne10_int32_t scaled_flag)
414{
415 ne10_int32_t k;
416 ne10_int32_t count = ncfft / 2;
417 ne10_fft_cpx_int16_t fpnk, fpk, f1k, f2k, tw, tdc;
418 int16x8x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
419 int16x8_t q_fpnk_r, q_fpnk_i;
420 int16x8_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
421 int16x8_t q_tw_r, q_tw_i;
422 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
423 int16x8_t q_dst2_r, q_dst2_i;
424 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
425
426 tdc.r = src[0].r;
427 tdc.i = src[0].i;
428
429 if (scaled_flag)
430 NE10_F2I16_FIXDIV (tdc, 2);
431
432 dst[0].r = tdc.r + tdc.i;
433 dst[ncfft].r = tdc.r - tdc.i;
434 dst[ncfft].i = dst[0].i = 0;
435 if (count >= 8)
436 {
437
438 if (scaled_flag)
439 {
440 for (k = 1; k <= count ; k += 8)
441 {
442 p_src = (int16_t*) (& (src[k]));
443 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
444 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
445 p_dst = (int16_t*) (& (dst[k]));
446 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
447
448 q2_fpk = vld2q_s16 (p_src);
449 q2_fpnk = vld2q_s16 (p_src2);
450
451 q2_tw = vld2q_s16 (p_twiddles);
452 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
453 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
454 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
455 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
456 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
457 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
458 q_fpnk_i = vnegq_s16 (q_fpnk_i);
459
460 q_f1k_r = vhaddq_s16 (q2_fpk.val[0], q_fpnk_r);
461 q_f1k_i = vhaddq_s16 (q2_fpk.val[1], q_fpnk_i);
462
463 q_f2k_r = vhsubq_s16 (q2_fpk.val[0], q_fpnk_r);
464 q_f2k_i = vhsubq_s16 (q2_fpk.val[1], q_fpnk_i);
465
466 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
467 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
468 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
469 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
470 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
471 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
472
473 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
474 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
475 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
476 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
477 q_dst2_r = vrev32q_s16 (q_dst2_r);
478 q_dst2_i = vrev32q_s16 (q_dst2_i);
479 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
480 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
481 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
482 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
483 vst2q_s16 (p_dst, q2_dst);
484 vst2q_s16 (p_dst2, q2_dst2);
485
486 }
487 }
488 else
489 {
490 for (k = 1; k <= count ; k += 8)
491 {
492 p_src = (int16_t*) (& (src[k]));
493 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
494 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
495 p_dst = (int16_t*) (& (dst[k]));
496 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
497
498 q2_fpk = vld2q_s16 (p_src);
499 q2_fpnk = vld2q_s16 (p_src2);
500
501 q2_tw = vld2q_s16 (p_twiddles);
502 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
503 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
504 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
505 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
506 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
507 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
508 q_fpnk_i = vnegq_s16 (q_fpnk_i);
509
510 q_f1k_r = vaddq_s16 (q2_fpk.val[0], q_fpnk_r);
511 q_f1k_i = vaddq_s16 (q2_fpk.val[1], q_fpnk_i);
512
513 q_f2k_r = vsubq_s16 (q2_fpk.val[0], q_fpnk_r);
514 q_f2k_i = vsubq_s16 (q2_fpk.val[1], q_fpnk_i);
515
516 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
517 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
518 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
519 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
520 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
521 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
522
523 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
524 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
525 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
526 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
527 q_dst2_r = vrev32q_s16 (q_dst2_r);
528 q_dst2_i = vrev32q_s16 (q_dst2_i);
529 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
530 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
531 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
532 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
533 vst2q_s16 (p_dst, q2_dst);
534 vst2q_s16 (p_dst2, q2_dst2);
535
536 }
537 }
538 }
539 else
540 {
541
542 for (k = 1; k <= ncfft / 2 ; ++k)
543 {
544 fpk = src[k];
545 fpnk.r = src[ncfft - k].r;
546 fpnk.i = - src[ncfft - k].i;
547 if (scaled_flag)
548 {
549 NE10_F2I16_FIXDIV (fpk, 2);
550 NE10_F2I16_FIXDIV (fpnk, 2);
551 }
552
553 f1k.r = fpk.r + fpnk.r;
554 f1k.i = fpk.i + fpnk.i;
555
556 f2k.r = fpk.r - fpnk.r;
557 f2k.i = fpk.i - fpnk.i;
558
559 tw.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).r
560 - (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
561 tw.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).i
562 + (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> NE10_F2I16_SHIFT);
563
564 dst[k].r = (f1k.r + tw.r) >> 1;
565 dst[k].i = (f1k.i + tw.i) >> 1;
566 dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
567 dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
568 }
569 }
570}
571
572static void ne10_fft_split_c2r_1d_int16_neon (ne10_fft_cpx_int16_t *dst,
573 const ne10_fft_cpx_int16_t *src,
574 ne10_fft_cpx_int16_t *twiddles,
575 ne10_int32_t ncfft,
576 ne10_int32_t scaled_flag)
577{
578
579 ne10_int32_t k;
580 ne10_int32_t count = ncfft / 2;
581 ne10_fft_cpx_int16_t fk, fnkc, fek, fok, tmp;
582 int16x8x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
583 int16x8_t q_fnkc_r, q_fnkc_i;
584 int16x8_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
585 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
586 int16x8_t q_dst2_r, q_dst2_i;
587 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
588
589
590 dst[0].r = src[0].r + src[ncfft].r;
591 dst[0].i = src[0].r - src[ncfft].r;
592
593 if (scaled_flag)
594 NE10_F2I16_FIXDIV (dst[0], 2);
595 if (count >= 8)
596 {
597 if (scaled_flag)
598 {
599 for (k = 1; k <= count ; k += 8)
600 {
601 p_src = (int16_t*) (& (src[k]));
602 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
603 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
604 p_dst = (int16_t*) (& (dst[k]));
605 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
606
607 q2_fk = vld2q_s16 (p_src);
608 q2_fnkc = vld2q_s16 (p_src2);
609 q2_tw = vld2q_s16 (p_twiddles);
610 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
611 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
612 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
613 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
614 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
615 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
616 q_fnkc_i = vnegq_s16 (q_fnkc_i);
617
618 q_fek_r = vhaddq_s16 (q2_fk.val[0], q_fnkc_r);
619 q_fek_i = vhaddq_s16 (q2_fk.val[1], q_fnkc_i);
620 q_tmp0 = vhsubq_s16 (q2_fk.val[0], q_fnkc_r);
621 q_tmp1 = vhsubq_s16 (q2_fk.val[1], q_fnkc_i);
622
623 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
624 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
625 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
626 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
627 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
628 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
629
630 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
631 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
632 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
633 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
634 q_dst2_r = vrev32q_s16 (q_dst2_r);
635 q_dst2_i = vrev32q_s16 (q_dst2_i);
636 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
637 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
638 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
639 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
640 vst2q_s16 (p_dst, q2_dst);
641 vst2q_s16 (p_dst2, q2_dst2);
642
643 }
644
645 }
646 else
647 {
648 for (k = 1; k <= count ; k += 8)
649 {
650 p_src = (int16_t*) (& (src[k]));
651 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
652 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
653 p_dst = (int16_t*) (& (dst[k]));
654 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
655
656 q2_fk = vld2q_s16 (p_src);
657 q2_fnkc = vld2q_s16 (p_src2);
658 q2_tw = vld2q_s16 (p_twiddles);
659 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
660 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
661 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
662 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
663 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
664 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
665 q_fnkc_i = vnegq_s16 (q_fnkc_i);
666
667 q_fek_r = vaddq_s16 (q2_fk.val[0], q_fnkc_r);
668 q_fek_i = vaddq_s16 (q2_fk.val[1], q_fnkc_i);
669 q_tmp0 = vsubq_s16 (q2_fk.val[0], q_fnkc_r);
670 q_tmp1 = vsubq_s16 (q2_fk.val[1], q_fnkc_i);
671
672 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
673 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
674 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
675 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
676 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
677 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
678
679 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
680 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
681 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
682 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
683 q_dst2_r = vrev32q_s16 (q_dst2_r);
684 q_dst2_i = vrev32q_s16 (q_dst2_i);
685 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
686 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
687 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
688 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
689 vst2q_s16 (p_dst, q2_dst);
690 vst2q_s16 (p_dst2, q2_dst2);
691
692 }
693 }
694 }
695 else
696 {
697
698 for (k = 1; k <= ncfft / 2; k++)
699 {
700 fk = src[k];
701 fnkc.r = src[ncfft - k].r;
702 fnkc.i = -src[ncfft - k].i;
703 if (scaled_flag)
704 {
705 NE10_F2I16_FIXDIV (fk, 2);
706 NE10_F2I16_FIXDIV (fnkc, 2);
707 }
708
709 fek.r = fk.r + fnkc.r;
710 fek.i = fk.i + fnkc.i;
711
712 tmp.r = fk.r - fnkc.r;
713 tmp.i = fk.i - fnkc.i;
714
715 fok.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).r
716 + (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
717 fok.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).r
718 - (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
719
720 dst[k].r = fek.r + fok.r;
721 dst[k].i = fek.i + fok.i;
722
723 dst[ncfft - k].r = fek.r - fok.r;
724 dst[ncfft - k].i = fok.i - fek.i;
725 }
726 }
727}
728
749 ne10_int32_t inverse_fft,
750 ne10_int32_t scaled_flag)
751{
752 if (scaled_flag)
753 {
754 if (inverse_fft)
755 {
756 switch (cfg->nfft)
757 {
758 case 4:
759 ne10_fft4_backward_int16_scaled (fout, fin);
760 break;
761 case 8:
762 ne10_fft8_backward_int16_scaled (fout, fin);
763 break;
764 default:
765 ne10_mixed_radix_fft_backward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
766 break;
767 }
768 }
769 else
770 {
771 switch (cfg->nfft)
772 {
773 case 4:
774 ne10_fft4_forward_int16_scaled (fout, fin);
775 break;
776 case 8:
777 ne10_fft8_forward_int16_scaled (fout, fin);
778 break;
779 default:
780 ne10_mixed_radix_fft_forward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
781 break;
782 }
783 }
784 }
785 else
786 {
787 if (inverse_fft)
788 {
789 switch (cfg->nfft)
790 {
791 case 4:
792 ne10_fft4_backward_int16_unscaled (fout, fin);
793 break;
794 case 8:
795 ne10_fft8_backward_int16_unscaled (fout, fin);
796 break;
797 default:
798 ne10_mixed_radix_fft_backward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
799 break;
800 }
801 }
802 else
803 {
804 switch (cfg->nfft)
805 {
806 case 4:
807 ne10_fft4_forward_int16_unscaled (fout, fin);
808 break;
809 case 8:
810 ne10_fft8_forward_int16_unscaled (fout, fin);
811 break;
812 default:
813 ne10_mixed_radix_fft_forward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
814 break;
815 }
816 }
817 }
818}
819
//end of C2C_FFT_IFFT group
823
841 ne10_int16_t *fin,
843 ne10_int32_t scaled_flag)
844{
845 ne10_fft_cpx_int16_t * tmpbuf1 = cfg->buffer;
846 ne10_fft_cpx_int16_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
847 ne10_fft_state_int16_t c2c_state;
848
849 c2c_state.nfft = cfg->ncfft;
850 c2c_state.factors = cfg->factors;
851 c2c_state.twiddles = cfg->twiddles;
852 c2c_state.buffer = tmpbuf2;
853
854 ne10_fft_c2c_1d_int16_neon (tmpbuf1, (ne10_fft_cpx_int16_t*) fin, &c2c_state, 0, scaled_flag);
855 ne10_fft_split_r2c_1d_int16_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
856}
868void ne10_fft_c2r_1d_int16_neon (ne10_int16_t *fout,
871 ne10_int32_t scaled_flag)
872{
873 ne10_fft_cpx_int16_t * tmpbuf1 = cfg->buffer;
874 ne10_fft_cpx_int16_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
875 ne10_fft_state_int16_t c2c_state;
876
877 c2c_state.nfft = cfg->ncfft;
878 c2c_state.factors = cfg->factors;
879 c2c_state.twiddles = cfg->twiddles;
880 c2c_state.buffer = tmpbuf2;
881
882 ne10_fft_split_c2r_1d_int16_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
883 ne10_fft_c2c_1d_int16_neon ( (ne10_fft_cpx_int16_t*) fout, tmpbuf1, &c2c_state, 1, scaled_flag);
884}
void ne10_fft_c2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_cfg_int16_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 16-bit fixed point data.
void ne10_fft_r2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 FFT (real to complex) of int16 data.
void ne10_fft_c2r_1d_int16_neon(ne10_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 IFFT (complex to real) of int16 data.
structure for the 16 bits fixed point FFT function.
Definition NE10_types.h:298