Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fft_generic_float32.neonintrinsic.cpp
1/*
2 * Copyright 2014-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/* license of Kiss FFT */
29/*
30Copyright (c) 2003-2010, Mark Borgerding
31
32All rights reserved.
33
34Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35
36 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39
40THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41*/
42
43/*
44 * NE10 Library : dsp/NE10_fft_generic_float32.neonintrisic.cpp
45 *
46 * This file must be compiled by C++ toolchain because some functions are
47 * written as template functions to make it easier for compiler to
48 * reduce branch jump.
49 */
50
51#include "NE10_types.h"
52#include "NE10_macros.h"
53#include "NE10_fft.neonintrinsic.h"
54#include "NE10_fft_generic_float32.h"
55
56typedef float32x4x2_t CPLX;
57typedef float32x4_t REAL;
58#define NE10_REAL_DUP_NEON_F32 vdupq_n_f32
59#define NE10_CPLX_LOAD(PTR) vld2q_f32 ((ne10_float32_t*) (PTR))
60#define NE10_CPLX_STORE(PTR,OUT) \
61 do { \
62 vst2q_f32 ((ne10_float32_t*) (PTR), OUT); \
63 } while (0)
64
65static inline void NE10_LOAD_TW_AND_MUL (CPLX &scratch_in,
66 const ne10_fft_cpx_float32_t *ptr_in)
67{
68 CPLX scratch_tw;
69 float32x2_t d2_tmp = vld1_f32 ((ne10_float32_t *)ptr_in);
70 scratch_tw.val[0] = NE10_REAL_DUP_NEON_F32 (d2_tmp[0]);
71 scratch_tw.val[1] = NE10_REAL_DUP_NEON_F32 (d2_tmp[1]);
72 NE10_CPX_MUL_NEON_F32 (scratch_in, scratch_in, scratch_tw);
73}
74
75static inline REAL NE10_S_MUL_NEON_F32 (const REAL vec,
76 const ne10_float32_t scalar)
77{
78 REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
79 REAL result = scalar_neon * vec;
80 return result;
81}
82
83static inline REAL NE10_S_MLA_NEON_F32 (const REAL dst,
84 const REAL src,
85 const ne10_float32_t scalar)
86{
87 REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
88 return vmlaq_f32 (dst, src, scalar_neon);
89}
90
91static inline REAL NE10_S_MLS_NEON_F32 (const REAL dst,
92 const REAL src,
93 const ne10_float32_t scalar)
94{
95 REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
96 return vmlsq_f32 (dst, src, scalar_neon);
97}
98
100// Multiply input with twiddles
102static inline void NE10_FFT2_MUL_TW_NEON (CPLX scratch_out[2],
103 const CPLX scratch_in[2],
104 const CPLX scratch_tw[1])
105{
106 scratch_out[0] = scratch_in[0];
107 NE10_CPX_MUL_NEON_F32 (scratch_out[1], scratch_in[1], scratch_tw[0]);
108}
109
110static inline void NE10_FFT3_MUL_TW_NEON (CPLX scratch_out[3],
111 const CPLX scratch_in[3],
112 const CPLX scratch_tw[2])
113{
114 NE10_FFT2_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
115 NE10_CPX_MUL_NEON_F32 (scratch_out[2], scratch_in[2], scratch_tw[1]);
116}
117
118static inline void NE10_FFT4_MUL_TW_NEON (CPLX scratch_out[4],
119 const CPLX scratch_in[4],
120 const CPLX scratch_tw[3])
121{
122 NE10_FFT3_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
123 NE10_CPX_MUL_NEON_F32 (scratch_out[3], scratch_in[3], scratch_tw[2]);
124}
125
126static inline void NE10_FFT5_MUL_TW_NEON (CPLX scratch_out[5],
127 const CPLX scratch_in[5],
128 const CPLX scratch_tw[4])
129{
130 NE10_FFT4_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
131 NE10_CPX_MUL_NEON_F32 (scratch_out[4], scratch_in[4], scratch_tw[3]);
132}
133
135// Conj inplace.
137static inline void NE10_FFT2_CONJ (CPLX scratch_out[2])
138{
139 scratch_out[0].val[1] = -scratch_out[0].val[1];
140 scratch_out[1].val[1] = -scratch_out[1].val[1];
141}
142
143static inline void NE10_FFT3_CONJ (CPLX scratch_out[3])
144{
145 NE10_FFT2_CONJ (scratch_out);
146 scratch_out[2].val[1] = -scratch_out[2].val[1];
147}
148
149static inline void NE10_FFT4_CONJ (CPLX scratch_out[4])
150{
151 NE10_FFT3_CONJ (scratch_out);
152 scratch_out[3].val[1] = -scratch_out[3].val[1];
153}
154
155static inline void NE10_FFT5_CONJ (CPLX scratch_out[5])
156{
157 NE10_FFT4_CONJ (scratch_out);
158 scratch_out[4].val[1] = -scratch_out[4].val[1];
159}
160
161static inline void NE10_FFT8_CONJ (CPLX scratch_out[8])
162{
163 NE10_FFT5_CONJ (scratch_out);
164 scratch_out[5].val[1] = -scratch_out[5].val[1];
165 scratch_out[6].val[1] = -scratch_out[6].val[1];
166 scratch_out[7].val[1] = -scratch_out[7].val[1];
167}
168
170// Scaling
171// If Macro NE10_DSP_CFFT_SCALING is not defined, these functions do nothing.
173static inline void NE10_FFT2_SCALING (CPLX scratch_out[2],
174 const REAL one_by_fft_neon)
175{
176#ifdef NE10_DSP_CFFT_SCALING
177 scratch_out[0].val[0] *= one_by_fft_neon;
178 scratch_out[0].val[1] *= one_by_fft_neon;
179 scratch_out[1].val[0] *= one_by_fft_neon;
180 scratch_out[1].val[1] *= one_by_fft_neon;
181#endif
182}
183
184static inline void NE10_FFT3_SCALING (CPLX scratch_out[3],
185 const REAL one_by_fft_neon)
186{
187#ifdef NE10_DSP_CFFT_SCALING
188 NE10_FFT2_SCALING (scratch_out, one_by_fft_neon);
189 scratch_out[2].val[0] *= one_by_fft_neon;
190 scratch_out[2].val[1] *= one_by_fft_neon;
191#endif
192}
193
194static inline void NE10_FFT4_SCALING (CPLX scratch_out[4],
195 const REAL one_by_fft_neon)
196{
197#ifdef NE10_DSP_CFFT_SCALING
198 NE10_FFT3_SCALING (scratch_out, one_by_fft_neon);
199 scratch_out[3].val[0] *= one_by_fft_neon;
200 scratch_out[3].val[1] *= one_by_fft_neon;
201#endif
202}
203
204static inline void NE10_FFT5_SCALING (CPLX scratch_out[5],
205 const REAL one_by_fft_neon)
206{
207#ifdef NE10_DSP_CFFT_SCALING
208 NE10_FFT4_SCALING (scratch_out, one_by_fft_neon);
209 scratch_out[4].val[0] *= one_by_fft_neon;
210 scratch_out[4].val[1] *= one_by_fft_neon;
211#endif
212}
213
214static inline void NE10_FFT8_SCALING (CPLX scratch_out[8],
215 const REAL one_by_fft_neon)
216{
217#ifdef NE10_DSP_CFFT_SCALING
218 NE10_FFT5_SCALING (scratch_out, one_by_fft_neon);
219 scratch_out[5].val[0] *= one_by_fft_neon;
220 scratch_out[5].val[1] *= one_by_fft_neon;
221 scratch_out[6].val[0] *= one_by_fft_neon;
222 scratch_out[6].val[1] *= one_by_fft_neon;
223 scratch_out[7].val[0] *= one_by_fft_neon;
224 scratch_out[7].val[1] *= one_by_fft_neon;
225#endif
226}
227
229// FFT Kernel
230// F: Forward
231// C: Complex
232// U: Unscaled
234static inline void NE10_FFT2_FUC_NEON_F32 (CPLX scratch_out[2],
235 const CPLX scratch_in[2])
236{
237 NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch_in[0], scratch_in[1]);
238 NE10_CPX_SUB_NEON_F32 (scratch_out[1], scratch_in[0], scratch_in[1]);
239}
240
241static inline void NE10_FFT3_FUC_NEON_F32 (CPLX Fout[3],
242 const CPLX Fin[3])
243{
244 const float32x4_t TW_3IN_NEON_F32 = vdupq_n_f32 (TW_3IN_F32);
245 const float32x4_t HALF_NEON_F32 = vdupq_n_f32 (0.5f);
246
247 NE10_CPX_ADD_NEON_F32 (Fout[2], Fin[1], Fin[2]);
248 NE10_CPX_SUB_NEON_F32 (Fout[0], Fin[1], Fin[2]);
249
250 Fout[1].val[0] = Fin[0].val[0] - Fout[2].val[0] * HALF_NEON_F32;
251 Fout[1].val[1] = Fin[0].val[1] - Fout[2].val[1] * HALF_NEON_F32;
252
253 Fout[0].val[0] = Fout[0].val[0] * TW_3IN_NEON_F32;
254 Fout[0].val[1] = Fout[0].val[1] * TW_3IN_NEON_F32;
255}
256
257static inline void NE10_FFT4_FUC_NEON_F32 (CPLX scratch_out[4],
258 const CPLX scratch_in[4])
259{
260 CPLX scratch[4];
261
262 NE10_CPX_ADD_NEON_F32 (scratch[0], scratch_in[0], scratch_in[2]);
263 NE10_CPX_SUB_NEON_F32 (scratch[1], scratch_in[0], scratch_in[2]);
264 NE10_CPX_ADD_NEON_F32 (scratch[2], scratch_in[1], scratch_in[3]);
265 NE10_CPX_SUB_NEON_F32 (scratch[3], scratch_in[1], scratch_in[3]);
266
267 NE10_CPX_SUB_NEON_F32 (scratch_out[2], scratch[0], scratch[2]);
268 NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch[0], scratch[2]);
269
270 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
271 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
272 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
273 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
274}
275
276static inline void NE10_FFT4_FUC_INPLACE_NEON_F32 (CPLX scratch_out[4])
277{
278 CPLX scratch[4];
279
280 NE10_CPX_ADD_NEON_F32 (scratch[0], scratch_out[0], scratch_out[2]);
281 NE10_CPX_SUB_NEON_F32 (scratch[1], scratch_out[0], scratch_out[2]);
282 NE10_CPX_ADD_NEON_F32 (scratch[2], scratch_out[1], scratch_out[3]);
283 NE10_CPX_SUB_NEON_F32 (scratch[3], scratch_out[1], scratch_out[3]);
284
285 NE10_CPX_SUB_NEON_F32 (scratch_out[2], scratch[0], scratch[2]);
286 NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch[0], scratch[2]);
287
288 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
289 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
290 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
291 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
292}
293
294static inline void NE10_FFT5_FUC_INPLACE_NEON_F32 (CPLX Fout[5])
295{
296 CPLX s[6];
297
298 NE10_CPX_ADD_NEON_F32 (s[1], Fout[1], Fout[4]);
299 NE10_CPX_ADD_NEON_F32 (s[2], Fout[2], Fout[3]);
300
301 s[0] = Fout[0];
302 s[5] = Fout[0];
303
304 Fout[0].val[0] = Fout[0].val[0] + s[1].val[0] + s[2].val[0];
305 Fout[0].val[1] = Fout[0].val[1] + s[1].val[1] + s[2].val[1];
306
307 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[1].val[0], TW_5A_F32.r);
308 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[1].val[1], TW_5A_F32.r);
309 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[1].val[0], TW_5B_F32.r);
310 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[1].val[1], TW_5B_F32.r);
311
312 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[2].val[0], TW_5B_F32.r);
313 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[2].val[1], TW_5B_F32.r);
314 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[2].val[0], TW_5A_F32.r);
315 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[2].val[1], TW_5A_F32.r);
316
317 NE10_CPX_SUB_NEON_F32 (s[4], Fout[1], Fout[4]);
318 NE10_CPX_SUB_NEON_F32 (s[3], Fout[2], Fout[3]);
319
320 s[1].val[0] = NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5A_F32.i);
321 s[1].val[1] = -NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5A_F32.i);
322 s[2].val[0] = -NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5B_F32.i);
323 s[2].val[1] = NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5B_F32.i);
324
325 s[1].val[0] = NE10_S_MLA_NEON_F32 (s[1].val[0], s[3].val[1], TW_5B_F32.i);
326 s[1].val[1] = NE10_S_MLS_NEON_F32 (s[1].val[1], s[3].val[0], TW_5B_F32.i);
327 s[2].val[0] = NE10_S_MLA_NEON_F32 (s[2].val[0], s[3].val[1], TW_5A_F32.i);
328 s[2].val[1] = NE10_S_MLS_NEON_F32 (s[2].val[1], s[3].val[0], TW_5A_F32.i);
329
330 NE10_CPX_SUB_NEON_F32 (Fout[1], s[0], s[1]);
331 NE10_CPX_ADD_NEON_F32 (Fout[4], s[0], s[1]);
332 NE10_CPX_ADD_NEON_F32 (Fout[2], s[5], s[2]);
333 NE10_CPX_SUB_NEON_F32 (Fout[3], s[5], s[2]);
334}
335
336#define NE10_BUTTERFLY_INDEX_NEON_F32(OUT,IN,OUT_I,OUT_J,IN_I,IN_J) \
337 do { \
338 NE10_CPX_ADD_NEON_F32 (OUT[OUT_I],IN[IN_I],IN[IN_J]); \
339 NE10_CPX_SUB_NEON_F32 (OUT[OUT_J],IN[IN_I],IN[IN_J]); \
340 } while (0)
341
342static inline void NE10_FFT8_FUC_NEON_F32 (CPLX out[8],
343 const CPLX in[8])
344{
345 CPLX s[8];
346 static const ne10_fft_cpx_float32_t TW_8[4] =
347 {
348 { 1.00000, 0.00000 },
349 { 0.70711, -0.70711 },
350 { 0.00000, -1.00000 },
351 { -0.70711, -0.70711 },
352 };
353
354 // STAGE - 1
355 // in -> s
356 {
357 NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 0, 4, 0, 4);
358 NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 1, 5, 1, 5);
359 NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 2, 6, 2, 6);
360 NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 3, 7, 3, 7);
361 }
362
363 // STAGE - 2
364 // s -> out
365 {
366 // TW
367#define NE10_CPX_MUL_TW8_NEON_F32(OUT,TW_8_TABLE,OUT_I,TW_J) \
368 do { \
369 ne10_fft_cpx_float32_t TW_TMP = TW_8_TABLE[TW_J]; \
370 CPLX TW_TMP_NEON; \
371 TW_TMP_NEON.val[0] = NE10_REAL_DUP_NEON_F32 (TW_TMP.r); \
372 TW_TMP_NEON.val[1] = NE10_REAL_DUP_NEON_F32 (TW_TMP.i); \
373 NE10_CPX_MUL_NEON_F32 (OUT[OUT_I],OUT[OUT_I],TW_TMP_NEON); \
374 } while (0)
375
376 NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 4, 0);
377 NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 5, 1);
378 NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 6, 2);
379 NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 7, 3);
380
381 NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 0, 2, 0, 2);
382 NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 1, 3, 1, 3);
383 NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 4, 6, 4, 6);
384 NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 5, 7, 5, 7);
385 }
386 // STAGE - 3
387 // out -> s
388 {
389 // TW
390 NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 2, 0);
391 NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 3, 2);
392 NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 6, 0);
393 NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 7, 2);
394#undef NE10_CPX_MUL_TW8_NEON_F32
395
396 NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 0, 4, 0, 1);
397 NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 2, 6, 2, 3);
398 NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 1, 5, 4, 5);
399 NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 3, 7, 6, 7);
400 }
401
402 out[0] = s[0];
403 out[1] = s[1];
404 out[2] = s[2];
405 out[3] = s[3];
406 out[4] = s[4];
407 out[5] = s[5];
408 out[6] = s[6];
409 out[7] = s[7];
410}
411
413// Following are butterfly functions
415template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse, bool is_scaled>
416static void ne10_radix_2_butterfly_float32_neon (CPLX *Fout,
417 const CPLX *Fin,
418 const ne10_fft_cpx_float32_t *twiddles,
419 const ne10_int32_t fstride,
420 const ne10_int32_t out_step,
421 const ne10_int32_t nfft)
422{
423 CPLX in[2];
424 CPLX out[2];
425
426 const ne10_int32_t in_step = nfft / 2;
427 ne10_int32_t f_count;
428 ne10_int32_t m_count;
429
430 const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
431
432 for (f_count = fstride; f_count > 0; f_count--)
433 {
434 for (m_count = out_step; m_count > 0; m_count--)
435 {
436#ifndef NE10_INLINE_ASM_OPT
437 in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
438 in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
439
440 if (is_inverse == 1)
441 {
442 NE10_FFT2_CONJ (in);
443 }
444
445 if (is_first_stage == 0)
446 {
447 NE10_LOAD_TW_AND_MUL (in[1], twiddles);
448 }
449
450 NE10_FFT2_FUC_NEON_F32 (out, in);
451
452 if (is_inverse == 1)
453 {
454 NE10_FFT2_CONJ (out);
455
456 if (is_scaled)
457 {
458 NE10_FFT2_SCALING (out, one_by_fft_neon);
459 }
460 }
461
462 NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
463 NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
464#else // NE10_INLINE_ASM_OPT
465#ifndef __aarch64__
466#error Currently, inline assembly optimizations are only available on AArch64.
467#else // __aarch64__
468 asm volatile (
469 "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t"
470 "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t"
471 :
472 : [pin0]"r"(Fin),
473 [pin1]"r"(Fin + in_step)
474 : "memory", "v0", "v1", "v2", "v3");
475
476 if (is_inverse == 1)
477 {
478 asm volatile (
479 "fneg v1.4s, v1.4s \n\t"
480 "fneg v3.4s, v3.4s \n\t"
481 :
482 :
483 : "v0", "v1", "v2", "v3");
484 }
485
486 if (is_first_stage == 0)
487 {
488 asm volatile (
489 "ld1 {v4.1d}, [%[ptw]] \n\t" // tw0
490
491 "fmul v14.4s, v2.4s, v4.s[1] \n\t" // RI
492 "fmul v2.4s, v2.4s, v4.s[0] \n\t" // RR
493 "fmls v2.4s, v3.4s, v4.s[1] \n\t" // RR - II
494 "fmul v15.4s, v3.4s, v4.s[0] \n\t" // IR
495 "fadd v3.4s, v14.4s, v15.4s \n\t" // RI + IR
496 :
497 : [ptw]"r"(twiddles)
498 : "memory", "v0", "v1", "v2", "v3", "v4", "v14", "v15");
499 }
500
501 asm volatile (
502 "fadd v4.4s, v0.4s, v2.4s \n\t"
503 "fadd v5.4s, v1.4s, v3.4s \n\t"
504 "fsub v6.4s, v0.4s, v2.4s \n\t"
505 "fsub v7.4s, v1.4s, v3.4s \n\t"
506 :
507 :
508 : "memory",
509 "v0", "v1", "v2", "v3", // in
510 "v4", "v5", "v6", "v7"); // out
511
512 if (is_inverse == 1)
513 {
514 asm volatile (
515 "fneg v5.4s, v5.4s \n\t"
516 "fneg v7.4s, v7.4s \n\t"
517 :
518 :
519 : "v4", "v5", "v6", "v7");
520 }
521
522 if (is_scaled == 1)
523 {
524 asm volatile (
525 "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t"
526 "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t"
527 "fmul v6.4s, v6.4s, %[one_by_nfft].4s \n\t"
528 "fmul v7.4s, v7.4s, %[one_by_nfft].4s \n\t"
529 :
530 : [one_by_nfft]"w"(one_by_fft_neon)
531 : "v4", "v5", "v6", "v7");
532 }
533
534 asm volatile (
535 "st2 {v4.4s, v5.4s}, [%[pout0]] \n\t"
536 "st2 {v6.4s, v7.4s}, [%[pout1]] \n\t"
537 :
538 : [pout0]"r"(Fout),
539 [pout1]"r"(Fout + out_step)
540 : "memory", "v4", "v5", "v6", "v7");
541#endif // __aarch64__
542#endif // NE10_INLINE_ASM_OPT
543
544 Fin++;
545
546 if (is_first_stage == 0)
547 {
548 Fout++;
549 twiddles++;
550 }
551 else
552 {
553 Fout += 2;
554 }
555 }
556 if (is_first_stage == 0)
557 {
558 twiddles -= out_step;
559 Fout += (2 - 1) * out_step;
560 }
561 }
562}
563template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse, bool is_scaled>
564static void ne10_radix_4_butterfly_float32_neon (CPLX *Fout,
565 const CPLX *Fin,
566 const ne10_fft_cpx_float32_t *twiddles,
567 const ne10_int32_t fstride,
568 const ne10_int32_t out_step,
569 const ne10_int32_t nfft)
570{
571 CPLX in[4];
572 #ifdef NE10_INLINE_ASM_OPT
573 CPLX s[4];
574 #endif
575
576 const ne10_int32_t in_step = nfft / 4;
577 ne10_int32_t f_count;
578 ne10_int32_t m_count;
579
580 const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
581
582 for (f_count = fstride; f_count > 0; f_count--)
583 {
584 for (m_count = out_step; m_count > 0; m_count--)
585 {
586#ifndef NE10_INLINE_ASM_OPT
587 in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
588 in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
589 in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
590 in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
591
592 if (is_inverse == 1)
593 {
594 NE10_FFT4_CONJ (in);
595 }
596
597 if (is_first_stage == 0)
598 {
599 NE10_LOAD_TW_AND_MUL (in[1], twiddles);
600 NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
601 NE10_LOAD_TW_AND_MUL (in[3], twiddles + out_step * 2);
602 }
603
604 NE10_FFT4_FUC_INPLACE_NEON_F32 (in);
605
606 if (is_inverse == 1)
607 {
608 NE10_FFT4_CONJ (in);
609 }
610 if (is_scaled)
611 {
612 NE10_FFT4_SCALING (in, one_by_fft_neon);
613 }
614
615 NE10_CPLX_STORE (Fout + 0 * out_step, in[0]);
616 NE10_CPLX_STORE (Fout + 1 * out_step, in[1]);
617 NE10_CPLX_STORE (Fout + 2 * out_step, in[2]);
618 NE10_CPLX_STORE (Fout + 3 * out_step, in[3]);
619#else // NE10_INLINE_ASM_OPT
620#ifndef __aarch64__
621#error Currently, inline assembly optimizations are only available on AArch64.
622#else // __aarch64__
623#define NEON_REGISTERS_C2C_R4 \
624 "v0", "v1", \
625 "v2", "v3", \
626 "v4", "v5", \
627 "v6", "v7"
628#define NEON_REGISTERS_C2C_TW_R4 \
629 "v8", "v9", \
630 "v10", "v11", \
631 "v12", "v13"
632
633 asm volatile (
634 "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t" // in[0]
635 "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t" // in[1]
636 "ld2 {v4.4s, v5.4s}, [%[pin2]] \n\t" // in[2]
637 "ld2 {v6.4s, v7.4s}, [%[pin3]] \n\t" // in[3]
638 :
639 : [pin0]"r"(Fin),
640 [pin1]"r"(Fin + in_step),
641 [pin2]"r"(Fin + in_step * 2),
642 [pin3]"r"(Fin + in_step * 3)
643 : "memory", NEON_REGISTERS_C2C_R4);
644
645 if (is_inverse == 1)
646 {
647 asm volatile (
648 "fneg v1.4s, v1.4s \n\t"
649 "fneg v3.4s, v3.4s \n\t"
650 "fneg v5.4s, v5.4s \n\t"
651 "fneg v7.4s, v7.4s \n\t"
652 :
653 :
654 : NEON_REGISTERS_C2C_R4);
655 }
656
657 if (is_first_stage == 0)
658 {
659 asm volatile (
660 "ld1 { v8.1d}, [%[ptw0]] \n\t" // tw0
661 "ld1 { v9.1d}, [%[ptw1]] \n\t" // tw1
662 "ld1 {v10.1d}, [%[ptw2]] \n\t" // tw2
663
664 "fmul v14.4s, v2.4s, v8.s[1] \n\t" // RI
665 "fmul v2.4s, v2.4s, v8.s[0] \n\t" // RR
666 "fmls v2.4s, v3.4s, v8.s[1] \n\t" // RR - II
667 "fmul v15.4s, v3.4s, v8.s[0] \n\t" // IR
668 "fadd v3.4s, v14.4s, v15.4s \n\t" // RI + IR
669
670 "fmul v14.4s, v4.4s, v9.s[1] \n\t" // RI
671 "fmul v4.4s, v4.4s, v9.s[0] \n\t" // RR
672 "fmls v4.4s, v5.4s, v9.s[1] \n\t" // RR - II
673 "fmul v15.4s, v5.4s, v9.s[0] \n\t" // IR
674 "fadd v5.4s, v14.4s, v15.4s \n\t" // RI + IR
675
676 "fmul v14.4s, v6.4s, v10.s[1] \n\t" // RI
677 "fmul v6.4s, v6.4s, v10.s[0] \n\t" // RR
678 "fmls v6.4s, v7.4s, v10.s[1] \n\t" // RR - II
679 "fmul v15.4s, v7.4s, v10.s[0] \n\t" // IR
680 "fadd v7.4s, v14.4s, v15.4s \n\t" // RI + IR
681 :
682 : [ptw0]"r"(twiddles),
683 [ptw1]"r"(twiddles + out_step),
684 [ptw2]"r"(twiddles + out_step * 2)
685 : "memory", NEON_REGISTERS_C2C_R4,
686 NEON_REGISTERS_C2C_TW_R4, "v14", "v15");
687 }
688
689 asm volatile (
690 "fadd %[s0r].4s, v0.4s, v4.4s \n\t"
691 "fadd %[s0i].4s, v1.4s, v5.4s \n\t"
692
693 "fsub %[s1r].4s, v0.4s, v4.4s \n\t"
694 "fsub %[s1i].4s, v1.4s, v5.4s \n\t"
695
696 "fadd %[s2r].4s, v2.4s, v6.4s \n\t"
697 "fadd %[s2i].4s, v3.4s, v7.4s \n\t"
698
699 "fsub %[s3r].4s, v2.4s, v6.4s \n\t"
700 "fsub %[s3i].4s, v3.4s, v7.4s \n\t"
701 : [s0r]"+w"(s[0].val[0]),
702 [s0i]"+w"(s[0].val[1]),
703 [s1r]"+w"(s[1].val[0]),
704 [s1i]"+w"(s[1].val[1]),
705 [s2r]"+w"(s[2].val[0]),
706 [s2i]"+w"(s[2].val[1]),
707 [s3r]"+w"(s[3].val[0]),
708 [s3i]"+w"(s[3].val[1])
709 :
710 : NEON_REGISTERS_C2C_R4);
711
712 asm volatile (
713 "fadd v0.4s, %[s0r].4s, %[s2r].4s \n\t"
714 "fadd v1.4s, %[s0i].4s, %[s2i].4s \n\t"
715 "fsub v4.4s, %[s0r].4s, %[s2r].4s \n\t"
716 "fsub v5.4s, %[s0i].4s, %[s2i].4s \n\t"
717
718 "fadd v2.4s, %[s1r].4s, %[s3i].4s \n\t"
719 "fsub v3.4s, %[s1i].4s, %[s3r].4s \n\t"
720 "fsub v6.4s, %[s1r].4s, %[s3i].4s \n\t"
721 "fadd v7.4s, %[s1i].4s, %[s3r].4s \n\t"
722 :
723 : [s0r]"w"(s[0].val[0]),
724 [s0i]"w"(s[0].val[1]),
725 [s1r]"w"(s[1].val[0]),
726 [s1i]"w"(s[1].val[1]),
727 [s2r]"w"(s[2].val[0]),
728 [s2i]"w"(s[2].val[1]),
729 [s3r]"w"(s[3].val[0]),
730 [s3i]"w"(s[3].val[1])
731 : NEON_REGISTERS_C2C_R4);
732
733 if (is_inverse == 1)
734 {
735 asm volatile (
736 "fneg v1.4s, v1.4s \n\t"
737 "fneg v3.4s, v3.4s \n\t"
738 "fneg v5.4s, v5.4s \n\t"
739 "fneg v7.4s, v7.4s \n\t"
740 :
741 :
742 : NEON_REGISTERS_C2C_R4);
743 }
744
745 if (is_scaled)
746 {
747 asm volatile (
748 "fmul v0.4s, v0.4s, %[one_by_nfft].4s \n\t"
749 "fmul v1.4s, v1.4s, %[one_by_nfft].4s \n\t"
750 "fmul v2.4s, v2.4s, %[one_by_nfft].4s \n\t"
751 "fmul v3.4s, v3.4s, %[one_by_nfft].4s \n\t"
752 "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t"
753 "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t"
754 "fmul v6.4s, v6.4s, %[one_by_nfft].4s \n\t"
755 "fmul v7.4s, v7.4s, %[one_by_nfft].4s \n\t"
756 :
757 : [one_by_nfft]"w"(one_by_fft_neon)
758 : NEON_REGISTERS_C2C_R4);
759 }
760
761 asm volatile (
762 "st2 {v0.4s, v1.4s}, [%[pout0]] \n\t"
763 "st2 {v2.4s, v3.4s}, [%[pout1]] \n\t"
764 "st2 {v4.4s, v5.4s}, [%[pout2]] \n\t"
765 "st2 {v6.4s, v7.4s}, [%[pout3]] \n\t"
766 :
767 : [pout0]"r"(Fout),
768 [pout1]"r"(Fout + out_step),
769 [pout2]"r"(Fout + out_step * 2),
770 [pout3]"r"(Fout + out_step * 3)
771 : NEON_REGISTERS_C2C_R4);
772#endif // __aarch64__
773#endif // NE10_INLINE_ASM_OPT
774 Fin++;
775
776 if (is_first_stage == 0)
777 {
778 Fout++;
779 twiddles++;
780 }
781 else
782 {
783 Fout += 4;
784 }
785 }
786 if (is_first_stage == 0)
787 {
788 twiddles -= out_step;
789 Fout += (4 - 1) * out_step;
790 }
791 }
792}
793
794template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse, bool is_scaled>
795static void ne10_radix_3_butterfly_float32_neon (CPLX *Fout,
796 const CPLX *Fin,
797 const ne10_fft_cpx_float32_t *twiddles,
798 const ne10_int32_t fstride,
799 const ne10_int32_t out_step,
800 const ne10_int32_t nfft)
801{
802 CPLX in[3];
803 CPLX out[3];
804 CPLX s[4];
805
806 const ne10_int32_t in_step = nfft / 3;
807 ne10_int32_t f_count;
808 ne10_int32_t m_count;
809
810 const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
811 const float32x4_t TW_3IN_NEON_F32 = vdupq_n_f32 (TW_3IN_F32);
812 const float32x4_t HALF_NEON_F32 = vdupq_n_f32 (0.5f);
813
814 for (f_count = fstride; f_count > 0; f_count--)
815 {
816 for (m_count = out_step; m_count > 0; m_count--)
817 {
818#ifndef NE10_INLINE_ASM_OPT
819 in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
820 in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
821 in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
822
823 if (is_inverse == 1)
824 {
825 NE10_FFT3_CONJ (in);
826 }
827
828 if (is_first_stage == 0)
829 {
830 NE10_LOAD_TW_AND_MUL (in[1], twiddles);
831 NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
832 }
833
834 NE10_CPX_ADD_NEON_F32 (s[2], in[1], in[2]);
835 NE10_CPX_SUB_NEON_F32 (s[0], in[1], in[2]);
836 s[3] = in[0];
837
838 s[1].val[0] = - s[2].val[0] * HALF_NEON_F32;
839 s[1].val[1] = - s[2].val[1] * HALF_NEON_F32;
840
841 s[1].val[0] += s[3].val[0];
842 s[1].val[1] += s[3].val[1];
843 s[0].val[0] *= TW_3IN_NEON_F32;
844 s[0].val[1] *= TW_3IN_NEON_F32;
845
846 out[0].val[0] = s[3].val[0] + s[2].val[0];
847 out[0].val[1] = s[3].val[1] + s[2].val[1];
848 out[1].val[0] = s[1].val[0] - s[0].val[1];
849 out[1].val[1] = s[1].val[1] + s[0].val[0];
850 out[2].val[0] = s[1].val[0] + s[0].val[1];
851 out[2].val[1] = s[1].val[1] - s[0].val[0];
852
853 if (is_inverse == 1)
854 {
855 NE10_FFT3_CONJ (out);
856 }
857 if (is_scaled)
858 {
859 NE10_FFT3_SCALING (out, one_by_fft_neon);
860 }
861
862 NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
863 NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
864 NE10_CPLX_STORE (Fout + 2 * out_step, out[2]);
865#else // NE10_INLINE_ASM_OPT
866#ifndef __aarch64__
867#error Currently, inline assembly optimizations are only available on AArch64.
868#else // __aarch64__
869 asm volatile (
870 "ld2 {v0.4s, v1.4s}, [%[pin0]] \n\t" // in[0]
871 "ld2 {v2.4s, v3.4s}, [%[pin1]] \n\t" // in[1]
872 "ld2 {v4.4s, v5.4s}, [%[pin2]] \n\t" // in[2]
873 :
874 : [pin0]"r"(Fin),
875 [pin1]"r"(Fin + in_step),
876 [pin2]"r"(Fin + in_step * 2)
877 : "memory", "v0", "v1", "v2", "v3", "v4", "v5");
878
879 if (is_inverse == 1)
880 {
881 asm volatile (
882 "fneg v1.4s, v1.4s \n\t"
883 "fneg v3.4s, v3.4s \n\t"
884 "fneg v5.4s, v5.4s \n\t"
885 :
886 :
887 : "v1", "v3", "v5");
888 }
889
890 if (is_first_stage == 0)
891 {
892 asm volatile (
893 // Load twiddles.
894 "ld1 {v6.1d}, [%[ptw0]] \n\t" // tw0
895 "ld1 {v7.1d}, [%[ptw1]] \n\t" // tw1
896 // in[1] = in[1] * tw[0]
897 "fmul v10.4s, v2.4s, v6.s[1] \n\t" // RI
898 "fmul v2.4s, v2.4s, v6.s[0] \n\t" // RR
899 "fmls v2.4s, v3.4s, v6.s[1] \n\t" // RR - II
900 "fmul v11.4s, v3.4s, v6.s[0] \n\t" // IR
901 "fadd v3.4s, v10.4s, v11.4s \n\t" // RI + IR
902 // in[2] = in[2] * tw[1]
903 "fmul v10.4s, v4.4s, v7.s[1] \n\t" // RI
904 "fmul v4.4s, v4.4s, v7.s[0] \n\t" // RR
905 "fmls v4.4s, v5.4s, v7.s[1] \n\t" // RR - II
906 "fmul v11.4s, v5.4s, v7.s[0] \n\t" // IR
907 "fadd v5.4s, v10.4s, v11.4s \n\t" // RI + IR
908 :
909 : [ptw0]"r"(twiddles),
910 [ptw1]"r"(twiddles + out_step)
911 : "memory", "v0", "v1", "v2", "v3", "v4", "v5",
912 "v6", "v7", "v8", "v9",
913 "v10", "v11");
914 }
915
916 asm volatile (
917 "fadd %[s2r].4s, v2.4s, v4.4s \n\t"
918 "fadd %[s2i].4s, v3.4s, v5.4s \n\t"
919
920 "fsub %[s0r].4s, v2.4s, v4.4s \n\t"
921 "fsub %[s0i].4s, v3.4s, v5.4s \n\t"
922
923 "mov %[s3r].16b, v0.16b \n\t"
924 "mov %[s3i].16b, v1.16b \n\t"
925 : [s0r]"+w"(s[0].val[0]),
926 [s0i]"+w"(s[0].val[1]),
927 [s2r]"+w"(s[2].val[0]),
928 [s2i]"+w"(s[2].val[1]),
929 [s3r]"+w"(s[3].val[0]),
930 [s3i]"+w"(s[3].val[1])
931 :
932 : "v0", "v1", "v2", "v3", "v4", "v5");
933
934 s[1].val[0] = - s[2].val[0] * HALF_NEON_F32;
935 s[1].val[1] = - s[2].val[1] * HALF_NEON_F32;
936
937 s[1].val[0] += s[3].val[0];
938 s[1].val[1] += s[3].val[1];
939 s[0].val[0] *= TW_3IN_NEON_F32;
940 s[0].val[1] *= TW_3IN_NEON_F32;
941
942 // out[0] - {v0.4s, v1.4s}
943 // out[1] - {v2.4s, v3.4s}
944 // out[2] - {v4.4s, v5.4s}
945 asm volatile (
946 "fadd v0.4s, %[s3r].4s, %[s2r].4s \n\t"
947 "fadd v1.4s, %[s3i].4s, %[s2i].4s \n\t"
948 "fsub v2.4s, %[s1r].4s, %[s0i].4s \n\t"
949 "fadd v3.4s, %[s1i].4s, %[s0r].4s \n\t"
950 "fadd v4.4s, %[s1r].4s, %[s0i].4s \n\t"
951 "fsub v5.4s, %[s1i].4s, %[s0r].4s \n\t"
952 :
953 : [s0r]"w"(s[0].val[0]),
954 [s0i]"w"(s[0].val[1]),
955 [s1r]"w"(s[1].val[0]),
956 [s1i]"w"(s[1].val[1]),
957 [s2r]"w"(s[2].val[0]),
958 [s2i]"w"(s[2].val[1]),
959 [s3r]"w"(s[3].val[0]),
960 [s3i]"w"(s[3].val[1])
961 : "v0", "v1", "v2", "v3", "v4", "v5");
962
963 if (is_inverse == 1)
964 {
965 asm volatile (
966 "fneg v1.4s, v1.4s \n\t"
967 "fneg v3.4s, v3.4s \n\t"
968 "fneg v5.4s, v5.4s \n\t"
969 :
970 :
971 : "v1", "v3", "v5");
972 }
973
974 if (is_scaled)
975 {
976 asm volatile (
977 "fmul v0.4s, v0.4s, %[one_by_nfft].4s \n\t"
978 "fmul v1.4s, v1.4s, %[one_by_nfft].4s \n\t"
979 "fmul v2.4s, v2.4s, %[one_by_nfft].4s \n\t"
980 "fmul v3.4s, v3.4s, %[one_by_nfft].4s \n\t"
981 "fmul v4.4s, v4.4s, %[one_by_nfft].4s \n\t"
982 "fmul v5.4s, v5.4s, %[one_by_nfft].4s \n\t"
983 :
984 : [one_by_nfft]"w"(one_by_fft_neon)
985 : "v0", "v1", "v2", "v3", "v4", "v5");
986 }
987
988 asm volatile (
989 "st2 {v0.4s, v1.4s}, [%[pout0]] \n\t"
990 "st2 {v2.4s, v3.4s}, [%[pout1]] \n\t"
991 "st2 {v4.4s, v5.4s}, [%[pout2]] \n\t"
992 :
993 : [pout0]"r"(Fout),
994 [pout1]"r"(Fout + out_step),
995 [pout2]"r"(Fout + 2 * out_step)
996 : "memory", "v0", "v1", "v2", "v3", "v4", "v5");
997#endif // __aarch64__
998#endif // NE10_INLINE_ASM_OPT
999
1000 Fin++;
1001
1002 if (is_first_stage == 0)
1003 {
1004 Fout++;
1005 twiddles++;
1006 }
1007 else
1008 {
1009 Fout += 3;
1010 }
1011 }
1012 if (is_first_stage == 0)
1013 {
1014 twiddles -= out_step;
1015 Fout += (3 - 1) * out_step;
1016 }
1017 }
1018}
1019
1020template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse, bool is_scaled>
1021static void ne10_radix_5_butterfly_float32_neon (CPLX *Fout,
1022 const CPLX *Fin,
1023 const ne10_fft_cpx_float32_t *twiddles,
1024 const ne10_int32_t fstride,
1025 const ne10_int32_t out_step,
1026 const ne10_int32_t nfft)
1027{
1028 CPLX in[5];
1029 CPLX s[6];
1030
1031 const ne10_int32_t in_step = nfft / 5;
1032 ne10_int32_t f_count;
1033 ne10_int32_t m_count;
1034
1035 const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
1036
1037 for (f_count = fstride; f_count > 0; f_count--)
1038 {
1039 for (m_count = out_step; m_count > 0; m_count--)
1040 {
1041 in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
1042 in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
1043 in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
1044 in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
1045 in[4] = NE10_CPLX_LOAD (Fin + 4 * in_step);
1046
1047 if (is_inverse == 1)
1048 {
1049 NE10_FFT5_CONJ (in);
1050 }
1051
1052 if (is_first_stage == 0)
1053 {
1054 NE10_LOAD_TW_AND_MUL (in[1], twiddles);
1055 NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
1056 NE10_LOAD_TW_AND_MUL (in[3], twiddles + out_step * 2);
1057 NE10_LOAD_TW_AND_MUL (in[4], twiddles + out_step * 3);
1058 }
1059
1060 NE10_CPX_ADD_NEON_F32 (s[1], in[1], in[4]);
1061 NE10_CPX_ADD_NEON_F32 (s[2], in[2], in[3]);
1062
1063 s[0] = in[0];
1064 s[5] = in[0];
1065
1066 in[0].val[0] = in[0].val[0] + s[1].val[0] + s[2].val[0];
1067 in[0].val[1] = in[0].val[1] + s[1].val[1] + s[2].val[1];
1068
1069 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[1].val[0], TW_5A_F32.r);
1070 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[1].val[1], TW_5A_F32.r);
1071 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[1].val[0], TW_5B_F32.r);
1072 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[1].val[1], TW_5B_F32.r);
1073
1074 s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[2].val[0], TW_5B_F32.r);
1075 s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[2].val[1], TW_5B_F32.r);
1076 s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[2].val[0], TW_5A_F32.r);
1077 s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[2].val[1], TW_5A_F32.r);
1078
1079 NE10_CPX_SUB_NEON_F32 (s[4], in[1], in[4]);
1080 NE10_CPX_SUB_NEON_F32 (s[3], in[2], in[3]);
1081
1082 s[1].val[0] = NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5A_F32.i);
1083 s[1].val[1] = -NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5A_F32.i);
1084 s[2].val[0] = -NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5B_F32.i);
1085 s[2].val[1] = NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5B_F32.i);
1086
1087 s[1].val[0] = NE10_S_MLA_NEON_F32 (s[1].val[0], s[3].val[1], TW_5B_F32.i);
1088 s[1].val[1] = NE10_S_MLS_NEON_F32 (s[1].val[1], s[3].val[0], TW_5B_F32.i);
1089 s[2].val[0] = NE10_S_MLA_NEON_F32 (s[2].val[0], s[3].val[1], TW_5A_F32.i);
1090 s[2].val[1] = NE10_S_MLS_NEON_F32 (s[2].val[1], s[3].val[0], TW_5A_F32.i);
1091
1092 NE10_CPX_SUB_NEON_F32 (in[1], s[0], s[1]);
1093 NE10_CPX_ADD_NEON_F32 (in[4], s[0], s[1]);
1094 NE10_CPX_ADD_NEON_F32 (in[2], s[5], s[2]);
1095 NE10_CPX_SUB_NEON_F32 (in[3], s[5], s[2]);
1096
1097 if (is_inverse == 1)
1098 {
1099 NE10_FFT5_CONJ (in);
1100 }
1101 if (is_scaled)
1102 {
1103 NE10_FFT5_SCALING (in, one_by_fft_neon);
1104 }
1105
1106 NE10_CPLX_STORE (Fout + 0 * out_step, in[0]);
1107 NE10_CPLX_STORE (Fout + 1 * out_step, in[1]);
1108 NE10_CPLX_STORE (Fout + 2 * out_step, in[2]);
1109 NE10_CPLX_STORE (Fout + 3 * out_step, in[3]);
1110 NE10_CPLX_STORE (Fout + 4 * out_step, in[4]);
1111
1112 Fin++;
1113
1114 if (is_first_stage == 0)
1115 {
1116 Fout++;
1117 twiddles++;
1118 }
1119 else
1120 {
1121 Fout += 5;
1122 }
1123 }
1124 if (is_first_stage == 0)
1125 {
1126 twiddles -= out_step;
1127 Fout += (5 - 1) * out_step;
1128 }
1129 }
1130}
1131
1132template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse, bool is_scaled>
1133static void ne10_radix_8_butterfly_float32_neon (CPLX *Fout,
1134 const CPLX *Fin,
1135 const ne10_fft_cpx_float32_t *,
1136 const ne10_int32_t fstride,
1137 const ne10_int32_t out_step,
1138 const ne10_int32_t nfft)
1139{
1140 CPLX in[8];
1141 CPLX out[8];
1142
1143 const ne10_int32_t in_step = nfft / 8;
1144 ne10_int32_t f_count;
1145 ne10_int32_t m_count;
1146
1147 const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
1148
1149 for (f_count = fstride; f_count > 0; f_count--)
1150 {
1151 for (m_count = out_step; m_count > 0; m_count--)
1152 {
1153 in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
1154 in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
1155 in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
1156 in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
1157 in[4] = NE10_CPLX_LOAD (Fin + 4 * in_step);
1158 in[5] = NE10_CPLX_LOAD (Fin + 5 * in_step);
1159 in[6] = NE10_CPLX_LOAD (Fin + 6 * in_step);
1160 in[7] = NE10_CPLX_LOAD (Fin + 7 * in_step);
1161
1162 if (is_inverse == 1)
1163 {
1164 NE10_FFT8_CONJ (in);
1165 }
1166
1167 NE10_FFT8_FUC_NEON_F32 (out, in);
1168
1169 if (is_inverse == 1)
1170 {
1171 NE10_FFT8_CONJ (out);
1172 }
1173 if (is_scaled)
1174 {
1175 NE10_FFT8_SCALING (out, one_by_fft_neon);
1176 }
1177
1178 NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
1179 NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
1180 NE10_CPLX_STORE (Fout + 2 * out_step, out[2]);
1181 NE10_CPLX_STORE (Fout + 3 * out_step, out[3]);
1182 NE10_CPLX_STORE (Fout + 4 * out_step, out[4]);
1183 NE10_CPLX_STORE (Fout + 5 * out_step, out[5]);
1184 NE10_CPLX_STORE (Fout + 6 * out_step, out[6]);
1185 NE10_CPLX_STORE (Fout + 7 * out_step, out[7]);
1186
1187 Fin++;
1188 Fout += 8;
1189 }
1190 }
1191}
1192
1193template<ne10_int32_t is_inverse, bool is_scaled>
1194static void ne10_mixed_radix_generic_butterfly_float32_neon_impl (CPLX *Fout,
1195 const CPLX *Fin,
1196 const ne10_int32_t *factors,
1197 const ne10_fft_cpx_float32_t *twiddles,
1198 CPLX *buffer)
1199{
1200 ne10_int32_t fstride, mstride, radix;
1201 ne10_int32_t stage_count;
1202 ne10_int32_t nfft;
1203
1204 // init fstride, mstride, radix, nfft
1205 stage_count = factors[0];
1206 fstride = factors[1];
1207 mstride = 1;
1208 radix = factors[ stage_count << 1 ]; // radix of first stage
1209 nfft = fstride * radix;
1210
1211 // swap to make sure output to Fout
1212 if (stage_count % 2 == 0)
1213 {
1214 ne10_swap_ptr (buffer, Fout);
1215 }
1216
1217 // first stage
1218 switch (radix)
1219 {
1220 case 2:
1221 ne10_radix_2_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1222 fstride, 1, nfft);
1223 break;
1224 case 4:
1225 ne10_radix_4_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1226 fstride, 1, nfft);
1227 break;
1228 case 3:
1229 ne10_radix_3_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1230 fstride, 1, nfft);
1231 break;
1232 case 5:
1233 ne10_radix_5_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1234 fstride, 1, nfft);
1235 break;
1236 case 8:
1237 ne10_radix_8_butterfly_float32_neon<1, is_inverse, is_scaled> (Fout, Fin, NULL,
1238 fstride, 1, nfft);
1239 break;
1240 }
1241
1242 stage_count--;
1243 if (! stage_count) // finish
1244 {
1245 return;
1246 }
1247
1248 mstride *= radix;
1249
1250 // update radix
1251 if (radix % 2)
1252 {
1253 twiddles += radix;
1254 }
1255 radix = factors[ stage_count << 1 ];
1256
1257 // other stages
1258 while (stage_count > 0)
1259 {
1260
1261 // radix of first stage, should be one of {2,3,5,4}
1262 assert ((radix > 1) && (radix < 6));
1263
1264 ne10_swap_ptr (buffer, Fout);
1265
1266 fstride /= radix;
1267 switch (radix)
1268 {
1269 case 2:
1270 ne10_radix_2_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1271 twiddles, fstride, mstride, nfft);
1272 break;
1273 case 3:
1274 ne10_radix_3_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1275 twiddles, fstride, mstride, nfft);
1276 break;
1277 case 4:
1278 ne10_radix_4_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1279 twiddles, fstride, mstride, nfft);
1280 break;
1281 case 5:
1282 ne10_radix_5_butterfly_float32_neon<0, is_inverse, false> (Fout, buffer,
1283 twiddles, fstride, mstride, nfft);
1284 break;
1285 } // switch (radix)
1286
1287 twiddles += mstride * (radix - 1);
1288 mstride *= radix;
1289
1290 stage_count--;
1291 radix = factors[ stage_count << 1 ];
1292 } // while (stage_count)
1293}
1294
1295template<ne10_int32_t is_inverse>
1296static void ne10_c2c_1d_last_stage_neon (CPLX *Fout,
1297 const CPLX *Fin,
1298 const ne10_fft_cpx_float32_t *twiddles,
1299 const ne10_int32_t fstride,
1300 const ne10_int32_t out_step,
1301 const ne10_int32_t)
1302{
1303 ne10_int32_t f_count;
1304 ne10_int32_t m_count;
1305
1306 for (f_count = fstride; f_count > 0; f_count--)
1307 {
1308 CPLX scratch_in[4];
1309 CPLX scratch_out[4];
1310 CPLX scratch[4];
1311
1312 for (m_count = out_step / NE10_FFT_PARA_LEVEL; m_count > 0; m_count--)
1313 {
1314#ifndef NE10_INLINE_ASM_OPT
1315 scratch_in[0] = NE10_CPLX_LOAD (Fin + 0);
1316 scratch_in[1] = NE10_CPLX_LOAD (Fin + 1);
1317 scratch_in[2] = NE10_CPLX_LOAD (Fin + 2);
1318 scratch_in[3] = NE10_CPLX_LOAD (Fin + 3);
1319
1320 // Transpose
1321 {
1322 CPLX scratch0, scratch_in0;
1323 CPLX scratch1, scratch_in1;
1324 CPLX scratch2, scratch_in2;
1325 CPLX scratch3, scratch_in3;
1326
1327 scratch_in0 = scratch_in[0];
1328 scratch_in1 = scratch_in[1];
1329 scratch_in2 = scratch_in[2];
1330 scratch_in3 = scratch_in[3];
1331
1332 NE10_RADIX4X4C_TRANSPOSE_NEON (scratch, scratch_in);
1333
1334 scratch_in[0] = scratch0;
1335 scratch_in[1] = scratch1;
1336 scratch_in[2] = scratch2;
1337 scratch_in[3] = scratch3;
1338 }
1339#else // NE10_INLINE_ASM_OPT
1340#ifndef __aarch64__
1341#error Currently, inline assembly optimizations are only available on AArch64.
1342#else // __aarch64__
1343 const float *pin = (const float *) Fin;
1344 asm volatile (
1345 "ld2 {v0.4s, v1.4s}, [%[pin]], %[offset] \n\t"
1346 "ld2 {v2.4s, v3.4s}, [%[pin]], %[offset] \n\t"
1347 "ld2 {v4.4s, v5.4s}, [%[pin]], %[offset] \n\t"
1348 "ld2 {v6.4s, v7.4s}, [%[pin]] \n\t"
1349
1350 // NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in,q2_out);
1351 "trn1 v8.4s, v0.4s, v2.4s \n\t"
1352 "trn2 v9.4s, v0.4s, v2.4s \n\t"
1353 "trn1 v10.4s, v4.4s, v6.4s \n\t"
1354 "trn2 v11.4s, v4.4s, v6.4s \n\t"
1355
1356 "trn1 %[in0r].2d, v8.2d, v10.2d \n\t"
1357 "trn1 %[in1r].2d, v9.2d, v11.2d \n\t"
1358 "trn2 %[in2r].2d, v8.2d, v10.2d \n\t"
1359 "trn2 %[in3r].2d, v9.2d, v11.2d \n\t"
1360
1361 "trn1 v8.4s, v1.4s, v3.4s \n\t"
1362 "trn2 v9.4s, v1.4s, v3.4s \n\t"
1363 "trn1 v10.4s, v5.4s, v7.4s \n\t"
1364 "trn2 v11.4s, v5.4s, v7.4s \n\t"
1365
1366 "trn1 %[in0i].2d, v8.2d, v10.2d \n\t"
1367 "trn1 %[in1i].2d, v9.2d, v11.2d \n\t"
1368 "trn2 %[in2i].2d, v8.2d, v10.2d \n\t"
1369 "trn2 %[in3i].2d, v9.2d, v11.2d \n\t"
1370 : [in0r]"+w"(scratch_in[0].val[0]),
1371 [in0i]"+w"(scratch_in[0].val[1]),
1372 [in1r]"+w"(scratch_in[1].val[0]),
1373 [in1i]"+w"(scratch_in[1].val[1]),
1374 [in2r]"+w"(scratch_in[2].val[0]),
1375 [in2i]"+w"(scratch_in[2].val[1]),
1376 [in3r]"+w"(scratch_in[3].val[0]),
1377 [in3i]"+w"(scratch_in[3].val[1]),
1378 [pin]"+r"(pin)
1379 : [offset]"r"(32)
1380 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1381 "v8", "v9", "v10", "v11");
1382#endif // __aarch64__
1383#endif // NE10_INLINE_ASM_OPT
1384
1385 if (is_inverse)
1386 {
1387 NE10_FFT4_CONJ (scratch_in);
1388 }
1389
1390 // Not first stage
1391 {
1392#ifndef NE10_INLINE_ASM_OPT
1393 CPLX scratch_tw[3];
1394
1395 scratch_tw[0] = NE10_CPLX_LOAD (twiddles + 0 * out_step);
1396 scratch_tw[1] = NE10_CPLX_LOAD (twiddles + 1 * out_step);
1397 scratch_tw[2] = NE10_CPLX_LOAD (twiddles + 2 * out_step);
1398
1399 NE10_FFT4_MUL_TW_NEON (scratch_in, scratch_in, scratch_tw);
1400#else // NE10_INLINE_ASM_OPT
1401#ifndef __aarch64__
1402#error Currently, inline assembly optimizations are only available on AArch64.
1403#else // __aarch64__
1404 const float *tw = (const float *)twiddles;
1405 asm volatile (
1406 "ld2 {v0.4s, v1.4s}, [%[tw]], %[offset] \n\t"
1407 "ld2 {v2.4s, v3.4s}, [%[tw]], %[offset] \n\t"
1408 "ld2 {v4.4s, v5.4s}, [%[tw]] \n\t"
1409
1410 "fmul v6.4s, %[in1r].4s, v1.4s \n\t" // RI
1411 "fmul %[in1r].4s, %[in1r].4s, v0.4s \n\t" // RR
1412 "fmls %[in1r].4s, %[in1i].4s, v1.4s \n\t" // RR - II
1413 "fmul v7.4s, %[in1i].4s, v0.4s \n\t" // IR
1414 "fadd %[in1i].4s, v6.4s, v7.4s \n\t" // RI + IR
1415
1416 "fmul v6.4s, %[in2r].4s, v3.4s \n\t" // RI
1417 "fmul %[in2r].4s, %[in2r].4s, v2.4s \n\t" // RR
1418 "fmls %[in2r].4s, %[in2i].4s, v3.4s \n\t" // RR - II
1419 "fmul v7.4s, %[in2i].4s, v2.4s \n\t" // IR
1420 "fadd %[in2i].4s, v6.4s, v7.4s \n\t" // RI + IR
1421
1422 "fmul v6.4s, %[in3r].4s, v5.4s \n\t" // RI
1423 "fmul %[in3r].4s, %[in3r].4s, v4.4s \n\t" // RR
1424 "fmls %[in3r].4s, %[in3i].4s, v5.4s \n\t" // RR - II
1425 "fmul v7.4s, %[in3i].4s, v4.4s \n\t" // IR
1426 "fadd %[in3i].4s, v6.4s, v7.4s \n\t" // RI + IR
1427 : [tw]"+r"(tw),
1428 [in1r]"+w"(scratch_in[1].val[0]),
1429 [in1i]"+w"(scratch_in[1].val[1]),
1430 [in2r]"+w"(scratch_in[2].val[0]),
1431 [in2i]"+w"(scratch_in[2].val[1]),
1432 [in3r]"+w"(scratch_in[3].val[0]),
1433 [in3i]"+w"(scratch_in[3].val[1])
1434 : [offset]"r"(out_step * 8)
1435 : "memory", "v0", "v1", "v2", "v3", "v4", "v5",
1436 "v6", "v7");
1437#endif // __aarch64__
1438#endif // NE10_INLINE_ASM_OPT
1439
1440 }
1441
1442 NE10_CPX_ADD_NEON_F32 (scratch[0], scratch_in[0], scratch_in[2]);
1443 NE10_CPX_SUB_NEON_F32 (scratch[1], scratch_in[0], scratch_in[2]);
1444 NE10_CPX_ADD_NEON_F32 (scratch[2], scratch_in[1], scratch_in[3]);
1445 NE10_CPX_SUB_NEON_F32 (scratch[3], scratch_in[1], scratch_in[3]);
1446
1447#ifndef NE10_INLINE_ASM_OPT
1448 NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch[0], scratch[2]);
1449 NE10_CPX_SUB_NEON_F32 (scratch_out[2], scratch[0], scratch[2]);
1450
1451 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
1452 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
1453 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
1454 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
1455
1456 if (is_inverse == 1)
1457 {
1458 NE10_FFT4_CONJ (scratch_out);
1459 }
1460
1461 // Store.
1462 {
1463 ne10_fft_cpx_float32_t *Fout_cpx;
1464 Fout_cpx = (ne10_fft_cpx_float32_t *) Fout;
1465
1466 NE10_CPLX_STORE (Fout_cpx + 0 * out_step, scratch_out[0]);
1467 NE10_CPLX_STORE (Fout_cpx + 1 * out_step, scratch_out[1]);
1468 NE10_CPLX_STORE (Fout_cpx + 2 * out_step, scratch_out[2]);
1469 NE10_CPLX_STORE (Fout_cpx + 3 * out_step, scratch_out[3]);
1470 }
1471#else // NE10_INLINE_ASM_OPT
1472#ifndef __aarch64__
1473#error Currently, inline assembly optimizations are only available on AArch64.
1474#else // __aarch64__
1475 asm volatile (
1476 "fadd v0.4s, %[s0r].4s, %[s2r].4s \n\t"
1477 "fadd v1.4s, %[s0i].4s, %[s2i].4s \n\t"
1478 "fsub v4.4s, %[s0r].4s, %[s2r].4s \n\t"
1479 "fsub v5.4s, %[s0i].4s, %[s2i].4s \n\t"
1480 "fadd v2.4s, %[s1r].4s, %[s3i].4s \n\t"
1481 "fsub v3.4s, %[s1i].4s, %[s3r].4s \n\t"
1482 "fsub v6.4s, %[s1r].4s, %[s3i].4s \n\t"
1483 "fadd v7.4s, %[s1i].4s, %[s3r].4s \n\t"
1484 :
1485 : [s0r]"w"(scratch[0].val[0]),
1486 [s0i]"w"(scratch[0].val[1]),
1487 [s1r]"w"(scratch[1].val[0]),
1488 [s1i]"w"(scratch[1].val[1]),
1489 [s2r]"w"(scratch[2].val[0]),
1490 [s2i]"w"(scratch[2].val[1]),
1491 [s3r]"w"(scratch[3].val[0]),
1492 [s3i]"w"(scratch[3].val[1])
1493 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1494
1495 if (is_inverse == 1)
1496 {
1497 asm volatile (
1498 "fneg v1.4s, v1.4s \n\t"
1499 "fneg v3.4s, v3.4s \n\t"
1500 "fneg v5.4s, v5.4s \n\t"
1501 "fneg v7.4s, v7.4s \n\t"
1502 :
1503 :
1504 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1505 }
1506
1507 float *pout = (float *) Fout;
1508 asm volatile (
1509 "st2 {v0.4s, v1.4s}, [%[pout]], %[offset] \n\t"
1510 "st2 {v2.4s, v3.4s}, [%[pout]], %[offset] \n\t"
1511 "st2 {v4.4s, v5.4s}, [%[pout]], %[offset] \n\t"
1512 "st2 {v6.4s, v7.4s}, [%[pout]] \n\t"
1513 : [pout]"+r"(pout)
1514 : [offset]"r"(out_step * 8)
1515 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1516#endif // __aarch64__
1517#endif // NE10_INLINE_ASM_OPT
1518 Fin += 4;
1519 Fout += 1;
1520 twiddles += 4;
1521 }
1522 }
1523
1524 ne10_int32_t left_over = out_step % 4;
1525 if (left_over == 0)
1526 {
1527 return;
1528 }
1529
1530 // Left over.
1531 const ne10_fft_cpx_float32_t *Fin_s = (ne10_fft_cpx_float32_t *) Fin;
1533 for (m_count = out_step % 4; m_count > 0; m_count--)
1534 {
1535 ne10_fft_cpx_float32_t scratch_in[4];
1536 ne10_fft_cpx_float32_t scratch_tw[4];
1537
1538 scratch_in[0] = Fin_s[0];
1539 scratch_in[1] = Fin_s[1];
1540 scratch_in[2] = Fin_s[2];
1541 scratch_in[3] = Fin_s[3];
1542
1543 if (is_inverse)
1544 {
1545 scratch_in[0].i = -scratch_in[0].i;
1546 scratch_in[1].i = -scratch_in[1].i;
1547 scratch_in[2].i = -scratch_in[2].i;
1548 scratch_in[3].i = -scratch_in[3].i;
1549 }
1550
1551 scratch_tw[0] = twiddles[0 * out_step];
1552 scratch_tw[1] = twiddles[1 * out_step];
1553 scratch_tw[2] = twiddles[2 * out_step];
1554
1555 FFT4_MUL_TW (scratch_in, scratch_in, scratch_tw);
1556
1557 FFT4_FCU_INPLACE (scratch_in);
1558
1559 if (is_inverse)
1560 {
1561 scratch_in[0].i = -scratch_in[0].i;
1562 scratch_in[1].i = -scratch_in[1].i;
1563 scratch_in[2].i = -scratch_in[2].i;
1564 scratch_in[3].i = -scratch_in[3].i;
1565 }
1566
1567 Fout_s[0 * out_step] = scratch_in[0];
1568 Fout_s[1 * out_step] = scratch_in[1];
1569 Fout_s[2 * out_step] = scratch_in[2];
1570 Fout_s[3 * out_step] = scratch_in[3];
1571
1572 Fin_s += 4;
1573 Fout_s += 1;
1574 twiddles += 1;
1575 }
1576}
1577
1578typedef void (*NE10_MIXED_RADIX_FUNC) (CPLX*, const CPLX *, const ne10_int32_t *,
1579 const ne10_fft_cpx_float32_t *, CPLX *);
1580
1581void ne10_mixed_radix_generic_butterfly_float32_neon (
1583 const ne10_fft_cpx_float32_t *Fin,
1584 const ne10_int32_t *factors,
1585 const ne10_fft_cpx_float32_t *twiddles,
1586 ne10_fft_cpx_float32_t *buffer,
1587 const ne10_int32_t is_scaled)
1588{
1589 ne10_int32_t stage_count = factors[0];
1590 ne10_int32_t fstride = factors[1];
1591 ne10_int32_t radix = factors[stage_count << 1]; // radix of first stage
1592
1593 NE10_MIXED_RADIX_FUNC ne10_mixed_radix_impl = NULL;
1594
1595 // nfft below is not the actual length of FFT, it is 1/4 of the actual one
1596 // instead.
1597 ne10_int32_t nfft = fstride * radix;
1598
1599 if (is_scaled)
1600 {
1601 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<0, true>;
1602 }
1603 else
1604 {
1605 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<0, false>;
1606 }
1607
1608 ne10_mixed_radix_impl ((CPLX *) buffer,
1609 (const CPLX *) Fin, // From Fin to buffer
1610 factors,
1611 twiddles,
1612 (CPLX *) Fout); // Fout is "buffer" for these stages.
1613
1614 ne10_c2c_1d_last_stage_neon<0> ((CPLX *) Fout,
1615 (const CPLX *) buffer, // From buffer to Fout
1616 twiddles + nfft,
1617 1, // out_step == fstride == 1
1618 nfft, // in_step == mstride == nfft
1619 nfft * 4); // Actual length of FFT
1620}
1621
1622void ne10_mixed_radix_generic_butterfly_inverse_float32_neon (
1624 const ne10_fft_cpx_float32_t *Fin,
1625 const ne10_int32_t *factors,
1626 const ne10_fft_cpx_float32_t *twiddles,
1627 ne10_fft_cpx_float32_t *buffer,
1628 const ne10_int32_t is_scaled)
1629{
1630 ne10_int32_t stage_count = factors[0];
1631 ne10_int32_t fstride = factors[1];
1632 ne10_int32_t radix = factors[stage_count << 1]; // radix of first stage
1633
1634 NE10_MIXED_RADIX_FUNC ne10_mixed_radix_impl = NULL;
1635
1636 // nfft below is not the actual length of FFT, it is 1/4 of the actual one
1637 // instead.
1638 ne10_int32_t nfft = fstride * radix;
1639
1640 if (is_scaled)
1641 {
1642 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<1, true>;
1643 }
1644 else
1645 {
1646 ne10_mixed_radix_impl = ne10_mixed_radix_generic_butterfly_float32_neon_impl<1, false>;
1647 }
1648
1649 ne10_mixed_radix_impl ((CPLX *) buffer,
1650 (const CPLX *) Fin, // From Fin to buffer
1651 factors,
1652 twiddles,
1653 (CPLX *) Fout); // Fout is "buffer" for these stages.
1654
1655 ne10_c2c_1d_last_stage_neon<1> ((CPLX *) Fout,
1656 (const CPLX *) buffer, // From buffer to Fout
1657 twiddles + nfft,
1658 1, // out_step == fstride == 1
1659 nfft, // in_step == mstride == nfft
1660 nfft * 4); // Actual length of FFT
1661}