Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
factor.h
1/*
2 * Copyright 2011-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : common/factor.h
30 */
31
32// Typebuilding MACROs
33// - Slight difference between toolchain versions on intrinsics
34#define FLOAT32_2x3(x1,y1,x2,y2,x3,y3) \
35 {{ \
36 {x1, y1}, {x2,y2}, {x3,y3} \
37 }}
38
39// Unit test use this macro to index into their function table
40// "opc" stands for operation's code (which function),
41// and "imp" stands for implementation (which implementation of the function)
42#define FTBL_IDX(opc, imp) ((opc-1)*IMPL_COUNT+(imp-1))
43
44// This macro helps measure the performance of the code passed to it through the "code" argument
45// It is used in the unit tests
46#define MEASURE(res, code) \
47 { \
48 gettimeofday (&before, &zone); \
49 code \
50 gettimeofday (&after, &zone); \
51 if (before.tv_usec > after.tv_usec) \
52 { \
53 after.tv_usec += 1000000; \
54 after.tv_sec--; \
55 } \
56 lapsed.tv_usec = after.tv_usec - before.tv_usec; \
57 lapsed.tv_sec = after.tv_sec - before.tv_sec; \
58 res = lapsed.tv_sec + ((double)lapsed.tv_usec / 1000000.0); \
59 }
60
61// There are several categories of functions that share common code:
62
63// Different groups of functions take different number of inputs
64//
65// Group 1 = Functions that take a dst, a src, and a cst ("DstSrcCst" for short)
66// Group 2 = Those that take a dst, an acc, a src, and a cst ("DstAccSrcCst" for short)
67// Group 3 = The ones that take a dst, and a cst only ("DstCst" for short)
68//
69// Group 4 = These take a dst, and two src inputs, src2 and scr2 ("DstSrc1Src2")
70// Group 5 = These take a dst, an acc, and two src inputs ("DstAccSrc1Src2")
71// Group 6 = These take a dst, and a src ("DstSrc")
72//
73
74// The naming convention used in the following macros is as follows:
75// SNAPP_<A>_OPERATION_<T>_<I>
76// where
77// <A> Stands for the title of the operation (add, mul, etc) followed by its type (C = const as in addc).
78// The letter X - if used - means any such operation.
79// <T> Indicates the type of the operation (float, vec2, etc.)
80// The letter X - is used - means any type.
81// <I> This indicates the implementation (it can be C, ASM, or NEON).
82
83// A few macros to check pointers and their address range to make sure there's
84// no unwanted overlap between any two of them
85#define NE10_CHECKPOINTER_DstSrcCst_OPERATION \
86 if ( (void *)dst < (void *)src ) \
87 { assert ( (void *)dst + count <= (void *)src ); } \
88 else if ( (void *)dst > (void *)src ) \
89 { assert ( (void *)src + count <= (void *)dst ); }
90
91#define NE10_CHECKPOINTER_DstSrc_OPERATION NE10_CHECKPOINTER_DstSrcCst_OPERATION
92
93#define NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
94 if ( (void *)arg1 < (void *)arg2 ) \
95 { assert ( (void *)arg1 + count <= (void *)arg2 ); } \
96 else if ( (void *)arg1 > (void *)arg2 ) \
97 { assert ( (void *)arg2 + count <= (void *)arg1 ); } \
98 if ( (void *)arg1 < (void *)arg3 ) \
99 { assert ( (void *)arg1 + count <= (void *)arg3 ); } \
100 else if ( (void *)arg1 > (void *)arg3 ) \
101 { assert ( (void *)arg3 + count <= (void *)arg1 ); } \
102 if ( (void *)arg3 < (void *)arg2 ) \
103 { assert ( (void *)arg3 + count <= (void *)arg2 ); } \
104 else if ( (void *)arg3 > (void *)arg2 ) \
105 { assert ( (void *)arg2 + count <= (void *)arg3 ); }
106
107#define NE10_CHECKPOINTER_4POINTER_OPERATION(arg1, arg2, arg3, arg4) \
108 NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
109 if ( (void *)arg1 < (void *)arg4 ) \
110 { assert ( (void *)arg1 + count <= (void *)arg4 ); } \
111 else if ( (void *)arg1 > (void *)arg4 ) \
112 { assert ( (void *)arg4 + count <= (void *)arg1 ); } \
113 if ( (void *)arg2 < (void *)arg4 ) \
114 { assert ( (void *)arg2 + count <= (void *)arg4 ); } \
115 else if ( (void *)arg2 > (void *)arg4 ) \
116 { assert ( (void *)arg4 + count <= (void *)arg2 ); } \
117 if ( (void *)arg4 < (void *)arg3 ) \
118 { assert ( (void *)arg4 + count <= (void *)arg3 ); } \
119 else if ( (void *)arg4 > (void *)arg3 ) \
120 { assert ( (void *)arg3 + count <= (void *)arg4 ); }
121
122
123
124#define NE10_CHECKPOINTER_DstAccSrcCst_OPERATION { \
125 NE10_CHECKPOINTER_3POINTER_OPERATION(dst, acc, src); }
126
127#define NE10_CHECKPOINTER_DstCst_OPERATION {}
128
129#define NE10_CHECKPOINTER_DstSrc1Src2_OPERATION { \
130 NE10_CHECKPOINTER_3POINTER_OPERATION(dst, src1, src2); }
131
132#define NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION { \
133 NE10_CHECKPOINTER_4POINTER_OPERATION(dst, acc, src1, src2); }
134
135// These macros generalise implementation of the functions.
136
137// Macros used in C implementations
138#define NE10_TEMPLATE_XC_OPERATION_X_C(checkPointer, loopCode) { \
139 ne10_result_t res = NE10_OK; \
140 unsigned int itr = 0; \
141 checkPointer; \
142 for ( itr = 0; itr < count; itr++ ) \
143 { loopCode ; /* this loop iterates through each and every float item one at a time */ \
144 } \
145 return res; \
146 }
147
148// macros used in the NEON implementations
149
150// Main Loop = The loop where the number of items to be processed is exactly the
151// number that we can process in a single iteration.
152//
153// Secondary Loop = The loop that follows a Main Loop to fill in the entries that
154// did not fit into the Main Loop. This is needed when the number of
155// input items is not a multiple of the number of items that we
156// process in every iteration of the Main Loop.
157
158
159/****************************************************
160 * *
161 * The "DstSrcCst" group of functions *
162 * *
163 ****************************************************/
164
166
167#define NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
168 /* load 4 values */ \
169 n_src = vld1q_f32( (float32_t*)src ); \
170 src += 4; /* move to the next 4 float items; 4*float */ \
171 loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
172 vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
173 dst += 4; /* move to the next items; 4*float */ \
174 }
175
176#define NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
177 float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
178 float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
179 n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0); /* load into the first lane of d0 */ \
180 loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
181 vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
182 /* move to the next item in the stream */ \
183 src++; \
184 dst++; \
185 }
186
187#define NE10_DstSrcCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
188 ne10_result_t res = NE10_OK; \
189 float32x4_t n_src; \
190 float32x4_t n_dst; \
191 checkPointer; \
192 int dif = 0; \
193 dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
194 for (; count > dif; count -= 4) { \
195 loopCode1; \
196 } \
197 if ( 0 != dif ) { \
198 unsigned int idx; \
199 for ( idx = 0 ; idx < dif; idx++ ) { \
200 loopCode2; \
201 } \
202 } \
203 return res; \
204 }
205
207
208#define NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
209 n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
210 src += 2; /* move to the next two vectors */ \
211 loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
212 vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
213 dst += 2; /* move to the next 2 vectors */ \
214 }
215
216#define NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
217 float32x2_t n_tmp_src; \
218 float32x2_t n_tmp_cst = { cst->x, cst->y }; \
219 n_tmp_src = vld1_f32( (float32_t*)src ); \
220 loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
221 vst1_f32( (float32_t*)dst, n_tmp_src); \
222 }
223
224#define NE10_DstSrcCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
225 ne10_result_t res = NE10_OK; \
226 float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
227 float32x4_t n_src; \
228 float32x4_t n_dst; \
229 checkPointer; \
230 int dif = count % 2; \
231 for (; count > dif; count -= 2) { \
232 loopCode1; \
233 } \
234 if ( 0 != dif ) { \
235 loopCode2; \
236 } \
237 return res; \
238 }
239
241
242#define NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
243 n_src1 = vld1q_f32( (float32_t*)src ); \
244 src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
245 n_src2 = vld1q_f32( (float32_t*)src ); \
246 src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
247 n_src3 = vld1q_f32( (float32_t*)src ); \
248 src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
249 loopCode; /* The main loop iterates through three 3D vectors each time */ \
250 vst1q_f32 ( (float32_t*)dst , n_dst1 ); \
251 dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
252 vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
253 dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
254 vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
255 dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
256 }
257
258#define NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
259 float32x2x3_t n_tmp_src = FLOAT32_2x3( \
260 0.0f, 0.0f, 0.0f , 0.0f, 0.0f , 0.0f); \
261 float32x2x3_t n_tmp_cst = { (const float32x2_t){cst->x, 0}, \
262 (const float32x2_t){cst->y, 0}, (const float32x2_t){cst->z, 0} }; \
263 n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
264 loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
265 vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
266 src++; \
267 dst++; \
268 }
269
270#define NE10_DstSrcCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
271 ne10_result_t res = NE10_OK; \
272 float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
273 float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
274 float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
275 float32x4_t n_src1, n_src2, n_src3; \
276 float32x4_t n_dst1, n_dst2, n_dst3; \
277 checkPointer; \
278 int dif = count % 4; \
279 for (; count > dif; count -= 4) { \
280 loopCode1; \
281 } \
282 if ( 0 != dif ) { \
283 unsigned int idx; \
284 for ( idx = 0 ; idx < dif; idx++ ) { \
285 loopCode2; \
286 } \
287 } \
288 return res; \
289 }
290
292
293/* Note that for the VEC4* types, we do not need a second loop as the number
294 of input items is always a multiple of four. */
295
296#define NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
297 n_src = vld1q_f32( (float32_t*)src ); \
298 src ++; \
299 loopCode; \
300 vst1q_f32 ( (float32_t*)dst , n_dst ); /* The main loop iterates through one 4D vector each time */ \
301 dst ++; \
302 }
303
304#define NE10_DstSrcCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
305 ne10_result_t res = NE10_OK; \
306 float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
307 float32x4_t n_src; \
308 float32x4_t n_dst; \
309 checkPointer; \
310 for (; count != 0; count --) { \
311 loopCode; \
312 } \
313 return res; \
314 }
315
316/****************************************************
317 * *
318 * The "DstAccSrcCst" group of functions *
319 * *
320 ****************************************************/
321
323
324#define NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
325 /* load 4 values */ \
326 n_acc = vld1q_f32( (float32_t*)acc ); \
327 n_src = vld1q_f32( (float32_t*)src ); \
328 acc += 4; /* move to the next 4 float items; 4*float */ \
329 src += 4; \
330 loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
331 vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
332 dst += 4; /* move to the next items; 4*float */ \
333 }
334
335#define NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
336 float32x2_t n_tmp_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
337 float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
338 float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
339 n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); /* load into the first lane of d0 */ \
340 n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0); /* load into the first lane of d1 */ \
341 loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
342 vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
343 /* move to the next item in the stream */ \
344 acc++; \
345 src++; \
346 dst++; \
347 }
348
349#define NE10_DstAccSrcCst_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
350
352
353#define NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
354 n_acc = vld1q_f32( (float32_t*)acc ); /* load two vectors */ \
355 n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
356 acc += 2; /* move to the next two vectors */ \
357 src += 2; \
358 loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
359 vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
360 dst += 2; /* move to the next 2 vectors */ \
361 }
362
363#define NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
364 float32x2_t n_tmp_acc; \
365 float32x2_t n_tmp_src; \
366 float32x2_t n_tmp_cst = { cst->x, cst->y }; \
367 n_tmp_acc = vld1_f32( (float32_t*)acc ); \
368 n_tmp_src = vld1_f32( (float32_t*)src ); \
369 loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
370 vst1_f32( (float32_t*)dst, n_tmp_src); \
371 }
372
373#define NE10_DstAccSrcCst_OPERATION_VEC2F_NEON NE10_DstSrcCst_OPERATION_VEC2F_NEON
374
376
377#define NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
378 n_acc1 = vld1q_f32( (float32_t*)acc ); /* Load accumulator values */ \
379 acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
380 n_acc2 = vld1q_f32( (float32_t*)acc ); \
381 acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
382 n_acc3 = vld1q_f32( (float32_t*)acc ); \
383 acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
384 n_src1 = vld1q_f32( (float32_t*)src ); /* Load source values */ \
385 src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
386 n_src2 = vld1q_f32( (float32_t*)src ); \
387 src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
388 n_src3 = vld1q_f32( (float32_t*)src ); \
389 src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
390 loopCode; /* The main loop iterates through three 3D vectors each time */ \
391 vst1q_f32 ( (float32_t*)dst , n_dst1 ); /* Store the results back into the memory */ \
392 dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
393 vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
394 dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
395 vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
396 dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
397 }
398
399#define NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
400 float32x2x3_t n_tmp_acc = FLOAT32_2x3( \
401 0.0f, 0.0f, \
402 0.0f, 0.0f, \
403 0.0f, 0.0f \
404 ); \
405 float32x2x3_t n_tmp_src = FLOAT32_2x3( \
406 0.0f, 0.0f, \
407 0.0f, 0.0f, \
408 0.0f, 0.0f \
409 ); \
410 float32x2x3_t n_tmp_cst = { (const float32x2_t){cst->x, 0}, \
411 (const float32x2_t){cst->y, 0}, \
412 (const float32x2_t){cst->z, 0} }; \
413 n_tmp_acc = vld3_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); \
414 n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
415 loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
416 vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
417 acc++; \
418 src++; \
419 dst++; \
420 }
421
422#define NE10_DstAccSrcCst_OPERATION_VEC3F_NEON NE10_DstSrcCst_OPERATION_VEC3F_NEON
423
425
426#define NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
427 n_acc = vld1q_f32( (float32_t*)acc ); \
428 n_src = vld1q_f32( (float32_t*)src ); \
429 acc ++; \
430 src ++; \
431 loopCode; \
432 vst1q_f32 ( (float32_t*)dst , n_dst ); /* The main loop iterates through one 4D vector each time */ \
433 dst ++; \
434 }
435
436#define NE10_DstAccSrcCst_OPERATION_VEC4F_NEON NE10_DstSrcCst_OPERATION_VEC4F_NEON
437
438/****************************************************
439 * *
440 * The "DstCst" group of functions *
441 * *
442 ****************************************************/
443
445
446#define NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode) { \
447 /* load 4 values */ \
448 loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
449 vst1q_f32 ( (float32_t*)dst , n_cst ); /* store theresults back */ \
450 dst += 4; /* move to the next items; 4*float */ \
451 }
452
453#define NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
454 float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
455 loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
456 vst1_lane_f32( (float32_t*)dst, n_tmp_cst, 0); /* store the lane back into the memory */ \
457 /* move to the next item in the stream */ \
458 dst++; \
459 }
460
461#define NE10_DstCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
462 ne10_result_t res = NE10_OK; \
463 checkPointer; \
464 int dif = 0; \
465 dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
466 for (; count > dif; count -= 4) { \
467 loopCode1; \
468 } \
469 if ( 0 != dif ) { \
470 unsigned int idx; \
471 for ( idx = 0 ; idx < dif; idx++ ) { \
472 loopCode2; \
473 } \
474 } \
475 return res; \
476 }
477
479
480
481#define NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode) { \
482 loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
483 vst1q_f32 ( (float32_t*)dst , n_cst ); /* store back */ \
484 dst += 2; /* move to the next 2 vectors */ \
485 }
486
487#define NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
488 float32x2_t n_tmp_cst = { cst->x, cst->y }; \
489 loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
490 vst1_f32( (float32_t*)dst, n_tmp_cst); \
491 }
492
493#define NE10_DstCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
494 ne10_result_t res = NE10_OK; \
495 float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
496 checkPointer; \
497 int dif = count % 2; \
498 for (; count > dif; count -= 2) { \
499 loopCode1; \
500 } \
501 if ( 0 != dif ) { \
502 loopCode2; \
503 } \
504 return res; \
505 }
506
508
509#define NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode) { \
510 loopCode; /* The main loop iterates through three 3D vectors each time */ \
511 vst1q_f32 ( (float32_t*)dst , n_cst1 ); \
512 dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
513 vst1q_f32 ( (float32_t*)dst , n_cst2 ); \
514 dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
515 vst1q_f32 ( (float32_t*)dst , n_cst3 ); \
516 dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
517 }
518
519#define NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
520 float32x2x3_t n_tmp_cst = { (const float32x2_t){cst->x, 0}, \
521 (const float32x2_t){cst->y, 0}, (const float32x2_t){cst->z, 0} }; \
522 loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
523 vst3_lane_f32( (float32_t*)dst, n_tmp_cst, 0); \
524 dst++; \
525 }
526
527#define NE10_DstCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
528 ne10_result_t res = NE10_OK; \
529 float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
530 float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
531 float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
532 checkPointer; \
533 int dif = count % 4; \
534 for (; count > dif; count -= 4) { \
535 loopCode1; \
536 } \
537 if ( 0 != dif ) { \
538 unsigned int idx; \
539 for ( idx = 0 ; idx < dif; idx++ ) { \
540 loopCode2; \
541 } \
542 } \
543 return res; \
544 }
545
547
548#define NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode) { \
549 loopCode; \
550 vst1q_f32 ( (float32_t*)dst , n_cst ); /* The main loop iterates through one 4D vector each time */ \
551 dst ++; \
552 }
553
554#define NE10_DstCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
555 ne10_result_t res = NE10_OK; \
556 float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
557 checkPointer; \
558 for (; count != 0; count --) { \
559 loopCode; \
560 } \
561 return res; \
562 }
563
564/****************************************************
565 * *
566 * The "DstSrc1Src2" group of functions *
567 * *
568 ****************************************************/
569
571
572#define NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
573 /* load 4 values */ \
574 n_src = vld1q_f32( (float32_t*)src1 ); \
575 src1 += 4; /* move to the next 4 float items; 4*float */ \
576 n_src2 = vld1q_f32( (float32_t*)src2 ); \
577 src2 += 4; /* move to the next 4 float items; 4*float */ \
578 loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
579 vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
580 dst += 4; /* move to the next items; 4*float */ \
581 }
582
583#define NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
584 float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
585 float32x2_t n_tmp_src2 = { 0.0f , 0.0f }; \
586 n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0); /* load into the first lane of d0 */ \
587 n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src, 0); \
588 loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
589 vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
590 /* move to the next item in the stream */ \
591 src1++; \
592 src2++; \
593 dst++; \
594 }
595
596#define NE10_DstSrc1Src2_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
597
598/****************************************************
599 * *
600 * The "DstAccSrc1Src2" group of functions *
601 * *
602 ****************************************************/
603
605
606#define NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
607 /* load 4 values */ \
608 n_acc = vld1q_f32( (float32_t*)acc ); \
609 n_src = vld1q_f32( (float32_t*)src1 ); \
610 n_src2 = vld1q_f32( (float32_t*)src2 ); \
611 acc += 4; /* move to the next 4 float items; 4*float */ \
612 src1 += 4; \
613 src2 += 4; \
614 loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
615 vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
616 dst += 4; /* move to the next items; 4*float */ \
617 }
618
619#define NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
620 float32x2_t n_tmp_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
621 float32x2_t n_tmp_src = { 0.0f , 0.0f }; \
622 float32x2_t n_tmp_src2 = { 0.0f, 0.0f }; \
623 n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); /* load into the first lane of d0 */ \
624 n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0); /* load into the first lane of d1 */ \
625 n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src2, 0); /* load into the first lane of d2 */ \
626 loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
627 vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
628 /* move to the next item in the stream */ \
629 acc++; \
630 src1++; \
631 src2++; \
632 dst++; \
633 }
634
635#define NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON NE10_DstAccSrcCst_OPERATION_FLOAT_NEON
636
637/****************************************************
638 * *
639 * The "DstSrc" group of functions *
640 * *
641 ****************************************************/
642
644
645#define NE10_DstSrc_MAINLOOP_FLOAT_NEON NE10_DstSrcCst_MAINLOOP_FLOAT_NEON
646
647#define NE10_DstSrc_SECONDLOOP_FLOAT_NEON NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON
648
649#define NE10_DstSrc_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
650
652
653#define NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode) { \
654 n_src = vld2_f32( (float32_t*)src ); /* load two vectors */ \
655 src += 2; /* move to the next two vectors */ \
656 loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
657 /* store the results and increment the destination pointer within the loopCode */ \
658 }
659
660#define NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode) { \
661 loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
662 /* store the results within the loopCode */ \
663 }
664
665#define NE10_DstSrc_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
666 ne10_result_t res = NE10_OK; \
667 float32x2x2_t n_src; \
668 float32x2_t n_dst; \
669 checkPointer; \
670 int dif = count % 2; \
671 for (; count > dif; count -= 2) { \
672 loopCode1; \
673 } \
674 if ( 0 != dif ) { \
675 loopCode2; \
676 } \
677 return res; \
678 }
679
681
682#define NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode) { \
683 n_src = vld3q_f32( (float32_t*)src ); \
684 src = ((void*)src)+(12*sizeof(ne10_float32_t)); \
685 loopCode; /* The main loop iterates through four 3D vectors each time */ \
686 /* store the results and increment the destination pointer within the loopCode */ \
687 }
688
689#define NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode) { \
690 loopCode; /* exceptional cases where the count isn't a multiple of 4 */ \
691 /* store the results within the loopCode */ \
692 }
693
694#define NE10_DstSrc_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
695 ne10_result_t res = NE10_OK; \
696 float32x4x3_t n_src; \
697 float32x4_t n_dst; \
698 checkPointer; \
699 int dif = count % 4; \
700 for (; count > dif; count -= 4) { \
701 loopCode1; \
702 } \
703 if ( 0 != dif ) { \
704 unsigned int idx; \
705 for ( idx = 0 ; idx < dif; idx++ ) { \
706 loopCode2; \
707 } \
708 } \
709 return res; \
710 }
711
713
714/* Note that for the VEC4* types, we do not need a second loop as the number
715 of input items is always a multiple of four. */
716
717#define NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) { \
718 n_src = vld1q_f32( (float32_t*)src ); \
719 src ++; \
720 loopCode; \
721 /* store the results and increment the destination pointer within the loopCode */ \
722 }
723
724#define NE10_DstSrc_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
725 ne10_result_t res = NE10_OK; \
726 float32x4_t n_src; \
727 checkPointer; \
728 for (; count != 0; count --) { \
729 loopCode; \
730 } \
731 return res; \
732 }
733