Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_divc.neon.c
1/*
2 * Copyright 2011-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : math/NE10_divc.neon.c
30 */
31
32#include "NE10_types.h"
33#include "macros.h"
34
35#include <assert.h>
36#include <arm_neon.h>
37
38#include <stdio.h>
39#include <stdlib.h>
40
41ne10_result_t ne10_divc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
42{
43 NE10_XC_OPERATION_FLOAT_NEON
44 (
45 /* a single division operation */
46 float32x4_t rec = vrecpeq_f32 (n_cst);
47 rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
48 rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
49 n_dst = vmulq_f32 (n_src , rec);
50 ,
51 /* a single division operation */
52 float32x2_t rec = vrecpe_f32 (n_tmp_cst);
53 rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
54 rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
55 n_tmp_src = vmul_f32 (n_tmp_src, rec);
56 );
57}
58
59ne10_result_t ne10_divc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
60{
61 NE10_XC_OPERATION_VEC2F_NEON
62 (
63 /* a single division operation */
64 float32x4_t rec = vrecpeq_f32 (n_cst);
65 rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
66 rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
67 n_dst = vmulq_f32 (n_src , rec);
68 ,
69 /* a single division operation */
70 float32x2_t rec = vrecpe_f32 (n_tmp_cst);
71 rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
72 rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
73 n_tmp_src = vmul_f32 (n_tmp_src, rec);
74 );
75}
76
77ne10_result_t ne10_divc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
78{
79 NE10_XC_OPERATION_VEC3F_NEON
80 (
81 /* three division operations */
82 float32x4_t rec = vrecpeq_f32 (n_cst1);
83 rec = vmulq_f32 (vrecpsq_f32 (n_cst1, rec), rec);
84 rec = vmulq_f32 (vrecpsq_f32 (n_cst1, rec), rec);
85 n_dst1 = vmulq_f32 (n_src1 , rec);
86
87 rec = vrecpeq_f32 (n_cst2);
88 rec = vmulq_f32 (vrecpsq_f32 (n_cst2, rec), rec);
89 rec = vmulq_f32 (vrecpsq_f32 (n_cst2, rec), rec);
90 n_dst2 = vmulq_f32 (n_src2 , rec);
91
92 rec = vrecpeq_f32 (n_cst3);
93 rec = vmulq_f32 (vrecpsq_f32 (n_cst3, rec), rec);
94 rec = vmulq_f32 (vrecpsq_f32 (n_cst3, rec), rec);
95 n_dst3 = vmulq_f32 (n_src3 , rec);
96 ,
97 /* three division operations */
98 float32x2_t rec = vrecpe_f32 (n_tmp_cst.val[0]);
99 rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[0], rec), rec);
100 rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[0], rec), rec);
101 n_tmp_src.val[0] = vmul_f32 (n_tmp_src.val[0] , rec);
102
103 rec = vrecpe_f32 (n_tmp_cst.val[1]);
104 rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[1], rec), rec);
105 rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[1], rec), rec);
106 n_tmp_src.val[1] = vmul_f32 (n_tmp_src.val[1] , rec);
107
108 rec = vrecpe_f32 (n_tmp_cst.val[2]);
109 rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[2], rec), rec);
110 rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[2], rec), rec);
111 n_tmp_src.val[2] = vmul_f32 (n_tmp_src.val[2] , rec);
112 );
113}
114
115ne10_result_t ne10_divc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
116{
117 NE10_XC_OPERATION_VEC4F_NEON
118 (
119 /* a single division operation */
120 float32x4_t rec = vrecpeq_f32 (n_cst);
121 rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
122 rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
123 n_dst = vmulq_f32 (n_src , rec);
124 );
125}
a 2-tuple of ne10_float32_t values.
Definition NE10_types.h:88
a 3-tuple of ne10_float32_t values.
Definition NE10_types.h:97
a 4-tuple of ne10_float32_t values.
Definition NE10_types.h:107