Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_boxfilter.neon.c
1/*
2 * Copyright 2013-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : imgproc/NE10_boxfilter.c
30 */
31
32#include "NE10.h"
33#include <stdlib.h>
34#include <math.h>
35#include <arm_neon.h>
40extern void ne10_img_boxfilter_row_border (const ne10_uint8_t* src,
41 ne10_uint8_t* dst,
42 ne10_size_t src_sz,
43 ne10_int32_t src_stride,
44 ne10_int32_t dst_stride,
45 ne10_size_t kernel,
46 ne10_point_t anchor,
47 ne10_int32_t *border_l_ptr,
48 ne10_int32_t *border_r_ptr);
49
50extern void ne10_img_boxfilter_col_border (const ne10_uint8_t *src,
51 ne10_uint8_t *dst,
52 ne10_size_t src_sz,
53 ne10_int32_t src_stride,
54 ne10_int32_t dst_stride,
55 ne10_size_t kernel,
56 ne10_point_t anchor,
57 ne10_int32_t *border_t_ptr,
58 ne10_int32_t *border_b_ptr);
59
60extern void ne10_img_boxfilter_row_c (const ne10_uint8_t *src,
61 ne10_uint8_t *dst,
62 ne10_size_t src_sz,
63 ne10_int32_t src_stride,
64 ne10_int32_t dst_stride,
65 ne10_size_t kernel,
66 ne10_point_t anchor,
67 ne10_int32_t border_l,
68 ne10_int32_t border_r);
69
70extern void ne10_img_boxfilter_col_c (const ne10_uint8_t *src,
71 ne10_uint8_t *dst,
72 ne10_size_t src_sz,
73 ne10_int32_t src_stride,
74 ne10_int32_t dst_stride,
75 ne10_size_t kernel,
76 ne10_point_t anchor,
77 ne10_int32_t border_t,
78 ne10_int32_t border_b);
79
80/* RGBA CHANNEL number is 4 */
81#define RGBA_CH 4
82/* DIV_SHIFT is used in replacement of constant division */
83#define DIV_SHIFT 15
84
85void ne10_img_boxfilter_row_neon (const ne10_uint8_t *src,
86 ne10_uint8_t *dst,
87 ne10_size_t src_sz,
88 ne10_int32_t src_stride,
89 ne10_int32_t dst_stride,
90 ne10_size_t kernel,
91 ne10_point_t anchor,
92 ne10_int32_t border_l,
93 ne10_int32_t border_r)
94{
95 /* when in special cases, we'll call the c version of row filter */
96 if (src_sz.y == 1 || kernel.x >= (1 << 7) || kernel.x == 1)
97 {
98 return ne10_img_boxfilter_row_c (src,
99 dst,
100 src_sz,
101 src_stride,
102 dst_stride,
103 kernel,
104 anchor,
105 border_l,
106 border_r);
107 }
108
109 assert (src != dst);
110 assert (kernel.x > 0);
111 assert ( (kernel.x <= src_sz.x) && (kernel.y <= src_sz.y));
112 /* if kernel.x was 1, the mul variable would overflow when it
113 * is casted to ne10_int16_t type.
114 */
115 assert ((src_sz.y > 1) &&
116 (kernel.x < (1 << 7)) &&
117 (kernel.x > 1));
118
119 ne10_int32_t x, y, k;
120 ne10_int16_t mul = (1 << DIV_SHIFT) / kernel.x;
121 int16x8_t mul_vec = vdupq_n_s16 (mul);
122
123 for (y = 0; y < src_sz.y; y += 2)
124 {
125 /* step back one row when image height is odd and before reaching last
126 * line
127 */
128 if ((src_sz.y % 2 != 0) && (y == src_sz.y - 1))
129 y--;
130
131 const ne10_uint8_t *src_row1 = src + y * src_stride;
132 const ne10_uint8_t *src_row2 = src + (y + 1) * src_stride;
133 ne10_uint8_t *dst_row1 = dst + y * dst_stride;
134 ne10_uint8_t *dst_row2 = dst + (y + 1) * dst_stride;
135 ne10_int16_t sum[RGBA_CH * 2];
136
137 for (k = 0; k < RGBA_CH; k++)
138 {
139 sum[k] = 0;
140 sum[k + 4] = 0;
141
142 for (x = 0; x < kernel.x; x++)
143 {
144 sum[k] += * (src_row1 + x * RGBA_CH + k);
145 sum[k + 4] += * (src_row2 + x * RGBA_CH + k);
146 }
147
148 *(dst_row1 + border_l * RGBA_CH + k) = sum[k] * mul >>
149 DIV_SHIFT;
150 *(dst_row2 + border_l * RGBA_CH + k) = sum[k + 4] * mul >>
151 DIV_SHIFT;
152 }
153
154 ne10_uint32_t prev = (anchor.x + 1) * RGBA_CH;
155 ne10_uint32_t next = (kernel.x - anchor.x - 1) * RGBA_CH;
156 const ne10_uint8_t *src_pixel1 = src_row1 + (1 + border_l) * RGBA_CH;
157 const ne10_uint8_t *src_pixel2 = src_row2 + (1 + border_l) * RGBA_CH;
158 const ne10_uint8_t *src_pixel_end = src_row1 + (src_sz.x - border_r) *
159 RGBA_CH;
160 ne10_uint8_t *dst_pixel1 = dst_row1 + (1 + border_l) * RGBA_CH;
161 ne10_uint8_t *dst_pixel2 = dst_row2 + (1 + border_l) * RGBA_CH;
162
163 int16x8_t sum_vec = vld1q_s16 (sum);
164 int16x8_t sum_tmp;
165 uint16x8_t sum_vec_u;
166 uint8x8_t src_pixel_next_vec, src_pixel_prev_vec;
167 uint32x2_t src_pixel_next_tmp_vec, src_pixel_prev_tmp_vec;
168 uint32x2_t src_pixel_next_tmp_vec_pre, src_pixel_prev_tmp_vec_pre;
169 uint32x2_t dst_pixel_vec;
170 uint8x8_t dst_pixel_tmp_vec;
171
172
173 /* preload */
174 src_pixel_next_tmp_vec = vld1_lane_u32 (
175 (const ne10_uint32_t*) (src_pixel1 + next),
176 src_pixel_next_tmp_vec,
177 0);
178 src_pixel_prev_tmp_vec = vld1_lane_u32 (
179 (const ne10_uint32_t*) (src_pixel1 - prev),
180 src_pixel_prev_tmp_vec,
181 0);
182 src_pixel_next_tmp_vec = vld1_lane_u32 (
183 (const ne10_uint32_t*) (src_pixel2 + next),
184 src_pixel_next_tmp_vec,
185 1);
186 src_pixel_prev_tmp_vec = vld1_lane_u32 (
187 (const ne10_uint32_t*) (src_pixel2 - prev),
188 src_pixel_prev_tmp_vec,
189 1);
190
191 /* load two rows to do filtering */
192 while (src_pixel1 < src_pixel_end)
193 {
194 /* preload */
195 src_pixel_next_tmp_vec_pre = vld1_lane_u32 (
196 (const ne10_uint32_t*) (src_pixel1 + 4 + next),
197 src_pixel_next_tmp_vec_pre,
198 0);
199 src_pixel_prev_tmp_vec_pre = vld1_lane_u32 (
200 (const ne10_uint32_t*) (src_pixel1 + 4 - prev),
201 src_pixel_prev_tmp_vec_pre,
202 0);
203 src_pixel_next_tmp_vec_pre = vld1_lane_u32 (
204 (const ne10_uint32_t*) (src_pixel2 + 4 + next),
205 src_pixel_next_tmp_vec_pre,
206 1);
207 src_pixel_prev_tmp_vec_pre = vld1_lane_u32 (
208 (const ne10_uint32_t*) (src_pixel2 + 4 - prev),
209 src_pixel_prev_tmp_vec_pre,
210 1);
211
212 src_pixel_prev_vec = vreinterpret_u8_u32 (src_pixel_prev_tmp_vec);
213 src_pixel_next_vec = vreinterpret_u8_u32 (src_pixel_next_tmp_vec);
214
215 sum_vec_u = vreinterpretq_u16_s16 (sum_vec);
216 sum_vec_u = vaddw_u8 (sum_vec_u, src_pixel_next_vec);
217 sum_vec_u = vsubw_u8 (sum_vec_u, src_pixel_prev_vec);
218 sum_vec = vreinterpretq_s16_u16 (sum_vec_u);
219 /* vqdmulhq_n_s16 would shift the result 16 bit */
220 sum_tmp = vqdmulhq_s16 (sum_vec, mul_vec);
221 dst_pixel_tmp_vec = vqmovun_s16 (sum_tmp);
222 dst_pixel_vec = vreinterpret_u32_u8 (dst_pixel_tmp_vec);
223 vst1_lane_u32 ((ne10_uint32_t *) dst_pixel1, dst_pixel_vec, 0);
224 vst1_lane_u32 ((ne10_uint32_t *) dst_pixel2, dst_pixel_vec, 1);
225
226 src_pixel_prev_tmp_vec = src_pixel_prev_tmp_vec_pre;
227 src_pixel_next_tmp_vec = src_pixel_next_tmp_vec_pre;
228
229 src_pixel1 += 4;
230 src_pixel2 += 4;
231 dst_pixel1 += 4;
232 dst_pixel2 += 4;
233 }
234 }
235}
236
237void ne10_img_boxfilter_col_neon (const ne10_uint8_t *src,
238 ne10_uint8_t *dst,
239 ne10_size_t src_sz,
240 ne10_int32_t src_stride,
241 ne10_int32_t dst_stride,
242 ne10_size_t kernel,
243 ne10_point_t anchor,
244 ne10_int32_t border_t,
245 ne10_int32_t border_b)
246{
247 /* when in special cases, we'll call c version to do the work */
248 if (kernel.y == 1 || kernel.y >= (1 << 7) || src_sz.x == 1)
249 {
250 return ne10_img_boxfilter_col_c (src,
251 dst,
252 src_sz,
253 src_stride,
254 dst_stride,
255 kernel,
256 anchor,
257 border_t,
258 border_b);
259 }
260
261 assert (src != dst);
262 assert (kernel.y > 0);
263 assert ( (kernel.x <= src_sz.x) && (kernel.y <= src_sz.y));
264 /* if kernel.y was 1, the mul variable would overflow when it
265 * is casted to ne10_int16_t type.
266 */
267 assert ( (src_sz.x > 1) &&
268 (kernel.y < (1 << 7)) &&
269 (kernel.y > 1));
270
271 ne10_int32_t x, y, k;
272 ne10_uint16_t *sum_row = (ne10_uint16_t *) malloc (src_sz.x *
273 RGBA_CH *
274 sizeof (ne10_uint16_t));
275 ne10_uint16_t mul = (1 << DIV_SHIFT) / kernel.y;
276
277 if (!sum_row)
278 {
279 fprintf (stderr,
280 "ERROR: buffer allocation fails!\nallocation size: %d\n",
281 sizeof (ne10_uint32_t) *
282 src_sz.x *
283 RGBA_CH);
284 return;
285 }
286
287 for (x = 0; x < src_sz.x * RGBA_CH; x++)
288 {
289 sum_row[x] = 0;
290 }
291
292 for (x = 0; x < src_sz.x; x++)
293 {
294 const ne10_uint8_t *src_col = src + x * RGBA_CH;
295 ne10_uint8_t *dst_col = dst + x * RGBA_CH;
296 ne10_uint8_t *dst_pixel = dst_col + border_t * dst_stride;
297 ne10_uint16_t *sum = sum_row + x * RGBA_CH;
298
299 for (y = 0; y < kernel.y; y++)
300 {
301 const ne10_uint8_t *src_pixel = src_col + y * src_stride;
302
303 for (k = 0; k < RGBA_CH; k++)
304 {
305 sum[k] += src_pixel[k];
306 }
307 }
308
309 for (k = 0; k < RGBA_CH; k++)
310 {
311 dst_pixel[k] = sum_row[x * RGBA_CH + k] * mul >>
312 DIV_SHIFT;
313 }
314 }
315
316 const ne10_uint8_t *src_row = src + (1 + border_t) * src_stride;
317 const ne10_uint8_t *src_row_end = src + (src_sz.y - border_b) *
318 src_stride;
319 ne10_uint8_t *dst_row = dst + (1 + border_t) * dst_stride;
320 ne10_uint32_t prev = (anchor.y + 1) * src_stride;
321 ne10_uint32_t next = (kernel.y - anchor.y - 1) * src_stride;
322
323 uint16x8_t sum_vec, sum_vec_pre;
324 int16x8_t sum_vec_s;
325 uint8x8_t src_pixel_prev_vec, src_pixel_next_vec;
326 uint8x8_t src_pixel_prev_vec_pre, src_pixel_next_vec_pre;
327 uint8x8_t dst_pixel_vec;
328
329 ne10_uint16_t sum_val_bakcup[RGBA_CH];
330 ne10_uint32_t src_sz_x_adjust = src_sz.x;
331 int16x8_t mul_vec = vdupq_n_s16 (mul);
332
333 if (src_sz.x % 2 != 0)
334 {
335 for (k = 0; k < RGBA_CH; k++)
336 sum_val_bakcup[k] = sum_row[ (src_sz.x - 2) * RGBA_CH + k];
337 src_sz_x_adjust = src_sz.x - 1;
338 }
339
340 /* boxfilter column filter is done once in a row, which
341 * is more friendly to cache than once in a column.
342 */
343 while (src_row < src_row_end)
344 {
345 /* preload */
346 const ne10_uint8_t *src_pixel = src_row;
347 ne10_uint8_t *dst_pixel = dst_row;
348 src_pixel_prev_vec = vld1_u8 (src_pixel - prev);
349 src_pixel_next_vec = vld1_u8 (src_pixel + next);
350 ne10_uint16_t *sum, *sum_pre;
351 sum_vec = vld1q_u16 (sum_row);
352
353 for (x = 0; x < src_sz_x_adjust; x += 2)
354 {
355 sum = sum_row + x * RGBA_CH;
356 sum_pre = sum + 2 * RGBA_CH;
357 sum_vec_pre = vld1q_u16 (sum_pre);
358 /* preload */
359 src_pixel = src_row + (x + 2) * RGBA_CH;
360 src_pixel_prev_vec_pre = vld1_u8 (src_pixel - prev);
361 src_pixel_next_vec_pre = vld1_u8 (src_pixel + next);
362
363 sum_vec = vaddw_u8 (sum_vec, src_pixel_next_vec);
364 sum_vec = vsubw_u8 (sum_vec, src_pixel_prev_vec);
365 sum_vec_s = vreinterpretq_s16_u16 (sum_vec);
366 /* vqdmulhq_n_s16 would shift the result 16 bit */
367 sum_vec_s = vqdmulhq_s16 (sum_vec_s, mul_vec);
368 dst_pixel_vec = vqmovun_s16 (sum_vec_s);
369 dst_pixel = dst_row + x * RGBA_CH;
370 vst1_u8 (dst_pixel, dst_pixel_vec);
371 vst1q_u16 (sum, sum_vec);
372
373 src_pixel_next_vec = src_pixel_next_vec_pre;
374 src_pixel_prev_vec = src_pixel_prev_vec_pre;
375 sum_vec = sum_vec_pre;
376 }
377 src_row += src_stride;
378 dst_row += dst_stride;
379 }
380
381 if (src_sz.x % 2 != 0)
382 {
383 for (k = 0; k < RGBA_CH; k++)
384 sum_row[ (src_sz.x - 2) * RGBA_CH + k] = sum_val_bakcup[k];
385
386 src_row = src + (1 + border_t) * src_stride;
387 dst_row = dst + (1 + border_t) * dst_stride;
388 /* step back one column */
389 x = src_sz.x - 2;
390 sum_vec = vld1q_u16 (sum_row + x * RGBA_CH);
391
392 while (src_row < src_row_end)
393 {
394 const ne10_uint8_t *src_pixel = src_row + x * RGBA_CH;
395 ne10_uint8_t *dst_pixel = dst_row + x * RGBA_CH;
396 src_pixel_prev_vec = vld1_u8 (src_pixel - prev);
397 src_pixel_next_vec = vld1_u8 (src_pixel + next);
398 sum_vec = vaddw_u8 (sum_vec, src_pixel_next_vec);
399 sum_vec = vsubw_u8 (sum_vec, src_pixel_prev_vec);
400 sum_vec_s = vreinterpretq_s16_u16 (sum_vec);
401 /* vqdmulhq_n_s16 would shift the result 16 bit
402 */
403 sum_vec_s = vqdmulhq_s16 (sum_vec_s, mul_vec);
404 dst_pixel_vec = vqmovun_s16 (sum_vec_s);
405 vst1_u8 (dst_pixel, dst_pixel_vec);
406
407 src_row += src_stride;
408 dst_row += dst_stride;
409 }
410 }
411
412 free (sum_row);
413}
414
434void ne10_img_boxfilter_rgba8888_neon (const ne10_uint8_t *src,
435 ne10_uint8_t *dst,
436 ne10_size_t src_sz,
437 ne10_int32_t src_stride,
438 ne10_int32_t dst_stride,
439 ne10_size_t kernel)
440{
441 assert (src != 0 && dst != 0);
442 assert (src_sz.x > 0 && src_sz.y > 0);
443 assert (src_stride > 0 && dst_stride > 0);
444 assert (kernel.x > 0 && kernel.x <= src_sz.x
445 && kernel.y > 0 && kernel.y <= src_sz.y);
446
447 ne10_int32_t border_l, border_r, border_t, border_b;
448 ne10_point_t anchor;
449
450 anchor.x = kernel.x / 2;
451 anchor.y = kernel.y / 2;
452
453 /* the extra 2 elements here is reserved for pre-load when do row or column
454 * filter */
455 ne10_uint32_t mem_bytes = (sizeof (ne10_uint8_t) * src_sz.x * src_sz.y + 2)
456 * RGBA_CH;
457
458 ne10_uint8_t *dst_buf = (ne10_uint8_t *) malloc (mem_bytes);
459
460 if (!dst_buf)
461 {
462 fprintf (stderr,
463 "ERROR: buffer allocation fails!\nallocation size: %d\n",
464 mem_bytes);
465 return;
466 }
467
468 ne10_int32_t dst_buf_stride = src_sz.x * RGBA_CH;
469
470 /* compute the row border of dst image */
471 ne10_img_boxfilter_row_border (src,
472 dst_buf,
473 src_sz,
474 src_stride,
475 dst_buf_stride,
476 kernel,
477 anchor,
478 &border_l,
479 &border_r);
480 /* boxfilter is separable filter, and then can be apply row filter and
481 * column filter sequentially. here apply boxfilter's row part to image
482 */
483 ne10_img_boxfilter_row_neon (src,
484 dst_buf,
485 src_sz,
486 src_stride,
487 dst_buf_stride,
488 kernel,
489 anchor,
490 border_l,
491 border_r);
492
493 /* compute the column border of dst image,
494 * which is based on previous row filter result.
495 */
496 ne10_img_boxfilter_col_border (dst_buf,
497 dst,
498 src_sz,
499 dst_buf_stride,
500 dst_stride,
501 kernel,
502 anchor,
503 &border_t,
504 &border_b);
505
506 /* apply boxfilter column filter to image */
507 ne10_img_boxfilter_col_neon (dst_buf,
508 dst,
509 src_sz,
510 dst_buf_stride,
511 dst_stride,
512 kernel,
513 anchor,
514 border_t,
515 border_b);
516
517 free (dst_buf);
518}
void ne10_img_boxfilter_rgba8888_neon(const ne10_uint8_t *src, ne10_uint8_t *dst, ne10_size_t src_sz, ne10_int32_t src_stride, ne10_int32_t dst_stride, ne10_size_t kernel)
neon optimized box filter
Structure for point in image.
Definition NE10_types.h:435