161 ui32 width,
bool even)
167 float* dpl = even ? ldst->
f32 : hdst->
f32;
168 float* dph = even ? hdst->
f32 : ldst->
f32;
169 float* sp = src->
f32;
175 float* hp = hdst->
f32, * lp = ldst->
f32;
176 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
177 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
179 for (
ui32 j = num_steps; j > 0; --j)
186 lp[l_width] = lp[l_width - 1];
188 const float* sp = lp;
190 int i = (int)h_width;
191 v128_t f = wasm_f32x4_splat(a);
194 for (; i > 0; i -= 4, sp += 4, dp += 4)
196 v128_t m = wasm_v128_load(sp);
197 v128_t n = wasm_v128_load(sp + 1);
198 v128_t p = wasm_v128_load(dp);
199 p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
200 wasm_v128_store(dp, p);
205 for (; i > 0; i -= 4, sp += 4, dp += 4)
207 v128_t m = wasm_v128_load(sp);
208 v128_t n = wasm_v128_load(sp - 1);
209 v128_t p = wasm_v128_load(dp);
210 p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
211 wasm_v128_store(dp, p);
216 float* t = lp; lp = hp; hp = t;
218 ui32 w = l_width; l_width = h_width; h_width = w;
222 float K = atk->
get_K();
223 float K_inv = 1.0f / K;
230 ldst->
f32[0] = src->
f32[0];
232 hdst->
f32[0] = src->
f32[0] * 2.0f;
239 ui32 width,
bool even)
244 float* oth = hsrc->
f32, * aug = lsrc->
f32;
245 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
246 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
249 float K = atk->
get_K();
250 float K_inv = 1.0f / K;
257 for (
ui32 j = 0; j < num_steps; ++j)
264 oth[oth_width] = oth[oth_width - 1];
266 const float* sp = oth;
268 int i = (int)aug_width;
269 v128_t f = wasm_f32x4_splat(a);
272 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
274 v128_t m = wasm_v128_load(sp);
275 v128_t n = wasm_v128_load(sp - 1);
276 v128_t p = wasm_v128_load(dp);
277 p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
278 wasm_v128_store(dp, p);
283 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
285 v128_t m = wasm_v128_load(sp);
286 v128_t n = wasm_v128_load(sp + 1);
287 v128_t p = wasm_v128_load(dp);
288 p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
289 wasm_v128_store(dp, p);
294 float* t = aug; aug = oth; oth = t;
296 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
301 float* dp = dst->
f32;
302 float* spl = even ? lsrc->
f32 : hsrc->
f32;
303 float* sph = even ? hsrc->
f32 : lsrc->
f32;
310 dst->
f32[0] = lsrc->
f32[0];
312 dst->
f32[0] = hsrc->
f32[0] * 0.5f;
319 ui32 repeat,
bool synthesis)
324 v128_t va = wasm_i32x4_splat(a);
325 v128_t vb = wasm_i32x4_splat(b);
328 const si32* src1 = sig->
i32, * src2 = other->
i32;
336 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
338 v128_t s1 = wasm_v128_load((v128_t*)src1);
339 v128_t s2 = wasm_v128_load((v128_t*)src2);
340 v128_t d = wasm_v128_load((v128_t*)dst);
341 v128_t t = wasm_i32x4_add(s1, s2);
342 v128_t v = wasm_i32x4_add(vb, t);
343 v128_t w = wasm_i32x4_shr(v, e);
344 d = wasm_i32x4_sub(d, w);
345 wasm_v128_store((v128_t*)dst, d);
348 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
350 v128_t s1 = wasm_v128_load((v128_t*)src1);
351 v128_t s2 = wasm_v128_load((v128_t*)src2);
352 v128_t d = wasm_v128_load((v128_t*)dst);
353 v128_t t = wasm_i32x4_add(s1, s2);
354 v128_t v = wasm_i32x4_add(vb, t);
355 v128_t w = wasm_i32x4_shr(v, e);
356 d = wasm_i32x4_add(d, w);
357 wasm_v128_store((v128_t*)dst, d);
360 else if (a == -1 && b == 1 && e == 1)
364 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
366 v128_t s1 = wasm_v128_load((v128_t*)src1);
367 v128_t s2 = wasm_v128_load((v128_t*)src2);
368 v128_t d = wasm_v128_load((v128_t*)dst);
369 v128_t t = wasm_i32x4_add(s1, s2);
370 v128_t w = wasm_i32x4_shr(t, e);
371 d = wasm_i32x4_add(d, w);
372 wasm_v128_store((v128_t*)dst, d);
375 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
377 v128_t s1 = wasm_v128_load((v128_t*)src1);
378 v128_t s2 = wasm_v128_load((v128_t*)src2);
379 v128_t d = wasm_v128_load((v128_t*)dst);
380 v128_t t = wasm_i32x4_add(s1, s2);
381 v128_t w = wasm_i32x4_shr(t, e);
382 d = wasm_i32x4_sub(d, w);
383 wasm_v128_store((v128_t*)dst, d);
390 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
392 v128_t s1 = wasm_v128_load((v128_t*)src1);
393 v128_t s2 = wasm_v128_load((v128_t*)src2);
394 v128_t d = wasm_v128_load((v128_t*)dst);
395 v128_t t = wasm_i32x4_add(s1, s2);
396 v128_t v = wasm_i32x4_sub(vb, t);
397 v128_t w = wasm_i32x4_shr(v, e);
398 d = wasm_i32x4_sub(d, w);
399 wasm_v128_store((v128_t*)dst, d);
402 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
404 v128_t s1 = wasm_v128_load((v128_t*)src1);
405 v128_t s2 = wasm_v128_load((v128_t*)src2);
406 v128_t d = wasm_v128_load((v128_t*)dst);
407 v128_t t = wasm_i32x4_add(s1, s2);
408 v128_t v = wasm_i32x4_sub(vb, t);
409 v128_t w = wasm_i32x4_shr(v, e);
410 d = wasm_i32x4_add(d, w);
411 wasm_v128_store((v128_t*)dst, d);
418 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
420 v128_t s1 = wasm_v128_load((v128_t*)src1);
421 v128_t s2 = wasm_v128_load((v128_t*)src2);
422 v128_t d = wasm_v128_load((v128_t*)dst);
423 v128_t t = wasm_i32x4_add(s1, s2);
424 v128_t u = wasm_i32x4_mul(va, t);
425 v128_t v = wasm_i32x4_add(vb, u);
426 v128_t w = wasm_i32x4_shr(v, e);
427 d = wasm_i32x4_sub(d, w);
428 wasm_v128_store((v128_t*)dst, d);
431 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
433 v128_t s1 = wasm_v128_load((v128_t*)src1);
434 v128_t s2 = wasm_v128_load((v128_t*)src2);
435 v128_t d = wasm_v128_load((v128_t*)dst);
436 v128_t t = wasm_i32x4_add(s1, s2);
437 v128_t u = wasm_i32x4_mul(va, t);
438 v128_t v = wasm_i32x4_add(vb, u);
439 v128_t w = wasm_i32x4_shr(v, e);
440 d = wasm_i32x4_add(d, w);
441 wasm_v128_store((v128_t*)dst, d);
449 ui32 repeat,
bool synthesis)
454 v128_t va = wasm_i64x2_splat(a);
455 v128_t vb = wasm_i64x2_splat(b);
458 const si64* src1 = sig->
i64, * src2 = other->
i64;
466 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
468 v128_t s1 = wasm_v128_load((v128_t*)src1);
469 v128_t s2 = wasm_v128_load((v128_t*)src2);
470 v128_t d = wasm_v128_load((v128_t*)dst);
471 v128_t t = wasm_i64x2_add(s1, s2);
472 v128_t v = wasm_i64x2_add(vb, t);
473 v128_t w = wasm_i64x2_shr(v, e);
474 d = wasm_i64x2_sub(d, w);
475 wasm_v128_store((v128_t*)dst, d);
478 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
480 v128_t s1 = wasm_v128_load((v128_t*)src1);
481 v128_t s2 = wasm_v128_load((v128_t*)src2);
482 v128_t d = wasm_v128_load((v128_t*)dst);
483 v128_t t = wasm_i64x2_add(s1, s2);
484 v128_t v = wasm_i64x2_add(vb, t);
485 v128_t w = wasm_i64x2_shr(v, e);
486 d = wasm_i64x2_add(d, w);
487 wasm_v128_store((v128_t*)dst, d);
490 else if (a == -1 && b == 1 && e == 1)
494 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
496 v128_t s1 = wasm_v128_load((v128_t*)src1);
497 v128_t s2 = wasm_v128_load((v128_t*)src2);
498 v128_t d = wasm_v128_load((v128_t*)dst);
499 v128_t t = wasm_i64x2_add(s1, s2);
500 v128_t w = wasm_i64x2_shr(t, e);
501 d = wasm_i64x2_add(d, w);
502 wasm_v128_store((v128_t*)dst, d);
505 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
507 v128_t s1 = wasm_v128_load((v128_t*)src1);
508 v128_t s2 = wasm_v128_load((v128_t*)src2);
509 v128_t d = wasm_v128_load((v128_t*)dst);
510 v128_t t = wasm_i64x2_add(s1, s2);
511 v128_t w = wasm_i64x2_shr(t, e);
512 d = wasm_i64x2_sub(d, w);
513 wasm_v128_store((v128_t*)dst, d);
520 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
522 v128_t s1 = wasm_v128_load((v128_t*)src1);
523 v128_t s2 = wasm_v128_load((v128_t*)src2);
524 v128_t d = wasm_v128_load((v128_t*)dst);
525 v128_t t = wasm_i64x2_add(s1, s2);
526 v128_t v = wasm_i64x2_sub(vb, t);
527 v128_t w = wasm_i64x2_shr(v, e);
528 d = wasm_i64x2_sub(d, w);
529 wasm_v128_store((v128_t*)dst, d);
532 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
534 v128_t s1 = wasm_v128_load((v128_t*)src1);
535 v128_t s2 = wasm_v128_load((v128_t*)src2);
536 v128_t d = wasm_v128_load((v128_t*)dst);
537 v128_t t = wasm_i64x2_add(s1, s2);
538 v128_t v = wasm_i64x2_sub(vb, t);
539 v128_t w = wasm_i64x2_shr(v, e);
540 d = wasm_i64x2_add(d, w);
541 wasm_v128_store((v128_t*)dst, d);
548 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
550 v128_t s1 = wasm_v128_load((v128_t*)src1);
551 v128_t s2 = wasm_v128_load((v128_t*)src2);
552 v128_t d = wasm_v128_load((v128_t*)dst);
553 v128_t t = wasm_i64x2_add(s1, s2);
554 v128_t u = wasm_i64x2_mul(va, t);
555 v128_t v = wasm_i64x2_add(vb, u);
556 v128_t w = wasm_i64x2_shr(v, e);
557 d = wasm_i64x2_sub(d, w);
558 wasm_v128_store((v128_t*)dst, d);
561 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
563 v128_t s1 = wasm_v128_load((v128_t*)src1);
564 v128_t s2 = wasm_v128_load((v128_t*)src2);
565 v128_t d = wasm_v128_load((v128_t*)dst);
566 v128_t t = wasm_i64x2_add(s1, s2);
567 v128_t u = wasm_i64x2_mul(va, t);
568 v128_t v = wasm_i64x2_add(vb, u);
569 v128_t w = wasm_i64x2_shr(v, e);
570 d = wasm_i64x2_add(d, w);
571 wasm_v128_store((v128_t*)dst, d);
603 ui32 width,
bool even)
609 float* dpl = even ? ldst->
f32 : hdst->
f32;
610 float* dph = even ? hdst->
f32 : ldst->
f32;
611 float* sp = src->
f32;
617 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
618 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
620 for (
ui32 j = num_steps; j > 0; --j)
627 v128_t va = wasm_i32x4_splat(a);
628 v128_t vb = wasm_i32x4_splat(b);
632 lp[l_width] = lp[l_width - 1];
638 int i = (int)h_width;
641 for (; i > 0; i -= 4, sp += 4, dp += 4)
643 v128_t s1 = wasm_v128_load((v128_t*)sp);
644 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
645 v128_t d = wasm_v128_load((v128_t*)dp);
646 v128_t t = wasm_i32x4_add(s1, s2);
647 v128_t v = wasm_i32x4_add(vb, t);
648 v128_t w = wasm_i32x4_shr(v, e);
649 d = wasm_i32x4_add(d, w);
650 wasm_v128_store((v128_t*)dp, d);
655 for (; i > 0; i -= 4, sp += 4, dp += 4)
657 v128_t s1 = wasm_v128_load((v128_t*)sp);
658 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
659 v128_t d = wasm_v128_load((v128_t*)dp);
660 v128_t t = wasm_i32x4_add(s1, s2);
661 v128_t v = wasm_i32x4_add(vb, t);
662 v128_t w = wasm_i32x4_shr(v, e);
663 d = wasm_i32x4_add(d, w);
664 wasm_v128_store((v128_t*)dp, d);
668 else if (a == -1 && b == 1 && e == 1)
670 int i = (int)h_width;
672 for (; i > 0; i -= 4, sp += 4, dp += 4)
674 v128_t s1 = wasm_v128_load((v128_t*)sp);
675 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
676 v128_t d = wasm_v128_load((v128_t*)dp);
677 v128_t t = wasm_i32x4_add(s1, s2);
678 v128_t w = wasm_i32x4_shr(t, e);
679 d = wasm_i32x4_sub(d, w);
680 wasm_v128_store((v128_t*)dp, d);
683 for (; i > 0; i -= 4, sp += 4, dp += 4)
685 v128_t s1 = wasm_v128_load((v128_t*)sp);
686 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
687 v128_t d = wasm_v128_load((v128_t*)dp);
688 v128_t t = wasm_i32x4_add(s1, s2);
689 v128_t w = wasm_i32x4_shr(t, e);
690 d = wasm_i32x4_sub(d, w);
691 wasm_v128_store((v128_t*)dp, d);
696 int i = (int)h_width;
698 for (; i > 0; i -= 4, sp += 4, dp += 4)
700 v128_t s1 = wasm_v128_load((v128_t*)sp);
701 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
702 v128_t d = wasm_v128_load((v128_t*)dp);
703 v128_t t = wasm_i32x4_add(s1, s2);
704 v128_t v = wasm_i32x4_sub(vb, t);
705 v128_t w = wasm_i32x4_shr(v, e);
706 d = wasm_i32x4_add(d, w);
707 wasm_v128_store((v128_t*)dp, d);
710 for (; i > 0; i -= 4, sp += 4, dp += 4)
712 v128_t s1 = wasm_v128_load((v128_t*)sp);
713 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
714 v128_t d = wasm_v128_load((v128_t*)dp);
715 v128_t t = wasm_i32x4_add(s1, s2);
716 v128_t v = wasm_i32x4_sub(vb, t);
717 v128_t w = wasm_i32x4_shr(v, e);
718 d = wasm_i32x4_add(d, w);
719 wasm_v128_store((v128_t*)dp, d);
724 int i = (int)h_width;
726 for (; i > 0; i -= 4, sp += 4, dp += 4)
728 v128_t s1 = wasm_v128_load((v128_t*)sp);
729 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
730 v128_t d = wasm_v128_load((v128_t*)dp);
731 v128_t t = wasm_i32x4_add(s1, s2);
732 v128_t u = wasm_i32x4_mul(va, t);
733 v128_t v = wasm_i32x4_add(vb, u);
734 v128_t w = wasm_i32x4_shr(v, e);
735 d = wasm_i32x4_add(d, w);
736 wasm_v128_store((v128_t*)dp, d);
739 for (; i > 0; i -= 4, sp += 4, dp += 4)
741 v128_t s1 = wasm_v128_load((v128_t*)sp);
742 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
743 v128_t d = wasm_v128_load((v128_t*)dp);
744 v128_t t = wasm_i32x4_add(s1, s2);
745 v128_t u = wasm_i32x4_mul(va, t);
746 v128_t v = wasm_i32x4_add(vb, u);
747 v128_t w = wasm_i32x4_shr(v, e);
748 d = wasm_i32x4_add(d, w);
749 wasm_v128_store((v128_t*)dp, d);
754 si32* t = lp; lp = hp; hp = t;
756 ui32 w = l_width; l_width = h_width; h_width = w;
761 ldst->
i32[0] = src->
i32[0];
763 hdst->
i32[0] = src->
i32[0] << 1;
771 ui32 width,
bool even)
777 double* dpl = (
double*)(even ? ldst->
p : hdst->
p);
778 double* dph = (
double*)(even ? hdst->
p : ldst->
p);
779 double* sp = (
double*)src->
p;
785 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
786 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
788 for (
ui32 j = num_steps; j > 0; --j)
795 v128_t va = wasm_i64x2_splat(a);
796 v128_t vb = wasm_i64x2_splat(b);
800 lp[l_width] = lp[l_width - 1];
806 int i = (int)h_width;
809 for (; i > 0; i -= 2, sp += 2, dp += 2)
811 v128_t s1 = wasm_v128_load((v128_t*)sp);
812 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
813 v128_t d = wasm_v128_load((v128_t*)dp);
814 v128_t t = wasm_i64x2_add(s1, s2);
815 v128_t v = wasm_i64x2_add(vb, t);
816 v128_t w = wasm_i64x2_shr(v, e);
817 d = wasm_i64x2_add(d, w);
818 wasm_v128_store((v128_t*)dp, d);
823 for (; i > 0; i -= 2, sp += 2, dp += 2)
825 v128_t s1 = wasm_v128_load((v128_t*)sp);
826 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
827 v128_t d = wasm_v128_load((v128_t*)dp);
828 v128_t t = wasm_i64x2_add(s1, s2);
829 v128_t v = wasm_i64x2_add(vb, t);
830 v128_t w = wasm_i64x2_shr(v, e);
831 d = wasm_i64x2_add(d, w);
832 wasm_v128_store((v128_t*)dp, d);
836 else if (a == -1 && b == 1 && e == 1)
838 int i = (int)h_width;
840 for (; i > 0; i -= 2, sp += 2, dp += 2)
842 v128_t s1 = wasm_v128_load((v128_t*)sp);
843 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
844 v128_t d = wasm_v128_load((v128_t*)dp);
845 v128_t t = wasm_i64x2_add(s1, s2);
846 v128_t w = wasm_i64x2_shr(t, e);
847 d = wasm_i64x2_sub(d, w);
848 wasm_v128_store((v128_t*)dp, d);
851 for (; i > 0; i -= 2, sp += 2, dp += 2)
853 v128_t s1 = wasm_v128_load((v128_t*)sp);
854 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
855 v128_t d = wasm_v128_load((v128_t*)dp);
856 v128_t t = wasm_i64x2_add(s1, s2);
857 v128_t w = wasm_i64x2_shr(t, e);
858 d = wasm_i64x2_sub(d, w);
859 wasm_v128_store((v128_t*)dp, d);
864 int i = (int)h_width;
866 for (; i > 0; i -= 2, sp += 2, dp += 2)
868 v128_t s1 = wasm_v128_load((v128_t*)sp);
869 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
870 v128_t d = wasm_v128_load((v128_t*)dp);
871 v128_t t = wasm_i64x2_add(s1, s2);
872 v128_t v = wasm_i64x2_sub(vb, t);
873 v128_t w = wasm_i64x2_shr(v, e);
874 d = wasm_i64x2_add(d, w);
875 wasm_v128_store((v128_t*)dp, d);
878 for (; i > 0; i -= 2, sp += 2, dp += 2)
880 v128_t s1 = wasm_v128_load((v128_t*)sp);
881 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
882 v128_t d = wasm_v128_load((v128_t*)dp);
883 v128_t t = wasm_i64x2_add(s1, s2);
884 v128_t v = wasm_i64x2_sub(vb, t);
885 v128_t w = wasm_i64x2_shr(v, e);
886 d = wasm_i64x2_add(d, w);
887 wasm_v128_store((v128_t*)dp, d);
892 int i = (int)h_width;
894 for (; i > 0; i -= 2, sp += 2, dp += 2)
896 v128_t s1 = wasm_v128_load((v128_t*)sp);
897 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
898 v128_t d = wasm_v128_load((v128_t*)dp);
899 v128_t t = wasm_i64x2_add(s1, s2);
900 v128_t u = wasm_i64x2_mul(va, t);
901 v128_t v = wasm_i64x2_add(vb, u);
902 v128_t w = wasm_i64x2_shr(v, e);
903 d = wasm_i64x2_add(d, w);
904 wasm_v128_store((v128_t*)dp, d);
907 for (; i > 0; i -= 2, sp += 2, dp += 2)
909 v128_t s1 = wasm_v128_load((v128_t*)sp);
910 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
911 v128_t d = wasm_v128_load((v128_t*)dp);
912 v128_t t = wasm_i64x2_add(s1, s2);
913 v128_t u = wasm_i64x2_mul(va, t);
914 v128_t v = wasm_i64x2_add(vb, u);
915 v128_t w = wasm_i64x2_shr(v, e);
916 d = wasm_i64x2_add(d, w);
917 wasm_v128_store((v128_t*)dp, d);
922 si64* t = lp; lp = hp; hp = t;
924 ui32 w = l_width; l_width = h_width; h_width = w;
929 ldst->
i64[0] = src->
i64[0];
931 hdst->
i64[0] = src->
i64[0] << 1;
958 ui32 width,
bool even)
964 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
965 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
967 for (
ui32 j = 0; j < num_steps; ++j)
973 v128_t va = wasm_i32x4_splat(a);
974 v128_t vb = wasm_i32x4_splat(b);
978 oth[oth_width] = oth[oth_width - 1];
980 const si32* sp = oth;
984 int i = (int)aug_width;
987 for (; i > 0; i -= 4, sp += 4, dp += 4)
989 v128_t s1 = wasm_v128_load((v128_t*)sp);
990 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
991 v128_t d = wasm_v128_load((v128_t*)dp);
992 v128_t t = wasm_i32x4_add(s1, s2);
993 v128_t v = wasm_i32x4_add(vb, t);
994 v128_t w = wasm_i32x4_shr(v, e);
995 d = wasm_i32x4_sub(d, w);
996 wasm_v128_store((v128_t*)dp, d);
1001 for (; i > 0; i -= 4, sp += 4, dp += 4)
1003 v128_t s1 = wasm_v128_load((v128_t*)sp);
1004 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1005 v128_t d = wasm_v128_load((v128_t*)dp);
1006 v128_t t = wasm_i32x4_add(s1, s2);
1007 v128_t v = wasm_i32x4_add(vb, t);
1008 v128_t w = wasm_i32x4_shr(v, e);
1009 d = wasm_i32x4_sub(d, w);
1010 wasm_v128_store((v128_t*)dp, d);
1014 else if (a == -1 && b == 1 && e == 1)
1016 int i = (int)aug_width;
1018 for (; i > 0; i -= 4, sp += 4, dp += 4)
1020 v128_t s1 = wasm_v128_load((v128_t*)sp);
1021 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1022 v128_t d = wasm_v128_load((v128_t*)dp);
1023 v128_t t = wasm_i32x4_add(s1, s2);
1024 v128_t w = wasm_i32x4_shr(t, e);
1025 d = wasm_i32x4_add(d, w);
1026 wasm_v128_store((v128_t*)dp, d);
1029 for (; i > 0; i -= 4, sp += 4, dp += 4)
1031 v128_t s1 = wasm_v128_load((v128_t*)sp);
1032 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1033 v128_t d = wasm_v128_load((v128_t*)dp);
1034 v128_t t = wasm_i32x4_add(s1, s2);
1035 v128_t w = wasm_i32x4_shr(t, e);
1036 d = wasm_i32x4_add(d, w);
1037 wasm_v128_store((v128_t*)dp, d);
1042 int i = (int)aug_width;
1044 for (; i > 0; i -= 4, sp += 4, dp += 4)
1046 v128_t s1 = wasm_v128_load((v128_t*)sp);
1047 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1048 v128_t d = wasm_v128_load((v128_t*)dp);
1049 v128_t t = wasm_i32x4_add(s1, s2);
1050 v128_t v = wasm_i32x4_sub(vb, t);
1051 v128_t w = wasm_i32x4_shr(v, e);
1052 d = wasm_i32x4_sub(d, w);
1053 wasm_v128_store((v128_t*)dp, d);
1056 for (; i > 0; i -= 4, sp += 4, dp += 4)
1058 v128_t s1 = wasm_v128_load((v128_t*)sp);
1059 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1060 v128_t d = wasm_v128_load((v128_t*)dp);
1061 v128_t t = wasm_i32x4_add(s1, s2);
1062 v128_t v = wasm_i32x4_sub(vb, t);
1063 v128_t w = wasm_i32x4_shr(v, e);
1064 d = wasm_i32x4_sub(d, w);
1065 wasm_v128_store((v128_t*)dp, d);
1070 int i = (int)aug_width;
1072 for (; i > 0; i -= 4, sp += 4, dp += 4)
1074 v128_t s1 = wasm_v128_load((v128_t*)sp);
1075 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1076 v128_t d = wasm_v128_load((v128_t*)dp);
1077 v128_t t = wasm_i32x4_add(s1, s2);
1078 v128_t u = wasm_i32x4_mul(va, t);
1079 v128_t v = wasm_i32x4_add(vb, u);
1080 v128_t w = wasm_i32x4_shr(v, e);
1081 d = wasm_i32x4_sub(d, w);
1082 wasm_v128_store((v128_t*)dp, d);
1085 for (; i > 0; i -= 4, sp += 4, dp += 4)
1087 v128_t s1 = wasm_v128_load((v128_t*)sp);
1088 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1089 v128_t d = wasm_v128_load((v128_t*)dp);
1090 v128_t t = wasm_i32x4_add(s1, s2);
1091 v128_t u = wasm_i32x4_mul(va, t);
1092 v128_t v = wasm_i32x4_add(vb, u);
1093 v128_t w = wasm_i32x4_shr(v, e);
1094 d = wasm_i32x4_sub(d, w);
1095 wasm_v128_store((v128_t*)dp, d);
1100 si32* t = aug; aug = oth; oth = t;
1102 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1107 float* dp = dst->
f32;
1108 float* spl = even ? lsrc->
f32 : hsrc->
f32;
1109 float* sph = even ? hsrc->
f32 : lsrc->
f32;
1116 dst->
i32[0] = lsrc->
i32[0];
1118 dst->
i32[0] = hsrc->
i32[0] >> 1;
1125 ui32 width,
bool even)
1131 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
1132 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
1134 for (
ui32 j = 0; j < num_steps; ++j)
1140 v128_t va = wasm_i64x2_splat(a);
1141 v128_t vb = wasm_i64x2_splat(b);
1145 oth[oth_width] = oth[oth_width - 1];
1147 const si64* sp = oth;
1151 int i = (int)aug_width;
1154 for (; i > 0; i -= 2, sp += 2, dp += 2)
1156 v128_t s1 = wasm_v128_load((v128_t*)sp);
1157 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1158 v128_t d = wasm_v128_load((v128_t*)dp);
1159 v128_t t = wasm_i64x2_add(s1, s2);
1160 v128_t v = wasm_i64x2_add(vb, t);
1161 v128_t w = wasm_i64x2_shr(v, e);
1162 d = wasm_i64x2_sub(d, w);
1163 wasm_v128_store((v128_t*)dp, d);
1168 for (; i > 0; i -= 2, sp += 2, dp += 2)
1170 v128_t s1 = wasm_v128_load((v128_t*)sp);
1171 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1172 v128_t d = wasm_v128_load((v128_t*)dp);
1173 v128_t t = wasm_i64x2_add(s1, s2);
1174 v128_t v = wasm_i64x2_add(vb, t);
1175 v128_t w = wasm_i64x2_shr(v, e);
1176 d = wasm_i64x2_sub(d, w);
1177 wasm_v128_store((v128_t*)dp, d);
1181 else if (a == -1 && b == 1 && e == 1)
1183 int i = (int)aug_width;
1185 for (; i > 0; i -= 2, sp += 2, dp += 2)
1187 v128_t s1 = wasm_v128_load((v128_t*)sp);
1188 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1189 v128_t d = wasm_v128_load((v128_t*)dp);
1190 v128_t t = wasm_i64x2_add(s1, s2);
1191 v128_t w = wasm_i64x2_shr(t, e);
1192 d = wasm_i64x2_add(d, w);
1193 wasm_v128_store((v128_t*)dp, d);
1196 for (; i > 0; i -= 2, sp += 2, dp += 2)
1198 v128_t s1 = wasm_v128_load((v128_t*)sp);
1199 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1200 v128_t d = wasm_v128_load((v128_t*)dp);
1201 v128_t t = wasm_i64x2_add(s1, s2);
1202 v128_t w = wasm_i64x2_shr(t, e);
1203 d = wasm_i64x2_add(d, w);
1204 wasm_v128_store((v128_t*)dp, d);
1209 int i = (int)aug_width;
1211 for (; i > 0; i -= 2, sp += 2, dp += 2)
1213 v128_t s1 = wasm_v128_load((v128_t*)sp);
1214 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1215 v128_t d = wasm_v128_load((v128_t*)dp);
1216 v128_t t = wasm_i64x2_add(s1, s2);
1217 v128_t v = wasm_i64x2_sub(vb, t);
1218 v128_t w = wasm_i64x2_shr(v, e);
1219 d = wasm_i64x2_sub(d, w);
1220 wasm_v128_store((v128_t*)dp, d);
1223 for (; i > 0; i -= 2, sp += 2, dp += 2)
1225 v128_t s1 = wasm_v128_load((v128_t*)sp);
1226 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1227 v128_t d = wasm_v128_load((v128_t*)dp);
1228 v128_t t = wasm_i64x2_add(s1, s2);
1229 v128_t v = wasm_i64x2_sub(vb, t);
1230 v128_t w = wasm_i64x2_shr(v, e);
1231 d = wasm_i64x2_sub(d, w);
1232 wasm_v128_store((v128_t*)dp, d);
1237 int i = (int)aug_width;
1239 for (; i > 0; i -= 2, sp += 2, dp += 2)
1241 v128_t s1 = wasm_v128_load((v128_t*)sp);
1242 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1243 v128_t d = wasm_v128_load((v128_t*)dp);
1244 v128_t t = wasm_i64x2_add(s1, s2);
1245 v128_t u = wasm_i64x2_mul(va, t);
1246 v128_t v = wasm_i64x2_add(vb, u);
1247 v128_t w = wasm_i64x2_shr(v, e);
1248 d = wasm_i64x2_sub(d, w);
1249 wasm_v128_store((v128_t*)dp, d);
1252 for (; i > 0; i -= 2, sp += 2, dp += 2)
1254 v128_t s1 = wasm_v128_load((v128_t*)sp);
1255 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1256 v128_t d = wasm_v128_load((v128_t*)dp);
1257 v128_t t = wasm_i64x2_add(s1, s2);
1258 v128_t u = wasm_i64x2_mul(va, t);
1259 v128_t v = wasm_i64x2_add(vb, u);
1260 v128_t w = wasm_i64x2_shr(v, e);
1261 d = wasm_i64x2_sub(d, w);
1262 wasm_v128_store((v128_t*)dp, d);
1267 si64* t = aug; aug = oth; oth = t;
1269 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1274 double* dp = (
double*)dst->
p;
1275 double* spl = (
double*)(even ? lsrc->
p : hsrc->
p);
1276 double* sph = (
double*)(even ? hsrc->
p : lsrc->
p);
1283 dst->
i64[0] = lsrc->
i64[0];
1285 dst->
i64[0] = hsrc->
i64[0] >> 1;