Grok 10.0.5
wasm_256-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 256-bit WASM vectors and operations. Experimental.
17// External include guard in highway.h - see comment there.
18
19// For half-width vectors. Already includes base.h and shared-inl.h.
21
23namespace hwy {
24namespace HWY_NAMESPACE {
25
26template <typename T>
27class Vec256 {
28 public:
29 using PrivateT = T; // only for DFromV
30 static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV
31
32 // Compound assignment. Only usable if there is a corresponding non-member
33 // binary operator overload. For example, only f32 and f64 support division.
35 return *this = (*this * other);
36 }
38 return *this = (*this / other);
39 }
41 return *this = (*this + other);
42 }
44 return *this = (*this - other);
45 }
47 return *this = (*this & other);
48 }
50 return *this = (*this | other);
51 }
53 return *this = (*this ^ other);
54 }
55
58};
59
60template <typename T>
61struct Mask256 {
64};
65
66// ------------------------------ BitCast
67
68template <typename T, typename FromT>
70 const Half<decltype(d)> dh;
71 Vec256<T> ret;
72 ret.v0 = BitCast(dh, v.v0);
73 ret.v1 = BitCast(dh, v.v1);
74 return ret;
75}
76
77// ------------------------------ Zero
78
79template <typename T>
81 const Half<decltype(d)> dh;
82 Vec256<T> ret;
83 ret.v0 = ret.v1 = Zero(dh);
84 return ret;
85}
86
87template <class D>
88using VFromD = decltype(Zero(D()));
89
90// ------------------------------ Set
91
92// Returns a vector/part with all lanes set to "t".
93template <typename T, typename T2>
95 const Half<decltype(d)> dh;
96 Vec256<T> ret;
97 ret.v0 = ret.v1 = Set(dh, static_cast<T>(t));
98 return ret;
99}
100
101template <typename T>
103 const Half<decltype(d)> dh;
104 Vec256<T> ret;
105 ret.v0 = ret.v1 = Undefined(dh);
106 return ret;
107}
108
109template <typename T, typename T2>
110Vec256<T> Iota(const Full256<T> d, const T2 first) {
111 const Half<decltype(d)> dh;
112 Vec256<T> ret;
113 ret.v0 = Iota(dh, first);
114 // NB: for floating types the gap between parts might be a bit uneven.
115 ret.v1 = Iota(dh, AddWithWraparound(hwy::IsFloatTag<T>(),
116 static_cast<T>(first), Lanes(dh)));
117 return ret;
118}
119
120// ================================================== ARITHMETIC
121
122template <typename T>
124 a.v0 += b.v0;
125 a.v1 += b.v1;
126 return a;
127}
128
129template <typename T>
131 a.v0 -= b.v0;
132 a.v1 -= b.v1;
133 return a;
134}
135
136// ------------------------------ SumsOf8
139 ret.v0 = SumsOf8(v.v0);
140 ret.v1 = SumsOf8(v.v1);
141 return ret;
142}
143
144template <typename T>
146 a.v0 = SaturatedAdd(a.v0, b.v0);
147 a.v1 = SaturatedAdd(a.v1, b.v1);
148 return a;
149}
150
151template <typename T>
153 a.v0 = SaturatedSub(a.v0, b.v0);
154 a.v1 = SaturatedSub(a.v1, b.v1);
155 return a;
156}
157
158template <typename T>
160 a.v0 = AverageRound(a.v0, b.v0);
161 a.v1 = AverageRound(a.v1, b.v1);
162 return a;
163}
164
165template <typename T>
167 v.v0 = Abs(v.v0);
168 v.v1 = Abs(v.v1);
169 return v;
170}
171
172// ------------------------------ Shift lanes by constant #bits
173
174template <int kBits, typename T>
176 v.v0 = ShiftLeft<kBits>(v.v0);
177 v.v1 = ShiftLeft<kBits>(v.v1);
178 return v;
179}
180
181template <int kBits, typename T>
183 v.v0 = ShiftRight<kBits>(v.v0);
184 v.v1 = ShiftRight<kBits>(v.v1);
185 return v;
186}
187
188// ------------------------------ RotateRight (ShiftRight, Or)
189template <int kBits, typename T>
191 constexpr size_t kSizeInBits = sizeof(T) * 8;
192 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
193 if (kBits == 0) return v;
194 return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
195}
196
197// ------------------------------ Shift lanes by same variable #bits
198
199template <typename T>
201 v.v0 = ShiftLeftSame(v.v0, bits);
202 v.v1 = ShiftLeftSame(v.v1, bits);
203 return v;
204}
205
206template <typename T>
208 v.v0 = ShiftRightSame(v.v0, bits);
209 v.v1 = ShiftRightSame(v.v1, bits);
210 return v;
211}
212
213// ------------------------------ Min, Max
214template <typename T>
216 a.v0 = Min(a.v0, b.v0);
217 a.v1 = Min(a.v1, b.v1);
218 return a;
219}
220
221template <typename T>
223 a.v0 = Max(a.v0, b.v0);
224 a.v1 = Max(a.v1, b.v1);
225 return a;
226}
227// ------------------------------ Integer multiplication
228
229template <typename T>
231 a.v0 *= b.v0;
232 a.v1 *= b.v1;
233 return a;
234}
235
236template <typename T>
238 a.v0 = MulHigh(a.v0, b.v0);
239 a.v1 = MulHigh(a.v1, b.v1);
240 return a;
241}
242
243template <typename T>
245 a.v0 = MulFixedPoint15(a.v0, b.v0);
246 a.v1 = MulFixedPoint15(a.v1, b.v1);
247 return a;
248}
249
250// Cannot use MakeWide because that returns uint128_t for uint64_t, but we want
251// uint64_t.
254 ret.v0 = MulEven(a.v0, b.v0);
255 ret.v1 = MulEven(a.v1, b.v1);
256 return ret;
257}
259 Vec256<int64_t> ret;
260 ret.v0 = MulEven(a.v0, b.v0);
261 ret.v1 = MulEven(a.v1, b.v1);
262 return ret;
263}
264
267 ret.v0 = MulEven(a.v0, b.v0);
268 ret.v1 = MulEven(a.v1, b.v1);
269 return ret;
270}
273 ret.v0 = MulOdd(a.v0, b.v0);
274 ret.v1 = MulOdd(a.v1, b.v1);
275 return ret;
276}
277
278// ------------------------------ Negate
279template <typename T>
281 v.v0 = Neg(v.v0);
282 v.v1 = Neg(v.v1);
283 return v;
284}
285
286// ------------------------------ Floating-point division
287template <typename T>
289 a.v0 /= b.v0;
290 a.v1 /= b.v1;
291 return a;
292}
293
294// Approximate reciprocal
296 const Vec256<float> one = Set(Full256<float>(), 1.0f);
297 return one / v;
298}
299
300// Absolute value of difference.
302 return Abs(a - b);
303}
304
305// ------------------------------ Floating-point multiply-add variants
306
307// Returns mul * x + add
309 const Vec256<float> add) {
310 // TODO(eustas): replace, when implemented in WASM.
311 // TODO(eustas): is it wasm_f32x4_qfma?
312 return mul * x + add;
313}
314
315// Returns add - mul * x
317 const Vec256<float> add) {
318 // TODO(eustas): replace, when implemented in WASM.
319 return add - mul * x;
320}
321
322// Returns mul * x - sub
324 const Vec256<float> sub) {
325 // TODO(eustas): replace, when implemented in WASM.
326 // TODO(eustas): is it wasm_f32x4_qfms?
327 return mul * x - sub;
328}
329
330// Returns -mul * x - sub
332 const Vec256<float> sub) {
333 // TODO(eustas): replace, when implemented in WASM.
334 return Neg(mul) * x - sub;
335}
336
337// ------------------------------ Floating-point square root
338
339template <typename T>
341 v.v0 = Sqrt(v.v0);
342 v.v1 = Sqrt(v.v1);
343 return v;
344}
345
346// Approximate reciprocal square root
348 // TODO(eustas): find cheaper a way to calculate this.
349 const Vec256<float> one = Set(Full256<float>(), 1.0f);
350 return one / Sqrt(v);
351}
352
353// ------------------------------ Floating-point rounding
354
355// Toward nearest integer, ties to even
357 v.v0 = Round(v.v0);
358 v.v1 = Round(v.v1);
359 return v;
360}
361
362// Toward zero, aka truncate
364 v.v0 = Trunc(v.v0);
365 v.v1 = Trunc(v.v1);
366 return v;
367}
368
369// Toward +infinity, aka ceiling
371 v.v0 = Ceil(v.v0);
372 v.v1 = Ceil(v.v1);
373 return v;
374}
375
376// Toward -infinity, aka floor
378 v.v0 = Floor(v.v0);
379 v.v1 = Floor(v.v1);
380 return v;
381}
382
383// ------------------------------ Floating-point classification
384
385template <typename T>
387 return v != v;
388}
389
390template <typename T, HWY_IF_FLOAT(T)>
392 const Full256<T> d;
393 const RebindToSigned<decltype(d)> di;
394 const VFromD<decltype(di)> vi = BitCast(di, v);
395 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
396 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
397}
398
399// Returns whether normal/subnormal/zero.
400template <typename T, HWY_IF_FLOAT(T)>
402 const Full256<T> d;
403 const RebindToUnsigned<decltype(d)> du;
404 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
405 const VFromD<decltype(du)> vu = BitCast(du, v);
406 // 'Shift left' to clear the sign bit, then right so we can compare with the
407 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
408 // negative and non-negative floats would be greater).
409 const VFromD<decltype(di)> exp =
410 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
411 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
412}
413
414// ================================================== COMPARE
415
416// Comparisons fill a lane with 1-bits if the condition is true, else 0.
417
418template <typename TFrom, typename TTo>
420 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
421 return Mask256<TTo>{Mask128<TTo>{m.m0.raw}, Mask128<TTo>{m.m1.raw}};
422}
423
424template <typename T>
426 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
427 return (v & bit) == bit;
428}
429
430template <typename T>
432 Mask256<T> m;
433 m.m0 = operator==(a.v0, b.v0);
434 m.m1 = operator==(a.v1, b.v1);
435 return m;
436}
437
438template <typename T>
440 Mask256<T> m;
441 m.m0 = operator!=(a.v0, b.v0);
442 m.m1 = operator!=(a.v1, b.v1);
443 return m;
444}
445
446template <typename T>
448 Mask256<T> m;
449 m.m0 = operator<(a.v0, b.v0);
450 m.m1 = operator<(a.v1, b.v1);
451 return m;
452}
453
454template <typename T>
456 Mask256<T> m;
457 m.m0 = operator>(a.v0, b.v0);
458 m.m1 = operator>(a.v1, b.v1);
459 return m;
460}
461
462template <typename T>
464 Mask256<T> m;
465 m.m0 = operator<=(a.v0, b.v0);
466 m.m1 = operator<=(a.v1, b.v1);
467 return m;
468}
469
470template <typename T>
472 Mask256<T> m;
473 m.m0 = operator>=(a.v0, b.v0);
474 m.m1 = operator>=(a.v1, b.v1);
475 return m;
476}
477
478// ------------------------------ FirstN (Iota, Lt)
479
480template <typename T>
482 const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
483 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
484}
485
486// ================================================== LOGICAL
487
488template <typename T>
490 v.v0 = Not(v.v0);
491 v.v1 = Not(v.v1);
492 return v;
493}
494
495template <typename T>
497 a.v0 = And(a.v0, b.v0);
498 a.v1 = And(a.v1, b.v1);
499 return a;
500}
501
502template <typename T>
504 not_mask.v0 = AndNot(not_mask.v0, mask.v0);
505 not_mask.v1 = AndNot(not_mask.v1, mask.v1);
506 return not_mask;
507}
508
509template <typename T>
511 a.v0 = Or(a.v0, b.v0);
512 a.v1 = Or(a.v1, b.v1);
513 return a;
514}
515
516template <typename T>
518 a.v0 = Xor(a.v0, b.v0);
519 a.v1 = Xor(a.v1, b.v1);
520 return a;
521}
522
523template <typename T>
525 return Xor(x1, Xor(x2, x3));
526}
527
528template <typename T>
530 return Or(o1, Or(o2, o3));
531}
532
533template <typename T>
535 return Or(o, And(a1, a2));
536}
537
538template <typename T>
540 return IfThenElse(MaskFromVec(mask), yes, no);
541}
542
543// ------------------------------ Operator overloads (internal-only if float)
544
545template <typename T>
547 return And(a, b);
548}
549
550template <typename T>
552 return Or(a, b);
553}
554
555template <typename T>
557 return Xor(a, b);
558}
559
560// ------------------------------ CopySign
561
562template <typename T>
564 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
565 const auto msb = SignBit(Full256<T>());
566 return Or(AndNot(msb, magn), And(msb, sign));
567}
568
569template <typename T>
571 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
572 return Or(abs, And(SignBit(Full256<T>()), sign));
573}
574
575// ------------------------------ Mask
576
577// Mask and Vec are the same (true = FF..FF).
578template <typename T>
580 Mask256<T> m;
581 m.m0 = MaskFromVec(v.v0);
582 m.m1 = MaskFromVec(v.v1);
583 return m;
584}
585
586template <typename T>
588 const Half<decltype(d)> dh;
589 Vec256<T> v;
590 v.v0 = VecFromMask(dh, m.m0);
591 v.v1 = VecFromMask(dh, m.m1);
592 return v;
593}
594
595// mask ? yes : no
596template <typename T>
598 yes.v0 = IfThenElse(mask.m0, yes.v0, no.v0);
599 yes.v1 = IfThenElse(mask.m1, yes.v1, no.v1);
600 return yes;
601}
602
603// mask ? yes : 0
604template <typename T>
606 return yes & VecFromMask(Full256<T>(), mask);
607}
608
609// mask ? 0 : no
610template <typename T>
612 return AndNot(VecFromMask(Full256<T>(), mask), no);
613}
614
615template <typename T>
617 v.v0 = IfNegativeThenElse(v.v0, yes.v0, no.v0);
618 v.v1 = IfNegativeThenElse(v.v1, yes.v1, no.v1);
619 return v;
620}
621
622template <typename T, HWY_IF_FLOAT(T)>
624 return IfThenZeroElse(v < Zero(Full256<T>()), v);
625}
626
627// ------------------------------ Mask logical
628
629template <typename T>
631 return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
632}
633
634template <typename T>
636 const Full256<T> d;
637 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
638}
639
640template <typename T>
642 const Full256<T> d;
643 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
644}
645
646template <typename T>
648 const Full256<T> d;
649 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
650}
651
652template <typename T>
654 const Full256<T> d;
655 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
656}
657
658template <typename T>
660 const Full256<T> d;
661 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
662}
663
664// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
665template <typename T>
667 v.v0 = operator<<(v.v0, bits.v0);
668 v.v1 = operator<<(v.v1, bits.v1);
669 return v;
670}
671
672// ------------------------------ Shr (BroadcastSignBit, IfThenElse)
673template <typename T>
675 v.v0 = operator>>(v.v0, bits.v0);
676 v.v1 = operator>>(v.v1, bits.v1);
677 return v;
678}
679
680// ------------------------------ BroadcastSignBit (compare, VecFromMask)
681
682template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
684 return ShiftRight<sizeof(T) * 8 - 1>(v);
685}
687 const Full256<int8_t> d;
688 return VecFromMask(d, v < Zero(d));
689}
690
691// ================================================== MEMORY
692
693// ------------------------------ Load
694
695template <typename T>
697 const Half<decltype(d)> dh;
698 Vec256<T> ret;
699 ret.v0 = Load(dh, aligned);
700 ret.v1 = Load(dh, aligned + Lanes(dh));
701 return ret;
702}
703
704template <typename T>
706 const T* HWY_RESTRICT aligned) {
707 return IfThenElseZero(m, Load(d, aligned));
708}
709
710// LoadU == Load.
711template <typename T>
713 return Load(d, p);
714}
715
716template <typename T>
718 const Half<decltype(d)> dh;
719 Vec256<T> ret;
720 ret.v0 = ret.v1 = Load(dh, p);
721 return ret;
722}
723
724// ------------------------------ Store
725
726template <typename T>
728 const Half<decltype(d)> dh;
729 Store(v.v0, dh, aligned);
730 Store(v.v1, dh, aligned + Lanes(dh));
731}
732
733// StoreU == Store.
734template <typename T>
736 Store(v, d, p);
737}
738
739template <typename T>
741 T* HWY_RESTRICT p) {
742 StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
743}
744
745// ------------------------------ Stream
746template <typename T>
748 // Same as aligned stores.
749 Store(v, d, aligned);
750}
751
752// ------------------------------ Scatter (Store)
753
754template <typename T, typename Offset>
756 const Vec256<Offset> offset) {
757 constexpr size_t N = 32 / sizeof(T);
758 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
759
760 alignas(32) T lanes[N];
761 Store(v, d, lanes);
762
763 alignas(32) Offset offset_lanes[N];
764 Store(offset, Full256<Offset>(), offset_lanes);
765
766 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
767 for (size_t i = 0; i < N; ++i) {
768 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
769 }
770}
771
772template <typename T, typename Index>
774 const Vec256<Index> index) {
775 constexpr size_t N = 32 / sizeof(T);
776 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
777
778 alignas(32) T lanes[N];
779 Store(v, d, lanes);
780
781 alignas(32) Index index_lanes[N];
782 Store(index, Full256<Index>(), index_lanes);
783
784 for (size_t i = 0; i < N; ++i) {
785 base[index_lanes[i]] = lanes[i];
786 }
787}
788
789// ------------------------------ Gather (Load/Store)
790
791template <typename T, typename Offset>
793 const Vec256<Offset> offset) {
794 constexpr size_t N = 32 / sizeof(T);
795 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
796
797 alignas(32) Offset offset_lanes[N];
798 Store(offset, Full256<Offset>(), offset_lanes);
799
800 alignas(32) T lanes[N];
801 const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
802 for (size_t i = 0; i < N; ++i) {
803 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
804 }
805 return Load(d, lanes);
806}
807
808template <typename T, typename Index>
810 const Vec256<Index> index) {
811 constexpr size_t N = 32 / sizeof(T);
812 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
813
814 alignas(32) Index index_lanes[N];
815 Store(index, Full256<Index>(), index_lanes);
816
817 alignas(32) T lanes[N];
818 for (size_t i = 0; i < N; ++i) {
819 lanes[i] = base[index_lanes[i]];
820 }
821 return Load(d, lanes);
822}
823
824// ================================================== SWIZZLE
825
826// ------------------------------ ExtractLane
827template <typename T>
828HWY_API T ExtractLane(const Vec256<T> v, size_t i) {
829 alignas(32) T lanes[32 / sizeof(T)];
830 Store(v, Full256<T>(), lanes);
831 return lanes[i];
832}
833
834// ------------------------------ InsertLane
835template <typename T>
836HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) {
838 alignas(32) T lanes[32 / sizeof(T)];
839 Store(v, d, lanes);
840 lanes[i] = t;
841 return Load(d, lanes);
842}
843
844// ------------------------------ LowerHalf
845
846template <typename T>
848 return v.v0;
849}
850
851template <typename T>
853 return v.v0;
854}
855
856// ------------------------------ GetLane (LowerHalf)
857template <typename T>
859 return GetLane(LowerHalf(v));
860}
861
862// ------------------------------ ShiftLeftBytes
863
864template <int kBytes, typename T>
866 const Half<decltype(d)> dh;
867 v.v0 = ShiftLeftBytes<kBytes>(dh, v.v0);
868 v.v1 = ShiftLeftBytes<kBytes>(dh, v.v1);
869 return v;
870}
871
872template <int kBytes, typename T>
874 return ShiftLeftBytes<kBytes>(Full256<T>(), v);
875}
876
877// ------------------------------ ShiftLeftLanes
878
879template <int kLanes, typename T>
881 const Repartition<uint8_t, decltype(d)> d8;
882 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
883}
884
885template <int kLanes, typename T>
887 return ShiftLeftLanes<kLanes>(Full256<T>(), v);
888}
889
890// ------------------------------ ShiftRightBytes
891template <int kBytes, typename T>
893 const Half<decltype(d)> dh;
894 v.v0 = ShiftRightBytes<kBytes>(dh, v.v0);
895 v.v1 = ShiftRightBytes<kBytes>(dh, v.v1);
896 return v;
897}
898
899// ------------------------------ ShiftRightLanes
900template <int kLanes, typename T>
902 const Repartition<uint8_t, decltype(d)> d8;
903 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
904}
905
906// ------------------------------ UpperHalf (ShiftRightBytes)
907
908template <typename T>
910 return v.v1;
911}
912
913// ------------------------------ CombineShiftRightBytes
914
915template <int kBytes, typename T, class V = Vec256<T>>
917 const Half<decltype(d)> dh;
918 hi.v0 = CombineShiftRightBytes<kBytes>(dh, hi.v0, lo.v0);
919 hi.v1 = CombineShiftRightBytes<kBytes>(dh, hi.v1, lo.v1);
920 return hi;
921}
922
923// ------------------------------ Broadcast/splat any lane
924
925template <int kLane, typename T>
927 Vec256<T> ret;
928 ret.v0 = Broadcast<kLane>(v.v0);
929 ret.v1 = Broadcast<kLane>(v.v1);
930 return ret;
931}
932
933// ------------------------------ TableLookupBytes
934
935// Both full
936template <typename T, typename TI>
938 from.v0 = TableLookupBytes(bytes.v0, from.v0);
939 from.v1 = TableLookupBytes(bytes.v1, from.v1);
940 return from;
941}
942
943// Partial index vector
944template <typename T, typename TI, size_t NI>
946 const Vec128<TI, NI> from) {
947 // First expand to full 128, then 256.
948 const auto from_256 = ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw});
949 const auto tbl_full = TableLookupBytes(bytes, from_256);
950 // Shrink to 128, then partial.
951 return Vec128<TI, NI>{LowerHalf(Full128<TI>(), tbl_full).raw};
952}
953
954// Partial table vector
955template <typename T, size_t N, typename TI>
957 const Vec256<TI> from) {
958 // First expand to full 128, then 256.
959 const auto bytes_256 = ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw});
960 return TableLookupBytes(bytes_256, from);
961}
962
963// Partial both are handled by wasm_128.
964
965template <class V, class VI>
966HWY_API VI TableLookupBytesOr0(const V bytes, VI from) {
967 // wasm out-of-bounds policy already zeros, so TableLookupBytes is fine.
968 return TableLookupBytes(bytes, from);
969}
970
971// ------------------------------ Hard-coded shuffles
972
973template <typename T>
975 v.v0 = Shuffle01(v.v0);
976 v.v1 = Shuffle01(v.v1);
977 return v;
978}
979
980template <typename T>
982 v.v0 = Shuffle2301(v.v0);
983 v.v1 = Shuffle2301(v.v1);
984 return v;
985}
986
987template <typename T>
989 v.v0 = Shuffle1032(v.v0);
990 v.v1 = Shuffle1032(v.v1);
991 return v;
992}
993
994template <typename T>
996 v.v0 = Shuffle0321(v.v0);
997 v.v1 = Shuffle0321(v.v1);
998 return v;
999}
1000
1001template <typename T>
1003 v.v0 = Shuffle2103(v.v0);
1004 v.v1 = Shuffle2103(v.v1);
1005 return v;
1006}
1007
1008template <typename T>
1010 v.v0 = Shuffle0123(v.v0);
1011 v.v1 = Shuffle0123(v.v1);
1012 return v;
1013}
1014
1015// Used by generic_ops-inl.h
1016namespace detail {
1017
1018template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1020 a.v0 = Shuffle2301(a.v0, b.v0);
1021 a.v1 = Shuffle2301(a.v1, b.v1);
1022 return a;
1023}
1024template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1026 a.v0 = Shuffle1230(a.v0, b.v0);
1027 a.v1 = Shuffle1230(a.v1, b.v1);
1028 return a;
1029}
1030template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1032 a.v0 = Shuffle3012(a.v0, b.v0);
1033 a.v1 = Shuffle3012(a.v1, b.v1);
1034 return a;
1035}
1036
1037} // namespace detail
1038
1039// ------------------------------ TableLookupLanes
1040
1041// Returned by SetTableIndices for use by TableLookupLanes.
1042template <typename T>
1044 __v128_u i0;
1045 __v128_u i1;
1046};
1047
1048template <typename T, typename TI>
1050 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
1051 Indices256<T> ret;
1052 ret.i0 = vec.v0.raw;
1053 ret.i1 = vec.v1.raw;
1054 return ret;
1055}
1056
1057template <typename T, typename TI>
1059 const Rebind<TI, decltype(d)> di;
1060 return IndicesFromVec(d, LoadU(di, idx));
1061}
1062
1063template <typename T>
1065 using TU = MakeUnsigned<T>;
1066 const Full128<T> dh;
1067 const Full128<TU> duh;
1068 constexpr size_t kLanesPerHalf = 16 / sizeof(TU);
1069
1070 const Vec128<TU> vi0{idx.i0};
1071 const Vec128<TU> vi1{idx.i1};
1072 const Vec128<TU> mask = Set(duh, static_cast<TU>(kLanesPerHalf - 1));
1073 const Vec128<TU> vmod0 = vi0 & mask;
1074 const Vec128<TU> vmod1 = vi1 & mask;
1075 // If ANDing did not change the index, it is for the lower half.
1076 const Mask128<T> is_lo0 = RebindMask(dh, vi0 == vmod0);
1077 const Mask128<T> is_lo1 = RebindMask(dh, vi1 == vmod1);
1078 const Indices128<T> mod0 = IndicesFromVec(dh, vmod0);
1079 const Indices128<T> mod1 = IndicesFromVec(dh, vmod1);
1080
1081 Vec256<T> ret;
1082 ret.v0 = IfThenElse(is_lo0, TableLookupLanes(v.v0, mod0),
1083 TableLookupLanes(v.v1, mod0));
1084 ret.v1 = IfThenElse(is_lo1, TableLookupLanes(v.v0, mod1),
1085 TableLookupLanes(v.v1, mod1));
1086 return ret;
1087}
1088
1089template <typename T>
1091 // The out of bounds behavior will already zero lanes.
1092 return TableLookupLanesOr0(v, idx);
1093}
1094
1095// ------------------------------ Reverse
1096template <typename T>
1098 const Half<decltype(d)> dh;
1099 Vec256<T> ret;
1100 ret.v1 = Reverse(dh, v.v0); // note reversed v1 member order
1101 ret.v0 = Reverse(dh, v.v1);
1102 return ret;
1103}
1104
1105// ------------------------------ Reverse2
1106template <typename T>
1108 const Half<decltype(d)> dh;
1109 v.v0 = Reverse2(dh, v.v0);
1110 v.v1 = Reverse2(dh, v.v1);
1111 return v;
1112}
1113
1114// ------------------------------ Reverse4
1115
1116// Each block has only 2 lanes, so swap blocks and their lanes.
1117template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1119 const Half<decltype(d)> dh;
1120 Vec256<T> ret;
1121 ret.v0 = Reverse2(dh, v.v1); // swapped
1122 ret.v1 = Reverse2(dh, v.v0);
1123 return ret;
1124}
1125
1126template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
1127HWY_API Vec256<T> Reverse4(Full256<T> d, Vec256<T> v) {
1128 const Half<decltype(d)> dh;
1129 v.v0 = Reverse4(dh, v.v0);
1130 v.v1 = Reverse4(dh, v.v1);
1131 return v;
1132}
1133
1134// ------------------------------ Reverse8
1135
1136template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1138 HWY_ASSERT(0); // don't have 8 u64 lanes
1139}
1140
1141// Each block has only 4 lanes, so swap blocks and their lanes.
1142template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1143HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
1144 const Half<decltype(d)> dh;
1145 Vec256<T> ret;
1146 ret.v0 = Reverse4(dh, v.v1); // swapped
1147 ret.v1 = Reverse4(dh, v.v0);
1148 return ret;
1149}
1150
1151template <typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)> // 1 or 2 bytes
1152HWY_API Vec256<T> Reverse8(Full256<T> d, Vec256<T> v) {
1153 const Half<decltype(d)> dh;
1154 v.v0 = Reverse8(dh, v.v0);
1155 v.v1 = Reverse8(dh, v.v1);
1156 return v;
1157}
1158
1159// ------------------------------ InterleaveLower
1160
1161template <typename T>
1163 a.v0 = InterleaveLower(a.v0, b.v0);
1164 a.v1 = InterleaveLower(a.v1, b.v1);
1165 return a;
1166}
1167
1168// wasm_128 already defines a template with D, V, V args.
1169
1170// ------------------------------ InterleaveUpper (UpperHalf)
1171
1172template <typename T, class V = Vec256<T>>
1174 const Half<decltype(d)> dh;
1175 a.v0 = InterleaveUpper(dh, a.v0, b.v0);
1176 a.v1 = InterleaveUpper(dh, a.v1, b.v1);
1177 return a;
1178}
1179
1180// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
1181
1182// Same as Interleave*, except that the return lanes are double-width integers;
1183// this is necessary because the single-lane scalar cannot return two values.
1184template <typename T, class DW = RepartitionToWide<Full256<T>>>
1186 return BitCast(DW(), InterleaveLower(a, b));
1187}
1188template <typename T, class D = Full256<T>, class DW = RepartitionToWide<D>>
1190 return BitCast(dw, InterleaveLower(D(), a, b));
1191}
1192
1193template <typename T, class D = Full256<T>, class DW = RepartitionToWide<D>>
1195 return BitCast(dw, InterleaveUpper(D(), a, b));
1196}
1197
1198// ================================================== COMBINE
1199
1200// ------------------------------ Combine (InterleaveLower)
1201template <typename T>
1203 Vec256<T> ret;
1204 ret.v1 = hi;
1205 ret.v0 = lo;
1206 return ret;
1207}
1208
1209// ------------------------------ ZeroExtendVector (Combine)
1210template <typename T>
1212 const Half<decltype(d)> dh;
1213 return Combine(d, Zero(dh), lo);
1214}
1215
1216// ------------------------------ ConcatLowerLower
1217template <typename T>
1219 const Vec256<T> lo) {
1220 Vec256<T> ret;
1221 ret.v1 = hi.v0;
1222 ret.v0 = lo.v0;
1223 return ret;
1224}
1225
1226// ------------------------------ ConcatUpperUpper
1227template <typename T>
1229 const Vec256<T> lo) {
1230 Vec256<T> ret;
1231 ret.v1 = hi.v1;
1232 ret.v0 = lo.v1;
1233 return ret;
1234}
1235
1236// ------------------------------ ConcatLowerUpper
1237template <typename T>
1239 const Vec256<T> lo) {
1240 Vec256<T> ret;
1241 ret.v1 = hi.v0;
1242 ret.v0 = lo.v1;
1243 return ret;
1244}
1245
1246// ------------------------------ ConcatUpperLower
1247template <typename T>
1249 const Vec256<T> lo) {
1250 Vec256<T> ret;
1251 ret.v1 = hi.v1;
1252 ret.v0 = lo.v0;
1253 return ret;
1254}
1255
1256// ------------------------------ ConcatOdd
1257template <typename T>
1259 const Vec256<T> lo) {
1260 const Half<decltype(d)> dh;
1261 Vec256<T> ret;
1262 ret.v0 = ConcatOdd(dh, lo.v1, lo.v0);
1263 ret.v1 = ConcatOdd(dh, hi.v1, hi.v0);
1264 return ret;
1265}
1266
1267// ------------------------------ ConcatEven
1268template <typename T>
1270 const Vec256<T> lo) {
1271 const Half<decltype(d)> dh;
1272 Vec256<T> ret;
1273 ret.v0 = ConcatEven(dh, lo.v1, lo.v0);
1274 ret.v1 = ConcatEven(dh, hi.v1, hi.v0);
1275 return ret;
1276}
1277
1278// ------------------------------ DupEven
1279template <typename T>
1281 v.v0 = DupEven(v.v0);
1282 v.v1 = DupEven(v.v1);
1283 return v;
1284}
1285
1286// ------------------------------ DupOdd
1287template <typename T>
1289 v.v0 = DupOdd(v.v0);
1290 v.v1 = DupOdd(v.v1);
1291 return v;
1292}
1293
1294// ------------------------------ OddEven
1295template <typename T>
1297 a.v0 = OddEven(a.v0, b.v0);
1298 a.v1 = OddEven(a.v1, b.v1);
1299 return a;
1300}
1301
1302// ------------------------------ OddEvenBlocks
1303template <typename T>
1305 odd.v0 = even.v0;
1306 return odd;
1307}
1308
1309// ------------------------------ SwapAdjacentBlocks
1310template <typename T>
1312 Vec256<T> ret;
1313 ret.v0 = v.v1; // swapped order
1314 ret.v1 = v.v0;
1315 return ret;
1316}
1317
1318// ------------------------------ ReverseBlocks
1319template <typename T>
1321 return SwapAdjacentBlocks(v); // 2 blocks, so Swap = Reverse
1322}
1323
1324// ================================================== CONVERT
1325
1326// ------------------------------ Promotions (part w/ narrow lanes -> full)
1327
1328namespace detail {
1329
1330// Unsigned: zero-extend.
1332 const Vec128<uint8_t> v) {
1333 return Vec128<uint16_t>{wasm_u16x8_extend_high_u8x16(v.raw)};
1334}
1336 const Vec128<uint8_t> v) {
1337 return Vec128<uint32_t>{
1338 wasm_u32x4_extend_high_u16x8(wasm_u16x8_extend_high_u8x16(v.raw))};
1339}
1341 const Vec128<uint8_t> v) {
1342 return Vec128<int16_t>{wasm_u16x8_extend_high_u8x16(v.raw)};
1343}
1345 const Vec128<uint8_t> v) {
1346 return Vec128<int32_t>{
1347 wasm_u32x4_extend_high_u16x8(wasm_u16x8_extend_high_u8x16(v.raw))};
1348}
1350 const Vec128<uint16_t> v) {
1351 return Vec128<uint32_t>{wasm_u32x4_extend_high_u16x8(v.raw)};
1352}
1354 const Vec128<uint32_t> v) {
1355 return Vec128<uint64_t>{wasm_u64x2_extend_high_u32x4(v.raw)};
1356}
1358 const Vec128<uint16_t> v) {
1359 return Vec128<int32_t>{wasm_u32x4_extend_high_u16x8(v.raw)};
1360}
1361
1362// Signed: replicate sign bit.
1364 const Vec128<int8_t> v) {
1365 return Vec128<int16_t>{wasm_i16x8_extend_high_i8x16(v.raw)};
1366}
1368 const Vec128<int8_t> v) {
1369 return Vec128<int32_t>{
1370 wasm_i32x4_extend_high_i16x8(wasm_i16x8_extend_high_i8x16(v.raw))};
1371}
1373 const Vec128<int16_t> v) {
1374 return Vec128<int32_t>{wasm_i32x4_extend_high_i16x8(v.raw)};
1375}
1377 const Vec128<int32_t> v) {
1378 return Vec128<int64_t>{wasm_i64x2_extend_high_i32x4(v.raw)};
1379}
1380
1382 const Vec128<int32_t> v) {
1383 // There is no wasm_f64x2_convert_high_i32x4.
1384 const Full64<int32_t> di32h;
1385 return PromoteTo(dd, UpperHalf(di32h, v));
1386}
1387
1389 const Vec128<float16_t> v) {
1390 const RebindToSigned<decltype(df32)> di32;
1391 const RebindToUnsigned<decltype(df32)> du32;
1392 // Expand to u32 so we can shift.
1393 const auto bits16 = PromoteUpperTo(du32, Vec128<uint16_t>{v.raw});
1394 const auto sign = ShiftRight<15>(bits16);
1395 const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
1396 const auto mantissa = bits16 & Set(du32, 0x3FF);
1397 const auto subnormal =
1398 BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
1399 Set(df32, 1.0f / 16384 / 1024));
1400
1401 const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
1402 const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
1403 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
1404 const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
1405 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
1406}
1407
1409 const Vec128<bfloat16_t> v) {
1410 const Full128<uint16_t> du16;
1411 const RebindToSigned<decltype(df32)> di32;
1412 return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v))));
1413}
1414
1415} // namespace detail
1416
1417template <typename T, typename TN>
1419 const Half<decltype(d)> dh;
1420 Vec256<T> ret;
1421 ret.v0 = PromoteTo(dh, LowerHalf(v));
1422 ret.v1 = detail::PromoteUpperTo(dh, v);
1423 return ret;
1424}
1425
1426// This is the only 4x promotion from 8 to 32-bit.
1427template <typename TW, typename TN>
1429 const Half<decltype(d)> dh;
1430 const Rebind<MakeWide<TN>, decltype(d)> d2; // 16-bit lanes
1431 const auto v16 = PromoteTo(d2, v);
1432 Vec256<TW> ret;
1433 ret.v0 = PromoteTo(dh, LowerHalf(v16));
1434 ret.v1 = detail::PromoteUpperTo(dh, v16);
1435 return ret;
1436}
1437
1438// ------------------------------ DemoteTo
1439
1441 const Vec256<int32_t> v) {
1442 return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(v.v0.raw, v.v1.raw)};
1443}
1444
1446 const Vec256<int32_t> v) {
1447 return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw)};
1448}
1449
1451 const Vec256<int32_t> v) {
1452 const auto intermediate = wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw);
1453 return Vec64<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
1454}
1455
1457 const Vec256<int16_t> v) {
1458 return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(v.v0.raw, v.v1.raw)};
1459}
1460
1462 const Vec256<int32_t> v) {
1463 const auto intermediate = wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw);
1464 return Vec64<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
1465}
1466
1468 const Vec256<int16_t> v) {
1469 return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(v.v0.raw, v.v1.raw)};
1470}
1471
1473 const Vec64<int32_t> lo{wasm_i32x4_trunc_sat_f64x2_zero(v.v0.raw)};
1474 const Vec64<int32_t> hi{wasm_i32x4_trunc_sat_f64x2_zero(v.v1.raw)};
1475 return Combine(di, hi, lo);
1476}
1477
1479 const Vec256<float> v) {
1480 const Half<decltype(d16)> d16h;
1481 const Vec64<float16_t> lo = DemoteTo(d16h, v.v0);
1482 const Vec64<float16_t> hi = DemoteTo(d16h, v.v1);
1483 return Combine(d16, hi, lo);
1484}
1485
1487 const Vec256<float> v) {
1488 const Half<decltype(dbf16)> dbf16h;
1489 const Vec64<bfloat16_t> lo = DemoteTo(dbf16h, v.v0);
1490 const Vec64<bfloat16_t> hi = DemoteTo(dbf16h, v.v1);
1491 return Combine(dbf16, hi, lo);
1492}
1493
1494// For already range-limited input [0, 255].
1496 const Full64<uint8_t> du8;
1497 const Full256<int32_t> di32; // no unsigned DemoteTo
1498 return DemoteTo(du8, BitCast(di32, v));
1499}
1500
1501// ------------------------------ Truncations
1502
1504 const Vec256<uint64_t> v) {
1505 return Vec32<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 8, 16, 24, 0,
1506 8, 16, 24, 0, 8, 16, 24, 0, 8, 16,
1507 24)};
1508}
1509
1511 const Vec256<uint64_t> v) {
1512 return Vec64<uint16_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 8, 9, 16,
1513 17, 24, 25, 0, 1, 8, 9, 16, 17, 24,
1514 25)};
1515}
1516
1518 const Vec256<uint64_t> v) {
1519 return Vec128<uint32_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 2, 3, 8,
1520 9, 10, 11, 16, 17, 18, 19, 24, 25,
1521 26, 27)};
1522}
1523
1525 const Vec256<uint32_t> v) {
1526 return Vec64<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 4, 8, 12, 16,
1527 20, 24, 28, 0, 4, 8, 12, 16, 20, 24,
1528 28)};
1529}
1530
1532 const Vec256<uint32_t> v) {
1533 return Vec128<uint16_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 4, 5, 8,
1534 9, 12, 13, 16, 17, 20, 21, 24, 25,
1535 28, 29)};
1536}
1537
1539 const Vec256<uint16_t> v) {
1540 return Vec128<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 2, 4, 6, 8,
1541 10, 12, 14, 16, 18, 20, 22, 24, 26,
1542 28, 30)};
1543}
1544
1545// ------------------------------ ReorderDemote2To
1548 const RebindToUnsigned<decltype(dbf16)> du16;
1549 return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
1550}
1551
1554 const Half<decltype(d16)> d16h;
1555 Vec256<int16_t> demoted;
1556 demoted.v0 = DemoteTo(d16h, a);
1557 demoted.v1 = DemoteTo(d16h, b);
1558 return demoted;
1559}
1560
1561// ------------------------------ Convert i32 <=> f32 (Round)
1562
1563template <typename TTo, typename TFrom>
1565 const Half<decltype(d)> dh;
1566 Vec256<TTo> ret;
1567 ret.v0 = ConvertTo(dh, v.v0);
1568 ret.v1 = ConvertTo(dh, v.v1);
1569 return ret;
1570}
1571
1573 return ConvertTo(Full256<int32_t>(), Round(v));
1574}
1575
1576// ================================================== MISC
1577
1578// ------------------------------ LoadMaskBits (TestBit)
1579
1580// `p` points to at least 8 readable bytes, not all of which need be valid.
1581template <typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)> // 4 or 8 bytes
1583 const uint8_t* HWY_RESTRICT bits) {
1584 const Half<decltype(d)> dh;
1585 Mask256<T> ret;
1586 ret.m0 = LoadMaskBits(dh, bits);
1587 // If size=4, one 128-bit vector has 4 mask bits; otherwise 2 for size=8.
1588 // Both halves fit in one byte's worth of mask bits.
1589 constexpr size_t kBitsPerHalf = 16 / sizeof(T);
1590 const uint8_t bits_upper[8] = {static_cast<uint8_t>(bits[0] >> kBitsPerHalf)};
1591 ret.m1 = LoadMaskBits(dh, bits_upper);
1592 return ret;
1593}
1594
1595template <typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)> // 1 or 2 bytes
1596HWY_API Mask256<T> LoadMaskBits(Full256<T> d,
1597 const uint8_t* HWY_RESTRICT bits) {
1598 const Half<decltype(d)> dh;
1599 Mask256<T> ret;
1600 ret.m0 = LoadMaskBits(dh, bits);
1601 constexpr size_t kLanesPerHalf = 16 / sizeof(T);
1602 constexpr size_t kBytesPerHalf = kLanesPerHalf / 8;
1603 static_assert(kBytesPerHalf != 0, "Lane size <= 16 bits => at least 8 lanes");
1604 ret.m1 = LoadMaskBits(dh, bits + kBytesPerHalf);
1605 return ret;
1606}
1607
1608// ------------------------------ Mask
1609
1610// `p` points to at least 8 writable bytes.
1611template <typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)> // 4 or 8 bytes
1613 uint8_t* bits) {
1614 const Half<decltype(d)> dh;
1615 StoreMaskBits(dh, mask.m0, bits);
1616 const uint8_t lo = bits[0];
1617 StoreMaskBits(dh, mask.m1, bits);
1618 // If size=4, one 128-bit vector has 4 mask bits; otherwise 2 for size=8.
1619 // Both halves fit in one byte's worth of mask bits.
1620 constexpr size_t kBitsPerHalf = 16 / sizeof(T);
1621 bits[0] = static_cast<uint8_t>(lo | (bits[0] << kBitsPerHalf));
1622 return (kBitsPerHalf * 2 + 7) / 8;
1623}
1624
1625template <typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)> // 1 or 2 bytes
1626HWY_API size_t StoreMaskBits(const Full256<T> d, const Mask256<T> mask,
1627 uint8_t* bits) {
1628 const Half<decltype(d)> dh;
1629 constexpr size_t kLanesPerHalf = 16 / sizeof(T);
1630 constexpr size_t kBytesPerHalf = kLanesPerHalf / 8;
1631 static_assert(kBytesPerHalf != 0, "Lane size <= 16 bits => at least 8 lanes");
1632 StoreMaskBits(dh, mask.m0, bits);
1633 StoreMaskBits(dh, mask.m1, bits + kBytesPerHalf);
1634 return kBytesPerHalf * 2;
1635}
1636
1637template <typename T>
1638HWY_API size_t CountTrue(const Full256<T> d, const Mask256<T> m) {
1639 const Half<decltype(d)> dh;
1640 return CountTrue(dh, m.m0) + CountTrue(dh, m.m1);
1641}
1642
1643template <typename T>
1645 const Half<decltype(d)> dh;
1646 return AllFalse(dh, m.m0) && AllFalse(dh, m.m1);
1647}
1648
1649template <typename T>
1650HWY_API bool AllTrue(const Full256<T> d, const Mask256<T> m) {
1651 const Half<decltype(d)> dh;
1652 return AllTrue(dh, m.m0) && AllTrue(dh, m.m1);
1653}
1654
1655template <typename T>
1657 const Half<decltype(d)> dh;
1658 const intptr_t lo = FindFirstTrue(dh, mask.m0); // not known
1659 constexpr size_t kLanesPerHalf = 16 / sizeof(T);
1660 return lo >= 0 ? static_cast<size_t>(lo)
1661 : kLanesPerHalf + FindKnownFirstTrue(dh, mask.m1);
1662}
1663
1664template <typename T>
1665HWY_API intptr_t FindFirstTrue(const Full256<T> d, const Mask256<T> mask) {
1666 const Half<decltype(d)> dh;
1667 const intptr_t lo = FindFirstTrue(dh, mask.m0);
1668 const intptr_t hi = FindFirstTrue(dh, mask.m1);
1669 if (lo < 0 && hi < 0) return lo;
1670 constexpr int kLanesPerHalf = 16 / sizeof(T);
1671 return lo >= 0 ? lo : hi + kLanesPerHalf;
1672}
1673
1674// ------------------------------ CompressStore
1675template <typename T>
1676HWY_API size_t CompressStore(const Vec256<T> v, const Mask256<T> mask,
1677 Full256<T> d, T* HWY_RESTRICT unaligned) {
1678 const Half<decltype(d)> dh;
1679 const size_t count = CompressStore(v.v0, mask.m0, dh, unaligned);
1680 const size_t count2 = CompressStore(v.v1, mask.m1, dh, unaligned + count);
1681 return count + count2;
1682}
1683
1684// ------------------------------ CompressBlendedStore
1685template <typename T>
1687 Full256<T> d, T* HWY_RESTRICT unaligned) {
1688 const Half<decltype(d)> dh;
1689 const size_t count = CompressBlendedStore(v.v0, m.m0, dh, unaligned);
1690 const size_t count2 = CompressBlendedStore(v.v1, m.m1, dh, unaligned + count);
1691 return count + count2;
1692}
1693
1694// ------------------------------ CompressBitsStore
1695
1696template <typename T>
1698 const uint8_t* HWY_RESTRICT bits, Full256<T> d,
1699 T* HWY_RESTRICT unaligned) {
1700 const Mask256<T> m = LoadMaskBits(d, bits);
1701 return CompressStore(v, m, d, unaligned);
1702}
1703
1704// ------------------------------ Compress
1705
1706template <typename T>
1708 const Full256<T> d;
1709 alignas(32) T lanes[32 / sizeof(T)] = {};
1710 (void)CompressStore(v, mask, d, lanes);
1711 return Load(d, lanes);
1712}
1713
1714// ------------------------------ CompressNot
1715template <typename T>
1717 return Compress(v, Not(mask));
1718}
1719
1720// ------------------------------ CompressBlocksNot
1722 Mask256<uint64_t> mask) {
1723 const Full128<uint64_t> dh;
1724 // Because the non-selected (mask=1) blocks are undefined, we can return the
1725 // input unless mask = 01, in which case we must bring down the upper block.
1726 return AllTrue(dh, AndNot(mask.m1, mask.m0)) ? SwapAdjacentBlocks(v) : v;
1727}
1728
1729// ------------------------------ CompressBits
1730
1731template <typename T>
1733 const Mask256<T> m = LoadMaskBits(Full256<T>(), bits);
1734 return Compress(v, m);
1735}
1736
1737// ------------------------------ LoadInterleaved3/4
1738
1739// Implemented in generic_ops, we just overload LoadTransposedBlocks3/4.
1740
1741namespace detail {
1742
1743// Input:
1744// 1 0 (<- first block of unaligned)
1745// 3 2
1746// 5 4
1747// Output:
1748// 3 0
1749// 4 1
1750// 5 2
1751template <typename T>
1753 const T* HWY_RESTRICT unaligned,
1754 Vec256<T>& A, Vec256<T>& B, Vec256<T>& C) {
1755 constexpr size_t N = 32 / sizeof(T);
1756 const Vec256<T> v10 = LoadU(d, unaligned + 0 * N); // 1 0
1757 const Vec256<T> v32 = LoadU(d, unaligned + 1 * N);
1758 const Vec256<T> v54 = LoadU(d, unaligned + 2 * N);
1759
1760 A = ConcatUpperLower(d, v32, v10);
1761 B = ConcatLowerUpper(d, v54, v10);
1762 C = ConcatUpperLower(d, v54, v32);
1763}
1764
1765// Input (128-bit blocks):
1766// 1 0 (first block of unaligned)
1767// 3 2
1768// 5 4
1769// 7 6
1770// Output:
1771// 4 0 (LSB of A)
1772// 5 1
1773// 6 2
1774// 7 3
1775template <typename T>
1777 const T* HWY_RESTRICT unaligned,
1778 Vec256<T>& A, Vec256<T>& B, Vec256<T>& C,
1779 Vec256<T>& D) {
1780 constexpr size_t N = 32 / sizeof(T);
1781 const Vec256<T> v10 = LoadU(d, unaligned + 0 * N);
1782 const Vec256<T> v32 = LoadU(d, unaligned + 1 * N);
1783 const Vec256<T> v54 = LoadU(d, unaligned + 2 * N);
1784 const Vec256<T> v76 = LoadU(d, unaligned + 3 * N);
1785
1786 A = ConcatLowerLower(d, v54, v10);
1787 B = ConcatUpperUpper(d, v54, v10);
1788 C = ConcatLowerLower(d, v76, v32);
1789 D = ConcatUpperUpper(d, v76, v32);
1790}
1791
1792} // namespace detail
1793
1794// ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower)
1795
1796// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
1797
1798namespace detail {
1799
1800// Input (128-bit blocks):
1801// 2 0 (LSB of i)
1802// 3 1
1803// Output:
1804// 1 0
1805// 3 2
1806template <typename T>
1808 const Full256<T> d,
1809 T* HWY_RESTRICT unaligned) {
1810 constexpr size_t N = 32 / sizeof(T);
1811 const auto out0 = ConcatLowerLower(d, j, i);
1812 const auto out1 = ConcatUpperUpper(d, j, i);
1813 StoreU(out0, d, unaligned + 0 * N);
1814 StoreU(out1, d, unaligned + 1 * N);
1815}
1816
1817// Input (128-bit blocks):
1818// 3 0 (LSB of i)
1819// 4 1
1820// 5 2
1821// Output:
1822// 1 0
1823// 3 2
1824// 5 4
1825template <typename T>
1827 const Vec256<T> k, Full256<T> d,
1828 T* HWY_RESTRICT unaligned) {
1829 constexpr size_t N = 32 / sizeof(T);
1830 const auto out0 = ConcatLowerLower(d, j, i);
1831 const auto out1 = ConcatUpperLower(d, i, k);
1832 const auto out2 = ConcatUpperUpper(d, k, j);
1833 StoreU(out0, d, unaligned + 0 * N);
1834 StoreU(out1, d, unaligned + 1 * N);
1835 StoreU(out2, d, unaligned + 2 * N);
1836}
1837
1838// Input (128-bit blocks):
1839// 4 0 (LSB of i)
1840// 5 1
1841// 6 2
1842// 7 3
1843// Output:
1844// 1 0
1845// 3 2
1846// 5 4
1847// 7 6
1848template <typename T>
1850 const Vec256<T> k, const Vec256<T> l,
1851 Full256<T> d, T* HWY_RESTRICT unaligned) {
1852 constexpr size_t N = 32 / sizeof(T);
1853 // Write lower halves, then upper.
1854 const auto out0 = ConcatLowerLower(d, j, i);
1855 const auto out1 = ConcatLowerLower(d, l, k);
1856 StoreU(out0, d, unaligned + 0 * N);
1857 StoreU(out1, d, unaligned + 1 * N);
1858 const auto out2 = ConcatUpperUpper(d, j, i);
1859 const auto out3 = ConcatUpperUpper(d, l, k);
1860 StoreU(out2, d, unaligned + 2 * N);
1861 StoreU(out3, d, unaligned + 3 * N);
1862}
1863
1864} // namespace detail
1865
1866// ------------------------------ ReorderWidenMulAccumulate
1867template <typename TN, typename TW>
1869 Vec256<TN> b, Vec256<TW> sum0,
1870 Vec256<TW>& sum1) {
1871 const Half<decltype(d)> dh;
1872 sum0.v0 = ReorderWidenMulAccumulate(dh, a.v0, b.v0, sum0.v0, sum1.v0);
1873 sum0.v1 = ReorderWidenMulAccumulate(dh, a.v1, b.v1, sum0.v1, sum1.v1);
1874 return sum0;
1875}
1876
1877// ------------------------------ RearrangeToOddPlusEven
1878template <typename TW>
1880 sum0.v0 = RearrangeToOddPlusEven(sum0.v0, sum1.v0);
1881 sum0.v1 = RearrangeToOddPlusEven(sum0.v1, sum1.v1);
1882 return sum0;
1883}
1884
1885// ------------------------------ Reductions
1886
1887template <typename T>
1889 const Half<decltype(d)> dh;
1890 const Vec128<T> lo = SumOfLanes(dh, Add(v.v0, v.v1));
1891 return Combine(d, lo, lo);
1892}
1893
1894template <typename T>
1896 const Half<decltype(d)> dh;
1897 const Vec128<T> lo = MinOfLanes(dh, Min(v.v0, v.v1));
1898 return Combine(d, lo, lo);
1899}
1900
1901template <typename T>
1903 const Half<decltype(d)> dh;
1904 const Vec128<T> lo = MaxOfLanes(dh, Max(v.v0, v.v1));
1905 return Combine(d, lo, lo);
1906}
1907
1908// ------------------------------ Lt128
1909
1910template <typename T>
1912 const Half<decltype(d)> dh;
1913 Mask256<T> ret;
1914 ret.m0 = Lt128(dh, a.v0, b.v0);
1915 ret.m1 = Lt128(dh, a.v1, b.v1);
1916 return ret;
1917}
1918
1919template <typename T>
1921 const Half<decltype(d)> dh;
1922 Mask256<T> ret;
1923 ret.m0 = Lt128Upper(dh, a.v0, b.v0);
1924 ret.m1 = Lt128Upper(dh, a.v1, b.v1);
1925 return ret;
1926}
1927
1928template <typename T>
1930 const Half<decltype(d)> dh;
1931 Mask256<T> ret;
1932 ret.m0 = Eq128(dh, a.v0, b.v0);
1933 ret.m1 = Eq128(dh, a.v1, b.v1);
1934 return ret;
1935}
1936
1937template <typename T>
1939 const Half<decltype(d)> dh;
1940 Mask256<T> ret;
1941 ret.m0 = Eq128Upper(dh, a.v0, b.v0);
1942 ret.m1 = Eq128Upper(dh, a.v1, b.v1);
1943 return ret;
1944}
1945
1946template <typename T>
1948 const Half<decltype(d)> dh;
1949 Mask256<T> ret;
1950 ret.m0 = Ne128(dh, a.v0, b.v0);
1951 ret.m1 = Ne128(dh, a.v1, b.v1);
1952 return ret;
1953}
1954
1955template <typename T>
1957 const Half<decltype(d)> dh;
1958 Mask256<T> ret;
1959 ret.m0 = Ne128Upper(dh, a.v0, b.v0);
1960 ret.m1 = Ne128Upper(dh, a.v1, b.v1);
1961 return ret;
1962}
1963
1964template <typename T>
1966 const Half<decltype(d)> dh;
1967 Vec256<T> ret;
1968 ret.v0 = Min128(dh, a.v0, b.v0);
1969 ret.v1 = Min128(dh, a.v1, b.v1);
1970 return ret;
1971}
1972
1973template <typename T>
1975 const Half<decltype(d)> dh;
1976 Vec256<T> ret;
1977 ret.v0 = Max128(dh, a.v0, b.v0);
1978 ret.v1 = Max128(dh, a.v1, b.v1);
1979 return ret;
1980}
1981
1982template <typename T>
1984 const Half<decltype(d)> dh;
1985 Vec256<T> ret;
1986 ret.v0 = Min128Upper(dh, a.v0, b.v0);
1987 ret.v1 = Min128Upper(dh, a.v1, b.v1);
1988 return ret;
1989}
1990
1991template <typename T>
1993 const Half<decltype(d)> dh;
1994 Vec256<T> ret;
1995 ret.v0 = Max128Upper(dh, a.v0, b.v0);
1996 ret.v1 = Max128Upper(dh, a.v1, b.v1);
1997 return ret;
1998}
1999
2000// NOLINTNEXTLINE(google-readability-namespace-comments)
2001} // namespace HWY_NAMESPACE
2002} // namespace hwy
#define HWY_RESTRICT
Definition: base.h:64
#define HWY_API
Definition: base.h:129
#define HWY_INLINE
Definition: base.h:70
#define HWY_ASSERT(condition)
Definition: base.h:192
Definition: arm_neon-inl.h:825
Definition: arm_neon-inl.h:778
Raw raw
Definition: arm_neon-inl.h:814
Definition: wasm_256-inl.h:27
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: wasm_256-inl.h:52
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: wasm_256-inl.h:46
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: wasm_256-inl.h:43
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: wasm_256-inl.h:40
Vec128< T > v1
Definition: wasm_256-inl.h:57
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: wasm_256-inl.h:49
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: wasm_256-inl.h:37
static constexpr size_t kPrivateN
Definition: wasm_256-inl.h:30
Vec128< T > v0
Definition: wasm_256-inl.h:56
T PrivateT
Definition: wasm_256-inl.h:29
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: wasm_256-inl.h:34
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2413
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:159
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2451
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:470
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:535
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:862
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N, kPow2 > df, svfloat16_t v)
Definition: arm_sve-inl.h:1299
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2432
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:670
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:505
HWY_API Vec128< ToT, N > ConvertTo(hwy::FloatTag, Simd< ToT, N, 0 >, Vec128< FromT, N > from)
Definition: emu128-inl.h:1685
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:340
d
Definition: rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4448
HWY_API Vec256< T > TableLookupLanesOr0(Vec256< T > v, Indices256< T > idx)
Definition: wasm_256-inl.h:1090
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition: arm_neon-inl.h:2025
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition: arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2517
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6677
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2758
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2753
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1413
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4922
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition: arm_neon-inl.h:2314
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition: arm_neon-inl.h:4412
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2260
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3425
typename D::Half Half
Definition: ops/shared-inl.h:227
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3327
N
Definition: rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6651
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3885
const vfloat64m1_t v
Definition: rvv-inl.h:1998
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3713
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1861
Definition: aligned_allocator.h:27
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition: base.h:906
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:593
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:595
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: arm_neon-inl.h:3968
Definition: wasm_256-inl.h:1043
__v128_u i0
Definition: wasm_256-inl.h:1044
__v128_u i1
Definition: wasm_256-inl.h:1045
Definition: wasm_256-inl.h:61
Mask128< T > m1
Definition: wasm_256-inl.h:63
Mask128< T > m0
Definition: wasm_256-inl.h:62
Definition: ops/shared-inl.h:52
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()