22#if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
56#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
58template <
size_t kLanes,
class D,
class V = VFromD<D>>
60 constexpr size_t kBytes = kLanes *
sizeof(LaneType<V>);
61 static_assert(kBytes < 16,
"Shift count is per-block");
62 return CombineShiftRightBytes<kBytes>(
d, hi, lo);
88 using TU =
TFromD<
decltype(du)>;
89 const TU max_x2 =
static_cast<TU
>(MaxExponentTimes2<T>());
95template <
class D,
typename T = TFromD<D>>
98#if HWY_MEM_OPS_MIGHT_FAULT
100 for (
size_t i = 0; i < num; ++i) {
110template <
class D,
typename T = TFromD<D>>
113#if HWY_MEM_OPS_MIGHT_FAULT
115 for (
size_t i = 0; i < num; ++i) {
128#if (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
129#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
130#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
132#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
137template <
typename T,
size_t N,
class V>
140 const V A =
LoadU(
d, unaligned + 0 *
N);
141 const V B =
LoadU(
d, unaligned + 1 *
N);
146template <
typename T,
class V>
149 v0 =
LoadU(
d, unaligned + 0);
150 v1 =
LoadU(
d, unaligned + 1);
158template <
typename T,
size_t N,
class V, HWY_IF_LE128(T, N)>
162 A =
LoadU(
d, unaligned + 0 *
N);
163 B =
LoadU(
d, unaligned + 1 *
N);
164 C =
LoadU(
d, unaligned + 2 *
N);
169template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
171 V& v0, V& v1, V& v2) {
179 constexpr uint8_t Z = 0x80;
180 alignas(16)
constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, 9, 12, 15, Z, Z,
181 Z, Z, Z, Z, Z, Z, Z, Z};
182 alignas(16)
constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, Z, Z, Z, 2, 5,
183 8, 11, 14, Z, Z, Z, Z, Z};
184 alignas(16)
constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
185 Z, Z, Z, 1, 4, 7, 10, 13};
186 alignas(16)
constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, 10, 13, Z, Z, Z,
187 Z, Z, Z, Z, Z, Z, Z, Z};
188 alignas(16)
constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, Z, Z, 0, 3, 6,
189 9, 12, 15, Z, Z, Z, Z, Z};
190 alignas(16)
constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
191 Z, Z, Z, 2, 5, 8, 11, 14};
192 alignas(16)
constexpr uint8_t kIdx_v2A[16] = {2, 5, 8, 11, 14, Z, Z, Z,
193 Z, Z, Z, Z, Z, Z, Z, Z};
194 alignas(16)
constexpr uint8_t kIdx_v2B[16] = {Z, Z, Z, Z, Z, 1, 4, 7,
195 10, 13, Z, Z, Z, Z, Z, Z};
196 alignas(16)
constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
197 Z, Z, 0, 3, 6, 9, 12, 15};
207 v0 =
Xor3(v0L, v0M, v0U);
208 v1 =
Xor3(v1L, v1M, v1U);
209 v2 =
Xor3(v2L, v2M, v2U);
216 V& v0, V& v1, V& v2) {
223 constexpr uint8_t Z = 0x80;
224 alignas(16)
constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z};
225 alignas(16)
constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z};
226 alignas(16)
constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5};
227 alignas(16)
constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z};
228 alignas(16)
constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z};
229 alignas(16)
constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6};
230 alignas(16)
constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z};
231 alignas(16)
constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z};
232 alignas(16)
constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7};
242 v0 =
Xor3(v0L, v0M, v0U);
243 v1 =
Xor3(v1L, v1M, v1U);
244 v2 =
Xor3(v2L, v2M, v2U);
251 V& v0, V& v1, V& v2) {
259 constexpr uint16_t Z = 0x8080;
260 alignas(16)
constexpr uint16_t kIdx_v0A[8] = {0x0100, 0x0706, 0x0D0C, Z,
262 alignas(16)
constexpr uint16_t kIdx_v0B[8] = {Z, Z, Z, 0x0302,
263 0x0908, 0x0F0E, Z, Z};
264 alignas(16)
constexpr uint16_t kIdx_v0C[8] = {Z, Z, Z, Z,
265 Z, Z, 0x0504, 0x0B0A};
266 alignas(16)
constexpr uint16_t kIdx_v1A[8] = {0x0302, 0x0908, 0x0F0E, Z,
268 alignas(16)
constexpr uint16_t kIdx_v1B[8] = {Z, Z, Z, 0x0504,
270 alignas(16)
constexpr uint16_t kIdx_v1C[8] = {Z, Z, Z, Z,
271 Z, 0x0100, 0x0706, 0x0D0C};
272 alignas(16)
constexpr uint16_t kIdx_v2A[8] = {0x0504, 0x0B0A, Z, Z,
274 alignas(16)
constexpr uint16_t kIdx_v2B[8] = {Z, Z, 0x0100, 0x0706,
276 alignas(16)
constexpr uint16_t kIdx_v2C[8] = {Z, Z, Z, Z,
277 Z, 0x0302, 0x0908, 0x0F0E};
287 v0 =
Xor3(v0L, v0M, v0U);
288 v1 =
Xor3(v1L, v1M, v1U);
289 v2 =
Xor3(v2L, v2M, v2U);
292template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
294 V& v0, V& v1, V& v2) {
300 const V vxx_02_03_xx =
OddEven(C, B);
306 const V vxx_xx_10_11 =
OddEven(A, B);
307 const V v12_13_xx_xx =
OddEven(B, C);
310 const V vxx_20_21_xx =
OddEven(B, A);
314template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
316 V& v0, V& v1, V& v2) {
322 v1 = CombineShiftRightBytes<sizeof(T)>(
d, C, A);
326template <
typename T,
class V>
328 V& v0, V& v1, V& v2) {
329 v0 =
LoadU(
d, unaligned + 0);
330 v1 =
LoadU(
d, unaligned + 1);
331 v2 =
LoadU(
d, unaligned + 2);
339template <
typename T,
size_t N,
class V, HWY_IF_LE128(T, N)>
343 A =
LoadU(
d, unaligned + 0 *
N);
344 B =
LoadU(
d, unaligned + 1 *
N);
345 C =
LoadU(
d, unaligned + 2 *
N);
346 D =
LoadU(
d, unaligned + 3 *
N);
351template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
353 V& v0, V& v1, V& v2, V& v3) {
355 using V64 =
VFromD<
decltype(d64)>;
387template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 8)>
389 V& v0, V& v1, V& v2, V& v3) {
394 using VW =
VFromD<
decltype(dw)>;
424template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
426 V& v0, V& v1, V& v2, V& v3) {
443template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
445 V& v0, V& v1, V& v2, V& v3) {
455template <
typename T,
class V>
457 V& v0, V& v1, V& v2, V& v3) {
458 v0 =
LoadU(
d, unaligned + 0);
459 v1 =
LoadU(
d, unaligned + 1);
460 v2 =
LoadU(
d, unaligned + 2);
461 v3 =
LoadU(
d, unaligned + 3);
469template <
typename T,
size_t N,
class V, HWY_IF_LE128(T, N)>
479template <
typename T,
size_t N,
class V, HWY_IF_GE128(T, N)>
488template <
class V,
typename T,
size_t N, HWY_IF_LE64(T, N)>
491 const Twice<
decltype(
d)> d2;
495 StoreU(v10, d2, unaligned);
504template <
typename T,
size_t N,
class V, HWY_IF_LE128(T, N)>
521 using TU =
TFromD<
decltype(du)>;
522 const auto k5 =
Set(du, TU{5});
523 const auto k6 =
Set(du, TU{6});
529 alignas(16)
static constexpr uint8_t tbl_v0[16] = {
530 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
531 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
532 alignas(16)
static constexpr uint8_t tbl_v1[16] = {
533 0x80, 0, 0x80, 0x80, 1, 0x80,
534 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
539 const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
543 const V A =
BitCast(
d, A0 | A1 | A2);
546 const auto shuf_B0 = shuf_A2 + k6;
547 const auto shuf_B1 = shuf_A0 + k5;
548 const auto shuf_B2 = shuf_A1 + k5;
552 const V B =
BitCast(
d, B0 | B1 | B2);
555 const auto shuf_C0 = shuf_B2 + k6;
556 const auto shuf_C1 = shuf_B0 + k5;
557 const auto shuf_C2 = shuf_B1 + k5;
561 const V C =
BitCast(
d, C0 | C1 | C2);
572 const auto k2 =
Set(du8, uint8_t{2 *
sizeof(T)});
573 const auto k3 =
Set(du8, uint8_t{3 *
sizeof(T)});
579 alignas(16)
static constexpr uint8_t tbl_v1[16] = {
580 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
581 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
582 alignas(16)
static constexpr uint8_t tbl_v2[16] = {
583 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
584 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
590 const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
596 const V A =
BitCast(
d, A0 | A1 | A2);
599 const auto shuf_B0 = shuf_A1 + k3;
600 const auto shuf_B1 = shuf_A2 + k3;
601 const auto shuf_B2 = shuf_A0 + k2;
605 const V B =
BitCast(
d, B0 | B1 | B2);
608 const auto shuf_C0 = shuf_B1 + k3;
609 const auto shuf_C1 = shuf_B2 + k3;
610 const auto shuf_C2 = shuf_B0 + k2;
614 const V C =
BitCast(
d, C0 | C1 | C2);
627 const V v01_v20 =
OddEven(v0, v2);
632 const V v1_321 = ShiftRightLanes<1>(
d, v1);
633 const V v0_32 = ShiftRightLanes<2>(
d, v0);
634 const V v21_v11 =
OddEven(v2, v1_321);
635 const V v12_v02 =
OddEven(v1_321, v0_32);
641 const V v23_v13 =
OddEven(v2, v1_321);
642 const V v03_v22 =
OddEven(v0, v2);
662template <
class V,
typename T, HWY_IF_LANE_SIZE(T, 1)>
665 constexpr size_t N = 16 /
sizeof(T);
669 const auto k5 =
Set(du, uint8_t{5});
670 const auto k6 =
Set(du, uint8_t{6});
679 alignas(16)
static constexpr uint8_t tbl_v0[16] = {
680 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
681 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
682 alignas(16)
static constexpr uint8_t tbl_v1[16] = {
683 0x80, 0, 0x80, 0x80, 1, 0x80,
684 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
687 const auto shuf_A0 =
Load(du, tbl_v0);
688 const auto shuf_A1 =
Load(du, tbl_v1);
689 const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
693 const auto A =
BitCast(d_full, A0 | A1 | A2);
694 StoreU(A, d_full, unaligned + 0 *
N);
697 const auto shuf_B0 = shuf_A2 + k6;
698 const auto shuf_B1 = shuf_A0 + k5;
699 const auto shuf_B2 = shuf_A1 + k5;
703 const V B{(B0 | B1 | B2).raw};
708template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
714 constexpr size_t N = 16 /
sizeof(T);
715 const auto k2 =
Set(du8, uint8_t{2 *
sizeof(T)});
716 const auto k3 =
Set(du8, uint8_t{3 *
sizeof(T)});
726 alignas(16)
static constexpr uint8_t tbl_v1[16] = {
727 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
728 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
729 alignas(16)
static constexpr uint8_t tbl_v2[16] = {
730 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
731 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
735 const auto shuf_A1 =
Load(du8, tbl_v1);
737 const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
738 const auto shuf_A2 =
Load(du8, tbl_v2);
747 const auto shuf_B0 = shuf_A1 + k3;
748 const auto shuf_B1 = shuf_A2 + k3;
749 const auto shuf_B2 = shuf_A0 + k2;
758template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
760 const Vec64<T> v2, Full64<T>
d,
763 constexpr size_t N = 2;
765 const Vec64<T> v01_v20 =
OddEven(v0, v2);
767 StoreU(v10_v00,
d, unaligned + 0 *
N);
768 StoreU(v01_v20,
d, unaligned + 1 *
N);
769 StoreU(v21_v11,
d, unaligned + 2 *
N);
775template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1), HWY_IF_LE32(T, N)>
791 alignas(16)
static constexpr uint8_t tbl_v0[16] = {
792 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80,
793 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
796 const auto shuf_A0 =
Load(du, tbl_v0);
797 const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
798 const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
803 alignas(16) T buf[16 /
sizeof(T)];
805 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
809template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
814 constexpr size_t N = 4 /
sizeof(T);
826 alignas(16)
static constexpr uint8_t tbl_v2[16] = {
827 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
828 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
834 CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
836 CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
840 const auto A =
BitCast(d_full, A0 | A1 | A2);
841 alignas(16) T buf[16 /
sizeof(T)];
843 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
861template <
typename T,
size_t N,
class V, HWY_IF_LE128(T, N)>
879 const auto v10L =
ZipLower(dw, v0, v1);
880 const auto v32L =
ZipLower(dw, v2, v3);
881 const auto v10U =
ZipUpper(dw, v0, v1);
882 const auto v32U =
ZipUpper(dw, v2, v3);
905template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
909 constexpr size_t N = 16 /
sizeof(T);
917 const auto v10 =
ZipLower(dw, v0, v1);
918 const auto v32 =
ZipLower(dw, v2, v3);
921 StoreU(A, d_full, unaligned + 0 *
N);
922 StoreU(B, d_full, unaligned + 1 *
N);
926template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
928 const Vec64<T> part2,
const Vec64<T> part3,
930 constexpr size_t N = 16 /
sizeof(T);
932 const Full128<T> d_full;
933 const Vec128<T> v0{part0.raw};
934 const Vec128<T> v1{part1.raw};
935 const Vec128<T> v2{part2.raw};
936 const Vec128<T> v3{part3.raw};
939 StoreU(A, d_full, unaligned + 0 *
N);
940 StoreU(B, d_full, unaligned + 1 *
N);
944template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
946 const Vec128<T, N> part1,
947 const Vec128<T, N> part2,
948 const Vec128<T, N> part3, Simd<T, N, 0> ,
951 const Full128<T> d_full;
953 const Vec128<T> v0{part0.raw};
954 const Vec128<T> v1{part1.raw};
955 const Vec128<T> v2{part2.raw};
956 const Vec128<T> v3{part3.raw};
957 const auto v10 =
ZipLower(dw, v0, v1);
958 const auto v32 =
ZipLower(dw, v2, v3);
960 alignas(16) T buf[16 /
sizeof(T)];
961 StoreU(v3210, d_full, buf);
962 CopyBytes<4 * N * sizeof(T)>(buf, unaligned);
970#if HWY_TARGET != HWY_SCALAR || HWY_IDE
986 const auto mask =
Set(du, uint8_t{0xF});
990 alignas(16)
static constexpr uint8_t basisL[16] = {
991 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
992 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
993 alignas(16)
static constexpr uint8_t basisU[16] = {
994 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
995 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
996 const auto sL =
And(state, mask);
997 const auto sU = ShiftRight<4>(state);
1000 state =
Xor(gf4L, gf4U);
1005 alignas(16)
static constexpr uint8_t kZetaInv[16] = {
1006 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
1007 alignas(16)
static constexpr uint8_t kInv[16] = {
1008 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
1010 const auto sL =
And(state, mask);
1011 const auto sU = ShiftRight<4>(state);
1012 const auto sX =
Xor(sU, sL);
1021 alignas(16)
static constexpr uint8_t kAffineL[16] = {
1022 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
1023 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
1024 alignas(16)
static constexpr uint8_t kAffineU[16] = {
1025 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
1026 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
1029 return Xor(
Xor(affL, affU),
Set(du, uint8_t{0x63}));
1037#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
1038#ifdef HWY_NATIVE_AES
1039#undef HWY_NATIVE_AES
1041#define HWY_NATIVE_AES
1045#if HWY_TARGET != HWY_SCALAR
1050HWY_API V ShiftRows(
const V state) {
1052 alignas(16)
static constexpr uint8_t kShiftRow[16] = {
1057 const auto shift_row =
LoadDup128(du, kShiftRow);
1062HWY_API V MixColumns(
const V state) {
1069 alignas(16)
static constexpr uint8_t k2301[16] = {
1070 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
1071 alignas(16)
static constexpr uint8_t k1230[16] = {
1072 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
1074 const auto msb = Lt(
BitCast(di, state),
Zero(di));
1076 const auto d =
Xor(
Add(state, state), overflow);
1078 const auto d_s2301 =
Xor(
d, s2301);
1079 const auto t_s2301 =
Xor(state, d_s2301);
1081 return Xor(d_s2301, t1230_s3012);
1090 state = detail::SubBytes(state);
1091 state = detail::ShiftRows(state);
1092 state = detail::MixColumns(state);
1093 state =
Xor(state, round_key);
1100 state = detail::SubBytes(state);
1101 state = detail::ShiftRows(state);
1102 state =
Xor(state, round_key);
1112 static_assert(
IsSame<
TFromD<
decltype(
d)>, uint64_t>(),
"V must be u64");
1113 const auto k1 =
Set(
d, 0x1111111111111111ULL);
1114 const auto k2 =
Set(
d, 0x2222222222222222ULL);
1115 const auto k4 =
Set(
d, 0x4444444444444444ULL);
1116 const auto k8 =
Set(
d, 0x8888888888888888ULL);
1117 const auto a0 =
And(a, k1);
1118 const auto a1 =
And(a, k2);
1119 const auto a2 =
And(a, k4);
1120 const auto a3 =
And(a, k8);
1121 const auto b0 =
And(b, k1);
1122 const auto b1 =
And(b, k2);
1123 const auto b2 =
And(b, k4);
1124 const auto b3 =
And(b, k8);
1140 static_assert(
IsSame<
TFromD<
decltype(
d)>, uint64_t>(),
"V must be u64");
1141 const auto k1 =
Set(
d, 0x1111111111111111ULL);
1142 const auto k2 =
Set(
d, 0x2222222222222222ULL);
1143 const auto k4 =
Set(
d, 0x4444444444444444ULL);
1144 const auto k8 =
Set(
d, 0x8888888888888888ULL);
1145 const auto a0 =
And(a, k1);
1146 const auto a1 =
And(a, k2);
1147 const auto a2 =
And(a, k4);
1148 const auto a3 =
And(a, k8);
1149 const auto b0 =
And(b, k1);
1150 const auto b1 =
And(b, k2);
1151 const auto b2 =
And(b, k4);
1152 const auto b3 =
And(b, k8);
1169#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
1170#ifdef HWY_NATIVE_POPCNT
1171#undef HWY_NATIVE_POPCNT
1173#define HWY_NATIVE_POPCNT
1176#undef HWY_MIN_POW2_FOR_128
1177#if HWY_TARGET == HWY_RVV
1178#define HWY_MIN_POW2_FOR_128 1
1182#define HWY_MIN_POW2_FOR_128 0
1187template <
typename V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
1188 HWY_IF_GE128_D(D), HWY_IF_POW2_GE(D, HWY_MIN_POW2_FOR_128)>
1190 static_assert(IsSame<TFromD<D>, uint8_t>(),
"V must be u8");
1192 HWY_ALIGN constexpr uint8_t kLookup[16] = {
1193 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1195 const auto lo =
And(
v,
Set(
d, uint8_t{0xF}));
1196 const auto hi = ShiftRight<4>(
v);
1202#if HWY_TARGET != HWY_RVV
1204template <
typename V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
1207 static_assert(IsSame<TFromD<D>, uint8_t>(),
"V must be u8");
1210 const V k33 =
Set(
d, uint8_t{0x33});
1213 return And(
Add(
v, ShiftRight<4>(
v)),
Set(
d, uint8_t{0x0F}));
1217template <
typename V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 2)>
1219 static_assert(IsSame<TFromD<D>, uint16_t>(),
"V must be u16");
1223 return Add(ShiftRight<8>(vals),
And(vals,
Set(
d, uint16_t{0xFF})));
1226template <
typename V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
1228 static_assert(IsSame<TFromD<D>, uint32_t>(),
"V must be u32");
1232 return Add(ShiftRight<16>(vals),
And(vals,
Set(
d, uint32_t{0xFF})));
1235#if HWY_HAVE_INTEGER64
1236template <
typename V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8)>
1238 static_assert(IsSame<TFromD<D>, uint64_t>(),
"V must be u64");
1242 return Add(ShiftRight<32>(vals),
And(vals,
Set(
d, 0xFFULL)));
1248template <
class V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8),
1249 HWY_IF_LT128_D(D), HWY_IF_FLOAT_D(D)>
1254template <
class V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8),
1255 HWY_IF_LT128_D(D), HWY_IF_NOT_FLOAT_D(D)>
1258 using T =
TFromD<
decltype(
d)>;
1260 const TU xu =
static_cast<TU
>(
GetLane(x));
1261 const TU yu =
static_cast<TU
>(
GetLane(y));
1262 return Set(
d,
static_cast<T
>(xu * yu));
1266#if (defined(HWY_NATIVE_I64MULLO) == defined(HWY_TARGET_TOGGLE))
1267#ifdef HWY_NATIVE_I64MULLO
1268#undef HWY_NATIVE_I64MULLO
1270#define HWY_NATIVE_I64MULLO
1273template <
class V,
class D64 = DFromV<V>,
typename T = LaneType<V>,
1274 HWY_IF_LANE_SIZE(T, 8), HWY_IF_UNSIGNED(T), HWY_IF_GE128_D(D64)>
1282 auto hi =
BitCast(d32, ShiftLeft<32>(
BitCast(D64{}, lohi + hilo)));
1283 return BitCast(D64{}, lolo + hi);
1285template <
class V,
class DI64 = DFromV<V>,
typename T = LaneType<V>,
1286 HWY_IF_LANE_SIZE(T, 8), HWY_IF_SIGNED(T), HWY_IF_GE128_D(DI64)>
1288 RebindToUnsigned<DI64> du64;
1295#if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
1296#ifdef HWY_NATIVE_COMPRESS8
1297#undef HWY_NATIVE_COMPRESS8
1299#define HWY_NATIVE_COMPRESS8
1302template <
class V,
class D,
typename T, HWY_IF_LANE_SIZE(T, 1)>
1311 HWY_ALIGN constexpr T table[256 * 8] = {
1312 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
1313 1, 0, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
1314 2, 0, 1, 3, 4, 5, 6, 7, 0, 2, 1, 3, 4, 5, 6, 7,
1315 1, 2, 0, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
1316 3, 0, 1, 2, 4, 5, 6, 7, 0, 3, 1, 2, 4, 5, 6, 7,
1317 1, 3, 0, 2, 4, 5, 6, 7, 0, 1, 3, 2, 4, 5, 6, 7,
1318 2, 3, 0, 1, 4, 5, 6, 7, 0, 2, 3, 1, 4, 5, 6, 7,
1319 1, 2, 3, 0, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
1320 4, 0, 1, 2, 3, 5, 6, 7, 0, 4, 1, 2, 3, 5, 6, 7,
1321 1, 4, 0, 2, 3, 5, 6, 7, 0, 1, 4, 2, 3, 5, 6, 7,
1322 2, 4, 0, 1, 3, 5, 6, 7, 0, 2, 4, 1, 3, 5, 6, 7,
1323 1, 2, 4, 0, 3, 5, 6, 7, 0, 1, 2, 4, 3, 5, 6, 7,
1324 3, 4, 0, 1, 2, 5, 6, 7, 0, 3, 4, 1, 2, 5, 6, 7,
1325 1, 3, 4, 0, 2, 5, 6, 7, 0, 1, 3, 4, 2, 5, 6, 7,
1326 2, 3, 4, 0, 1, 5, 6, 7, 0, 2, 3, 4, 1, 5, 6, 7,
1327 1, 2, 3, 4, 0, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
1328 5, 0, 1, 2, 3, 4, 6, 7, 0, 5, 1, 2, 3, 4, 6, 7,
1329 1, 5, 0, 2, 3, 4, 6, 7, 0, 1, 5, 2, 3, 4, 6, 7,
1330 2, 5, 0, 1, 3, 4, 6, 7, 0, 2, 5, 1, 3, 4, 6, 7,
1331 1, 2, 5, 0, 3, 4, 6, 7, 0, 1, 2, 5, 3, 4, 6, 7,
1332 3, 5, 0, 1, 2, 4, 6, 7, 0, 3, 5, 1, 2, 4, 6, 7,
1333 1, 3, 5, 0, 2, 4, 6, 7, 0, 1, 3, 5, 2, 4, 6, 7,
1334 2, 3, 5, 0, 1, 4, 6, 7, 0, 2, 3, 5, 1, 4, 6, 7,
1335 1, 2, 3, 5, 0, 4, 6, 7, 0, 1, 2, 3, 5, 4, 6, 7,
1336 4, 5, 0, 1, 2, 3, 6, 7, 0, 4, 5, 1, 2, 3, 6, 7,
1337 1, 4, 5, 0, 2, 3, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
1338 2, 4, 5, 0, 1, 3, 6, 7, 0, 2, 4, 5, 1, 3, 6, 7,
1339 1, 2, 4, 5, 0, 3, 6, 7, 0, 1, 2, 4, 5, 3, 6, 7,
1340 3, 4, 5, 0, 1, 2, 6, 7, 0, 3, 4, 5, 1, 2, 6, 7,
1341 1, 3, 4, 5, 0, 2, 6, 7, 0, 1, 3, 4, 5, 2, 6, 7,
1342 2, 3, 4, 5, 0, 1, 6, 7, 0, 2, 3, 4, 5, 1, 6, 7,
1343 1, 2, 3, 4, 5, 0, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
1344 6, 0, 1, 2, 3, 4, 5, 7, 0, 6, 1, 2, 3, 4, 5, 7,
1345 1, 6, 0, 2, 3, 4, 5, 7, 0, 1, 6, 2, 3, 4, 5, 7,
1346 2, 6, 0, 1, 3, 4, 5, 7, 0, 2, 6, 1, 3, 4, 5, 7,
1347 1, 2, 6, 0, 3, 4, 5, 7, 0, 1, 2, 6, 3, 4, 5, 7,
1348 3, 6, 0, 1, 2, 4, 5, 7, 0, 3, 6, 1, 2, 4, 5, 7,
1349 1, 3, 6, 0, 2, 4, 5, 7, 0, 1, 3, 6, 2, 4, 5, 7,
1350 2, 3, 6, 0, 1, 4, 5, 7, 0, 2, 3, 6, 1, 4, 5, 7,
1351 1, 2, 3, 6, 0, 4, 5, 7, 0, 1, 2, 3, 6, 4, 5, 7,
1352 4, 6, 0, 1, 2, 3, 5, 7, 0, 4, 6, 1, 2, 3, 5, 7,
1353 1, 4, 6, 0, 2, 3, 5, 7, 0, 1, 4, 6, 2, 3, 5, 7,
1354 2, 4, 6, 0, 1, 3, 5, 7, 0, 2, 4, 6, 1, 3, 5, 7,
1355 1, 2, 4, 6, 0, 3, 5, 7, 0, 1, 2, 4, 6, 3, 5, 7,
1356 3, 4, 6, 0, 1, 2, 5, 7, 0, 3, 4, 6, 1, 2, 5, 7,
1357 1, 3, 4, 6, 0, 2, 5, 7, 0, 1, 3, 4, 6, 2, 5, 7,
1358 2, 3, 4, 6, 0, 1, 5, 7, 0, 2, 3, 4, 6, 1, 5, 7,
1359 1, 2, 3, 4, 6, 0, 5, 7, 0, 1, 2, 3, 4, 6, 5, 7,
1360 5, 6, 0, 1, 2, 3, 4, 7, 0, 5, 6, 1, 2, 3, 4, 7,
1361 1, 5, 6, 0, 2, 3, 4, 7, 0, 1, 5, 6, 2, 3, 4, 7,
1362 2, 5, 6, 0, 1, 3, 4, 7, 0, 2, 5, 6, 1, 3, 4, 7,
1363 1, 2, 5, 6, 0, 3, 4, 7, 0, 1, 2, 5, 6, 3, 4, 7,
1364 3, 5, 6, 0, 1, 2, 4, 7, 0, 3, 5, 6, 1, 2, 4, 7,
1365 1, 3, 5, 6, 0, 2, 4, 7, 0, 1, 3, 5, 6, 2, 4, 7,
1366 2, 3, 5, 6, 0, 1, 4, 7, 0, 2, 3, 5, 6, 1, 4, 7,
1367 1, 2, 3, 5, 6, 0, 4, 7, 0, 1, 2, 3, 5, 6, 4, 7,
1368 4, 5, 6, 0, 1, 2, 3, 7, 0, 4, 5, 6, 1, 2, 3, 7,
1369 1, 4, 5, 6, 0, 2, 3, 7, 0, 1, 4, 5, 6, 2, 3, 7,
1370 2, 4, 5, 6, 0, 1, 3, 7, 0, 2, 4, 5, 6, 1, 3, 7,
1371 1, 2, 4, 5, 6, 0, 3, 7, 0, 1, 2, 4, 5, 6, 3, 7,
1372 3, 4, 5, 6, 0, 1, 2, 7, 0, 3, 4, 5, 6, 1, 2, 7,
1373 1, 3, 4, 5, 6, 0, 2, 7, 0, 1, 3, 4, 5, 6, 2, 7,
1374 2, 3, 4, 5, 6, 0, 1, 7, 0, 2, 3, 4, 5, 6, 1, 7,
1375 1, 2, 3, 4, 5, 6, 0, 7, 0, 1, 2, 3, 4, 5, 6, 7,
1376 7, 0, 1, 2, 3, 4, 5, 6, 0, 7, 1, 2, 3, 4, 5, 6,
1377 1, 7, 0, 2, 3, 4, 5, 6, 0, 1, 7, 2, 3, 4, 5, 6,
1378 2, 7, 0, 1, 3, 4, 5, 6, 0, 2, 7, 1, 3, 4, 5, 6,
1379 1, 2, 7, 0, 3, 4, 5, 6, 0, 1, 2, 7, 3, 4, 5, 6,
1380 3, 7, 0, 1, 2, 4, 5, 6, 0, 3, 7, 1, 2, 4, 5, 6,
1381 1, 3, 7, 0, 2, 4, 5, 6, 0, 1, 3, 7, 2, 4, 5, 6,
1382 2, 3, 7, 0, 1, 4, 5, 6, 0, 2, 3, 7, 1, 4, 5, 6,
1383 1, 2, 3, 7, 0, 4, 5, 6, 0, 1, 2, 3, 7, 4, 5, 6,
1384 4, 7, 0, 1, 2, 3, 5, 6, 0, 4, 7, 1, 2, 3, 5, 6,
1385 1, 4, 7, 0, 2, 3, 5, 6, 0, 1, 4, 7, 2, 3, 5, 6,
1386 2, 4, 7, 0, 1, 3, 5, 6, 0, 2, 4, 7, 1, 3, 5, 6,
1387 1, 2, 4, 7, 0, 3, 5, 6, 0, 1, 2, 4, 7, 3, 5, 6,
1388 3, 4, 7, 0, 1, 2, 5, 6, 0, 3, 4, 7, 1, 2, 5, 6,
1389 1, 3, 4, 7, 0, 2, 5, 6, 0, 1, 3, 4, 7, 2, 5, 6,
1390 2, 3, 4, 7, 0, 1, 5, 6, 0, 2, 3, 4, 7, 1, 5, 6,
1391 1, 2, 3, 4, 7, 0, 5, 6, 0, 1, 2, 3, 4, 7, 5, 6,
1392 5, 7, 0, 1, 2, 3, 4, 6, 0, 5, 7, 1, 2, 3, 4, 6,
1393 1, 5, 7, 0, 2, 3, 4, 6, 0, 1, 5, 7, 2, 3, 4, 6,
1394 2, 5, 7, 0, 1, 3, 4, 6, 0, 2, 5, 7, 1, 3, 4, 6,
1395 1, 2, 5, 7, 0, 3, 4, 6, 0, 1, 2, 5, 7, 3, 4, 6,
1396 3, 5, 7, 0, 1, 2, 4, 6, 0, 3, 5, 7, 1, 2, 4, 6,
1397 1, 3, 5, 7, 0, 2, 4, 6, 0, 1, 3, 5, 7, 2, 4, 6,
1398 2, 3, 5, 7, 0, 1, 4, 6, 0, 2, 3, 5, 7, 1, 4, 6,
1399 1, 2, 3, 5, 7, 0, 4, 6, 0, 1, 2, 3, 5, 7, 4, 6,
1400 4, 5, 7, 0, 1, 2, 3, 6, 0, 4, 5, 7, 1, 2, 3, 6,
1401 1, 4, 5, 7, 0, 2, 3, 6, 0, 1, 4, 5, 7, 2, 3, 6,
1402 2, 4, 5, 7, 0, 1, 3, 6, 0, 2, 4, 5, 7, 1, 3, 6,
1403 1, 2, 4, 5, 7, 0, 3, 6, 0, 1, 2, 4, 5, 7, 3, 6,
1404 3, 4, 5, 7, 0, 1, 2, 6, 0, 3, 4, 5, 7, 1, 2, 6,
1405 1, 3, 4, 5, 7, 0, 2, 6, 0, 1, 3, 4, 5, 7, 2, 6,
1406 2, 3, 4, 5, 7, 0, 1, 6, 0, 2, 3, 4, 5, 7, 1, 6,
1407 1, 2, 3, 4, 5, 7, 0, 6, 0, 1, 2, 3, 4, 5, 7, 6,
1408 6, 7, 0, 1, 2, 3, 4, 5, 0, 6, 7, 1, 2, 3, 4, 5,
1409 1, 6, 7, 0, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5,
1410 2, 6, 7, 0, 1, 3, 4, 5, 0, 2, 6, 7, 1, 3, 4, 5,
1411 1, 2, 6, 7, 0, 3, 4, 5, 0, 1, 2, 6, 7, 3, 4, 5,
1412 3, 6, 7, 0, 1, 2, 4, 5, 0, 3, 6, 7, 1, 2, 4, 5,
1413 1, 3, 6, 7, 0, 2, 4, 5, 0, 1, 3, 6, 7, 2, 4, 5,
1414 2, 3, 6, 7, 0, 1, 4, 5, 0, 2, 3, 6, 7, 1, 4, 5,
1415 1, 2, 3, 6, 7, 0, 4, 5, 0, 1, 2, 3, 6, 7, 4, 5,
1416 4, 6, 7, 0, 1, 2, 3, 5, 0, 4, 6, 7, 1, 2, 3, 5,
1417 1, 4, 6, 7, 0, 2, 3, 5, 0, 1, 4, 6, 7, 2, 3, 5,
1418 2, 4, 6, 7, 0, 1, 3, 5, 0, 2, 4, 6, 7, 1, 3, 5,
1419 1, 2, 4, 6, 7, 0, 3, 5, 0, 1, 2, 4, 6, 7, 3, 5,
1420 3, 4, 6, 7, 0, 1, 2, 5, 0, 3, 4, 6, 7, 1, 2, 5,
1421 1, 3, 4, 6, 7, 0, 2, 5, 0, 1, 3, 4, 6, 7, 2, 5,
1422 2, 3, 4, 6, 7, 0, 1, 5, 0, 2, 3, 4, 6, 7, 1, 5,
1423 1, 2, 3, 4, 6, 7, 0, 5, 0, 1, 2, 3, 4, 6, 7, 5,
1424 5, 6, 7, 0, 1, 2, 3, 4, 0, 5, 6, 7, 1, 2, 3, 4,
1425 1, 5, 6, 7, 0, 2, 3, 4, 0, 1, 5, 6, 7, 2, 3, 4,
1426 2, 5, 6, 7, 0, 1, 3, 4, 0, 2, 5, 6, 7, 1, 3, 4,
1427 1, 2, 5, 6, 7, 0, 3, 4, 0, 1, 2, 5, 6, 7, 3, 4,
1428 3, 5, 6, 7, 0, 1, 2, 4, 0, 3, 5, 6, 7, 1, 2, 4,
1429 1, 3, 5, 6, 7, 0, 2, 4, 0, 1, 3, 5, 6, 7, 2, 4,
1430 2, 3, 5, 6, 7, 0, 1, 4, 0, 2, 3, 5, 6, 7, 1, 4,
1431 1, 2, 3, 5, 6, 7, 0, 4, 0, 1, 2, 3, 5, 6, 7, 4,
1432 4, 5, 6, 7, 0, 1, 2, 3, 0, 4, 5, 6, 7, 1, 2, 3,
1433 1, 4, 5, 6, 7, 0, 2, 3, 0, 1, 4, 5, 6, 7, 2, 3,
1434 2, 4, 5, 6, 7, 0, 1, 3, 0, 2, 4, 5, 6, 7, 1, 3,
1435 1, 2, 4, 5, 6, 7, 0, 3, 0, 1, 2, 4, 5, 6, 7, 3,
1436 3, 4, 5, 6, 7, 0, 1, 2, 0, 3, 4, 5, 6, 7, 1, 2,
1437 1, 3, 4, 5, 6, 7, 0, 2, 0, 1, 3, 4, 5, 6, 7, 2,
1438 2, 3, 4, 5, 6, 7, 0, 1, 0, 2, 3, 4, 5, 6, 7, 1,
1439 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
1441 for (
size_t i = 0; i <
Lanes(
d); i += 8) {
1444 const size_t bits8 = bits[i / 8];
1445 const auto indices =
Load(d8, table + bits8 * 8);
1447 StoreU(compressed, d8, pos);
1450 return static_cast<size_t>(pos - unaligned);
1453template <
class V,
class M,
class D,
typename T, HWY_IF_LANE_SIZE(T, 1)>
1460template <
class V,
class M,
class D,
typename T, HWY_IF_LANE_SIZE(T, 1)>
1470template <
class V,
class M,
typename T = TFromV<V>, HWY_IF_LANE_SIZE(T, 1)>
1475 return Load(
d, lanes);
1478template <
class V,
typename T = TFromV<V>, HWY_IF_LANE_SIZE(T, 1)>
1483 return Load(
d, lanes);
1486template <
class V,
class M,
typename T = TFromV<V>, HWY_IF_LANE_SIZE(T, 1)>
1497#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE && \
1498 HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
1499 HWY_TARGET != HWY_SVE2_128
1529HWY_API auto Eq(V a, V b) ->
decltype(a == b) {
1533HWY_API auto Ne(V a, V b) ->
decltype(a == b) {
1537HWY_API auto Lt(V a, V b) ->
decltype(a == b) {
1542HWY_API auto Gt(V a, V b) ->
decltype(a == b) {
1546HWY_API auto Ge(V a, V b) ->
decltype(a == b) {
1551HWY_API auto Le(V a, V b) ->
decltype(a == b) {
#define HWY_MAX(a, b)
Definition: base.h:135
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES)
Definition: base.h:430
#define HWY_RESTRICT
Definition: base.h:64
#define HWY_IF_LANE_SIZE(T, bytes)
Definition: base.h:420
#define HWY_API
Definition: base.h:129
#define HWY_IF_NOT_LANE_SIZE(T, bytes)
Definition: base.h:422
#define HWY_MIN(a, b)
Definition: base.h:134
#define HWY_IF_GE128(T, N)
Definition: base.h:411
#define HWY_INLINE
Definition: base.h:70
Definition: arm_neon-inl.h:778
Raw raw
Definition: arm_neon-inl.h:814
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2413
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:718
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:159
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2451
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:470
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:535
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:815
HWY_API Vec128< uint16_t, N > Shl(hwy::UnsignedTag, Vec128< uint16_t, N > v, Vec128< uint16_t, N > bits)
Definition: x86_128-inl.h:5009
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:862
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:545
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2432
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:505
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:340
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:926
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:748
d
Definition: rvv-inl.h:1998
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4272
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:6349
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6584
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4448
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5701
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition: arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:221
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1949
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition: ops/shared-inl.h:295
HWY_API Vec< D > NaN(D d)
Definition: generic_ops-inl.h:76
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2217
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:212
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4912
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4617
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1931
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4872
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2758
typename D::Twice Twice
Definition: ops/shared-inl.h:231
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:210
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2941
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4417
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1998
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2772
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4586
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:223
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:322
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:69
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:6234
HWY_API Vec< D > Inf(D d)
Definition: generic_ops-inl.h:85
decltype(GetLane(V())) LaneType
Definition: generic_ops-inl.h:33
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2591
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:997
HWY_API svbool_t Gt(const V a, const V b)
Definition: arm_sve-inl.h:881
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4281
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1020
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4256
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:6387
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1986
HWY_API V Clamp(const V v, const V lo, const V hi)
Definition: generic_ops-inl.h:50
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1076
HWY_API void SafeFillN(const size_t num, const T value, D d, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:96
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4678
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:111
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:218
decltype(MaskFromVec(Zero(D()))) Mask
Definition: generic_ops-inl.h:46
N
Definition: rvv-inl.h:1998
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6273
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6428
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6257
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6517
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2934
const vfloat64m1_t v
Definition: rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4429
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:2506
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6549
typename D::T TFromD
Definition: ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6174
HWY_API svbool_t Ge(const V a, const V b)
Definition: arm_sve-inl.h:885
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:40
Definition: aligned_allocator.h:27
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:607
HWY_API constexpr bool IsSame()
Definition: base.h:396
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:865
constexpr MakeUnsigned< T > SignMask()
Definition: base.h:734
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:593
HWY_API constexpr T LimitsMax()
Definition: base.h:656
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: ops/shared-inl.h:52