Grok 10.0.5
generic_ops-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Target-independent types/functions defined after target-specific ops.
17
18#include "hwy/base.h"
19
20// Define detail::Shuffle1230 etc, but only when viewing the current header;
21// normally this is included via highway.h, which includes ops/*.h.
22#if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
23#include "hwy/ops/emu128-inl.h"
24#endif // HWY_IDE
25
26// Relies on the external include guard in highway.h.
28namespace hwy {
29namespace HWY_NAMESPACE {
30
31// The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
32template <class V>
33using LaneType = decltype(GetLane(V()));
34
35// Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
36// type of functions that do not take a vector argument, or as an argument type
37// if the function only has a template argument for D, or for explicit type
38// names instead of auto. This may be a built-in type.
39template <class D>
40using Vec = decltype(Zero(D()));
41
42// Mask type. Useful as the return type of functions that do not take a mask
43// argument, or as an argument type if the function only has a template argument
44// for D, or for explicit type names instead of auto.
45template <class D>
46using Mask = decltype(MaskFromVec(Zero(D())));
47
48// Returns the closest value to v within [lo, hi].
49template <class V>
50HWY_API V Clamp(const V v, const V lo, const V hi) {
51 return Min(Max(lo, v), hi);
52}
53
54// CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
55// and RVV has its own implementation of -Lanes.
56#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
57
58template <size_t kLanes, class D, class V = VFromD<D>>
59HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
60 constexpr size_t kBytes = kLanes * sizeof(LaneType<V>);
61 static_assert(kBytes < 16, "Shift count is per-block");
62 return CombineShiftRightBytes<kBytes>(d, hi, lo);
63}
64
65#endif
66
67// Returns lanes with the most significant bit set and all other bits zero.
68template <class D>
70 const RebindToUnsigned<decltype(d)> du;
71 return BitCast(d, Set(du, SignMask<TFromD<D>>()));
72}
73
74// Returns quiet NaN.
75template <class D>
77 const RebindToSigned<D> di;
78 // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
79 // mantissa MSB (to indicate quiet) would be sufficient.
80 return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
81}
82
83// Returns positive infinity.
84template <class D>
86 const RebindToUnsigned<D> du;
87 using T = TFromD<D>;
88 using TU = TFromD<decltype(du)>;
89 const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>());
90 return BitCast(d, Set(du, max_x2 >> 1));
91}
92
93// ------------------------------ SafeFillN
94
95template <class D, typename T = TFromD<D>>
96HWY_API void SafeFillN(const size_t num, const T value, D d,
97 T* HWY_RESTRICT to) {
98#if HWY_MEM_OPS_MIGHT_FAULT
99 (void)d;
100 for (size_t i = 0; i < num; ++i) {
101 to[i] = value;
102 }
103#else
104 BlendedStore(Set(d, value), FirstN(d, num), d, to);
105#endif
106}
107
108// ------------------------------ SafeCopyN
109
110template <class D, typename T = TFromD<D>>
111HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
112 T* HWY_RESTRICT to) {
113#if HWY_MEM_OPS_MIGHT_FAULT
114 (void)d;
115 for (size_t i = 0; i < num; ++i) {
116 to[i] = from[i];
117 }
118#else
119 const Mask<D> mask = FirstN(d, num);
120 BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
121#endif
122}
123
124// "Include guard": skip if native instructions are available. The generic
125// implementation is currently shared between x86_* and wasm_*, and is too large
126// to duplicate.
127
128#if (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
129#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
130#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
131#else
132#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
133#endif
134
135// ------------------------------ LoadInterleaved2
136
137template <typename T, size_t N, class V>
139 V& v0, V& v1) {
140 const V A = LoadU(d, unaligned + 0 * N); // v1[1] v0[1] v1[0] v0[0]
141 const V B = LoadU(d, unaligned + 1 * N);
142 v0 = ConcatEven(d, B, A);
143 v1 = ConcatOdd(d, B, A);
144}
145
146template <typename T, class V>
148 V& v0, V& v1) {
149 v0 = LoadU(d, unaligned + 0);
150 v1 = LoadU(d, unaligned + 1);
151}
152
153// ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)
154
155namespace detail {
156
157// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
158template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
160 const T* HWY_RESTRICT unaligned, V& A, V& B,
161 V& C) {
162 A = LoadU(d, unaligned + 0 * N);
163 B = LoadU(d, unaligned + 1 * N);
164 C = LoadU(d, unaligned + 2 * N);
165}
166
167} // namespace detail
168
169template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
171 V& v0, V& v1, V& v2) {
172 const RebindToUnsigned<decltype(d)> du;
173 // Compact notation so these fit on one line: 12 := v1[2].
174 V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
175 V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
176 V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
177 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
178 // Compress all lanes belonging to v0 into consecutive lanes.
179 constexpr uint8_t Z = 0x80;
180 alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, 9, 12, 15, Z, Z,
181 Z, Z, Z, Z, Z, Z, Z, Z};
182 alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, Z, Z, Z, 2, 5,
183 8, 11, 14, Z, Z, Z, Z, Z};
184 alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
185 Z, Z, Z, 1, 4, 7, 10, 13};
186 alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, 10, 13, Z, Z, Z,
187 Z, Z, Z, Z, Z, Z, Z, Z};
188 alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, Z, Z, 0, 3, 6,
189 9, 12, 15, Z, Z, Z, Z, Z};
190 alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
191 Z, Z, Z, 2, 5, 8, 11, 14};
192 alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, 8, 11, 14, Z, Z, Z,
193 Z, Z, Z, Z, Z, Z, Z, Z};
194 alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, Z, Z, Z, 1, 4, 7,
195 10, 13, Z, Z, Z, Z, Z, Z};
196 alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
197 Z, Z, 0, 3, 6, 9, 12, 15};
198 const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
199 const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
200 const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
201 const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
202 const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
203 const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
204 const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
205 const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
206 const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
207 v0 = Xor3(v0L, v0M, v0U);
208 v1 = Xor3(v1L, v1M, v1U);
209 v2 = Xor3(v2L, v2M, v2U);
210}
211
212// 8-bit lanes x8
213template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
216 V& v0, V& v1, V& v2) {
217 const RebindToUnsigned<decltype(d)> du;
218 V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
219 V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
220 V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
221 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
222 // Compress all lanes belonging to v0 into consecutive lanes.
223 constexpr uint8_t Z = 0x80;
224 alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z};
225 alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z};
226 alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5};
227 alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z};
228 alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z};
229 alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6};
230 alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z};
231 alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z};
232 alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7};
233 const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
234 const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
235 const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
236 const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
237 const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
238 const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
239 const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
240 const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
241 const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
242 v0 = Xor3(v0L, v0M, v0U);
243 v1 = Xor3(v1L, v1M, v1U);
244 v2 = Xor3(v2L, v2M, v2U);
245}
246
247// 16-bit lanes x8
248template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
250HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
251 V& v0, V& v1, V& v2) {
252 const RebindToUnsigned<decltype(d)> du;
253 V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
254 V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
255 V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
256 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
257 // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
258 // but each element of the array contains two byte indices for a lane.
259 constexpr uint16_t Z = 0x8080;
260 alignas(16) constexpr uint16_t kIdx_v0A[8] = {0x0100, 0x0706, 0x0D0C, Z,
261 Z, Z, Z, Z};
262 alignas(16) constexpr uint16_t kIdx_v0B[8] = {Z, Z, Z, 0x0302,
263 0x0908, 0x0F0E, Z, Z};
264 alignas(16) constexpr uint16_t kIdx_v0C[8] = {Z, Z, Z, Z,
265 Z, Z, 0x0504, 0x0B0A};
266 alignas(16) constexpr uint16_t kIdx_v1A[8] = {0x0302, 0x0908, 0x0F0E, Z,
267 Z, Z, Z, Z};
268 alignas(16) constexpr uint16_t kIdx_v1B[8] = {Z, Z, Z, 0x0504,
269 0x0B0A, Z, Z, Z};
270 alignas(16) constexpr uint16_t kIdx_v1C[8] = {Z, Z, Z, Z,
271 Z, 0x0100, 0x0706, 0x0D0C};
272 alignas(16) constexpr uint16_t kIdx_v2A[8] = {0x0504, 0x0B0A, Z, Z,
273 Z, Z, Z, Z};
274 alignas(16) constexpr uint16_t kIdx_v2B[8] = {Z, Z, 0x0100, 0x0706,
275 0x0D0C, Z, Z, Z};
276 alignas(16) constexpr uint16_t kIdx_v2C[8] = {Z, Z, Z, Z,
277 Z, 0x0302, 0x0908, 0x0F0E};
278 const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
279 const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
280 const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
281 const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
282 const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
283 const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
284 const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
285 const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
286 const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
287 v0 = Xor3(v0L, v0M, v0U);
288 v1 = Xor3(v1L, v1M, v1U);
289 v2 = Xor3(v2L, v2M, v2U);
290}
291
292template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
293HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
294 V& v0, V& v1, V& v2) {
295 V A; // v0[1] v2[0] v1[0] v0[0]
296 V B; // v1[2] v0[2] v2[1] v1[1]
297 V C; // v2[3] v1[3] v0[3] v2[2]
298 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
299
300 const V vxx_02_03_xx = OddEven(C, B);
301 v0 = detail::Shuffle1230(A, vxx_02_03_xx);
302
303 // Shuffle2301 takes the upper/lower halves of the output from one input, so
304 // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
305 // OddEven because it may have higher throughput than Shuffle.
306 const V vxx_xx_10_11 = OddEven(A, B);
307 const V v12_13_xx_xx = OddEven(B, C);
308 v1 = detail::Shuffle2301(vxx_xx_10_11, v12_13_xx_xx);
309
310 const V vxx_20_21_xx = OddEven(B, A);
311 v2 = detail::Shuffle3012(vxx_20_21_xx, C);
312}
313
314template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
315HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
316 V& v0, V& v1, V& v2) {
317 V A; // v1[0] v0[0]
318 V B; // v0[1] v2[0]
319 V C; // v2[1] v1[1]
320 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
321 v0 = OddEven(B, A);
322 v1 = CombineShiftRightBytes<sizeof(T)>(d, C, A);
323 v2 = OddEven(C, B);
324}
325
326template <typename T, class V>
328 V& v0, V& v1, V& v2) {
329 v0 = LoadU(d, unaligned + 0);
330 v1 = LoadU(d, unaligned + 1);
331 v2 = LoadU(d, unaligned + 2);
332}
333
334// ------------------------------ LoadInterleaved4
335
336namespace detail {
337
338// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
339template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
341 const T* HWY_RESTRICT unaligned, V& A, V& B,
342 V& C, V& D) {
343 A = LoadU(d, unaligned + 0 * N);
344 B = LoadU(d, unaligned + 1 * N);
345 C = LoadU(d, unaligned + 2 * N);
346 D = LoadU(d, unaligned + 3 * N);
347}
348
349} // namespace detail
350
351template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
353 V& v0, V& v1, V& v2, V& v3) {
354 const Repartition<uint64_t, decltype(d)> d64;
355 using V64 = VFromD<decltype(d64)>;
356 // 16 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
357 // Here int[i] means the four interleaved values of the i-th 4-tuple and
358 // int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
359 V A; // int[13..10] int[3..0]
360 V B; // int[17..14] int[7..4]
361 V C; // int[1b..18] int[b..8]
362 V D; // int[1f..1c] int[f..c]
363 detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
364
365 // For brevity, the comments only list the lower block (upper = lower + 0x10)
366 const V v5140 = InterleaveLower(d, A, B); // int[5,1,4,0]
367 const V vd9c8 = InterleaveLower(d, C, D); // int[d,9,c,8]
368 const V v7362 = InterleaveUpper(d, A, B); // int[7,3,6,2]
369 const V vfbea = InterleaveUpper(d, C, D); // int[f,b,e,a]
370
371 const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0]
372 const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8]
373 const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1]
374 const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9]
375
376 const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0]
377 const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8]
378 const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0]
379 const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8]
380
381 v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
382 v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
383 v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
384 v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
385}
386
387template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 8)>
388HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
389 V& v0, V& v1, V& v2, V& v3) {
390 // In the last step, we interleave by half of the block size, which is usually
391 // 8 bytes but half that for 8-bit x8 vectors.
392 using TW = hwy::UnsignedFromSize<sizeof(T) * N == 8 ? 4 : 8>;
393 const Repartition<TW, decltype(d)> dw;
394 using VW = VFromD<decltype(dw)>;
395
396 // (Comments are for 256-bit vectors.)
397 // 8 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
398 V A; // v3210[9]v3210[8] v3210[1]v3210[0]
399 V B; // v3210[b]v3210[a] v3210[3]v3210[2]
400 V C; // v3210[d]v3210[c] v3210[5]v3210[4]
401 V D; // v3210[f]v3210[e] v3210[7]v3210[6]
402 detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
403
404 const V va820 = InterleaveLower(d, A, B); // v3210[a,8] v3210[2,0]
405 const V vec64 = InterleaveLower(d, C, D); // v3210[e,c] v3210[6,4]
406 const V vb931 = InterleaveUpper(d, A, B); // v3210[b,9] v3210[3,1]
407 const V vfd75 = InterleaveUpper(d, C, D); // v3210[f,d] v3210[7,5]
408
409 const VW v10_b830 = // v10[b..8] v10[3..0]
410 BitCast(dw, InterleaveLower(d, va820, vb931));
411 const VW v10_fc74 = // v10[f..c] v10[7..4]
412 BitCast(dw, InterleaveLower(d, vec64, vfd75));
413 const VW v32_b830 = // v32[b..8] v32[3..0]
414 BitCast(dw, InterleaveUpper(d, va820, vb931));
415 const VW v32_fc74 = // v32[f..c] v32[7..4]
416 BitCast(dw, InterleaveUpper(d, vec64, vfd75));
417
418 v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
419 v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
420 v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
421 v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
422}
423
424template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
425HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
426 V& v0, V& v1, V& v2, V& v3) {
427 V A; // v3210[4] v3210[0]
428 V B; // v3210[5] v3210[1]
429 V C; // v3210[6] v3210[2]
430 V D; // v3210[7] v3210[3]
431 detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
432 const V v10_ev = InterleaveLower(d, A, C); // v1[6,4] v0[6,4] v1[2,0] v0[2,0]
433 const V v10_od = InterleaveLower(d, B, D); // v1[7,5] v0[7,5] v1[3,1] v0[3,1]
434 const V v32_ev = InterleaveUpper(d, A, C); // v3[6,4] v2[6,4] v3[2,0] v2[2,0]
435 const V v32_od = InterleaveUpper(d, B, D); // v3[7,5] v2[7,5] v3[3,1] v2[3,1]
436
437 v0 = InterleaveLower(d, v10_ev, v10_od);
438 v1 = InterleaveUpper(d, v10_ev, v10_od);
439 v2 = InterleaveLower(d, v32_ev, v32_od);
440 v3 = InterleaveUpper(d, v32_ev, v32_od);
441}
442
443template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
444HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
445 V& v0, V& v1, V& v2, V& v3) {
446 V A, B, C, D;
447 detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
448 v0 = InterleaveLower(d, A, C);
449 v1 = InterleaveUpper(d, A, C);
450 v2 = InterleaveLower(d, B, D);
451 v3 = InterleaveUpper(d, B, D);
452}
453
454// Any T x1
455template <typename T, class V>
457 V& v0, V& v1, V& v2, V& v3) {
458 v0 = LoadU(d, unaligned + 0);
459 v1 = LoadU(d, unaligned + 1);
460 v2 = LoadU(d, unaligned + 2);
461 v3 = LoadU(d, unaligned + 3);
462}
463
464// ------------------------------ StoreInterleaved2
465
466namespace detail {
467
468// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
469template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
471 T* HWY_RESTRICT unaligned) {
472 StoreU(A, d, unaligned + 0 * N);
473 StoreU(B, d, unaligned + 1 * N);
474}
475
476} // namespace detail
477
478// >= 128 bit vector
479template <typename T, size_t N, class V, HWY_IF_GE128(T, N)>
480HWY_API void StoreInterleaved2(const V v0, const V v1, Simd<T, N, 0> d,
481 T* HWY_RESTRICT unaligned) {
482 const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0]
483 const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[N/2] v0[N/2]
484 detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
485}
486
487// <= 64 bits
488template <class V, typename T, size_t N, HWY_IF_LE64(T, N)>
489HWY_API void StoreInterleaved2(const V part0, const V part1, Simd<T, N, 0> d,
490 T* HWY_RESTRICT unaligned) {
491 const Twice<decltype(d)> d2;
492 const auto v0 = ZeroExtendVector(d2, part0);
493 const auto v1 = ZeroExtendVector(d2, part1);
494 const auto v10 = InterleaveLower(d2, v0, v1);
495 StoreU(v10, d2, unaligned);
496}
497
498// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
499// TableLookupBytes)
500
501namespace detail {
502
503// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
504template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
505HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C,
507 T* HWY_RESTRICT unaligned) {
508 StoreU(A, d, unaligned + 0 * N);
509 StoreU(B, d, unaligned + 1 * N);
510 StoreU(C, d, unaligned + 2 * N);
511}
512
513} // namespace detail
514
515// >= 128-bit vector, 8-bit lanes
516template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
517 HWY_IF_GE128(T, N)>
518HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
519 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
520 const RebindToUnsigned<decltype(d)> du;
521 using TU = TFromD<decltype(du)>;
522 const auto k5 = Set(du, TU{5});
523 const auto k6 = Set(du, TU{6});
524
525 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
526 // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
527 // to their place, with 0x80 so lanes to be filled from other vectors are 0
528 // to enable blending by ORing together.
529 alignas(16) static constexpr uint8_t tbl_v0[16] = {
530 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
531 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
532 alignas(16) static constexpr uint8_t tbl_v1[16] = {
533 0x80, 0, 0x80, 0x80, 1, 0x80, //
534 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
535 // The interleaved vectors will be named A, B, C; temporaries with suffix
536 // 0..2 indicate which input vector's lanes they hold.
537 const auto shuf_A0 = LoadDup128(du, tbl_v0);
538 const auto shuf_A1 = LoadDup128(du, tbl_v1); // cannot reuse shuf_A0 (has 5)
539 const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
540 const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
541 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
542 const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
543 const V A = BitCast(d, A0 | A1 | A2);
544
545 // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
546 const auto shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6..
547 const auto shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5
548 const auto shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5.
549 const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
550 const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
551 const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
552 const V B = BitCast(d, B0 | B1 | B2);
553
554 // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
555 const auto shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B.
556 const auto shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B..
557 const auto shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A
558 const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
559 const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
560 const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
561 const V C = BitCast(d, C0 | C1 | C2);
562
563 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
564}
565
566// >= 128-bit vector, 16-bit lanes
567template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
568 HWY_IF_GE128(T, N)>
569HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
570 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
571 const Repartition<uint8_t, decltype(d)> du8;
572 const auto k2 = Set(du8, uint8_t{2 * sizeof(T)});
573 const auto k3 = Set(du8, uint8_t{3 * sizeof(T)});
574
575 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
576 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
577 // filled from other vectors are 0 for blending. Note that these are byte
578 // indices for 16-bit lanes.
579 alignas(16) static constexpr uint8_t tbl_v1[16] = {
580 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
581 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
582 alignas(16) static constexpr uint8_t tbl_v2[16] = {
583 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
584 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
585
586 // The interleaved vectors will be named A, B, C; temporaries with suffix
587 // 0..2 indicate which input vector's lanes they hold.
588 const auto shuf_A1 = LoadDup128(du8, tbl_v1); // 2..1..0.
589 // .2..1..0
590 const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
591 const auto shuf_A2 = LoadDup128(du8, tbl_v2); // ..1..0..
592
593 const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
594 const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
595 const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
596 const V A = BitCast(d, A0 | A1 | A2);
597
598 // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
599 const auto shuf_B0 = shuf_A1 + k3; // 5..4..3.
600 const auto shuf_B1 = shuf_A2 + k3; // ..4..3..
601 const auto shuf_B2 = shuf_A0 + k2; // .4..3..2
602 const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
603 const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
604 const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
605 const V B = BitCast(d, B0 | B1 | B2);
606
607 // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
608 const auto shuf_C0 = shuf_B1 + k3; // ..7..6..
609 const auto shuf_C1 = shuf_B2 + k3; // .7..6..5
610 const auto shuf_C2 = shuf_B0 + k2; // 7..6..5.
611 const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
612 const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
613 const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
614 const V C = BitCast(d, C0 | C1 | C2);
615
616 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
617}
618
619// >= 128-bit vector, 32-bit lanes
620template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 4),
621 HWY_IF_GE128(T, N)>
622HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
623 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
624 const RepartitionToWide<decltype(d)> dw;
625
626 const V v10_v00 = InterleaveLower(d, v0, v1);
627 const V v01_v20 = OddEven(v0, v2);
628 // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
629 const V A = BitCast(
630 d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));
631
632 const V v1_321 = ShiftRightLanes<1>(d, v1);
633 const V v0_32 = ShiftRightLanes<2>(d, v0);
634 const V v21_v11 = OddEven(v2, v1_321);
635 const V v12_v02 = OddEven(v1_321, v0_32);
636 // B: v1[2],v0[2], v2[1],v1[1]
637 const V B = BitCast(
638 d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));
639
640 // Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
641 const V v23_v13 = OddEven(v2, v1_321);
642 const V v03_v22 = OddEven(v0, v2);
643 // C: v2[3],v1[3],v0[3], v2[2]
644 const V C = BitCast(
645 d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));
646
647 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
648}
649
650// >= 128-bit vector, 64-bit lanes
651template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
652 HWY_IF_GE128(T, N)>
653HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
654 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
655 const V A = InterleaveLower(d, v0, v1);
656 const V B = OddEven(v0, v2);
657 const V C = InterleaveUpper(d, v1, v2);
658 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
659}
660
661// 64-bit vector, 8-bit lanes
662template <class V, typename T, HWY_IF_LANE_SIZE(T, 1)>
663HWY_API void StoreInterleaved3(const V part0, const V part1, const V part2,
664 Full64<T> d, T* HWY_RESTRICT unaligned) {
665 constexpr size_t N = 16 / sizeof(T);
666 // Use full vectors for the shuffles and first result.
667 const Full128<uint8_t> du;
668 const Full128<T> d_full;
669 const auto k5 = Set(du, uint8_t{5});
670 const auto k6 = Set(du, uint8_t{6});
671
672 const Vec128<T> v0{part0.raw};
673 const Vec128<T> v1{part1.raw};
674 const Vec128<T> v2{part2.raw};
675
676 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
677 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
678 // filled from other vectors are 0 for blending.
679 alignas(16) static constexpr uint8_t tbl_v0[16] = {
680 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
681 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
682 alignas(16) static constexpr uint8_t tbl_v1[16] = {
683 0x80, 0, 0x80, 0x80, 1, 0x80, //
684 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
685 // The interleaved vectors will be named A, B, C; temporaries with suffix
686 // 0..2 indicate which input vector's lanes they hold.
687 const auto shuf_A0 = Load(du, tbl_v0);
688 const auto shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB)
689 const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
690 const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
691 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
692 const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
693 const auto A = BitCast(d_full, A0 | A1 | A2);
694 StoreU(A, d_full, unaligned + 0 * N);
695
696 // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
697 const auto shuf_B0 = shuf_A2 + k6; // ..7..6..
698 const auto shuf_B1 = shuf_A0 + k5; // .7..6..5
699 const auto shuf_B2 = shuf_A1 + k5; // 7..6..5.
700 const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
701 const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
702 const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
703 const V B{(B0 | B1 | B2).raw};
704 StoreU(B, d, unaligned + 1 * N);
705}
706
707// 64-bit vector, 16-bit lanes
708template <typename T, HWY_IF_LANE_SIZE(T, 2)>
709HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
710 const Vec64<T> part2, Full64<T> dh,
711 T* HWY_RESTRICT unaligned) {
712 const Full128<T> d;
713 const Full128<uint8_t> du8;
714 constexpr size_t N = 16 / sizeof(T);
715 const auto k2 = Set(du8, uint8_t{2 * sizeof(T)});
716 const auto k3 = Set(du8, uint8_t{3 * sizeof(T)});
717
718 const Vec128<T> v0{part0.raw};
719 const Vec128<T> v1{part1.raw};
720 const Vec128<T> v2{part2.raw};
721
722 // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
723 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
724 // to their place, with 0x80 so lanes to be filled from other vectors are 0
725 // to enable blending by ORing together.
726 alignas(16) static constexpr uint8_t tbl_v1[16] = {
727 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
728 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
729 alignas(16) static constexpr uint8_t tbl_v2[16] = {
730 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
731 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
732
733 // The interleaved vectors will be named A, B; temporaries with suffix
734 // 0..2 indicate which input vector's lanes they hold.
735 const auto shuf_A1 = Load(du8, tbl_v1); // 2..1..0.
736 // .2..1..0
737 const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
738 const auto shuf_A2 = Load(du8, tbl_v2); // ..1..0..
739
740 const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
741 const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
742 const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
743 const Vec128<T> A = BitCast(d, A0 | A1 | A2);
744 StoreU(A, d, unaligned + 0 * N);
745
746 // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
747 const auto shuf_B0 = shuf_A1 + k3; // ..3.
748 const auto shuf_B1 = shuf_A2 + k3; // .3..
749 const auto shuf_B2 = shuf_A0 + k2; // 3..2
750 const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
751 const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
752 const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
753 const Vec128<T> B = BitCast(d, B0 | B1 | B2);
754 StoreU(Vec64<T>{B.raw}, dh, unaligned + 1 * N);
755}
756
757// 64-bit vector, 32-bit lanes
758template <typename T, HWY_IF_LANE_SIZE(T, 4)>
759HWY_API void StoreInterleaved3(const Vec64<T> v0, const Vec64<T> v1,
760 const Vec64<T> v2, Full64<T> d,
761 T* HWY_RESTRICT unaligned) {
762 // (same code as 128-bit vector, 64-bit lanes)
763 constexpr size_t N = 2;
764 const Vec64<T> v10_v00 = InterleaveLower(d, v0, v1);
765 const Vec64<T> v01_v20 = OddEven(v0, v2);
766 const Vec64<T> v21_v11 = InterleaveUpper(d, v1, v2);
767 StoreU(v10_v00, d, unaligned + 0 * N);
768 StoreU(v01_v20, d, unaligned + 1 * N);
769 StoreU(v21_v11, d, unaligned + 2 * N);
770}
771
772// 64-bit lanes are handled by the N=1 case below.
773
774// <= 32-bit vector, 8-bit lanes
775template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1), HWY_IF_LE32(T, N)>
777 const Vec128<T, N> part1,
778 const Vec128<T, N> part2, Simd<T, N, 0> /*tag*/,
779 T* HWY_RESTRICT unaligned) {
780 // Use full vectors for the shuffles and result.
781 const Full128<uint8_t> du;
782 const Full128<T> d_full;
783
784 const Vec128<T> v0{part0.raw};
785 const Vec128<T> v1{part1.raw};
786 const Vec128<T> v2{part2.raw};
787
788 // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
789 // so lanes to be filled from other vectors are 0 to enable blending by ORing
790 // together.
791 alignas(16) static constexpr uint8_t tbl_v0[16] = {
792 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80,
793 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
794 // The interleaved vector will be named A; temporaries with suffix
795 // 0..2 indicate which input vector's lanes they hold.
796 const auto shuf_A0 = Load(du, tbl_v0);
797 const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
798 const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
799 const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0
800 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0.
801 const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0..
802 const Vec128<T> A = BitCast(d_full, A0 | A1 | A2);
803 alignas(16) T buf[16 / sizeof(T)];
804 StoreU(A, d_full, buf);
805 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
806}
807
808// 32-bit vector, 16-bit lanes
809template <typename T, HWY_IF_LANE_SIZE(T, 2)>
811 const Vec128<T, 2> part1,
812 const Vec128<T, 2> part2, Simd<T, 2, 0> /*tag*/,
813 T* HWY_RESTRICT unaligned) {
814 constexpr size_t N = 4 / sizeof(T);
815 // Use full vectors for the shuffles and result.
816 const Full128<uint8_t> du8;
817 const Full128<T> d_full;
818
819 const Vec128<T> v0{part0.raw};
820 const Vec128<T> v1{part1.raw};
821 const Vec128<T> v2{part2.raw};
822
823 // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
824 // so lanes to be filled from other vectors are 0 to enable blending by ORing
825 // together.
826 alignas(16) static constexpr uint8_t tbl_v2[16] = {
827 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
828 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
829 // The interleaved vector will be named A; temporaries with suffix
830 // 0..2 indicate which input vector's lanes they hold.
831 const auto shuf_A2 = // ..1..0..
832 Load(du8, tbl_v2);
833 const auto shuf_A1 = // ...1..0.
834 CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
835 const auto shuf_A0 = // ....1..0
836 CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
837 const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0
838 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0.
839 const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0..
840 const auto A = BitCast(d_full, A0 | A1 | A2);
841 alignas(16) T buf[16 / sizeof(T)];
842 StoreU(A, d_full, buf);
843 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
844}
845
846// Single-element vector, any lane size: just store directly
847template <typename T>
849 const Vec128<T, 1> v2, Simd<T, 1, 0> d,
850 T* HWY_RESTRICT unaligned) {
851 StoreU(v0, d, unaligned + 0);
852 StoreU(v1, d, unaligned + 1);
853 StoreU(v2, d, unaligned + 2);
854}
855
856// ------------------------------ StoreInterleaved4
857
858namespace detail {
859
860// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
861template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
862HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D,
864 T* HWY_RESTRICT unaligned) {
865 StoreU(A, d, unaligned + 0 * N);
866 StoreU(B, d, unaligned + 1 * N);
867 StoreU(C, d, unaligned + 2 * N);
868 StoreU(D, d, unaligned + 3 * N);
869}
870
871} // namespace detail
872
873// >= 128-bit vector, 8..32-bit lanes
874template <typename T, size_t N, class V, HWY_IF_NOT_LANE_SIZE(T, 8),
875 HWY_IF_GE128(T, N)>
876HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
877 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
878 const RepartitionToWide<decltype(d)> dw;
879 const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
880 const auto v32L = ZipLower(dw, v2, v3);
881 const auto v10U = ZipUpper(dw, v0, v1);
882 const auto v32U = ZipUpper(dw, v2, v3);
883 // The interleaved vectors are A, B, C, D.
884 const auto A = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210
885 const auto B = BitCast(d, InterleaveUpper(dw, v10L, v32L));
886 const auto C = BitCast(d, InterleaveLower(dw, v10U, v32U));
887 const auto D = BitCast(d, InterleaveUpper(dw, v10U, v32U));
888 detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
889}
890
891// >= 128-bit vector, 64-bit lanes
892template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
893 HWY_IF_GE128(T, N)>
894HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
895 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
896 // The interleaved vectors are A, B, C, D.
897 const auto A = InterleaveLower(d, v0, v1); // v1[0] v0[0]
898 const auto B = InterleaveLower(d, v2, v3);
899 const auto C = InterleaveUpper(d, v0, v1);
900 const auto D = InterleaveUpper(d, v2, v3);
901 detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
902}
903
904// 64-bit vector, 8..32-bit lanes
905template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
906HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
907 const Vec64<T> part2, const Vec64<T> part3,
908 Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
909 constexpr size_t N = 16 / sizeof(T);
910 // Use full vectors to reduce the number of stores.
911 const Full128<T> d_full;
912 const RepartitionToWide<decltype(d_full)> dw;
913 const Vec128<T> v0{part0.raw};
914 const Vec128<T> v1{part1.raw};
915 const Vec128<T> v2{part2.raw};
916 const Vec128<T> v3{part3.raw};
917 const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0]
918 const auto v32 = ZipLower(dw, v2, v3);
919 const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32));
920 const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32));
921 StoreU(A, d_full, unaligned + 0 * N);
922 StoreU(B, d_full, unaligned + 1 * N);
923}
924
925// 64-bit vector, 64-bit lane
926template <typename T, HWY_IF_LANE_SIZE(T, 8)>
927HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
928 const Vec64<T> part2, const Vec64<T> part3,
929 Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
930 constexpr size_t N = 16 / sizeof(T);
931 // Use full vectors to reduce the number of stores.
932 const Full128<T> d_full;
933 const Vec128<T> v0{part0.raw};
934 const Vec128<T> v1{part1.raw};
935 const Vec128<T> v2{part2.raw};
936 const Vec128<T> v3{part3.raw};
937 const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0]
938 const auto B = InterleaveLower(d_full, v2, v3);
939 StoreU(A, d_full, unaligned + 0 * N);
940 StoreU(B, d_full, unaligned + 1 * N);
941}
942
943// <= 32-bit vectors
944template <typename T, size_t N, HWY_IF_LE32(T, N)>
945HWY_API void StoreInterleaved4(const Vec128<T, N> part0,
946 const Vec128<T, N> part1,
947 const Vec128<T, N> part2,
948 const Vec128<T, N> part3, Simd<T, N, 0> /*tag*/,
949 T* HWY_RESTRICT unaligned) {
950 // Use full vectors to reduce the number of stores.
951 const Full128<T> d_full;
952 const RepartitionToWide<decltype(d_full)> dw;
953 const Vec128<T> v0{part0.raw};
954 const Vec128<T> v1{part1.raw};
955 const Vec128<T> v2{part2.raw};
956 const Vec128<T> v3{part3.raw};
957 const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
958 const auto v32 = ZipLower(dw, v2, v3);
959 const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32));
960 alignas(16) T buf[16 / sizeof(T)];
961 StoreU(v3210, d_full, buf);
962 CopyBytes<4 * N * sizeof(T)>(buf, unaligned);
963}
964
965#endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED
966
967// ------------------------------ AESRound
968
969// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
970#if HWY_TARGET != HWY_SCALAR || HWY_IDE
971
972// Define for white-box testing, even if native instructions are available.
973namespace detail {
974
975// Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
976// Vector Permute Instructions" and the accompanying assembly language
977// implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
978// https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
979//
980// A brute-force 256 byte table lookup can also be made constant-time, and
981// possibly competitive on NEON, but this is more performance-portable
982// especially for x86 and large vectors.
983template <class V> // u8
984HWY_INLINE V SubBytes(V state) {
985 const DFromV<V> du;
986 const auto mask = Set(du, uint8_t{0xF});
987
988 // Change polynomial basis to GF(2^4)
989 {
990 alignas(16) static constexpr uint8_t basisL[16] = {
991 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
992 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
993 alignas(16) static constexpr uint8_t basisU[16] = {
994 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
995 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
996 const auto sL = And(state, mask);
997 const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
998 const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL);
999 const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU);
1000 state = Xor(gf4L, gf4U);
1001 }
1002
1003 // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
1004 // cause TableLookupBytesOr0 to return 0.
1005 alignas(16) static constexpr uint8_t kZetaInv[16] = {
1006 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
1007 alignas(16) static constexpr uint8_t kInv[16] = {
1008 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
1009 const auto tbl = LoadDup128(du, kInv);
1010 const auto sL = And(state, mask); // L=low nibble, U=upper
1011 const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
1012 const auto sX = Xor(sU, sL);
1013 const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL);
1014 const auto invU = TableLookupBytes(tbl, sU);
1015 const auto invX = TableLookupBytes(tbl, sX);
1016 const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
1017 const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));
1018
1019 // Linear skew (cannot bake 0x63 bias into the table because out* indices
1020 // may have the infinity flag set).
1021 alignas(16) static constexpr uint8_t kAffineL[16] = {
1022 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
1023 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
1024 alignas(16) static constexpr uint8_t kAffineU[16] = {
1025 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
1026 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
1027 const auto affL = TableLookupBytesOr0(LoadDup128(du, kAffineL), outL);
1028 const auto affU = TableLookupBytesOr0(LoadDup128(du, kAffineU), outU);
1029 return Xor(Xor(affL, affU), Set(du, uint8_t{0x63}));
1030}
1031
1032} // namespace detail
1033
1034#endif // HWY_TARGET != HWY_SCALAR
1035
1036// "Include guard": skip if native AES instructions are available.
1037#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
1038#ifdef HWY_NATIVE_AES
1039#undef HWY_NATIVE_AES
1040#else
1041#define HWY_NATIVE_AES
1042#endif
1043
1044// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
1045#if HWY_TARGET != HWY_SCALAR
1046
1047namespace detail {
1048
1049template <class V> // u8
1050HWY_API V ShiftRows(const V state) {
1051 const DFromV<V> du;
1052 alignas(16) static constexpr uint8_t kShiftRow[16] = {
1053 0, 5, 10, 15, // transposed: state is column major
1054 4, 9, 14, 3, //
1055 8, 13, 2, 7, //
1056 12, 1, 6, 11};
1057 const auto shift_row = LoadDup128(du, kShiftRow);
1058 return TableLookupBytes(state, shift_row);
1059}
1060
1061template <class V> // u8
1062HWY_API V MixColumns(const V state) {
1063 const DFromV<V> du;
1064 // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
1065 // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3.
1066 // 1 2 3 1 // d are on diagonal, no permutation needed.
1067 // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows.
1068 // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301).
1069 alignas(16) static constexpr uint8_t k2301[16] = {
1070 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
1071 alignas(16) static constexpr uint8_t k1230[16] = {
1072 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
1073 const RebindToSigned<decltype(du)> di; // can only do signed comparisons
1074 const auto msb = Lt(BitCast(di, state), Zero(di));
1075 const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B})));
1076 const auto d = Xor(Add(state, state), overflow); // = state*2 in GF(2^8).
1077 const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
1078 const auto d_s2301 = Xor(d, s2301);
1079 const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)}
1080 const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230));
1081 return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms
1082}
1083
1084} // namespace detail
1085
1086template <class V> // u8
1087HWY_API V AESRound(V state, const V round_key) {
1088 // Intel docs swap the first two steps, but it does not matter because
1089 // ShiftRows is a permutation and SubBytes is independent of lane index.
1090 state = detail::SubBytes(state);
1091 state = detail::ShiftRows(state);
1092 state = detail::MixColumns(state);
1093 state = Xor(state, round_key); // AddRoundKey
1094 return state;
1095}
1096
1097template <class V> // u8
1098HWY_API V AESLastRound(V state, const V round_key) {
1099 // LIke AESRound, but without MixColumns.
1100 state = detail::SubBytes(state);
1101 state = detail::ShiftRows(state);
1102 state = Xor(state, round_key); // AddRoundKey
1103 return state;
1104}
1105
1106// Constant-time implementation inspired by
1107// https://www.bearssl.org/constanttime.html, but about half the cost because we
1108// use 64x64 multiplies and 128-bit XORs.
1109template <class V>
1110HWY_API V CLMulLower(V a, V b) {
1111 const DFromV<V> d;
1112 static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
1113 const auto k1 = Set(d, 0x1111111111111111ULL);
1114 const auto k2 = Set(d, 0x2222222222222222ULL);
1115 const auto k4 = Set(d, 0x4444444444444444ULL);
1116 const auto k8 = Set(d, 0x8888888888888888ULL);
1117 const auto a0 = And(a, k1);
1118 const auto a1 = And(a, k2);
1119 const auto a2 = And(a, k4);
1120 const auto a3 = And(a, k8);
1121 const auto b0 = And(b, k1);
1122 const auto b1 = And(b, k2);
1123 const auto b2 = And(b, k4);
1124 const auto b3 = And(b, k8);
1125
1126 auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
1127 auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
1128 auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
1129 auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
1130 m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
1131 m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
1132 m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
1133 m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
1134 return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
1135}
1136
1137template <class V>
1138HWY_API V CLMulUpper(V a, V b) {
1139 const DFromV<V> d;
1140 static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
1141 const auto k1 = Set(d, 0x1111111111111111ULL);
1142 const auto k2 = Set(d, 0x2222222222222222ULL);
1143 const auto k4 = Set(d, 0x4444444444444444ULL);
1144 const auto k8 = Set(d, 0x8888888888888888ULL);
1145 const auto a0 = And(a, k1);
1146 const auto a1 = And(a, k2);
1147 const auto a2 = And(a, k4);
1148 const auto a3 = And(a, k8);
1149 const auto b0 = And(b, k1);
1150 const auto b1 = And(b, k2);
1151 const auto b2 = And(b, k4);
1152 const auto b3 = And(b, k8);
1153
1154 auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
1155 auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
1156 auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
1157 auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
1158 m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
1159 m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
1160 m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
1161 m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
1162 return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
1163}
1164
1165#endif // HWY_NATIVE_AES
1166#endif // HWY_TARGET != HWY_SCALAR
1167
1168// "Include guard": skip if native POPCNT-related instructions are available.
1169#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
1170#ifdef HWY_NATIVE_POPCNT
1171#undef HWY_NATIVE_POPCNT
1172#else
1173#define HWY_NATIVE_POPCNT
1174#endif
1175
1176#undef HWY_MIN_POW2_FOR_128
1177#if HWY_TARGET == HWY_RVV
1178#define HWY_MIN_POW2_FOR_128 1
1179#else
1180// All other targets except HWY_SCALAR (which is excluded by HWY_IF_GE128_D)
1181// guarantee 128 bits anyway.
1182#define HWY_MIN_POW2_FOR_128 0
1183#endif
1184
1185// This algorithm requires vectors to be at least 16 bytes, which is the case
1186// for LMUL >= 2. If not, use the fallback below.
1187template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
1188 HWY_IF_GE128_D(D), HWY_IF_POW2_GE(D, HWY_MIN_POW2_FOR_128)>
1190 static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
1191 const D d;
1192 HWY_ALIGN constexpr uint8_t kLookup[16] = {
1193 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1194 };
1195 const auto lo = And(v, Set(d, uint8_t{0xF}));
1196 const auto hi = ShiftRight<4>(v);
1197 const auto lookup = LoadDup128(d, kLookup);
1198 return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
1199}
1200
1201// RVV has a specialization that avoids the Set().
1202#if HWY_TARGET != HWY_RVV
1203// Slower fallback for capped vectors.
1204template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
1205 HWY_IF_LT128_D(D)>
1207 static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
1208 const D d;
1209 // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
1210 const V k33 = Set(d, uint8_t{0x33});
1211 v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
1212 v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
1213 return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
1214}
1215#endif // HWY_TARGET != HWY_RVV
1216
1217template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 2)>
1219 static_assert(IsSame<TFromD<D>, uint16_t>(), "V must be u16");
1220 const D d;
1221 const Repartition<uint8_t, decltype(d)> d8;
1222 const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
1223 return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
1224}
1225
1226template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
1228 static_assert(IsSame<TFromD<D>, uint32_t>(), "V must be u32");
1229 const D d;
1230 Repartition<uint16_t, decltype(d)> d16;
1231 auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
1232 return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
1233}
1234
1235#if HWY_HAVE_INTEGER64
1236template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8)>
1238 static_assert(IsSame<TFromD<D>, uint64_t>(), "V must be u64");
1239 const D d;
1240 Repartition<uint32_t, decltype(d)> d32;
1241 auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
1242 return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
1243}
1244#endif
1245
1246#endif // HWY_NATIVE_POPCNT
1247
1248template <class V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8),
1249 HWY_IF_LT128_D(D), HWY_IF_FLOAT_D(D)>
1250HWY_API V operator*(V x, V y) {
1251 return Set(D(), GetLane(x) * GetLane(y));
1252}
1253
1254template <class V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8),
1255 HWY_IF_LT128_D(D), HWY_IF_NOT_FLOAT_D(D)>
1256HWY_API V operator*(V x, V y) {
1257 const DFromV<V> d;
1258 using T = TFromD<decltype(d)>;
1259 using TU = MakeUnsigned<T>;
1260 const TU xu = static_cast<TU>(GetLane(x));
1261 const TU yu = static_cast<TU>(GetLane(y));
1262 return Set(d, static_cast<T>(xu * yu));
1263}
1264
1265// "Include guard": skip if native 64-bit mul instructions are available.
1266#if (defined(HWY_NATIVE_I64MULLO) == defined(HWY_TARGET_TOGGLE))
1267#ifdef HWY_NATIVE_I64MULLO
1268#undef HWY_NATIVE_I64MULLO
1269#else
1270#define HWY_NATIVE_I64MULLO
1271#endif
1272
1273template <class V, class D64 = DFromV<V>, typename T = LaneType<V>,
1274 HWY_IF_LANE_SIZE(T, 8), HWY_IF_UNSIGNED(T), HWY_IF_GE128_D(D64)>
1275HWY_API V operator*(V x, V y) {
1277 auto x32 = BitCast(d32, x);
1278 auto y32 = BitCast(d32, y);
1279 auto lolo = BitCast(d32, MulEven(x32, y32));
1280 auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
1281 auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
1282 auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
1283 return BitCast(D64{}, lolo + hi);
1284}
1285template <class V, class DI64 = DFromV<V>, typename T = LaneType<V>,
1286 HWY_IF_LANE_SIZE(T, 8), HWY_IF_SIGNED(T), HWY_IF_GE128_D(DI64)>
1287HWY_API V operator*(V x, V y) {
1288 RebindToUnsigned<DI64> du64;
1289 return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
1290}
1291
1292#endif // HWY_NATIVE_I64MULLO
1293
1294// "Include guard": skip if native 8-bit compress instructions are available.
1295#if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
1296#ifdef HWY_NATIVE_COMPRESS8
1297#undef HWY_NATIVE_COMPRESS8
1298#else
1299#define HWY_NATIVE_COMPRESS8
1300#endif
1301
1302template <class V, class D, typename T, HWY_IF_LANE_SIZE(T, 1)>
1303HWY_API size_t CompressBitsStore(V v, const uint8_t* HWY_RESTRICT bits, D d,
1304 T* unaligned) {
1305 HWY_ALIGN T lanes[MaxLanes(d)];
1306 Store(v, d, lanes);
1307
1308 const Simd<T, HWY_MIN(MaxLanes(d), 8), 0> d8;
1309 T* HWY_RESTRICT pos = unaligned;
1310
1311 HWY_ALIGN constexpr T table[256 * 8] = {
1312 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
1313 1, 0, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
1314 2, 0, 1, 3, 4, 5, 6, 7, 0, 2, 1, 3, 4, 5, 6, 7, //
1315 1, 2, 0, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
1316 3, 0, 1, 2, 4, 5, 6, 7, 0, 3, 1, 2, 4, 5, 6, 7, //
1317 1, 3, 0, 2, 4, 5, 6, 7, 0, 1, 3, 2, 4, 5, 6, 7, //
1318 2, 3, 0, 1, 4, 5, 6, 7, 0, 2, 3, 1, 4, 5, 6, 7, //
1319 1, 2, 3, 0, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
1320 4, 0, 1, 2, 3, 5, 6, 7, 0, 4, 1, 2, 3, 5, 6, 7, //
1321 1, 4, 0, 2, 3, 5, 6, 7, 0, 1, 4, 2, 3, 5, 6, 7, //
1322 2, 4, 0, 1, 3, 5, 6, 7, 0, 2, 4, 1, 3, 5, 6, 7, //
1323 1, 2, 4, 0, 3, 5, 6, 7, 0, 1, 2, 4, 3, 5, 6, 7, //
1324 3, 4, 0, 1, 2, 5, 6, 7, 0, 3, 4, 1, 2, 5, 6, 7, //
1325 1, 3, 4, 0, 2, 5, 6, 7, 0, 1, 3, 4, 2, 5, 6, 7, //
1326 2, 3, 4, 0, 1, 5, 6, 7, 0, 2, 3, 4, 1, 5, 6, 7, //
1327 1, 2, 3, 4, 0, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
1328 5, 0, 1, 2, 3, 4, 6, 7, 0, 5, 1, 2, 3, 4, 6, 7, //
1329 1, 5, 0, 2, 3, 4, 6, 7, 0, 1, 5, 2, 3, 4, 6, 7, //
1330 2, 5, 0, 1, 3, 4, 6, 7, 0, 2, 5, 1, 3, 4, 6, 7, //
1331 1, 2, 5, 0, 3, 4, 6, 7, 0, 1, 2, 5, 3, 4, 6, 7, //
1332 3, 5, 0, 1, 2, 4, 6, 7, 0, 3, 5, 1, 2, 4, 6, 7, //
1333 1, 3, 5, 0, 2, 4, 6, 7, 0, 1, 3, 5, 2, 4, 6, 7, //
1334 2, 3, 5, 0, 1, 4, 6, 7, 0, 2, 3, 5, 1, 4, 6, 7, //
1335 1, 2, 3, 5, 0, 4, 6, 7, 0, 1, 2, 3, 5, 4, 6, 7, //
1336 4, 5, 0, 1, 2, 3, 6, 7, 0, 4, 5, 1, 2, 3, 6, 7, //
1337 1, 4, 5, 0, 2, 3, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7, //
1338 2, 4, 5, 0, 1, 3, 6, 7, 0, 2, 4, 5, 1, 3, 6, 7, //
1339 1, 2, 4, 5, 0, 3, 6, 7, 0, 1, 2, 4, 5, 3, 6, 7, //
1340 3, 4, 5, 0, 1, 2, 6, 7, 0, 3, 4, 5, 1, 2, 6, 7, //
1341 1, 3, 4, 5, 0, 2, 6, 7, 0, 1, 3, 4, 5, 2, 6, 7, //
1342 2, 3, 4, 5, 0, 1, 6, 7, 0, 2, 3, 4, 5, 1, 6, 7, //
1343 1, 2, 3, 4, 5, 0, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
1344 6, 0, 1, 2, 3, 4, 5, 7, 0, 6, 1, 2, 3, 4, 5, 7, //
1345 1, 6, 0, 2, 3, 4, 5, 7, 0, 1, 6, 2, 3, 4, 5, 7, //
1346 2, 6, 0, 1, 3, 4, 5, 7, 0, 2, 6, 1, 3, 4, 5, 7, //
1347 1, 2, 6, 0, 3, 4, 5, 7, 0, 1, 2, 6, 3, 4, 5, 7, //
1348 3, 6, 0, 1, 2, 4, 5, 7, 0, 3, 6, 1, 2, 4, 5, 7, //
1349 1, 3, 6, 0, 2, 4, 5, 7, 0, 1, 3, 6, 2, 4, 5, 7, //
1350 2, 3, 6, 0, 1, 4, 5, 7, 0, 2, 3, 6, 1, 4, 5, 7, //
1351 1, 2, 3, 6, 0, 4, 5, 7, 0, 1, 2, 3, 6, 4, 5, 7, //
1352 4, 6, 0, 1, 2, 3, 5, 7, 0, 4, 6, 1, 2, 3, 5, 7, //
1353 1, 4, 6, 0, 2, 3, 5, 7, 0, 1, 4, 6, 2, 3, 5, 7, //
1354 2, 4, 6, 0, 1, 3, 5, 7, 0, 2, 4, 6, 1, 3, 5, 7, //
1355 1, 2, 4, 6, 0, 3, 5, 7, 0, 1, 2, 4, 6, 3, 5, 7, //
1356 3, 4, 6, 0, 1, 2, 5, 7, 0, 3, 4, 6, 1, 2, 5, 7, //
1357 1, 3, 4, 6, 0, 2, 5, 7, 0, 1, 3, 4, 6, 2, 5, 7, //
1358 2, 3, 4, 6, 0, 1, 5, 7, 0, 2, 3, 4, 6, 1, 5, 7, //
1359 1, 2, 3, 4, 6, 0, 5, 7, 0, 1, 2, 3, 4, 6, 5, 7, //
1360 5, 6, 0, 1, 2, 3, 4, 7, 0, 5, 6, 1, 2, 3, 4, 7, //
1361 1, 5, 6, 0, 2, 3, 4, 7, 0, 1, 5, 6, 2, 3, 4, 7, //
1362 2, 5, 6, 0, 1, 3, 4, 7, 0, 2, 5, 6, 1, 3, 4, 7, //
1363 1, 2, 5, 6, 0, 3, 4, 7, 0, 1, 2, 5, 6, 3, 4, 7, //
1364 3, 5, 6, 0, 1, 2, 4, 7, 0, 3, 5, 6, 1, 2, 4, 7, //
1365 1, 3, 5, 6, 0, 2, 4, 7, 0, 1, 3, 5, 6, 2, 4, 7, //
1366 2, 3, 5, 6, 0, 1, 4, 7, 0, 2, 3, 5, 6, 1, 4, 7, //
1367 1, 2, 3, 5, 6, 0, 4, 7, 0, 1, 2, 3, 5, 6, 4, 7, //
1368 4, 5, 6, 0, 1, 2, 3, 7, 0, 4, 5, 6, 1, 2, 3, 7, //
1369 1, 4, 5, 6, 0, 2, 3, 7, 0, 1, 4, 5, 6, 2, 3, 7, //
1370 2, 4, 5, 6, 0, 1, 3, 7, 0, 2, 4, 5, 6, 1, 3, 7, //
1371 1, 2, 4, 5, 6, 0, 3, 7, 0, 1, 2, 4, 5, 6, 3, 7, //
1372 3, 4, 5, 6, 0, 1, 2, 7, 0, 3, 4, 5, 6, 1, 2, 7, //
1373 1, 3, 4, 5, 6, 0, 2, 7, 0, 1, 3, 4, 5, 6, 2, 7, //
1374 2, 3, 4, 5, 6, 0, 1, 7, 0, 2, 3, 4, 5, 6, 1, 7, //
1375 1, 2, 3, 4, 5, 6, 0, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
1376 7, 0, 1, 2, 3, 4, 5, 6, 0, 7, 1, 2, 3, 4, 5, 6, //
1377 1, 7, 0, 2, 3, 4, 5, 6, 0, 1, 7, 2, 3, 4, 5, 6, //
1378 2, 7, 0, 1, 3, 4, 5, 6, 0, 2, 7, 1, 3, 4, 5, 6, //
1379 1, 2, 7, 0, 3, 4, 5, 6, 0, 1, 2, 7, 3, 4, 5, 6, //
1380 3, 7, 0, 1, 2, 4, 5, 6, 0, 3, 7, 1, 2, 4, 5, 6, //
1381 1, 3, 7, 0, 2, 4, 5, 6, 0, 1, 3, 7, 2, 4, 5, 6, //
1382 2, 3, 7, 0, 1, 4, 5, 6, 0, 2, 3, 7, 1, 4, 5, 6, //
1383 1, 2, 3, 7, 0, 4, 5, 6, 0, 1, 2, 3, 7, 4, 5, 6, //
1384 4, 7, 0, 1, 2, 3, 5, 6, 0, 4, 7, 1, 2, 3, 5, 6, //
1385 1, 4, 7, 0, 2, 3, 5, 6, 0, 1, 4, 7, 2, 3, 5, 6, //
1386 2, 4, 7, 0, 1, 3, 5, 6, 0, 2, 4, 7, 1, 3, 5, 6, //
1387 1, 2, 4, 7, 0, 3, 5, 6, 0, 1, 2, 4, 7, 3, 5, 6, //
1388 3, 4, 7, 0, 1, 2, 5, 6, 0, 3, 4, 7, 1, 2, 5, 6, //
1389 1, 3, 4, 7, 0, 2, 5, 6, 0, 1, 3, 4, 7, 2, 5, 6, //
1390 2, 3, 4, 7, 0, 1, 5, 6, 0, 2, 3, 4, 7, 1, 5, 6, //
1391 1, 2, 3, 4, 7, 0, 5, 6, 0, 1, 2, 3, 4, 7, 5, 6, //
1392 5, 7, 0, 1, 2, 3, 4, 6, 0, 5, 7, 1, 2, 3, 4, 6, //
1393 1, 5, 7, 0, 2, 3, 4, 6, 0, 1, 5, 7, 2, 3, 4, 6, //
1394 2, 5, 7, 0, 1, 3, 4, 6, 0, 2, 5, 7, 1, 3, 4, 6, //
1395 1, 2, 5, 7, 0, 3, 4, 6, 0, 1, 2, 5, 7, 3, 4, 6, //
1396 3, 5, 7, 0, 1, 2, 4, 6, 0, 3, 5, 7, 1, 2, 4, 6, //
1397 1, 3, 5, 7, 0, 2, 4, 6, 0, 1, 3, 5, 7, 2, 4, 6, //
1398 2, 3, 5, 7, 0, 1, 4, 6, 0, 2, 3, 5, 7, 1, 4, 6, //
1399 1, 2, 3, 5, 7, 0, 4, 6, 0, 1, 2, 3, 5, 7, 4, 6, //
1400 4, 5, 7, 0, 1, 2, 3, 6, 0, 4, 5, 7, 1, 2, 3, 6, //
1401 1, 4, 5, 7, 0, 2, 3, 6, 0, 1, 4, 5, 7, 2, 3, 6, //
1402 2, 4, 5, 7, 0, 1, 3, 6, 0, 2, 4, 5, 7, 1, 3, 6, //
1403 1, 2, 4, 5, 7, 0, 3, 6, 0, 1, 2, 4, 5, 7, 3, 6, //
1404 3, 4, 5, 7, 0, 1, 2, 6, 0, 3, 4, 5, 7, 1, 2, 6, //
1405 1, 3, 4, 5, 7, 0, 2, 6, 0, 1, 3, 4, 5, 7, 2, 6, //
1406 2, 3, 4, 5, 7, 0, 1, 6, 0, 2, 3, 4, 5, 7, 1, 6, //
1407 1, 2, 3, 4, 5, 7, 0, 6, 0, 1, 2, 3, 4, 5, 7, 6, //
1408 6, 7, 0, 1, 2, 3, 4, 5, 0, 6, 7, 1, 2, 3, 4, 5, //
1409 1, 6, 7, 0, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, //
1410 2, 6, 7, 0, 1, 3, 4, 5, 0, 2, 6, 7, 1, 3, 4, 5, //
1411 1, 2, 6, 7, 0, 3, 4, 5, 0, 1, 2, 6, 7, 3, 4, 5, //
1412 3, 6, 7, 0, 1, 2, 4, 5, 0, 3, 6, 7, 1, 2, 4, 5, //
1413 1, 3, 6, 7, 0, 2, 4, 5, 0, 1, 3, 6, 7, 2, 4, 5, //
1414 2, 3, 6, 7, 0, 1, 4, 5, 0, 2, 3, 6, 7, 1, 4, 5, //
1415 1, 2, 3, 6, 7, 0, 4, 5, 0, 1, 2, 3, 6, 7, 4, 5, //
1416 4, 6, 7, 0, 1, 2, 3, 5, 0, 4, 6, 7, 1, 2, 3, 5, //
1417 1, 4, 6, 7, 0, 2, 3, 5, 0, 1, 4, 6, 7, 2, 3, 5, //
1418 2, 4, 6, 7, 0, 1, 3, 5, 0, 2, 4, 6, 7, 1, 3, 5, //
1419 1, 2, 4, 6, 7, 0, 3, 5, 0, 1, 2, 4, 6, 7, 3, 5, //
1420 3, 4, 6, 7, 0, 1, 2, 5, 0, 3, 4, 6, 7, 1, 2, 5, //
1421 1, 3, 4, 6, 7, 0, 2, 5, 0, 1, 3, 4, 6, 7, 2, 5, //
1422 2, 3, 4, 6, 7, 0, 1, 5, 0, 2, 3, 4, 6, 7, 1, 5, //
1423 1, 2, 3, 4, 6, 7, 0, 5, 0, 1, 2, 3, 4, 6, 7, 5, //
1424 5, 6, 7, 0, 1, 2, 3, 4, 0, 5, 6, 7, 1, 2, 3, 4, //
1425 1, 5, 6, 7, 0, 2, 3, 4, 0, 1, 5, 6, 7, 2, 3, 4, //
1426 2, 5, 6, 7, 0, 1, 3, 4, 0, 2, 5, 6, 7, 1, 3, 4, //
1427 1, 2, 5, 6, 7, 0, 3, 4, 0, 1, 2, 5, 6, 7, 3, 4, //
1428 3, 5, 6, 7, 0, 1, 2, 4, 0, 3, 5, 6, 7, 1, 2, 4, //
1429 1, 3, 5, 6, 7, 0, 2, 4, 0, 1, 3, 5, 6, 7, 2, 4, //
1430 2, 3, 5, 6, 7, 0, 1, 4, 0, 2, 3, 5, 6, 7, 1, 4, //
1431 1, 2, 3, 5, 6, 7, 0, 4, 0, 1, 2, 3, 5, 6, 7, 4, //
1432 4, 5, 6, 7, 0, 1, 2, 3, 0, 4, 5, 6, 7, 1, 2, 3, //
1433 1, 4, 5, 6, 7, 0, 2, 3, 0, 1, 4, 5, 6, 7, 2, 3, //
1434 2, 4, 5, 6, 7, 0, 1, 3, 0, 2, 4, 5, 6, 7, 1, 3, //
1435 1, 2, 4, 5, 6, 7, 0, 3, 0, 1, 2, 4, 5, 6, 7, 3, //
1436 3, 4, 5, 6, 7, 0, 1, 2, 0, 3, 4, 5, 6, 7, 1, 2, //
1437 1, 3, 4, 5, 6, 7, 0, 2, 0, 1, 3, 4, 5, 6, 7, 2, //
1438 2, 3, 4, 5, 6, 7, 0, 1, 0, 2, 3, 4, 5, 6, 7, 1, //
1439 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
1440
1441 for (size_t i = 0; i < Lanes(d); i += 8) {
1442 // Each byte worth of bits is the index of one of 256 8-byte ranges, and its
1443 // population count determines how far to advance the write position.
1444 const size_t bits8 = bits[i / 8];
1445 const auto indices = Load(d8, table + bits8 * 8);
1446 const auto compressed = TableLookupBytes(LoadU(d8, lanes + i), indices);
1447 StoreU(compressed, d8, pos);
1448 pos += PopCount(bits8);
1449 }
1450 return static_cast<size_t>(pos - unaligned);
1451}
1452
1453template <class V, class M, class D, typename T, HWY_IF_LANE_SIZE(T, 1)>
1454HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) {
1455 uint8_t bits[HWY_MAX(size_t{8}, MaxLanes(d) / 8)];
1456 (void)StoreMaskBits(d, mask, bits);
1457 return CompressBitsStore(v, bits, d, unaligned);
1458}
1459
1460template <class V, class M, class D, typename T, HWY_IF_LANE_SIZE(T, 1)>
1461HWY_API size_t CompressBlendedStore(V v, M mask, D d,
1462 T* HWY_RESTRICT unaligned) {
1463 HWY_ALIGN T buf[MaxLanes(d)];
1464 const size_t bytes = CompressStore(v, mask, d, buf);
1465 BlendedStore(Load(d, buf), FirstN(d, bytes), d, unaligned);
1466 return bytes;
1467}
1468
1469// For reasons unknown, HWY_IF_LANE_SIZE_V is a compile error in SVE.
1470template <class V, class M, typename T = TFromV<V>, HWY_IF_LANE_SIZE(T, 1)>
1471HWY_API V Compress(V v, const M mask) {
1472 const DFromV<V> d;
1473 HWY_ALIGN T lanes[MaxLanes(d)];
1474 (void)CompressStore(v, mask, d, lanes);
1475 return Load(d, lanes);
1476}
1477
1478template <class V, typename T = TFromV<V>, HWY_IF_LANE_SIZE(T, 1)>
1479HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
1480 const DFromV<V> d;
1481 HWY_ALIGN T lanes[MaxLanes(d)];
1482 (void)CompressBitsStore(v, bits, d, lanes);
1483 return Load(d, lanes);
1484}
1485
1486template <class V, class M, typename T = TFromV<V>, HWY_IF_LANE_SIZE(T, 1)>
1487HWY_API V CompressNot(V v, M mask) {
1488 return Compress(v, Not(mask));
1489}
1490
1491#endif // HWY_NATIVE_COMPRESS8
1492
1493// ================================================== Operator wrapper
1494
1495// These targets currently cannot define operators and have already defined
1496// (only) the corresponding functions such as Add.
1497#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE && \
1498 HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
1499 HWY_TARGET != HWY_SVE2_128
1500
1501template <class V>
1502HWY_API V Add(V a, V b) {
1503 return a + b;
1504}
1505template <class V>
1506HWY_API V Sub(V a, V b) {
1507 return a - b;
1508}
1509
1510template <class V>
1511HWY_API V Mul(V a, V b) {
1512 return a * b;
1513}
1514template <class V>
1515HWY_API V Div(V a, V b) {
1516 return a / b;
1517}
1518
1519template <class V>
1520V Shl(V a, V b) {
1521 return a << b;
1522}
1523template <class V>
1524V Shr(V a, V b) {
1525 return a >> b;
1526}
1527
1528template <class V>
1529HWY_API auto Eq(V a, V b) -> decltype(a == b) {
1530 return a == b;
1531}
1532template <class V>
1533HWY_API auto Ne(V a, V b) -> decltype(a == b) {
1534 return a != b;
1535}
1536template <class V>
1537HWY_API auto Lt(V a, V b) -> decltype(a == b) {
1538 return a < b;
1539}
1540
1541template <class V>
1542HWY_API auto Gt(V a, V b) -> decltype(a == b) {
1543 return a > b;
1544}
1545template <class V>
1546HWY_API auto Ge(V a, V b) -> decltype(a == b) {
1547 return a >= b;
1548}
1549
1550template <class V>
1551HWY_API auto Le(V a, V b) -> decltype(a == b) {
1552 return a <= b;
1553}
1554
1555#endif // HWY_TARGET for operators
1556
1557// NOLINTNEXTLINE(google-readability-namespace-comments)
1558} // namespace HWY_NAMESPACE
1559} // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:135
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES)
Definition: base.h:430
#define HWY_RESTRICT
Definition: base.h:64
#define HWY_IF_LANE_SIZE(T, bytes)
Definition: base.h:420
#define HWY_API
Definition: base.h:129
#define HWY_IF_NOT_LANE_SIZE(T, bytes)
Definition: base.h:422
#define HWY_MIN(a, b)
Definition: base.h:134
#define HWY_IF_GE128(T, N)
Definition: base.h:411
#define HWY_INLINE
Definition: base.h:70
Definition: arm_neon-inl.h:778
Raw raw
Definition: arm_neon-inl.h:814
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2413
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:718
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:159
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2451
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:470
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:535
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:815
HWY_API Vec128< uint16_t, N > Shl(hwy::UnsignedTag, Vec128< uint16_t, N > v, Vec128< uint16_t, N > bits)
Definition: x86_128-inl.h:5009
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:862
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:545
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2432
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:505
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:340
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:926
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:748
d
Definition: rvv-inl.h:1998
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4272
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:6349
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6584
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4448
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5701
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition: arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:221
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1949
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition: ops/shared-inl.h:295
HWY_API Vec< D > NaN(D d)
Definition: generic_ops-inl.h:76
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2217
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:212
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4912
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4617
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1931
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4872
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2758
typename D::Twice Twice
Definition: ops/shared-inl.h:231
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:210
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2941
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4417
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1998
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2772
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4586
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:223
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:322
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:69
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:6234
HWY_API Vec< D > Inf(D d)
Definition: generic_ops-inl.h:85
decltype(GetLane(V())) LaneType
Definition: generic_ops-inl.h:33
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2591
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:997
HWY_API svbool_t Gt(const V a, const V b)
Definition: arm_sve-inl.h:881
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4281
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1020
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4256
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:6387
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1986
HWY_API V Clamp(const V v, const V lo, const V hi)
Definition: generic_ops-inl.h:50
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1076
HWY_API void SafeFillN(const size_t num, const T value, D d, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:96
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4678
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:111
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:218
decltype(MaskFromVec(Zero(D()))) Mask
Definition: generic_ops-inl.h:46
N
Definition: rvv-inl.h:1998
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6273
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6428
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6257
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6517
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2934
const vfloat64m1_t v
Definition: rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4429
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:2506
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6549
typename D::T TFromD
Definition: ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6174
HWY_API svbool_t Ge(const V a, const V b)
Definition: arm_sve-inl.h:885
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:40
Definition: aligned_allocator.h:27
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:607
HWY_API constexpr bool IsSame()
Definition: base.h:396
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:865
constexpr MakeUnsigned< T > SignMask()
Definition: base.h:734
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:593
HWY_API constexpr T LimitsMax()
Definition: base.h:656
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: ops/shared-inl.h:52