Grok 10.0.5
base.h
Go to the documentation of this file.
1// Copyright 2020 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef HIGHWAY_HWY_BASE_H_
17#define HIGHWAY_HWY_BASE_H_
18
19// For SIMD module implementations and their callers, target-independent.
20
21#include <stddef.h>
22#include <stdint.h>
23
25#include "hwy/highway_export.h"
26
27#if HWY_COMPILER_MSVC
28#include <string.h> // memcpy
29#endif
30#if HWY_ARCH_X86
31#include <atomic>
32#endif
33
34//------------------------------------------------------------------------------
35// Compiler-specific definitions
36
37#define HWY_STR_IMPL(macro) #macro
38#define HWY_STR(macro) HWY_STR_IMPL(macro)
39
40#if HWY_COMPILER_MSVC
41
42#include <intrin.h>
43
44#define HWY_RESTRICT __restrict
45#define HWY_INLINE __forceinline
46#define HWY_NOINLINE __declspec(noinline)
47#define HWY_FLATTEN
48#define HWY_NORETURN __declspec(noreturn)
49#define HWY_LIKELY(expr) (expr)
50#define HWY_UNLIKELY(expr) (expr)
51#define HWY_PRAGMA(tokens) __pragma(tokens)
52#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
53#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
54#define HWY_MAYBE_UNUSED
55#define HWY_HAS_ASSUME_ALIGNED 0
56#if (_MSC_VER >= 1700)
57#define HWY_MUST_USE_RESULT _Check_return_
58#else
59#define HWY_MUST_USE_RESULT
60#endif
61
62#else
63
64#define HWY_RESTRICT __restrict__
65// force inlining without optimization enabled creates very inefficient code
66// that can cause compiler timeout
67#ifdef __OPTIMIZE__
68#define HWY_INLINE inline __attribute__((always_inline))
69#else
70#define HWY_INLINE inline
71#endif
72#define HWY_NOINLINE __attribute__((noinline))
73#define HWY_FLATTEN __attribute__((flatten))
74#define HWY_NORETURN __attribute__((noreturn))
75#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
76#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
77#define HWY_PRAGMA(tokens) _Pragma(#tokens)
78#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
79#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
80// Encountered "attribute list cannot appear here" when using the C++17
81// [[maybe_unused]], so only use the old style attribute for now.
82#define HWY_MAYBE_UNUSED __attribute__((unused))
83#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
84
85#endif // !HWY_COMPILER_MSVC
86
87//------------------------------------------------------------------------------
88// Builtin/attributes
89
90// Enables error-checking of format strings.
91#if HWY_HAS_ATTRIBUTE(__format__)
92#define HWY_FORMAT(idx_fmt, idx_arg) \
93 __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
94#else
95#define HWY_FORMAT(idx_fmt, idx_arg)
96#endif
97
98// Returns a void* pointer which the compiler then assumes is N-byte aligned.
99// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
100//
101// The assignment semantics are required by GCC/Clang. ICC provides an in-place
102// __assume_aligned, whereas MSVC's __assume appears unsuitable.
103#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
104#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
105#else
106#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
107#endif
108
109// Clang and GCC require attributes on each function into which SIMD intrinsics
110// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
111// automatic annotation via pragmas.
112#if HWY_COMPILER_CLANG
113#define HWY_PUSH_ATTRIBUTES(targets_str) \
114 HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
115 apply_to = function))
116#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
117#elif HWY_COMPILER_GCC
118#define HWY_PUSH_ATTRIBUTES(targets_str) \
119 HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
120#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
121#else
122#define HWY_PUSH_ATTRIBUTES(targets_str)
123#define HWY_POP_ATTRIBUTES
124#endif
125
126//------------------------------------------------------------------------------
127// Macros
128
129#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
130
131#define HWY_CONCAT_IMPL(a, b) a##b
132#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
133
134#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
135#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
136
137#if HWY_COMPILER_GCC_ACTUAL
138// nielskm: GCC does not support '#pragma GCC unroll' without the factor.
139#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
140#define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
141#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
142#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
143#define HWY_DEFAULT_UNROLL HWY_UNROLL()
144#else
145#define HWY_UNROLL(factor)
146#define HWY_DEFAULT_UNROLL
147#endif
148
149// Tell a compiler that the expression always evaluates to true.
150// The expression should be free from any side effects.
151// Some older compilers may have trouble with complex expressions, therefore
152// it is advisable to split multiple conditions into separate assume statements,
153// and manually check the generated code.
154// OK but could fail:
155// HWY_ASSUME(x == 2 && y == 3);
156// Better:
157// HWY_ASSUME(x == 2);
158// HWY_ASSUME(y == 3);
159#if defined(__has_cpp_attribute) && __has_cpp_attribute(assume)
160#define HWY_ASSUME(expr) [[assume(expr)]]
161#elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
162#define HWY_ASSUME(expr) __assume(expr)
163// __builtin_assume() was added in clang 3.6.
164#elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume)
165#define HWY_ASSUME(expr) __builtin_assume(expr)
166// __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added
167// later, so check for the compiler version directly.
168#elif HWY_COMPILER_GCC_ACTUAL >= 405
169#define HWY_ASSUME(expr) \
170 ((expr) ? static_cast<void>(0) : __builtin_unreachable())
171#else
172#define HWY_ASSUME(expr) static_cast<void>(0)
173#endif
174
175// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
176// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
177// does, without generating code.
178#if HWY_ARCH_X86
179#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
180#else
181// TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
182#define HWY_FENCE
183#endif
184
185// 4 instances of a given literal value, useful as input to LoadDup128.
186#define HWY_REP4(literal) literal, literal, literal, literal
187
188#define HWY_ABORT(format, ...) \
189 ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
190
191// Always enabled.
192#define HWY_ASSERT(condition) \
193 do { \
194 if (!(condition)) { \
195 HWY_ABORT("Assert %s", #condition); \
196 } \
197 } while (0)
198
199#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
200#define HWY_IS_MSAN 1
201#else
202#define HWY_IS_MSAN 0
203#endif
204
205#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
206#define HWY_IS_ASAN 1
207#else
208#define HWY_IS_ASAN 0
209#endif
210
211#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
212#define HWY_IS_TSAN 1
213#else
214#define HWY_IS_TSAN 0
215#endif
216
217// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
218// You can disable MSAN by adding this attribute to the function that fails.
219#if HWY_IS_MSAN
220#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
221#else
222#define HWY_ATTR_NO_MSAN
223#endif
224
225// For enabling HWY_DASSERT and shortening tests in slower debug builds
226#if !defined(HWY_IS_DEBUG_BUILD)
227// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
228// MSVC defines NDEBUG (if not, could instead check _DEBUG).
229#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
230 HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
231#define HWY_IS_DEBUG_BUILD 1
232#else
233#define HWY_IS_DEBUG_BUILD 0
234#endif
235#endif // HWY_IS_DEBUG_BUILD
236
237#if HWY_IS_DEBUG_BUILD
238#define HWY_DASSERT(condition) HWY_ASSERT(condition)
239#else
240#define HWY_DASSERT(condition) \
241 do { \
242 } while (0)
243#endif
244
245namespace hwy {
246
247//------------------------------------------------------------------------------
248// kMaxVectorSize (undocumented, pending removal)
249
250#if HWY_ARCH_X86
251static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
252#elif HWY_ARCH_RVV && defined(__riscv_vector)
253// Not actually an upper bound on the size.
254static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
255#else
256static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
257#endif
258
259//------------------------------------------------------------------------------
260// Alignment
261
262// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
263// should be allocated dynamically via aligned_allocator.h because Lanes() may
264// exceed the stack size.
265#if HWY_ARCH_X86
266#define HWY_ALIGN_MAX alignas(64)
267#elif HWY_ARCH_RVV && defined(__riscv_vector)
268#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
269#else
270#define HWY_ALIGN_MAX alignas(16)
271#endif
272
273//------------------------------------------------------------------------------
274// Lane types
275
276// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
277// by concatenating base type and bits.
278
279#pragma pack(push, 1)
280
281// ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
282// always supported on aarch64, for v7 only if -mfp16-format is given.
283#if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
284using float16_t = __fp16;
285// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
286// Required for Clang RVV if the float16 extension is used.
287#elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
288using float16_t = _Float16;
289// Otherwise emulate
290#else
291struct float16_t {
292 uint16_t bits;
293};
294#endif
295
297 uint16_t bits;
298};
299
300#pragma pack(pop)
301
302using float32_t = float;
303using float64_t = double;
304
305#pragma pack(push, 1)
306
307// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
308// https://reviews.llvm.org/D86310
309struct alignas(16) uint128_t {
310 uint64_t lo; // little-endian layout
311 uint64_t hi;
312};
313
314// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
315// field is to be compared (Lt128Upper instead of Lt128).
316struct alignas(16) K64V64 {
317 uint64_t value; // little-endian layout
318 uint64_t key;
319};
320
321// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
322// than when considering both to be a 64-bit key.
323struct alignas(8) K32V32 {
324 uint32_t value; // little-endian layout
325 uint32_t key;
326};
327
328#pragma pack(pop)
329
330static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
331 const uint128_t& b) {
332 return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
333}
334// Required for std::greater.
335static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
336 const uint128_t& b) {
337 return b < a;
338}
339static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
340 const uint128_t& b) {
341 return a.lo == b.lo && a.hi == b.hi;
342}
343
344static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
345 const K64V64& b) {
346 return a.key < b.key;
347}
348// Required for std::greater.
349static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
350 const K64V64& b) {
351 return b < a;
352}
353static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
354 const K64V64& b) {
355 return a.key == b.key;
356}
357
358static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
359 const K32V32& b) {
360 return a.key < b.key;
361}
362// Required for std::greater.
363static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
364 const K32V32& b) {
365 return b < a;
366}
367static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
368 const K32V32& b) {
369 return a.key == b.key;
370}
371
372//------------------------------------------------------------------------------
373// Controlling overload resolution (SFINAE)
374
375template <bool Condition>
376struct EnableIfT {};
377template <>
378struct EnableIfT<true> {
379 using type = void;
380};
381
382template <bool Condition>
384
385template <typename T, typename U>
386struct IsSameT {
387 enum { value = 0 };
388};
389
390template <typename T>
391struct IsSameT<T, T> {
392 enum { value = 1 };
393};
394
395template <typename T, typename U>
396HWY_API constexpr bool IsSame() {
398}
399
400// Insert into template/function arguments to enable this overload only for
401// vectors of AT MOST this many bits.
402//
403// Note that enabling for exactly 128 bits is unnecessary because a function can
404// simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
405// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
406#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
407#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
408#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
409#define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
410#define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
411#define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
412#define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr
413
414#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
415#define HWY_IF_SIGNED(T) \
416 hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
417#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
418#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
419
420#define HWY_IF_LANE_SIZE(T, bytes) \
421 hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
422#define HWY_IF_NOT_LANE_SIZE(T, bytes) \
423 hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
424// bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
425// too similar. If you want the opposite of this (2 or 4 bytes), ask for those
426// bits explicitly (0x14) instead of attempting to 'negate' 0x102.
427#define HWY_IF_LANE_SIZE_ONE_OF(T, bit_array) \
428 hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
429
430#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
431 hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
432
433// Empty struct used as a size tag type.
434template <size_t N>
435struct SizeTag {};
436
437template <class T>
439 using type = T;
440};
441template <class T>
442struct RemoveConstT<const T> {
443 using type = T;
444};
445
446template <class T>
448
449//------------------------------------------------------------------------------
450// Type relations
451
452namespace detail {
453
454template <typename T>
456template <>
457struct Relations<uint8_t> {
458 using Unsigned = uint8_t;
459 using Signed = int8_t;
460 using Wide = uint16_t;
461 enum { is_signed = 0, is_float = 0 };
462};
463template <>
464struct Relations<int8_t> {
465 using Unsigned = uint8_t;
466 using Signed = int8_t;
467 using Wide = int16_t;
468 enum { is_signed = 1, is_float = 0 };
469};
470template <>
471struct Relations<uint16_t> {
472 using Unsigned = uint16_t;
473 using Signed = int16_t;
474 using Wide = uint32_t;
475 using Narrow = uint8_t;
476 enum { is_signed = 0, is_float = 0 };
477};
478template <>
479struct Relations<int16_t> {
480 using Unsigned = uint16_t;
481 using Signed = int16_t;
482 using Wide = int32_t;
483 using Narrow = int8_t;
484 enum { is_signed = 1, is_float = 0 };
485};
486template <>
487struct Relations<uint32_t> {
488 using Unsigned = uint32_t;
489 using Signed = int32_t;
490 using Float = float;
491 using Wide = uint64_t;
492 using Narrow = uint16_t;
493 enum { is_signed = 0, is_float = 0 };
494};
495template <>
496struct Relations<int32_t> {
497 using Unsigned = uint32_t;
498 using Signed = int32_t;
499 using Float = float;
500 using Wide = int64_t;
501 using Narrow = int16_t;
502 enum { is_signed = 1, is_float = 0 };
503};
504template <>
505struct Relations<uint64_t> {
506 using Unsigned = uint64_t;
507 using Signed = int64_t;
508 using Float = double;
510 using Narrow = uint32_t;
511 enum { is_signed = 0, is_float = 0 };
512};
513template <>
514struct Relations<int64_t> {
515 using Unsigned = uint64_t;
516 using Signed = int64_t;
517 using Float = double;
518 using Narrow = int32_t;
519 enum { is_signed = 1, is_float = 0 };
520};
521template <>
524 using Narrow = uint64_t;
525 enum { is_signed = 0, is_float = 0 };
526};
527template <>
529 using Unsigned = uint16_t;
530 using Signed = int16_t;
532 using Wide = float;
533 enum { is_signed = 1, is_float = 1 };
534};
535template <>
537 using Unsigned = uint16_t;
538 using Signed = int16_t;
539 using Wide = float;
540 enum { is_signed = 1, is_float = 1 };
541};
542template <>
543struct Relations<float> {
544 using Unsigned = uint32_t;
545 using Signed = int32_t;
546 using Float = float;
547 using Wide = double;
549 enum { is_signed = 1, is_float = 1 };
550};
551template <>
552struct Relations<double> {
553 using Unsigned = uint64_t;
554 using Signed = int64_t;
555 using Float = double;
556 using Narrow = float;
557 enum { is_signed = 1, is_float = 1 };
558};
559
560template <size_t N>
562template <>
563struct TypeFromSize<1> {
564 using Unsigned = uint8_t;
565 using Signed = int8_t;
566};
567template <>
568struct TypeFromSize<2> {
569 using Unsigned = uint16_t;
570 using Signed = int16_t;
571};
572template <>
573struct TypeFromSize<4> {
574 using Unsigned = uint32_t;
575 using Signed = int32_t;
576 using Float = float;
577};
578template <>
579struct TypeFromSize<8> {
580 using Unsigned = uint64_t;
581 using Signed = int64_t;
582 using Float = double;
583};
584template <>
585struct TypeFromSize<16> {
587};
588
589} // namespace detail
590
591// Aliases for types of a different category, but the same size.
592template <typename T>
594template <typename T>
596template <typename T>
598
599// Aliases for types of the same category, but different size.
600template <typename T>
602template <typename T>
604
605// Obtain type from its size [bytes].
606template <size_t N>
608template <size_t N>
610template <size_t N>
612
613// Avoid confusion with SizeTag where the parameter is a lane size.
615using SignedTag = SizeTag<0x100>; // integer
617
618template <typename T, class R = detail::Relations<T>>
619constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> {
621}
622
623// For when we only want to distinguish FloatTag from everything else.
625
626template <typename T, class R = detail::Relations<T>>
627constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
629}
630
631//------------------------------------------------------------------------------
632// Type traits
633
634template <typename T>
635HWY_API constexpr bool IsFloat() {
636 // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
637 // from a float, not compared.
638 return IsSame<T, float>() || IsSame<T, double>();
639}
640
641template <typename T>
642HWY_API constexpr bool IsSigned() {
643 return T(0) > T(-1);
644}
645template <>
646constexpr bool IsSigned<float16_t>() {
647 return true;
648}
649template <>
650constexpr bool IsSigned<bfloat16_t>() {
651 return true;
652}
653
654// Largest/smallest representable integer values.
655template <typename T>
656HWY_API constexpr T LimitsMax() {
657 static_assert(!IsFloat<T>(), "Only for integer types");
658 using TU = MakeUnsigned<T>;
659 return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
660 : static_cast<TU>(~0ull));
661}
662template <typename T>
663HWY_API constexpr T LimitsMin() {
664 static_assert(!IsFloat<T>(), "Only for integer types");
665 return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
666}
667
668// Largest/smallest representable value (integer or float). This naming avoids
669// confusion with numeric_limits<float>::min() (the smallest positive value).
670template <typename T>
671HWY_API constexpr T LowestValue() {
672 return LimitsMin<T>();
673}
674template <>
675constexpr float LowestValue<float>() {
676 return -3.402823466e+38F;
677}
678template <>
679constexpr double LowestValue<double>() {
680 return -1.7976931348623158e+308;
681}
682
683template <typename T>
684HWY_API constexpr T HighestValue() {
685 return LimitsMax<T>();
686}
687template <>
688constexpr float HighestValue<float>() {
689 return 3.402823466e+38F;
690}
691template <>
692constexpr double HighestValue<double>() {
693 return 1.7976931348623158e+308;
694}
695
696// Difference between 1.0 and the next representable value.
697template <typename T>
698HWY_API constexpr T Epsilon() {
699 return 1;
700}
701template <>
702constexpr float Epsilon<float>() {
703 return 1.192092896e-7f;
704}
705template <>
706constexpr double Epsilon<double>() {
707 return 2.2204460492503131e-16;
708}
709
710// Returns width in bits of the mantissa field in IEEE binary32/64.
711template <typename T>
712constexpr int MantissaBits() {
713 static_assert(sizeof(T) == 0, "Only instantiate the specializations");
714 return 0;
715}
716template <>
717constexpr int MantissaBits<float>() {
718 return 23;
719}
720template <>
721constexpr int MantissaBits<double>() {
722 return 52;
723}
724
725// Returns the (left-shifted by one bit) IEEE binary32/64 representation with
726// the largest possible (biased) exponent field. Used by IsInf.
727template <typename T>
729 return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
730}
731
732// Returns bitmask of the sign bit in IEEE binary32/64.
733template <typename T>
735 return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
736}
737
738// Returns bitmask of the exponent field in IEEE binary32/64.
739template <typename T>
741 return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
742}
743
744// Returns bitmask of the mantissa field in IEEE binary32/64.
745template <typename T>
747 return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
748}
749
750// Returns 1 << mantissa_bits as a floating-point number. All integers whose
751// absolute value are less than this can be represented exactly.
752template <typename T>
753constexpr T MantissaEnd() {
754 static_assert(sizeof(T) == 0, "Only instantiate the specializations");
755 return 0;
756}
757template <>
758constexpr float MantissaEnd<float>() {
759 return 8388608.0f; // 1 << 23
760}
761template <>
762constexpr double MantissaEnd<double>() {
763 // floating point literal with p52 requires C++17.
764 return 4503599627370496.0; // 1 << 52
765}
766
767// Returns width in bits of the exponent field in IEEE binary32/64.
768template <typename T>
769constexpr int ExponentBits() {
770 // Exponent := remaining bits after deducting sign and mantissa.
771 return 8 * sizeof(T) - 1 - MantissaBits<T>();
772}
773
774// Returns largest value of the biased exponent field in IEEE binary32/64,
775// right-shifted so that the LSB is bit zero. Example: 0xFF for float.
776// This is expressed as a signed integer for more efficient comparison.
777template <typename T>
779 return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
780}
781
782//------------------------------------------------------------------------------
783// Helper functions
784
785template <typename T1, typename T2>
786constexpr inline T1 DivCeil(T1 a, T2 b) {
787 return (a + b - 1) / b;
788}
789
790// Works for any `align`; if a power of two, compiler emits ADD+AND.
791constexpr inline size_t RoundUpTo(size_t what, size_t align) {
792 return DivCeil(what, align) * align;
793}
794
795// Undefined results for x == 0.
796HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
797#if HWY_COMPILER_MSVC
798 unsigned long index; // NOLINT
799 _BitScanForward(&index, x);
800 return index;
801#else // HWY_COMPILER_MSVC
802 return static_cast<size_t>(__builtin_ctz(x));
803#endif // HWY_COMPILER_MSVC
804}
805
806HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
807#if HWY_COMPILER_MSVC
808#if HWY_ARCH_X86_64
809 unsigned long index; // NOLINT
810 _BitScanForward64(&index, x);
811 return index;
812#else // HWY_ARCH_X86_64
813 // _BitScanForward64 not available
814 uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
815 unsigned long index; // NOLINT
816 if (lsb == 0) {
817 uint32_t msb = static_cast<uint32_t>(x >> 32u);
818 _BitScanForward(&index, msb);
819 return 32 + index;
820 } else {
821 _BitScanForward(&index, lsb);
822 return index;
823 }
824#endif // HWY_ARCH_X86_64
825#else // HWY_COMPILER_MSVC
826 return static_cast<size_t>(__builtin_ctzll(x));
827#endif // HWY_COMPILER_MSVC
828}
829
830// Undefined results for x == 0.
831HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
832#if HWY_COMPILER_MSVC
833 unsigned long index; // NOLINT
834 _BitScanReverse(&index, x);
835 return 31 - index;
836#else // HWY_COMPILER_MSVC
837 return static_cast<size_t>(__builtin_clz(x));
838#endif // HWY_COMPILER_MSVC
839}
840
841HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
842#if HWY_COMPILER_MSVC
843#if HWY_ARCH_X86_64
844 unsigned long index; // NOLINT
845 _BitScanReverse64(&index, x);
846 return 63 - index;
847#else // HWY_ARCH_X86_64
848 // _BitScanReverse64 not available
849 const uint32_t msb = static_cast<uint32_t>(x >> 32u);
850 unsigned long index; // NOLINT
851 if (msb == 0) {
852 const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
853 _BitScanReverse(&index, lsb);
854 return 63 - index;
855 } else {
856 _BitScanReverse(&index, msb);
857 return 31 - index;
858 }
859#endif // HWY_ARCH_X86_64
860#else // HWY_COMPILER_MSVC
861 return static_cast<size_t>(__builtin_clzll(x));
862#endif // HWY_COMPILER_MSVC
863}
864
865HWY_API size_t PopCount(uint64_t x) {
866#if HWY_COMPILER_GCC // includes clang
867 return static_cast<size_t>(__builtin_popcountll(x));
868 // This instruction has a separate feature flag, but is often called from
869 // non-SIMD code, so we don't want to require dynamic dispatch. It was first
870 // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
871 // for AVX, so check for that.
872#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
873 return _mm_popcnt_u64(x);
874#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
875 return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
876 _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
877#else
878 x -= ((x >> 1) & 0x5555555555555555ULL);
879 x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
880 x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
881 x += (x >> 8);
882 x += (x >> 16);
883 x += (x >> 32);
884 return static_cast<size_t>(x & 0x7Fu);
885#endif
886}
887
888// Skip HWY_API due to GCC "function not considered for inlining". Previously
889// such errors were caused by underlying type mismatches, but it's not clear
890// what is still mismatched despite all the casts.
891template <typename TI>
892/*HWY_API*/ constexpr size_t FloorLog2(TI x) {
893 return x == TI{1}
894 ? 0
895 : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
896}
897
898template <typename TI>
899/*HWY_API*/ constexpr size_t CeilLog2(TI x) {
900 return x == TI{1}
901 ? 0
902 : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
903}
904
905template <typename T>
906HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) {
907 return t + static_cast<T>(n);
908}
909
910template <typename T>
912 size_t n) {
913 using TU = MakeUnsigned<T>;
914 return static_cast<T>(
915 static_cast<TU>(static_cast<TU>(t) + static_cast<TU>(n)) &
916 hwy::LimitsMax<TU>());
917}
918
919#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
920#pragma intrinsic(_umul128)
921#endif
922
923// 64 x 64 = 128 bit multiplication
924HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
925#if defined(__SIZEOF_INT128__)
926 __uint128_t product = (__uint128_t)a * (__uint128_t)b;
927 *upper = (uint64_t)(product >> 64);
928 return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
929#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
930 return _umul128(a, b, upper);
931#else
932 constexpr uint64_t kLo32 = 0xFFFFFFFFU;
933 const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
934 const uint64_t hi_lo = (a >> 32) * (b & kLo32);
935 const uint64_t lo_hi = (a & kLo32) * (b >> 32);
936 const uint64_t hi_hi = (a >> 32) * (b >> 32);
937 const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
938 *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
939 return (t << 32) | (lo_lo & kLo32);
940#endif
941}
942
943#if HWY_COMPILER_MSVC
944#pragma intrinsic(memcpy)
945#pragma intrinsic(memset)
946#endif
947
948// The source/destination must not overlap/alias.
949template <size_t kBytes, typename From, typename To>
950HWY_API void CopyBytes(const From* from, To* to) {
951#if HWY_COMPILER_MSVC
952 memcpy(to, from, kBytes);
953#else
954 __builtin_memcpy(
955 static_cast<void*>(to), static_cast<const void*>(from), kBytes);
956#endif
957}
958
959// Same as CopyBytes, but for same-sized objects; avoids a size argument.
960template <typename From, typename To>
961HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
962 static_assert(sizeof(From) == sizeof(To), "");
963 CopyBytes<sizeof(From)>(from, to);
964}
965
966template <size_t kBytes, typename To>
967HWY_API void ZeroBytes(To* to) {
968#if HWY_COMPILER_MSVC
969 memset(to, 0, kBytes);
970#else
971 __builtin_memset(to, 0, kBytes);
972#endif
973}
974
976 uint32_t bits = bf.bits;
977 bits <<= 16;
978 float f;
979 CopySameSize(&bits, &f);
980 return f;
981}
982
984 uint32_t bits;
985 CopySameSize(&f, &bits);
986 bfloat16_t bf;
987 bf.bits = static_cast<uint16_t>(bits >> 16);
988 return bf;
989}
990
992 Abort(const char* file, int line, const char* format, ...);
993
994} // namespace hwy
995
996#endif // HIGHWAY_HWY_BASE_H_
#define HWY_RESTRICT
Definition: base.h:64
#define HWY_NORETURN
Definition: base.h:74
#define HWY_API
Definition: base.h:129
#define HWY_INLINE
Definition: base.h:70
#define HWY_MAYBE_UNUSED
Definition: base.h:82
#define HWY_DLLEXPORT
Definition: highway_export.h:13
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition: base.h:906
constexpr T MantissaEnd()
Definition: base.h:753
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:806
constexpr MakeSigned< T > MaxExponentTimes2()
Definition: base.h:728
constexpr MakeUnsigned< T > MantissaMask()
Definition: base.h:746
HWY_API constexpr T Epsilon()
Definition: base.h:698
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:975
HWY_API void ZeroBytes(To *to)
Definition: base.h:967
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:924
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:983
HWY_API constexpr T LimitsMin()
Definition: base.h:663
typename detail::TypeFromSize< N >::Float FloatFromSize
Definition: base.h:611
HWY_API constexpr T HighestValue()
Definition: base.h:684
typename RemoveConstT< T >::type RemoveConst
Definition: base.h:447
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:607
constexpr float HighestValue< float >()
Definition: base.h:688
typename detail::TypeFromSize< N >::Signed SignedFromSize
Definition: base.h:609
constexpr T1 DivCeil(T1 a, T2 b)
Definition: base.h:786
constexpr float MantissaEnd< float >()
Definition: base.h:758
double float64_t
Definition: base.h:303
static HWY_MAYBE_UNUSED bool operator==(const uint128_t &a, const uint128_t &b)
Definition: base.h:339
HWY_API constexpr bool IsSame()
Definition: base.h:396
constexpr bool IsSigned< bfloat16_t >()
Definition: base.h:650
HWY_API constexpr bool IsSigned()
Definition: base.h:642
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition: base.h:961
constexpr size_t FloorLog2(TI x)
Definition: base.h:892
constexpr MakeUnsigned< T > ExponentMask()
Definition: base.h:740
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:831
constexpr bool IsSigned< float16_t >()
Definition: base.h:646
constexpr double HighestValue< double >()
Definition: base.h:692
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 :0x400)>
Definition: base.h:627
constexpr int MantissaBits< double >()
Definition: base.h:721
constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed+R::is_float)<< 8)>
Definition: base.h:619
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:383
static HWY_MAYBE_UNUSED bool operator>(const uint128_t &a, const uint128_t &b)
Definition: base.h:335
float float32_t
Definition: base.h:302
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:865
constexpr double MantissaEnd< double >()
Definition: base.h:762
constexpr int MantissaBits()
Definition: base.h:712
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:796
constexpr float LowestValue< float >()
Definition: base.h:675
constexpr MakeSigned< T > MaxExponentField()
Definition: base.h:778
constexpr size_t CeilLog2(TI x)
Definition: base.h:899
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:841
constexpr MakeUnsigned< T > SignMask()
Definition: base.h:734
constexpr double LowestValue< double >()
Definition: base.h:679
static HWY_MAYBE_UNUSED bool operator<(const uint128_t &a, const uint128_t &b)
Definition: base.h:330
HWY_API constexpr T LowestValue()
Definition: base.h:671
constexpr float Epsilon< float >()
Definition: base.h:702
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize
Definition: base.h:256
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:593
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:601
typename detail::Relations< T >::Float MakeFloat
Definition: base.h:597
HWY_API constexpr bool IsFloat()
Definition: base.h:635
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:595
constexpr int MantissaBits< float >()
Definition: base.h:717
constexpr double Epsilon< double >()
Definition: base.h:706
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) Abort(const char *file
HWY_DLLEXPORT HWY_NORETURN void int const char * format
Definition: base.h:992
HWY_DLLEXPORT HWY_NORETURN void int line
Definition: base.h:992
HWY_API constexpr T LimitsMax()
Definition: base.h:656
constexpr size_t RoundUpTo(size_t what, size_t align)
Definition: base.h:791
typename detail::Relations< T >::Narrow MakeNarrow
Definition: base.h:603
constexpr int ExponentBits()
Definition: base.h:769
void type
Definition: base.h:379
Definition: base.h:376
Definition: base.h:386
@ value
Definition: base.h:387
Definition: base.h:323
uint32_t value
Definition: base.h:324
uint32_t key
Definition: base.h:325
Definition: base.h:316
uint64_t value
Definition: base.h:317
uint64_t key
Definition: base.h:318
T type
Definition: base.h:443
Definition: base.h:438
T type
Definition: base.h:439
Definition: base.h:435
Definition: base.h:296
uint16_t bits
Definition: base.h:297
int16_t Signed
Definition: base.h:538
float Wide
Definition: base.h:539
uint16_t Unsigned
Definition: base.h:537
double Float
Definition: base.h:555
uint64_t Unsigned
Definition: base.h:553
int64_t Signed
Definition: base.h:554
float Narrow
Definition: base.h:556
int16_t Signed
Definition: base.h:530
float Wide
Definition: base.h:532
uint16_t Unsigned
Definition: base.h:529
uint32_t Unsigned
Definition: base.h:544
double Wide
Definition: base.h:547
float Float
Definition: base.h:546
int32_t Signed
Definition: base.h:545
uint16_t Unsigned
Definition: base.h:480
int16_t Signed
Definition: base.h:481
int32_t Wide
Definition: base.h:482
int8_t Narrow
Definition: base.h:483
uint32_t Unsigned
Definition: base.h:497
int64_t Wide
Definition: base.h:500
float Float
Definition: base.h:499
int16_t Narrow
Definition: base.h:501
int32_t Signed
Definition: base.h:498
int32_t Narrow
Definition: base.h:518
double Float
Definition: base.h:517
uint64_t Unsigned
Definition: base.h:515
int64_t Signed
Definition: base.h:516
int16_t Wide
Definition: base.h:467
int8_t Signed
Definition: base.h:466
uint8_t Unsigned
Definition: base.h:465
uint64_t Narrow
Definition: base.h:524
uint8_t Narrow
Definition: base.h:475
int16_t Signed
Definition: base.h:473
uint32_t Wide
Definition: base.h:474
uint16_t Unsigned
Definition: base.h:472
uint32_t Unsigned
Definition: base.h:488
uint64_t Wide
Definition: base.h:491
uint16_t Narrow
Definition: base.h:492
float Float
Definition: base.h:490
int32_t Signed
Definition: base.h:489
uint32_t Narrow
Definition: base.h:510
int64_t Signed
Definition: base.h:507
uint64_t Unsigned
Definition: base.h:506
double Float
Definition: base.h:508
int8_t Signed
Definition: base.h:459
uint8_t Unsigned
Definition: base.h:458
uint16_t Wide
Definition: base.h:460
Definition: base.h:455
int8_t Signed
Definition: base.h:565
uint8_t Unsigned
Definition: base.h:564
int16_t Signed
Definition: base.h:570
uint16_t Unsigned
Definition: base.h:569
int32_t Signed
Definition: base.h:575
uint32_t Unsigned
Definition: base.h:574
float Float
Definition: base.h:576
double Float
Definition: base.h:582
int64_t Signed
Definition: base.h:581
uint64_t Unsigned
Definition: base.h:580
Definition: base.h:561
Definition: base.h:291
uint16_t bits
Definition: base.h:292
Definition: base.h:309
uint64_t lo
Definition: base.h:310
uint64_t hi
Definition: base.h:311