Grok 10.0.5
emu128-inl.h
Go to the documentation of this file.
1// Copyright 2022 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Single-element vectors and operations.
17// External include guard in highway.h - see comment there.
18
19#include <stddef.h>
20#include <stdint.h>
21#include <cmath> // std::abs, std::isnan
22
23#include "hwy/base.h"
24#include "hwy/ops/shared-inl.h"
25
27namespace hwy {
28namespace HWY_NAMESPACE {
29
30template <typename T>
31using Full128 = Simd<T, 16 / sizeof(T), 0>;
32
33// (Wrapper class required for overloading comparison operators.)
34template <typename T, size_t N = 16 / sizeof(T)>
35struct Vec128 {
36 using PrivateT = T; // only for DFromV
37 static constexpr size_t kPrivateN = N; // only for DFromV
38
39 HWY_INLINE Vec128() = default;
40 Vec128(const Vec128&) = default;
41 Vec128& operator=(const Vec128&) = default;
42
44 return *this = (*this * other);
45 }
47 return *this = (*this / other);
48 }
50 return *this = (*this + other);
51 }
53 return *this = (*this - other);
54 }
56 return *this = (*this & other);
57 }
59 return *this = (*this | other);
60 }
62 return *this = (*this ^ other);
63 }
64
65 // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h
66 // relies on this for LoadInterleaved*. CAVEAT: this method of padding
67 // prevents using range for, especially in SumOfLanes, where it would be
68 // incorrect. Moving padding to another field would require handling the case
69 // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward.
70 T raw[16 / sizeof(T)] = {};
71};
72
73// 0 or FF..FF, same size as Vec128.
74template <typename T, size_t N = 16 / sizeof(T)>
75struct Mask128 {
77 static HWY_INLINE Raw FromBool(bool b) {
78 return b ? static_cast<Raw>(~Raw{0}) : 0;
79 }
80
81 // Must match the size of Vec128.
82 Raw bits[16 / sizeof(T)] = {};
83};
84
85template <class V>
86using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
87
88template <class V>
89using TFromV = typename V::PrivateT;
90
91// ------------------------------ BitCast
92
93template <typename T, size_t N, typename FromT, size_t FromN>
95 Vec128<T, N> to;
96 CopySameSize(&v, &to);
97 return to;
98}
99
100// ------------------------------ Set
101
102template <typename T, size_t N>
103HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
104 Vec128<T, N> v;
105 ZeroBytes<sizeof(T) * N>(v.raw);
106 return v;
107}
108
109template <class D>
110using VFromD = decltype(Zero(D()));
111
112template <typename T, size_t N, typename T2>
113HWY_API Vec128<T, N> Set(Simd<T, N, 0> /* tag */, const T2 t) {
115 for (size_t i = 0; i < N; ++i) {
116 v.raw[i] = static_cast<T>(t);
117 }
118 return v;
119}
120
121template <typename T, size_t N>
122HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
123 return Zero(d);
124}
125
126template <typename T, size_t N, typename T2>
127HWY_API Vec128<T, N> Iota(const Simd<T, N, 0> /* tag */, T2 first) {
129 for (size_t i = 0; i < N; ++i) {
130 v.raw[i] =
131 AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
132 }
133 return v;
134}
135
136// ================================================== LOGICAL
137
138// ------------------------------ Not
139template <typename T, size_t N>
141 const Simd<T, N, 0> d;
142 const RebindToUnsigned<decltype(d)> du;
143 using TU = TFromD<decltype(du)>;
144 VFromD<decltype(du)> vu = BitCast(du, v);
145 for (size_t i = 0; i < N; ++i) {
146 vu.raw[i] = static_cast<TU>(~vu.raw[i]);
147 }
148 return BitCast(d, vu);
149}
150
151// ------------------------------ And
152template <typename T, size_t N>
154 const Simd<T, N, 0> d;
155 const RebindToUnsigned<decltype(d)> du;
156 auto au = BitCast(du, a);
157 auto bu = BitCast(du, b);
158 for (size_t i = 0; i < N; ++i) {
159 au.raw[i] &= bu.raw[i];
160 }
161 return BitCast(d, au);
162}
163template <typename T, size_t N>
164HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
165 return And(a, b);
166}
167
168// ------------------------------ AndNot
169template <typename T, size_t N>
171 return And(Not(a), b);
172}
173
174// ------------------------------ Or
175template <typename T, size_t N>
177 const Simd<T, N, 0> d;
178 const RebindToUnsigned<decltype(d)> du;
179 auto au = BitCast(du, a);
180 auto bu = BitCast(du, b);
181 for (size_t i = 0; i < N; ++i) {
182 au.raw[i] |= bu.raw[i];
183 }
184 return BitCast(d, au);
185}
186template <typename T, size_t N>
187HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
188 return Or(a, b);
189}
190
191// ------------------------------ Xor
192template <typename T, size_t N>
194 const Simd<T, N, 0> d;
195 const RebindToUnsigned<decltype(d)> du;
196 auto au = BitCast(du, a);
197 auto bu = BitCast(du, b);
198 for (size_t i = 0; i < N; ++i) {
199 au.raw[i] ^= bu.raw[i];
200 }
201 return BitCast(d, au);
202}
203template <typename T, size_t N>
204HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
205 return Xor(a, b);
206}
207
208// ------------------------------ Xor3
209
210template <typename T, size_t N>
211HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
212 return Xor(x1, Xor(x2, x3));
213}
214
215// ------------------------------ Or3
216
217template <typename T, size_t N>
218HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
219 return Or(o1, Or(o2, o3));
220}
221
222// ------------------------------ OrAnd
223template <typename T, size_t N>
224HWY_API Vec128<T, N> OrAnd(const Vec128<T, N> o, const Vec128<T, N> a1,
225 const Vec128<T, N> a2) {
226 return Or(o, And(a1, a2));
227}
228
229// ------------------------------ IfVecThenElse
230template <typename T, size_t N>
231HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
232 Vec128<T, N> no) {
233 return Or(And(mask, yes), AndNot(mask, no));
234}
235
236// ------------------------------ CopySign
237template <typename T, size_t N>
238HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
239 const Vec128<T, N> sign) {
240 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
241 const auto msb = SignBit(Simd<T, N, 0>());
242 return Or(AndNot(msb, magn), And(msb, sign));
243}
244
245template <typename T, size_t N>
246HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
247 const Vec128<T, N> sign) {
248 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
249 return Or(abs, And(SignBit(Simd<T, N, 0>()), sign));
250}
251
252// ------------------------------ BroadcastSignBit
253template <typename T, size_t N>
255 // This is used inside ShiftRight, so we cannot implement in terms of it.
256 for (size_t i = 0; i < N; ++i) {
257 v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0);
258 }
259 return v;
260}
261
262// ------------------------------ Mask
263
264template <typename TFrom, typename TTo, size_t N>
265HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
266 Mask128<TFrom, N> mask) {
267 Mask128<TTo, N> to;
268 CopySameSize(&mask, &to);
269 return to;
270}
271
272// v must be 0 or FF..FF.
273template <typename T, size_t N>
274HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
275 Mask128<T, N> mask;
276 CopySameSize(&v, &mask);
277 return mask;
278}
279
280template <typename T, size_t N>
283 CopySameSize(&mask, &v);
284 return v;
285}
286
287template <typename T, size_t N>
289 return VecFromMask(mask);
290}
291
292template <typename T, size_t N>
293HWY_API Mask128<T, N> FirstN(Simd<T, N, 0> /*tag*/, size_t n) {
294 Mask128<T, N> m;
295 for (size_t i = 0; i < N; ++i) {
296 m.bits[i] = Mask128<T, N>::FromBool(i < n);
297 }
298 return m;
299}
300
301// Returns mask ? yes : no.
302template <typename T, size_t N>
304 const Vec128<T, N> yes, const Vec128<T, N> no) {
305 return IfVecThenElse(VecFromMask(mask), yes, no);
306}
307
308template <typename T, size_t N>
309HWY_API Vec128<T, N> IfThenElseZero(const Mask128<T, N> mask,
310 const Vec128<T, N> yes) {
311 return IfVecThenElse(VecFromMask(mask), yes, Zero(Simd<T, N, 0>()));
312}
313
314template <typename T, size_t N>
315HWY_API Vec128<T, N> IfThenZeroElse(const Mask128<T, N> mask,
316 const Vec128<T, N> no) {
317 return IfVecThenElse(VecFromMask(mask), Zero(Simd<T, N, 0>()), no);
318}
319
320template <typename T, size_t N>
321HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
322 Vec128<T, N> no) {
323 for (size_t i = 0; i < N; ++i) {
324 v.raw[i] = v.raw[i] < 0 ? yes.raw[i] : no.raw[i];
325 }
326 return v;
327}
328
329template <typename T, size_t N>
330HWY_API Vec128<T, N> ZeroIfNegative(const Vec128<T, N> v) {
331 return IfNegativeThenElse(v, Zero(Simd<T, N, 0>()), v);
332}
333
334// ------------------------------ Mask logical
335
336template <typename T, size_t N>
337HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
338 return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
339}
340
341template <typename T, size_t N>
342HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
343 const Simd<T, N, 0> d;
344 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
345}
346
347template <typename T, size_t N>
348HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
349 const Simd<T, N, 0> d;
350 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
351}
352
353template <typename T, size_t N>
354HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
355 const Simd<T, N, 0> d;
356 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
357}
358
359template <typename T, size_t N>
360HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
361 const Simd<T, N, 0> d;
362 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
363}
364
365template <typename T, size_t N>
366HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
367 const Simd<T, N, 0> d;
368 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
369}
370
371// ================================================== SHIFTS
372
373// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
374
375template <int kBits, typename T, size_t N>
377 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
378 for (size_t i = 0; i < N; ++i) {
379 const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << kBits;
380 v.raw[i] = static_cast<T>(shifted);
381 }
382 return v;
383}
384
385template <int kBits, typename T, size_t N>
387 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
388#if __cplusplus >= 202002L
389 // Signed right shift is now guaranteed to be arithmetic (rounding toward
390 // negative infinity, i.e. shifting in the sign bit).
391 for (size_t i = 0; i < N; ++i) {
392 v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
393 }
394#else
395 if (IsSigned<T>()) {
396 // Emulate arithmetic shift using only logical (unsigned) shifts, because
397 // signed shifts are still implementation-defined.
398 using TU = hwy::MakeUnsigned<T>;
399 for (size_t i = 0; i < N; ++i) {
400 const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
401 const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
402 const size_t sign_shift =
403 static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
404 const TU upper = static_cast<TU>(sign << sign_shift);
405 v.raw[i] = static_cast<T>(shifted | upper);
406 }
407 } else { // T is unsigned
408 for (size_t i = 0; i < N; ++i) {
409 v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
410 }
411 }
412#endif
413 return v;
414}
415
416// ------------------------------ RotateRight (ShiftRight)
417
418namespace detail {
419
420// For partial specialization: kBits == 0 results in an invalid shift count
421template <int kBits>
423 template <typename T, size_t N>
425 return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
426 }
427};
428
429template <>
431 template <typename T, size_t N>
433 return v;
434 }
435};
436
437} // namespace detail
438
439template <int kBits, typename T, size_t N>
441 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
443}
444
445// ------------------------------ ShiftLeftSame
446
447template <typename T, size_t N>
448HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
449 for (size_t i = 0; i < N; ++i) {
450 const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << bits;
451 v.raw[i] = static_cast<T>(shifted);
452 }
453 return v;
454}
455
456template <typename T, size_t N>
457HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
458#if __cplusplus >= 202002L
459 // Signed right shift is now guaranteed to be arithmetic (rounding toward
460 // negative infinity, i.e. shifting in the sign bit).
461 for (size_t i = 0; i < N; ++i) {
462 v.raw[i] = static_cast<T>(v.raw[i] >> bits);
463 }
464#else
465 if (IsSigned<T>()) {
466 // Emulate arithmetic shift using only logical (unsigned) shifts, because
467 // signed shifts are still implementation-defined.
468 using TU = hwy::MakeUnsigned<T>;
469 for (size_t i = 0; i < N; ++i) {
470 const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
471 const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
472 const size_t sign_shift =
473 static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
474 const TU upper = static_cast<TU>(sign << sign_shift);
475 v.raw[i] = static_cast<T>(shifted | upper);
476 }
477 } else {
478 for (size_t i = 0; i < N; ++i) {
479 v.raw[i] = static_cast<T>(v.raw[i] >> bits); // unsigned, logical shift
480 }
481 }
482#endif
483 return v;
484}
485
486// ------------------------------ Shl
487
488template <typename T, size_t N>
490 for (size_t i = 0; i < N; ++i) {
491 const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i])
492 << bits.raw[i];
493 v.raw[i] = static_cast<T>(shifted);
494 }
495 return v;
496}
497
498template <typename T, size_t N>
500#if __cplusplus >= 202002L
501 // Signed right shift is now guaranteed to be arithmetic (rounding toward
502 // negative infinity, i.e. shifting in the sign bit).
503 for (size_t i = 0; i < N; ++i) {
504 v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
505 }
506#else
507 if (IsSigned<T>()) {
508 // Emulate arithmetic shift using only logical (unsigned) shifts, because
509 // signed shifts are still implementation-defined.
510 using TU = hwy::MakeUnsigned<T>;
511 for (size_t i = 0; i < N; ++i) {
512 const TU shifted =
513 static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
514 const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
515 const size_t sign_shift = static_cast<size_t>(
516 static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
517 const TU upper = static_cast<TU>(sign << sign_shift);
518 v.raw[i] = static_cast<T>(shifted | upper);
519 }
520 } else { // T is unsigned
521 for (size_t i = 0; i < N; ++i) {
522 v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
523 }
524 }
525#endif
526 return v;
527}
528
529// ================================================== ARITHMETIC
530
531// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
532namespace detail {
533
534template <typename T, size_t N>
536 Vec128<T, N> b) {
537 for (size_t i = 0; i < N; ++i) {
538 const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
539 const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
540 a.raw[i] = static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0)));
541 }
542 return a;
543}
544template <typename T, size_t N>
546 Vec128<T, N> b) {
547 for (size_t i = 0; i < N; ++i) {
548 const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
549 const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
550 a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
551 }
552 return a;
553}
554
555template <typename T, size_t N>
557 const Vec128<T, N> b) {
558 for (size_t i = 0; i < N; ++i) {
559 a.raw[i] += b.raw[i];
560 }
561 return a;
562}
563
564template <typename T, size_t N>
566 const Vec128<T, N> b) {
567 for (size_t i = 0; i < N; ++i) {
568 a.raw[i] -= b.raw[i];
569 }
570 return a;
571}
572
573} // namespace detail
574
575template <typename T, size_t N>
577 return detail::Sub(hwy::IsFloatTag<T>(), a, b);
578}
579template <typename T, size_t N>
581 return detail::Add(hwy::IsFloatTag<T>(), a, b);
582}
583
584// ------------------------------ SumsOf8
585
586template <size_t N>
587HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(const Vec128<uint8_t, N> v) {
588 Vec128<uint64_t, (N + 7) / 8> sums;
589 for (size_t i = 0; i < N; ++i) {
590 sums.raw[i / 8] += v.raw[i];
591 }
592 return sums;
593}
594
595// ------------------------------ SaturatedAdd
596template <typename T, size_t N>
598 for (size_t i = 0; i < N; ++i) {
599 a.raw[i] = static_cast<T>(
600 HWY_MIN(HWY_MAX(hwy::LowestValue<T>(), a.raw[i] + b.raw[i]),
601 hwy::HighestValue<T>()));
602 }
603 return a;
604}
605
606// ------------------------------ SaturatedSub
607template <typename T, size_t N>
609 for (size_t i = 0; i < N; ++i) {
610 a.raw[i] = static_cast<T>(
611 HWY_MIN(HWY_MAX(hwy::LowestValue<T>(), a.raw[i] - b.raw[i]),
612 hwy::HighestValue<T>()));
613 }
614 return a;
615}
616
617// ------------------------------ AverageRound
618template <typename T, size_t N>
620 static_assert(!IsSigned<T>(), "Only for unsigned");
621 for (size_t i = 0; i < N; ++i) {
622 a.raw[i] = static_cast<T>((a.raw[i] + b.raw[i] + 1) / 2);
623 }
624 return a;
625}
626
627// ------------------------------ Abs
628
629// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
630namespace detail {
631
632template <typename T, size_t N>
634 for (size_t i = 0; i < N; ++i) {
635 const T s = a.raw[i];
636 const T min = hwy::LimitsMin<T>();
637 a.raw[i] = static_cast<T>((s >= 0 || s == min) ? a.raw[i] : -s);
638 }
639 return a;
640}
641
642template <typename T, size_t N>
644 for (size_t i = 0; i < N; ++i) {
645 v.raw[i] = std::abs(v.raw[i]);
646 }
647 return v;
648}
649
650} // namespace detail
651
652template <typename T, size_t N>
654 return detail::Abs(hwy::TypeTag<T>(), a);
655}
656
657// ------------------------------ Min/Max
658
659// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
660namespace detail {
661
662template <typename T, size_t N>
664 const Vec128<T, N> b) {
665 for (size_t i = 0; i < N; ++i) {
666 a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
667 }
668 return a;
669}
670template <typename T, size_t N>
672 const Vec128<T, N> b) {
673 for (size_t i = 0; i < N; ++i) {
674 a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
675 }
676 return a;
677}
678
679template <typename T, size_t N>
681 const Vec128<T, N> b) {
682 for (size_t i = 0; i < N; ++i) {
683 if (std::isnan(a.raw[i])) {
684 a.raw[i] = b.raw[i];
685 } else if (std::isnan(b.raw[i])) {
686 // no change
687 } else {
688 a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
689 }
690 }
691 return a;
692}
693template <typename T, size_t N>
695 const Vec128<T, N> b) {
696 for (size_t i = 0; i < N; ++i) {
697 if (std::isnan(a.raw[i])) {
698 a.raw[i] = b.raw[i];
699 } else if (std::isnan(b.raw[i])) {
700 // no change
701 } else {
702 a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
703 }
704 }
705 return a;
706}
707
708} // namespace detail
709
710template <typename T, size_t N>
712 return detail::Min(hwy::IsFloatTag<T>(), a, b);
713}
714
715template <typename T, size_t N>
717 return detail::Max(hwy::IsFloatTag<T>(), a, b);
718}
719
720// ------------------------------ Neg
721
722// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
723namespace detail {
724
725template <typename T, size_t N>
727 return Zero(Simd<T, N, 0>()) - v;
728}
729
730template <typename T, size_t N>
732 return Xor(v, SignBit(Simd<T, N, 0>()));
733}
734
735} // namespace detail
736
737template <typename T, size_t N>
739 return detail::Neg(hwy::IsFloatTag<T>(), v);
740}
741
742// ------------------------------ Mul/Div
743
744// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
745namespace detail {
746
747template <typename T, size_t N>
749 const Vec128<T, N> b) {
750 for (size_t i = 0; i < N; ++i) {
751 a.raw[i] *= b.raw[i];
752 }
753 return a;
754}
755
756template <typename T, size_t N>
758 const Vec128<T, N> b) {
759 for (size_t i = 0; i < N; ++i) {
760 a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) *
761 static_cast<uint64_t>(b.raw[i]));
762 }
763 return a;
764}
765
766template <typename T, size_t N>
768 const Vec128<T, N> b) {
769 for (size_t i = 0; i < N; ++i) {
770 a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) *
771 static_cast<uint64_t>(b.raw[i]));
772 }
773 return a;
774}
775
776} // namespace detail
777
778template <typename T, size_t N>
780 return detail::Mul(hwy::TypeTag<T>(), a, b);
781}
782
783template <typename T, size_t N>
785 for (size_t i = 0; i < N; ++i) {
786 a.raw[i] /= b.raw[i];
787 }
788 return a;
789}
790
791// Returns the upper 16 bits of a * b in each lane.
792template <size_t N>
794 const Vec128<int16_t, N> b) {
795 for (size_t i = 0; i < N; ++i) {
796 a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
797 }
798 return a;
799}
800template <size_t N>
802 const Vec128<uint16_t, N> b) {
803 for (size_t i = 0; i < N; ++i) {
804 // Cast to uint32_t first to prevent overflow. Otherwise the result of
805 // uint16_t * uint16_t is in "int" which may overflow. In practice the
806 // result is the same but this way it is also defined.
807 a.raw[i] = static_cast<uint16_t>(
808 (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
809 16);
810 }
811 return a;
812}
813
814template <size_t N>
817 for (size_t i = 0; i < N; ++i) {
818 a.raw[i] = static_cast<int16_t>((2 * a.raw[i] * b.raw[i] + 32768) >> 16);
819 }
820 return a;
821}
822
823// Multiplies even lanes (0, 2 ..) and returns the double-wide result.
824template <size_t N>
825HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
826 const Vec128<int32_t, N> b) {
827 Vec128<int64_t, (N + 1) / 2> mul;
828 for (size_t i = 0; i < N; i += 2) {
829 const int64_t a64 = a.raw[i];
830 mul.raw[i / 2] = a64 * b.raw[i];
831 }
832 return mul;
833}
834template <size_t N>
835HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a,
836 const Vec128<uint32_t, N> b) {
837 Vec128<uint64_t, (N + 1) / 2> mul;
838 for (size_t i = 0; i < N; i += 2) {
839 const uint64_t a64 = a.raw[i];
840 mul.raw[i / 2] = a64 * b.raw[i];
841 }
842 return mul;
843}
844
845template <size_t N>
846HWY_API Vec128<int64_t, (N + 1) / 2> MulOdd(const Vec128<int32_t, N> a,
847 const Vec128<int32_t, N> b) {
848 Vec128<int64_t, (N + 1) / 2> mul;
849 for (size_t i = 0; i < N; i += 2) {
850 const int64_t a64 = a.raw[i + 1];
851 mul.raw[i / 2] = a64 * b.raw[i + 1];
852 }
853 return mul;
854}
855template <size_t N>
857 const Vec128<uint32_t, N> b) {
858 Vec128<uint64_t, (N + 1) / 2> mul;
859 for (size_t i = 0; i < N; i += 2) {
860 const uint64_t a64 = a.raw[i + 1];
861 mul.raw[i / 2] = a64 * b.raw[i + 1];
862 }
863 return mul;
864}
865
866template <size_t N>
867HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
868 for (size_t i = 0; i < N; ++i) {
869 // Zero inputs are allowed, but callers are responsible for replacing the
870 // return value with something else (typically using IfThenElse). This check
871 // avoids a ubsan error. The result is arbitrary.
872 v.raw[i] = (std::abs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
873 }
874 return v;
875}
876
877template <size_t N>
879 return Abs(a - b);
880}
881
882// ------------------------------ Floating-point multiply-add variants
883
884template <typename T, size_t N>
886 const Vec128<T, N> add) {
887 return mul * x + add;
888}
889
890template <typename T, size_t N>
892 const Vec128<T, N> add) {
893 return add - mul * x;
894}
895
896template <typename T, size_t N>
898 const Vec128<T, N> sub) {
899 return mul * x - sub;
900}
901
902template <typename T, size_t N>
904 const Vec128<T, N> sub) {
905 return Neg(mul) * x - sub;
906}
907
908// ------------------------------ Floating-point square root
909
910template <size_t N>
911HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
912 for (size_t i = 0; i < N; ++i) {
913 const float half = v.raw[i] * 0.5f;
914 uint32_t bits;
915 CopySameSize(&v.raw[i], &bits);
916 // Initial guess based on log2(f)
917 bits = 0x5F3759DF - (bits >> 1);
918 CopySameSize(&bits, &v.raw[i]);
919 // One Newton-Raphson iteration
920 v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
921 }
922 return v;
923}
924
925template <typename T, size_t N>
927 for (size_t i = 0; i < N; ++i) {
928 v.raw[i] = std::sqrt(v.raw[i]);
929 }
930 return v;
931}
932
933// ------------------------------ Floating-point rounding
934
935template <typename T, size_t N>
937 using TI = MakeSigned<T>;
938 const Vec128<T, N> a = Abs(v);
939 for (size_t i = 0; i < N; ++i) {
940 if (!(a.raw[i] < MantissaEnd<T>())) { // Huge or NaN
941 continue;
942 }
943 const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
944 const TI rounded = static_cast<TI>(v.raw[i] + bias);
945 if (rounded == 0) {
946 v.raw[i] = v.raw[i] < 0 ? T{-0} : T{0};
947 continue;
948 }
949 const T rounded_f = static_cast<T>(rounded);
950 // Round to even
951 if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
952 v.raw[i] = static_cast<T>(rounded - (v.raw[i] < T(0) ? -1 : 1));
953 continue;
954 }
955 v.raw[i] = rounded_f;
956 }
957 return v;
958}
959
960// Round-to-nearest even.
961template <size_t N>
962HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
963 using T = float;
964 using TI = int32_t;
965
966 const Vec128<float, N> abs = Abs(v);
967 Vec128<int32_t, N> ret;
968 for (size_t i = 0; i < N; ++i) {
969 const bool signbit = std::signbit(v.raw[i]);
970
971 if (!(abs.raw[i] < MantissaEnd<T>())) { // Huge or NaN
972 // Check if too large to cast or NaN
973 if (!(abs.raw[i] <= static_cast<T>(LimitsMax<TI>()))) {
974 ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
975 continue;
976 }
977 ret.raw[i] = static_cast<TI>(v.raw[i]);
978 continue;
979 }
980 const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
981 const TI rounded = static_cast<TI>(v.raw[i] + bias);
982 if (rounded == 0) {
983 ret.raw[i] = 0;
984 continue;
985 }
986 const T rounded_f = static_cast<T>(rounded);
987 // Round to even
988 if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
989 ret.raw[i] = rounded - (signbit ? -1 : 1);
990 continue;
991 }
992 ret.raw[i] = rounded;
993 }
994 return ret;
995}
996
997template <typename T, size_t N>
999 using TI = MakeSigned<T>;
1000 const Vec128<T, N> abs = Abs(v);
1001 for (size_t i = 0; i < N; ++i) {
1002 if (!(abs.raw[i] <= MantissaEnd<T>())) { // Huge or NaN
1003 continue;
1004 }
1005 const TI truncated = static_cast<TI>(v.raw[i]);
1006 if (truncated == 0) {
1007 v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0};
1008 continue;
1009 }
1010 v.raw[i] = static_cast<T>(truncated);
1011 }
1012 return v;
1013}
1014
1015// Toward +infinity, aka ceiling
1016template <typename Float, size_t N>
1018 constexpr int kMantissaBits = MantissaBits<Float>();
1019 using Bits = MakeUnsigned<Float>;
1020 const Bits kExponentMask = MaxExponentField<Float>();
1021 const Bits kMantissaMask = MantissaMask<Float>();
1022 const Bits kBias = kExponentMask / 2;
1023
1024 for (size_t i = 0; i < N; ++i) {
1025 const bool positive = v.raw[i] > Float(0.0);
1026
1027 Bits bits;
1028 CopySameSize(&v.raw[i], &bits);
1029
1030 const int exponent =
1031 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1032 // Already an integer.
1033 if (exponent >= kMantissaBits) continue;
1034 // |v| <= 1 => 0 or 1.
1035 if (exponent < 0) {
1036 v.raw[i] = positive ? Float{1} : Float{-0.0};
1037 continue;
1038 }
1039
1040 const Bits mantissa_mask = kMantissaMask >> exponent;
1041 // Already an integer
1042 if ((bits & mantissa_mask) == 0) continue;
1043
1044 // Clear fractional bits and round up
1045 if (positive) bits += (kMantissaMask + 1) >> exponent;
1046 bits &= ~mantissa_mask;
1047
1048 CopySameSize(&bits, &v.raw[i]);
1049 }
1050 return v;
1051}
1052
1053// Toward -infinity, aka floor
1054template <typename Float, size_t N>
1056 constexpr int kMantissaBits = MantissaBits<Float>();
1057 using Bits = MakeUnsigned<Float>;
1058 const Bits kExponentMask = MaxExponentField<Float>();
1059 const Bits kMantissaMask = MantissaMask<Float>();
1060 const Bits kBias = kExponentMask / 2;
1061
1062 for (size_t i = 0; i < N; ++i) {
1063 const bool negative = v.raw[i] < Float(0.0);
1064
1065 Bits bits;
1066 CopySameSize(&v.raw[i], &bits);
1067
1068 const int exponent =
1069 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1070 // Already an integer.
1071 if (exponent >= kMantissaBits) continue;
1072 // |v| <= 1 => -1 or 0.
1073 if (exponent < 0) {
1074 v.raw[i] = negative ? Float(-1.0) : Float(0.0);
1075 continue;
1076 }
1077
1078 const Bits mantissa_mask = kMantissaMask >> exponent;
1079 // Already an integer
1080 if ((bits & mantissa_mask) == 0) continue;
1081
1082 // Clear fractional bits and round down
1083 if (negative) bits += (kMantissaMask + 1) >> exponent;
1084 bits &= ~mantissa_mask;
1085
1086 CopySameSize(&bits, &v.raw[i]);
1087 }
1088 return v;
1089}
1090
1091// ------------------------------ Floating-point classification
1092
1093template <typename T, size_t N>
1094HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
1095 Mask128<T, N> ret;
1096 for (size_t i = 0; i < N; ++i) {
1097 // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
1098 MakeUnsigned<T> bits;
1099 CopySameSize(&v.raw[i], &bits);
1100 bits += bits;
1101 bits >>= 1; // clear sign bit
1102 // NaN if all exponent bits are set and the mantissa is not zero.
1103 ret.bits[i] = Mask128<T, N>::FromBool(bits > ExponentMask<T>());
1104 }
1105 return ret;
1106}
1107
1108template <typename T, size_t N>
1110 static_assert(IsFloat<T>(), "Only for float");
1111 const Simd<T, N, 0> d;
1112 const RebindToSigned<decltype(d)> di;
1113 const VFromD<decltype(di)> vi = BitCast(di, v);
1114 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1115 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
1116}
1117
1118// Returns whether normal/subnormal/zero.
1119template <typename T, size_t N>
1121 static_assert(IsFloat<T>(), "Only for float");
1122 const Simd<T, N, 0> d;
1123 const RebindToUnsigned<decltype(d)> du;
1124 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
1125 using VI = VFromD<decltype(di)>;
1126 using VU = VFromD<decltype(du)>;
1127 const VU vu = BitCast(du, v);
1128 // 'Shift left' to clear the sign bit, then right so we can compare with the
1129 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
1130 // negative and non-negative floats would be greater).
1131 const VI exp =
1132 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
1133 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
1134}
1135
1136// ================================================== COMPARE
1137
1138template <typename T, size_t N>
1140 Mask128<T, N> m;
1141 for (size_t i = 0; i < N; ++i) {
1142 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] == b.raw[i]);
1143 }
1144 return m;
1145}
1146
1147template <typename T, size_t N>
1149 Mask128<T, N> m;
1150 for (size_t i = 0; i < N; ++i) {
1151 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] != b.raw[i]);
1152 }
1153 return m;
1154}
1155
1156template <typename T, size_t N>
1158 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1159 return (v & bit) == bit;
1160}
1161
1162template <typename T, size_t N>
1164 Mask128<T, N> m;
1165 for (size_t i = 0; i < N; ++i) {
1166 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] < b.raw[i]);
1167 }
1168 return m;
1169}
1170template <typename T, size_t N>
1171HWY_API Mask128<T, N> operator>(const Vec128<T, N> a, const Vec128<T, N> b) {
1172 Mask128<T, N> m;
1173 for (size_t i = 0; i < N; ++i) {
1174 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] > b.raw[i]);
1175 }
1176 return m;
1177}
1178
1179template <typename T, size_t N>
1181 Mask128<T, N> m;
1182 for (size_t i = 0; i < N; ++i) {
1183 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] <= b.raw[i]);
1184 }
1185 return m;
1186}
1187template <typename T, size_t N>
1188HWY_API Mask128<T, N> operator>=(const Vec128<T, N> a, const Vec128<T, N> b) {
1189 Mask128<T, N> m;
1190 for (size_t i = 0; i < N; ++i) {
1191 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] >= b.raw[i]);
1192 }
1193 return m;
1194}
1195
1196// ------------------------------ Lt128
1197
1198// Only makes sense for full vectors of u64.
1201 const bool lt =
1202 (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]);
1204 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
1205 return ret;
1206}
1207
1210 const Vec128<uint64_t> b) {
1211 const bool lt = a.raw[1] < b.raw[1];
1213 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
1214 return ret;
1215}
1216
1217// ------------------------------ Eq128
1218
1219// Only makes sense for full vectors of u64.
1222 const bool eq = a.raw[1] == b.raw[1] && a.raw[0] == b.raw[0];
1224 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
1225 return ret;
1226}
1227
1230 const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0];
1232 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
1233 return ret;
1234}
1235
1238 const Vec128<uint64_t> b) {
1239 const bool eq = a.raw[1] == b.raw[1];
1241 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
1242 return ret;
1243}
1244
1247 const Vec128<uint64_t> b) {
1248 const bool ne = a.raw[1] != b.raw[1];
1250 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
1251 return ret;
1252}
1253
1254// ------------------------------ Min128, Max128 (Lt128)
1255
1256template <class D, class V = VFromD<D>>
1257HWY_API V Min128(D d, const V a, const V b) {
1258 return IfThenElse(Lt128(d, a, b), a, b);
1259}
1260
1261template <class D, class V = VFromD<D>>
1262HWY_API V Max128(D d, const V a, const V b) {
1263 return IfThenElse(Lt128(d, b, a), a, b);
1264}
1265
1266template <class D, class V = VFromD<D>>
1267HWY_API V Min128Upper(D d, const V a, const V b) {
1268 return IfThenElse(Lt128Upper(d, a, b), a, b);
1269}
1270
1271template <class D, class V = VFromD<D>>
1272HWY_API V Max128Upper(D d, const V a, const V b) {
1273 return IfThenElse(Lt128Upper(d, b, a), a, b);
1274}
1275
1276// ================================================== MEMORY
1277
1278// ------------------------------ Load
1279
1280template <typename T, size_t N>
1281HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */,
1282 const T* HWY_RESTRICT aligned) {
1283 Vec128<T, N> v;
1284 CopyBytes<sizeof(T) * N>(aligned, v.raw); // copy from array
1285 return v;
1286}
1287
1288template <typename T, size_t N>
1289HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
1290 const T* HWY_RESTRICT aligned) {
1291 return IfThenElseZero(m, Load(d, aligned));
1292}
1293
1294template <typename T, size_t N>
1296 return Load(d, p);
1297}
1298
1299// In some use cases, "load single lane" is sufficient; otherwise avoid this.
1300template <typename T, size_t N>
1302 const T* HWY_RESTRICT aligned) {
1303 return Load(d, aligned);
1304}
1305
1306// ------------------------------ Store
1307
1308template <typename T, size_t N>
1309HWY_API void Store(const Vec128<T, N> v, Simd<T, N, 0> /* tag */,
1310 T* HWY_RESTRICT aligned) {
1311 CopyBytes<sizeof(T) * N>(v.raw, aligned); // copy to array
1312}
1313
1314template <typename T, size_t N>
1316 Store(v, d, p);
1317}
1318
1319template <typename T, size_t N>
1320HWY_API void BlendedStore(const Vec128<T, N> v, Mask128<T, N> m,
1321 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
1322 for (size_t i = 0; i < N; ++i) {
1323 if (m.bits[i]) p[i] = v.raw[i];
1324 }
1325}
1326
1327// ------------------------------ LoadInterleaved2/3/4
1328
1329// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1330// We implement those here because scalar code is likely faster than emulation
1331// via shuffles.
1332#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1333#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1334#else
1335#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1336#endif
1337
1338template <typename T, size_t N>
1340 Vec128<T, N>& v0, Vec128<T, N>& v1) {
1341 alignas(16) T buf0[N];
1342 alignas(16) T buf1[N];
1343 for (size_t i = 0; i < N; ++i) {
1344 buf0[i] = *unaligned++;
1345 buf1[i] = *unaligned++;
1346 }
1347 v0 = Load(d, buf0);
1348 v1 = Load(d, buf1);
1349}
1350
1351template <typename T, size_t N>
1353 Vec128<T, N>& v0, Vec128<T, N>& v1,
1354 Vec128<T, N>& v2) {
1355 alignas(16) T buf0[N];
1356 alignas(16) T buf1[N];
1357 alignas(16) T buf2[N];
1358 for (size_t i = 0; i < N; ++i) {
1359 buf0[i] = *unaligned++;
1360 buf1[i] = *unaligned++;
1361 buf2[i] = *unaligned++;
1362 }
1363 v0 = Load(d, buf0);
1364 v1 = Load(d, buf1);
1365 v2 = Load(d, buf2);
1366}
1367
1368template <typename T, size_t N>
1370 Vec128<T, N>& v0, Vec128<T, N>& v1,
1371 Vec128<T, N>& v2, Vec128<T, N>& v3) {
1372 alignas(16) T buf0[N];
1373 alignas(16) T buf1[N];
1374 alignas(16) T buf2[N];
1375 alignas(16) T buf3[N];
1376 for (size_t i = 0; i < N; ++i) {
1377 buf0[i] = *unaligned++;
1378 buf1[i] = *unaligned++;
1379 buf2[i] = *unaligned++;
1380 buf3[i] = *unaligned++;
1381 }
1382 v0 = Load(d, buf0);
1383 v1 = Load(d, buf1);
1384 v2 = Load(d, buf2);
1385 v3 = Load(d, buf3);
1386}
1387
1388// ------------------------------ StoreInterleaved2/3/4
1389
1390template <typename T, size_t N>
1392 Simd<T, N, 0> /* tag */,
1393 T* HWY_RESTRICT unaligned) {
1394 for (size_t i = 0; i < N; ++i) {
1395 *unaligned++ = v0.raw[i];
1396 *unaligned++ = v1.raw[i];
1397 }
1398}
1399
1400template <typename T, size_t N>
1402 const Vec128<T, N> v2, Simd<T, N, 0> /* tag */,
1403 T* HWY_RESTRICT unaligned) {
1404 for (size_t i = 0; i < N; ++i) {
1405 *unaligned++ = v0.raw[i];
1406 *unaligned++ = v1.raw[i];
1407 *unaligned++ = v2.raw[i];
1408 }
1409}
1410
1411template <typename T, size_t N>
1413 const Vec128<T, N> v2, const Vec128<T, N> v3,
1414 Simd<T, N, 0> /* tag */,
1415 T* HWY_RESTRICT unaligned) {
1416 for (size_t i = 0; i < N; ++i) {
1417 *unaligned++ = v0.raw[i];
1418 *unaligned++ = v1.raw[i];
1419 *unaligned++ = v2.raw[i];
1420 *unaligned++ = v3.raw[i];
1421 }
1422}
1423
1424// ------------------------------ Stream
1425
1426template <typename T, size_t N>
1427HWY_API void Stream(const Vec128<T, N> v, Simd<T, N, 0> d,
1428 T* HWY_RESTRICT aligned) {
1429 Store(v, d, aligned);
1430}
1431
1432// ------------------------------ Scatter
1433
1434template <typename T, size_t N, typename Offset>
1436 const Vec128<Offset, N> offset) {
1437 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1438 for (size_t i = 0; i < N; ++i) {
1439 uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw[i];
1440 CopyBytes<sizeof(T)>(&v.raw[i], base8); // copy to bytes
1441 }
1442}
1443
1444template <typename T, size_t N, typename Index>
1446 T* HWY_RESTRICT base, const Vec128<Index, N> index) {
1447 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1448 for (size_t i = 0; i < N; ++i) {
1449 base[index.raw[i]] = v.raw[i];
1450 }
1451}
1452
1453// ------------------------------ Gather
1454
1455template <typename T, size_t N, typename Offset>
1457 const Vec128<Offset, N> offset) {
1458 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1460 for (size_t i = 0; i < N; ++i) {
1461 const uint8_t* base8 =
1462 reinterpret_cast<const uint8_t*>(base) + offset.raw[i];
1463 CopyBytes<sizeof(T)>(base8, &v.raw[i]); // copy from bytes
1464 }
1465 return v;
1466}
1467
1468template <typename T, size_t N, typename Index>
1469HWY_API Vec128<T, N> GatherIndex(Simd<T, N, 0> /* tag */,
1470 const T* HWY_RESTRICT base,
1471 const Vec128<Index, N> index) {
1472 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1473 Vec128<T, N> v;
1474 for (size_t i = 0; i < N; ++i) {
1475 v.raw[i] = base[index.raw[i]];
1476 }
1477 return v;
1478}
1479
1480// ================================================== CONVERT
1481
1482// ConvertTo and DemoteTo with floating-point input and integer output truncate
1483// (rounding toward zero).
1484
1485template <typename FromT, typename ToT, size_t N>
1487 Vec128<FromT, N> from) {
1488 static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
1489 Vec128<ToT, N> ret;
1490 for (size_t i = 0; i < N; ++i) {
1491 // For bits Y > X, floatX->floatY and intX->intY are always representable.
1492 ret.raw[i] = static_cast<ToT>(from.raw[i]);
1493 }
1494 return ret;
1495}
1496
1497// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
1498// so we overload for FromT=double and ToT={float,int32_t}.
1499template <size_t N>
1501 Vec128<double, N> from) {
1502 Vec128<float, N> ret;
1503 for (size_t i = 0; i < N; ++i) {
1504 // Prevent ubsan errors when converting float to narrower integer/float
1505 if (std::isinf(from.raw[i]) ||
1506 std::fabs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
1507 ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<float>()
1509 continue;
1510 }
1511 ret.raw[i] = static_cast<float>(from.raw[i]);
1512 }
1513 return ret;
1514}
1515template <size_t N>
1517 Vec128<double, N> from) {
1519 for (size_t i = 0; i < N; ++i) {
1520 // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
1521 if (std::isinf(from.raw[i]) ||
1522 std::fabs(from.raw[i]) > static_cast<double>(HighestValue<int32_t>())) {
1523 ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<int32_t>()
1524 : HighestValue<int32_t>();
1525 continue;
1526 }
1527 ret.raw[i] = static_cast<int32_t>(from.raw[i]);
1528 }
1529 return ret;
1530}
1531
1532template <typename FromT, typename ToT, size_t N>
1534 Vec128<FromT, N> from) {
1535 static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
1536 static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
1537
1538 Vec128<ToT, N> ret;
1539 for (size_t i = 0; i < N; ++i) {
1540 // Int to int: choose closest value in ToT to `from` (avoids UB)
1541 from.raw[i] =
1542 HWY_MIN(HWY_MAX(LimitsMin<ToT>(), from.raw[i]), LimitsMax<ToT>());
1543 ret.raw[i] = static_cast<ToT>(from.raw[i]);
1544 }
1545 return ret;
1546}
1547
1548template <size_t N>
1549HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
1550 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
1551 const Repartition<uint32_t, decltype(dbf16)> du32;
1552 const Vec128<uint32_t, N> b_in_lower = ShiftRight<16>(BitCast(du32, b));
1553 // Avoid OddEven - we want the upper half of `a` even on big-endian systems.
1554 const Vec128<uint32_t, N> a_mask = Set(du32, 0xFFFF0000);
1555 return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
1556}
1557
1558template <size_t N>
1562 const int16_t min = LimitsMin<int16_t>();
1563 const int16_t max = LimitsMax<int16_t>();
1565 for (size_t i = 0; i < N; ++i) {
1566 ret.raw[i] = static_cast<int16_t>(HWY_MIN(HWY_MAX(min, a.raw[i]), max));
1567 }
1568 for (size_t i = 0; i < N; ++i) {
1569 ret.raw[N + i] = static_cast<int16_t>(HWY_MIN(HWY_MAX(min, b.raw[i]), max));
1570 }
1571 return ret;
1572}
1573
1574namespace detail {
1575
1576HWY_INLINE void StoreU16ToF16(const uint16_t val,
1578 CopySameSize(&val, to);
1579}
1580
1582 uint16_t bits16;
1583 CopySameSize(from, &bits16);
1584 return bits16;
1585}
1586
1587} // namespace detail
1588
1589template <size_t N>
1590HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
1591 const Vec128<float16_t, N> v) {
1592 Vec128<float, N> ret;
1593 for (size_t i = 0; i < N; ++i) {
1594 const uint16_t bits16 = detail::U16FromF16(&v.raw[i]);
1595 const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
1596 const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1597 const uint32_t mantissa = bits16 & 0x3FF;
1598
1599 // Subnormal or zero
1600 if (biased_exp == 0) {
1601 const float subnormal =
1602 (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
1603 ret.raw[i] = sign ? -subnormal : subnormal;
1604 continue;
1605 }
1606
1607 // Normalized: convert the representation directly (faster than
1608 // ldexp/tables).
1609 const uint32_t biased_exp32 = biased_exp + (127 - 15);
1610 const uint32_t mantissa32 = mantissa << (23 - 10);
1611 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1612 CopySameSize(&bits32, &ret.raw[i]);
1613 }
1614 return ret;
1615}
1616
1617template <size_t N>
1618HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
1619 const Vec128<bfloat16_t, N> v) {
1620 Vec128<float, N> ret;
1621 for (size_t i = 0; i < N; ++i) {
1622 ret.raw[i] = F32FromBF16(v.raw[i]);
1623 }
1624 return ret;
1625}
1626
1627template <size_t N>
1628HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
1629 const Vec128<float, N> v) {
1630 Vec128<float16_t, N> ret;
1631 for (size_t i = 0; i < N; ++i) {
1632 uint32_t bits32;
1633 CopySameSize(&v.raw[i], &bits32);
1634 const uint32_t sign = bits32 >> 31;
1635 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1636 const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1637
1638 const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
1639
1640 // Tiny or zero => zero.
1641 if (exp < -24) {
1642 ZeroBytes<sizeof(uint16_t)>(&ret.raw[i]);
1643 continue;
1644 }
1645
1646 uint32_t biased_exp16, mantissa16;
1647
1648 // exp = [-24, -15] => subnormal
1649 if (exp < -14) {
1650 biased_exp16 = 0;
1651 const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
1652 HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
1653 mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
1654 (mantissa32 >> (13 + sub_exp)));
1655 } else {
1656 // exp = [-14, 15]
1657 biased_exp16 = static_cast<uint32_t>(exp + 15);
1658 HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1659 mantissa16 = mantissa32 >> 13;
1660 }
1661
1662 HWY_DASSERT(mantissa16 < 1024);
1663 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1664 HWY_DASSERT(bits16 < 0x10000);
1665 const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
1666 detail::StoreU16ToF16(narrowed, &ret.raw[i]);
1667 }
1668 return ret;
1669}
1670
1671template <size_t N>
1672HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> /* tag */,
1673 const Vec128<float, N> v) {
1674 Vec128<bfloat16_t, N> ret;
1675 for (size_t i = 0; i < N; ++i) {
1676 ret.raw[i] = BF16FromF32(v.raw[i]);
1677 }
1678 return ret;
1679}
1680
1681// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
1682namespace detail {
1683
1684template <typename FromT, typename ToT, size_t N>
1686 Simd<ToT, N, 0> /* tag */,
1687 Vec128<FromT, N> from) {
1688 static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1689 Vec128<ToT, N> ret;
1690 for (size_t i = 0; i < N; ++i) {
1691 // float## -> int##: return closest representable value. We cannot exactly
1692 // represent LimitsMax<ToT> in FromT, so use double.
1693 const double f = static_cast<double>(from.raw[i]);
1694 if (std::isinf(from.raw[i]) ||
1695 std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
1696 ret.raw[i] =
1697 std::signbit(from.raw[i]) ? LimitsMin<ToT>() : LimitsMax<ToT>();
1698 continue;
1699 }
1700 ret.raw[i] = static_cast<ToT>(from.raw[i]);
1701 }
1702 return ret;
1703}
1704
1705template <typename FromT, typename ToT, size_t N>
1707 Simd<ToT, N, 0> /* tag */,
1708 Vec128<FromT, N> from) {
1709 static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1710 Vec128<ToT, N> ret;
1711 for (size_t i = 0; i < N; ++i) {
1712 // int## -> float##: no check needed
1713 ret.raw[i] = static_cast<ToT>(from.raw[i]);
1714 }
1715 return ret;
1716}
1717
1718} // namespace detail
1719
1720template <typename FromT, typename ToT, size_t N>
1722 return detail::ConvertTo(hwy::IsFloatTag<FromT>(), d, from);
1723}
1724
1725template <size_t N>
1727 return DemoteTo(Simd<uint8_t, N, 0>(), v);
1728}
1729
1730// ------------------------------ Truncations
1731
1732template <size_t N>
1734 const Vec128<uint64_t, N> v) {
1736 for (size_t i = 0; i < N; ++i) {
1737 ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
1738 }
1739 return ret;
1740}
1741
1742template <size_t N>
1744 const Vec128<uint64_t, N> v) {
1746 for (size_t i = 0; i < N; ++i) {
1747 ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
1748 }
1749 return ret;
1750}
1751
1752template <size_t N>
1754 const Vec128<uint64_t, N> v) {
1756 for (size_t i = 0; i < N; ++i) {
1757 ret.raw[i] = static_cast<uint32_t>(v.raw[i] & 0xFFFFFFFFu);
1758 }
1759 return ret;
1760}
1761
1762template <size_t N>
1764 const Vec128<uint32_t, N> v) {
1766 for (size_t i = 0; i < N; ++i) {
1767 ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
1768 }
1769 return ret;
1770}
1771
1772template <size_t N>
1774 const Vec128<uint32_t, N> v) {
1776 for (size_t i = 0; i < N; ++i) {
1777 ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
1778 }
1779 return ret;
1780}
1781
1782template <size_t N>
1784 const Vec128<uint16_t, N> v) {
1786 for (size_t i = 0; i < N; ++i) {
1787 ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
1788 }
1789 return ret;
1790}
1791
1792// ================================================== COMBINE
1793
1794template <typename T, size_t N>
1796 Vec128<T, N / 2> ret;
1797 CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
1798 return ret;
1799}
1800
1801template <typename T, size_t N>
1802HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
1803 Vec128<T, N> v) {
1804 return LowerHalf(v);
1805}
1806
1807template <typename T, size_t N>
1809 Vec128<T, N> v) {
1810 Vec128<T, N / 2> ret;
1811 CopyBytes<N / 2 * sizeof(T)>(&v.raw[N / 2], ret.raw);
1812 return ret;
1813}
1814
1815template <typename T, size_t N>
1816HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> /* tag */,
1817 Vec128<T, N / 2> v) {
1818 Vec128<T, N> ret;
1819 CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
1820 return ret;
1821}
1822
1823template <typename T, size_t N>
1825 Vec128<T, N / 2> lo_half) {
1826 Vec128<T, N> ret;
1827 CopyBytes<N / 2 * sizeof(T)>(lo_half.raw, &ret.raw[0]);
1828 CopyBytes<N / 2 * sizeof(T)>(hi_half.raw, &ret.raw[N / 2]);
1829 return ret;
1830}
1831
1832template <typename T, size_t N>
1834 Vec128<T, N> lo) {
1835 Vec128<T, N> ret;
1836 CopyBytes<N / 2 * sizeof(T)>(lo.raw, &ret.raw[0]);
1837 CopyBytes<N / 2 * sizeof(T)>(hi.raw, &ret.raw[N / 2]);
1838 return ret;
1839}
1840
1841template <typename T, size_t N>
1843 Vec128<T, N> lo) {
1844 Vec128<T, N> ret;
1845 CopyBytes<N / 2 * sizeof(T)>(&lo.raw[N / 2], &ret.raw[0]);
1846 CopyBytes<N / 2 * sizeof(T)>(&hi.raw[N / 2], &ret.raw[N / 2]);
1847 return ret;
1848}
1849
1850template <typename T, size_t N>
1852 const Vec128<T, N> hi,
1853 const Vec128<T, N> lo) {
1854 Vec128<T, N> ret;
1855 CopyBytes<N / 2 * sizeof(T)>(&lo.raw[N / 2], &ret.raw[0]);
1856 CopyBytes<N / 2 * sizeof(T)>(hi.raw, &ret.raw[N / 2]);
1857 return ret;
1858}
1859
1860template <typename T, size_t N>
1861HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
1862 Vec128<T, N> lo) {
1863 Vec128<T, N> ret;
1864 CopyBytes<N / 2 * sizeof(T)>(lo.raw, &ret.raw[0]);
1865 CopyBytes<N / 2 * sizeof(T)>(&hi.raw[N / 2], &ret.raw[N / 2]);
1866 return ret;
1867}
1868
1869template <typename T, size_t N>
1871 Vec128<T, N> lo) {
1872 Vec128<T, N> ret;
1873 for (size_t i = 0; i < N / 2; ++i) {
1874 ret.raw[i] = lo.raw[2 * i];
1875 }
1876 for (size_t i = 0; i < N / 2; ++i) {
1877 ret.raw[N / 2 + i] = hi.raw[2 * i];
1878 }
1879 return ret;
1880}
1881
1882template <typename T, size_t N>
1884 Vec128<T, N> lo) {
1885 Vec128<T, N> ret;
1886 for (size_t i = 0; i < N / 2; ++i) {
1887 ret.raw[i] = lo.raw[2 * i + 1];
1888 }
1889 for (size_t i = 0; i < N / 2; ++i) {
1890 ret.raw[N / 2 + i] = hi.raw[2 * i + 1];
1891 }
1892 return ret;
1893}
1894
1895// ------------------------------ CombineShiftRightBytes
1896
1897template <int kBytes, typename T, size_t N, class V = Vec128<T, N>>
1899 V ret;
1900 const uint8_t* HWY_RESTRICT lo8 =
1901 reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
1902 uint8_t* HWY_RESTRICT ret8 =
1903 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1904 CopyBytes<sizeof(T) * N - kBytes>(lo8 + kBytes, ret8);
1905 CopyBytes<kBytes>(hi.raw, ret8 + sizeof(T) * N - kBytes);
1906 return ret;
1907}
1908
1909// ------------------------------ ShiftLeftBytes
1910
1911template <int kBytes, typename T, size_t N>
1912HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
1913 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1914 Vec128<T, N> ret;
1915 uint8_t* HWY_RESTRICT ret8 =
1916 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1917 ZeroBytes<kBytes>(ret8);
1918 CopyBytes<sizeof(T) * N - kBytes>(v.raw, ret8 + kBytes);
1919 return ret;
1920}
1921
1922template <int kBytes, typename T, size_t N>
1923HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
1924 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
1925}
1926
1927// ------------------------------ ShiftLeftLanes
1928
1929template <int kLanes, typename T, size_t N>
1930HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
1931 const Repartition<uint8_t, decltype(d)> d8;
1932 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1933}
1934
1935template <int kLanes, typename T, size_t N>
1936HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
1937 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
1938}
1939
1940// ------------------------------ ShiftRightBytes
1941template <int kBytes, typename T, size_t N>
1942HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
1943 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1944 Vec128<T, N> ret;
1945 const uint8_t* HWY_RESTRICT v8 =
1946 reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
1947 uint8_t* HWY_RESTRICT ret8 =
1948 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1949 CopyBytes<sizeof(T) * N - kBytes>(v8 + kBytes, ret8);
1950 ZeroBytes<kBytes>(ret8 + sizeof(T) * N - kBytes);
1951 return ret;
1952}
1953
1954// ------------------------------ ShiftRightLanes
1955template <int kLanes, typename T, size_t N>
1956HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
1957 const Repartition<uint8_t, decltype(d)> d8;
1958 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
1959}
1960
1961// ================================================== SWIZZLE
1962
1963template <typename T, size_t N>
1965 return v.raw[0];
1966}
1967
1968template <typename T, size_t N>
1970 v.raw[i] = t;
1971 return v;
1972}
1973
1974template <typename T, size_t N>
1975HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
1976 return v.raw[i];
1977}
1978
1979template <typename T, size_t N>
1981 for (size_t i = 0; i < N; i += 2) {
1982 v.raw[i + 1] = v.raw[i];
1983 }
1984 return v;
1985}
1986
1987template <typename T, size_t N>
1989 for (size_t i = 0; i < N; i += 2) {
1990 v.raw[i] = v.raw[i + 1];
1991 }
1992 return v;
1993}
1994
1995template <typename T, size_t N>
1996HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
1997 for (size_t i = 0; i < N; i += 2) {
1998 odd.raw[i] = even.raw[i];
1999 }
2000 return odd;
2001}
2002
2003template <typename T, size_t N>
2004HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
2005 return even;
2006}
2007
2008// ------------------------------ SwapAdjacentBlocks
2009
2010template <typename T, size_t N>
2011HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
2012 return v;
2013}
2014
2015// ------------------------------ TableLookupLanes
2016
2017// Returned by SetTableIndices for use by TableLookupLanes.
2018template <typename T, size_t N>
2019struct Indices128 {
2021};
2022
2023template <typename T, size_t N, typename TI>
2025 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
2026 Indices128<T, N> ret;
2027 CopyBytes<N * sizeof(T)>(vec.raw, ret.raw);
2028 return ret;
2029}
2030
2031template <typename T, size_t N, typename TI>
2033 return IndicesFromVec(d, LoadU(Simd<TI, N, 0>(), idx));
2034}
2035
2036template <typename T, size_t N>
2037HWY_API Vec128<T, N> TableLookupLanes(const Vec128<T, N> v,
2038 const Indices128<T, N> idx) {
2039 Vec128<T, N> ret;
2040 for (size_t i = 0; i < N; ++i) {
2041 ret.raw[i] = v.raw[idx.raw[i]];
2042 }
2043 return ret;
2044}
2045
2046// ------------------------------ ReverseBlocks
2047
2048// Single block: no change
2049template <typename T, size_t N>
2051 const Vec128<T, N> v) {
2052 return v;
2053}
2054
2055// ------------------------------ Reverse
2056
2057template <typename T, size_t N>
2059 Vec128<T, N> ret;
2060 for (size_t i = 0; i < N; ++i) {
2061 ret.raw[i] = v.raw[N - 1 - i];
2062 }
2063 return ret;
2064}
2065
2066template <typename T, size_t N>
2068 Vec128<T, N> ret;
2069 for (size_t i = 0; i < N; i += 2) {
2070 ret.raw[i + 0] = v.raw[i + 1];
2071 ret.raw[i + 1] = v.raw[i + 0];
2072 }
2073 return ret;
2074}
2075
2076template <typename T, size_t N>
2078 Vec128<T, N> ret;
2079 for (size_t i = 0; i < N; i += 4) {
2080 ret.raw[i + 0] = v.raw[i + 3];
2081 ret.raw[i + 1] = v.raw[i + 2];
2082 ret.raw[i + 2] = v.raw[i + 1];
2083 ret.raw[i + 3] = v.raw[i + 0];
2084 }
2085 return ret;
2086}
2087
2088template <typename T, size_t N>
2090 Vec128<T, N> ret;
2091 for (size_t i = 0; i < N; i += 8) {
2092 ret.raw[i + 0] = v.raw[i + 7];
2093 ret.raw[i + 1] = v.raw[i + 6];
2094 ret.raw[i + 2] = v.raw[i + 5];
2095 ret.raw[i + 3] = v.raw[i + 4];
2096 ret.raw[i + 4] = v.raw[i + 3];
2097 ret.raw[i + 5] = v.raw[i + 2];
2098 ret.raw[i + 6] = v.raw[i + 1];
2099 ret.raw[i + 7] = v.raw[i + 0];
2100 }
2101 return ret;
2102}
2103
2104// ================================================== BLOCKWISE
2105
2106// ------------------------------ Shuffle*
2107
2108// Swap 32-bit halves in 64-bit halves.
2109template <typename T, size_t N>
2111 static_assert(sizeof(T) == 4, "Only for 32-bit");
2112 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2113 return Reverse2(DFromV<decltype(v)>(), v);
2114}
2115
2116// Swap 64-bit halves
2117template <typename T>
2118HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
2119 static_assert(sizeof(T) == 4, "Only for 32-bit");
2120 Vec128<T> ret;
2121 ret.raw[3] = v.raw[1];
2122 ret.raw[2] = v.raw[0];
2123 ret.raw[1] = v.raw[3];
2124 ret.raw[0] = v.raw[2];
2125 return ret;
2126}
2127template <typename T>
2128HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
2129 static_assert(sizeof(T) == 8, "Only for 64-bit");
2130 return Reverse2(DFromV<decltype(v)>(), v);
2131}
2132
2133// Rotate right 32 bits
2134template <typename T>
2135HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
2136 Vec128<T> ret;
2137 ret.raw[3] = v.raw[0];
2138 ret.raw[2] = v.raw[3];
2139 ret.raw[1] = v.raw[2];
2140 ret.raw[0] = v.raw[1];
2141 return ret;
2142}
2143
2144// Rotate left 32 bits
2145template <typename T>
2146HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
2147 Vec128<T> ret;
2148 ret.raw[3] = v.raw[2];
2149 ret.raw[2] = v.raw[1];
2150 ret.raw[1] = v.raw[0];
2151 ret.raw[0] = v.raw[3];
2152 return ret;
2153}
2154
2155template <typename T>
2156HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
2157 return Reverse4(DFromV<decltype(v)>(), v);
2158}
2159
2160// ------------------------------ Broadcast/splat any lane
2161
2162template <int kLane, typename T, size_t N>
2164 for (size_t i = 0; i < N; ++i) {
2165 v.raw[i] = v.raw[kLane];
2166 }
2167 return v;
2168}
2169
2170// ------------------------------ TableLookupBytes, TableLookupBytesOr0
2171
2172template <typename T, size_t N, typename TI, size_t NI>
2174 const Vec128<TI, NI> indices) {
2175 const uint8_t* HWY_RESTRICT v_bytes =
2176 reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
2177 const uint8_t* HWY_RESTRICT idx_bytes =
2178 reinterpret_cast<const uint8_t*>(indices.raw);
2179 Vec128<TI, NI> ret;
2180 uint8_t* HWY_RESTRICT ret_bytes =
2181 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2182 for (size_t i = 0; i < NI * sizeof(TI); ++i) {
2183 const size_t idx = idx_bytes[i];
2184 // Avoid out of bounds reads.
2185 ret_bytes[i] = idx < sizeof(T) * N ? v_bytes[idx] : 0;
2186 }
2187 return ret;
2188}
2189
2190template <typename T, size_t N, typename TI, size_t NI>
2192 const Vec128<TI, NI> indices) {
2193 // Same as TableLookupBytes, which already returns 0 if out of bounds.
2194 return TableLookupBytes(v, indices);
2195}
2196
2197// ------------------------------ InterleaveLower/InterleaveUpper
2198
2199template <typename T, size_t N>
2201 const Vec128<T, N> b) {
2202 Vec128<T, N> ret;
2203 for (size_t i = 0; i < N / 2; ++i) {
2204 ret.raw[2 * i + 0] = a.raw[i];
2205 ret.raw[2 * i + 1] = b.raw[i];
2206 }
2207 return ret;
2208}
2209
2210// Additional overload for the optional tag (also for 256/512).
2211template <class V>
2212HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
2213 return InterleaveLower(a, b);
2214}
2215
2216template <typename T, size_t N>
2218 const Vec128<T, N> a,
2219 const Vec128<T, N> b) {
2220 Vec128<T, N> ret;
2221 for (size_t i = 0; i < N / 2; ++i) {
2222 ret.raw[2 * i + 0] = a.raw[N / 2 + i];
2223 ret.raw[2 * i + 1] = b.raw[N / 2 + i];
2224 }
2225 return ret;
2226}
2227
2228// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2229
2230// Same as Interleave*, except that the return lanes are double-width integers;
2231// this is necessary because the single-lane scalar cannot return two values.
2232template <class V, class DW = RepartitionToWide<DFromV<V>>>
2233HWY_API VFromD<DW> ZipLower(V a, V b) {
2234 return BitCast(DW(), InterleaveLower(a, b));
2235}
2236template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2237HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2238 return BitCast(dw, InterleaveLower(D(), a, b));
2239}
2240
2241template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2242HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2243 return BitCast(dw, InterleaveUpper(D(), a, b));
2244}
2245
2246// ================================================== MASK
2247
2248template <typename T, size_t N>
2249HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
2250 typename Mask128<T, N>::Raw or_sum = 0;
2251 for (size_t i = 0; i < N; ++i) {
2252 or_sum |= mask.bits[i];
2253 }
2254 return or_sum == 0;
2255}
2256
2257template <typename T, size_t N>
2258HWY_API bool AllTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
2259 constexpr uint64_t kAll = LimitsMax<typename Mask128<T, N>::Raw>();
2260 uint64_t and_sum = kAll;
2261 for (size_t i = 0; i < N; ++i) {
2262 and_sum &= mask.bits[i];
2263 }
2264 return and_sum == kAll;
2265}
2266
2267// `p` points to at least 8 readable bytes, not all of which need be valid.
2268template <typename T, size_t N>
2270 const uint8_t* HWY_RESTRICT bits) {
2271 Mask128<T, N> m;
2272 for (size_t i = 0; i < N; ++i) {
2273 const size_t bit = size_t{1} << (i & 7);
2274 const size_t idx_byte = i >> 3;
2275 m.bits[i] = Mask128<T, N>::FromBool((bits[idx_byte] & bit) != 0);
2276 }
2277 return m;
2278}
2279
2280// `p` points to at least 8 writable bytes.
2281template <typename T, size_t N>
2282HWY_API size_t StoreMaskBits(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask,
2283 uint8_t* bits) {
2284 bits[0] = 0;
2285 if (N > 8) bits[1] = 0; // N <= 16, so max two bytes
2286 for (size_t i = 0; i < N; ++i) {
2287 const size_t bit = size_t{1} << (i & 7);
2288 const size_t idx_byte = i >> 3;
2289 if (mask.bits[i]) {
2290 bits[idx_byte] = static_cast<uint8_t>(bits[idx_byte] | bit);
2291 }
2292 }
2293 return N > 8 ? 2 : 1;
2294}
2295
2296template <typename T, size_t N>
2297HWY_API size_t CountTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
2298 size_t count = 0;
2299 for (size_t i = 0; i < N; ++i) {
2300 count += mask.bits[i] != 0;
2301 }
2302 return count;
2303}
2304
2305template <typename T, size_t N>
2306HWY_API size_t FindKnownFirstTrue(Simd<T, N, 0> /* tag */,
2307 const Mask128<T, N> mask) {
2308 for (size_t i = 0; i < N; ++i) {
2309 if (mask.bits[i] != 0) return i;
2310 }
2311 HWY_DASSERT(false);
2312 return 0;
2313}
2314
2315template <typename T, size_t N>
2316HWY_API intptr_t FindFirstTrue(Simd<T, N, 0> /* tag */,
2317 const Mask128<T, N> mask) {
2318 for (size_t i = 0; i < N; ++i) {
2319 if (mask.bits[i] != 0) return static_cast<intptr_t>(i);
2320 }
2321 return intptr_t{-1};
2322}
2323
2324// ------------------------------ Compress
2325
2326template <typename T>
2327struct CompressIsPartition {
2328 enum { value = (sizeof(T) != 1) };
2329};
2330
2331template <typename T, size_t N>
2333 size_t count = 0;
2334 Vec128<T, N> ret;
2335 for (size_t i = 0; i < N; ++i) {
2336 if (mask.bits[i]) {
2337 ret.raw[count++] = v.raw[i];
2338 }
2339 }
2340 for (size_t i = 0; i < N; ++i) {
2341 if (!mask.bits[i]) {
2342 ret.raw[count++] = v.raw[i];
2343 }
2344 }
2345 HWY_DASSERT(count == N);
2346 return ret;
2347}
2348
2349// ------------------------------ CompressNot
2350template <typename T, size_t N>
2352 size_t count = 0;
2353 Vec128<T, N> ret;
2354 for (size_t i = 0; i < N; ++i) {
2355 if (!mask.bits[i]) {
2356 ret.raw[count++] = v.raw[i];
2357 }
2358 }
2359 for (size_t i = 0; i < N; ++i) {
2360 if (mask.bits[i]) {
2361 ret.raw[count++] = v.raw[i];
2362 }
2363 }
2364 HWY_DASSERT(count == N);
2365 return ret;
2366}
2367
2368// ------------------------------ CompressBlocksNot
2369HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
2370 Mask128<uint64_t> /* m */) {
2371 return v;
2372}
2373
2374// ------------------------------ CompressBits
2375template <typename T, size_t N>
2377 const uint8_t* HWY_RESTRICT bits) {
2378 return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits));
2379}
2380
2381// ------------------------------ CompressStore
2382template <typename T, size_t N>
2384 Simd<T, N, 0> /* tag */,
2385 T* HWY_RESTRICT unaligned) {
2386 size_t count = 0;
2387 for (size_t i = 0; i < N; ++i) {
2388 if (mask.bits[i]) {
2389 unaligned[count++] = v.raw[i];
2390 }
2391 }
2392 return count;
2393}
2394
2395// ------------------------------ CompressBlendedStore
2396template <typename T, size_t N>
2399 T* HWY_RESTRICT unaligned) {
2400 return CompressStore(v, mask, d, unaligned);
2401}
2402
2403// ------------------------------ CompressBitsStore
2404template <typename T, size_t N>
2406 const uint8_t* HWY_RESTRICT bits,
2407 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
2408 const Mask128<T, N> mask = LoadMaskBits(d, bits);
2409 StoreU(Compress(v, mask), d, unaligned);
2410 return CountTrue(d, mask);
2411}
2412
2413// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2414
2415template <size_t N>
2416HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
2417 Vec128<bfloat16_t, 2 * N> a,
2418 Vec128<bfloat16_t, 2 * N> b,
2419 const Vec128<float, N> sum0,
2420 Vec128<float, N>& sum1) {
2421 const Rebind<uint32_t, decltype(df32)> du32;
2422 using VU32 = VFromD<decltype(du32)>;
2423 const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
2424 // Avoid ZipLower/Upper so this also works on big-endian systems.
2425 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2426 const VU32 ao = And(BitCast(du32, a), odd);
2427 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2428 const VU32 bo = And(BitCast(du32, b), odd);
2429 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
2430 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
2431}
2432
2433template <size_t N>
2436 const Vec128<int32_t, N> sum0, Vec128<int32_t, N>& sum1) {
2437 using VI32 = VFromD<decltype(d32)>;
2438 // Manual sign extension requires two shifts for even lanes.
2439 const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
2440 const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
2441 const VI32 ao = ShiftRight<16>(BitCast(d32, a));
2442 const VI32 bo = ShiftRight<16>(BitCast(d32, b));
2443 sum1 = Add(Mul(ao, bo), sum1);
2444 return Add(Mul(ae, be), sum0);
2445}
2446
2447// ------------------------------ RearrangeToOddPlusEven
2448template <class VW>
2449HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
2450 return Add(sum0, sum1);
2451}
2452
2453// ================================================== REDUCTIONS
2454
2455template <typename T, size_t N>
2456HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2457 T sum = T{0};
2458 for (size_t i = 0; i < N; ++i) {
2459 sum += v.raw[i];
2460 }
2461 return Set(d, sum);
2462}
2463template <typename T, size_t N>
2464HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2465 T min = HighestValue<T>();
2466 for (size_t i = 0; i < N; ++i) {
2467 min = HWY_MIN(min, v.raw[i]);
2468 }
2469 return Set(d, min);
2470}
2471template <typename T, size_t N>
2472HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2473 T max = LowestValue<T>();
2474 for (size_t i = 0; i < N; ++i) {
2475 max = HWY_MAX(max, v.raw[i]);
2476 }
2477 return Set(d, max);
2478}
2479
2480// ================================================== OPS WITH DEPENDENCIES
2481
2482// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
2483
2484HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
2485 const Vec128<uint64_t> b) {
2486 alignas(16) uint64_t mul[2];
2487 mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
2488 return Load(Full128<uint64_t>(), mul);
2489}
2490
2491HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
2492 const Vec128<uint64_t> b) {
2493 alignas(16) uint64_t mul[2];
2494 const Half<Full128<uint64_t>> d2;
2495 mul[0] =
2496 Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
2497 return Load(Full128<uint64_t>(), mul);
2498}
2499
2500// NOLINTNEXTLINE(google-readability-namespace-comments)
2501} // namespace HWY_NAMESPACE
2502} // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:135
#define HWY_RESTRICT
Definition: base.h:64
#define HWY_API
Definition: base.h:129
#define HWY_MIN(a, b)
Definition: base.h:134
#define HWY_INLINE
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:238
Definition: arm_neon-inl.h:825
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:827
static HWY_INLINE Raw FromBool(bool b)
Definition: emu128-inl.h:77
Raw bits[16/sizeof(T)]
Definition: emu128-inl.h:82
Definition: arm_neon-inl.h:778
HWY_INLINE Vec128()=default
T PrivateT
Definition: arm_neon-inl.h:782
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: emu128-inl.h:46
Raw raw
Definition: arm_neon-inl.h:814
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: emu128-inl.h:52
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: emu128-inl.h:61
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: emu128-inl.h:58
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: emu128-inl.h:43
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: emu128-inl.h:55
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: emu128-inl.h:49
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition: emu128-inl.h:726
HWY_INLINE Vec128< T, N > Abs(SignedTag, Vec128< T, N > a)
Definition: emu128-inl.h:633
HWY_INLINE Vec128< T, N > Add(hwy::FloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:556
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:535
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:663
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:545
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:671
HWY_INLINE Vec128< T, N > Mul(UnsignedTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:767
HWY_API Vec128< ToT, N > ConvertTo(hwy::FloatTag, Simd< ToT, N, 0 >, Vec128< FromT, N > from)
Definition: emu128-inl.h:1685
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:926
HWY_INLINE void StoreU16ToF16(const uint16_t val, hwy::float16_t *HWY_RESTRICT to)
Definition: emu128-inl.h:1576
HWY_INLINE uint16_t U16FromF16(const hwy::float16_t *HWY_RESTRICT from)
Definition: emu128-inl.h:1581
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:748
static bool SignBit(float f)
Definition: scalar-inl.h:601
d
Definition: rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5716
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:6349
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4131
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6584
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition: arm_neon-inl.h:2025
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition: arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2517
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6677
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2758
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2477
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2753
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1413
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4922
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3467
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition: arm_neon-inl.h:842
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition: arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition: arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition: arm_neon-inl.h:4412
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:5020
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:6387
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2260
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3425
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3327
N
Definition: rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1885
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6428
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:580
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6517
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6651
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3885
const vfloat64m1_t v
Definition: rvv-inl.h:1998
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3713
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6549
typename D::T TFromD
Definition: ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1861
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition: base.h:906
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:975
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:924
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:983
constexpr float HighestValue< float >()
Definition: base.h:688
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition: base.h:961
constexpr float LowestValue< float >()
Definition: base.h:675
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:593
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:595
HWY_API constexpr T LimitsMax()
Definition: base.h:656
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5730
Definition: arm_neon-inl.h:3968
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3969
Definition: ops/shared-inl.h:52
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v) const
Definition: emu128-inl.h:432
Definition: emu128-inl.h:422
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v) const
Definition: emu128-inl.h:424
Definition: base.h:435
Definition: base.h:291