From 861a6e85e106850583a98f2254def0c9510de091 Mon Sep 17 00:00:00 2001
From: Dean Li <deantvv@gmail.com>
Date: Sun, 28 Nov 2021 15:19:01 +0800
Subject: [PATCH 01/11] Add spectral_norm example from packed_simd

---
 crates/core_simd/examples/spectral_norm.rs | 77 ++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 crates/core_simd/examples/spectral_norm.rs
diff --git a/crates/core_simd/examples/spectral_norm.rs b/crates/core_simd/examples/spectral_norm.rs
new file mode 100644
index 00000000000..c515dad4dea
--- /dev/null
+++ b/crates/core_simd/examples/spectral_norm.rs
@@ -0,0 +1,77 @@
+#![feature(portable_simd)]
+
+use core_simd::simd::*;
+
+fn a(i: usize, j: usize) -> f64 {
+    ((i + j) * (i + j + 1) / 2 + i + 1) as f64
+}
+
+fn mult_av(v: &[f64], out: &mut [f64]) {
+    assert!(v.len() == out.len());
+    assert!(v.len() % 2 == 0);
+
+    for (i, out) in out.iter_mut().enumerate() {
+        let mut sum = f64x2::splat(0.0);
+
+        let mut j = 0;
+        while j < v.len() {
+            let b = f64x2::from_slice(&v[j..]);
+            let a = f64x2::from_array([a(i, j), a(i, j + 1)]);
+            sum += b / a;
+            j += 2
+        }
+        *out = sum.horizontal_sum();
+    }
+}
+
+fn mult_atv(v: &[f64], out: &mut [f64]) {
+    assert!(v.len() == out.len());
+    assert!(v.len() % 2 == 0);
+
+    for (i, out) in out.iter_mut().enumerate() {
+        let mut sum = f64x2::splat(0.0);
+
+        let mut j = 0;
+        while j < v.len() {
+            let b = f64x2::from_slice(&v[j..]);
+            let a = f64x2::from_array([a(j, i), a(j + 1, i)]);
+            sum += b / a;
+            j += 2
+        }
+        *out = sum.horizontal_sum();
+    }
+}
+
+fn mult_atav(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
+    mult_av(v, tmp);
+    mult_atv(tmp, out);
+}
+
+pub fn spectral_norm(n: usize) -> f64 {
+    assert!(n % 2 == 0, "only even lengths are accepted");
+
+    let mut u = vec![1.0; n];
+    let mut v = u.clone();
+    let mut tmp = u.clone();
+
+    for _ in 0..10 {
+        mult_atav(&u, &mut v, &mut tmp);
+        mult_atav(&v, &mut u, &mut tmp);
+    }
+    (dot(&u, &v) / dot(&v, &v)).sqrt()
+}
+
+fn dot(x: &[f64], y: &[f64]) -> f64 {
+    // This is auto-vectorized:
+    x.iter().zip(y).map(|(&x, &y)| x * y).sum()
+}
+
+#[cfg(test)]
+#[test]
+fn test() {
+    assert_eq!(&format!("{:.9}", spectral_norm(100)), "1.274219991");
+}
+
+fn main() {
+    // Empty main to make cargo happy
+}

From ebf65de2ce331f428a174f4ade13f60c8654472f Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 2 Feb 2022 18:14:16 -0800
Subject: [PATCH 02/11] Delete outmoded fn round_from_int

---
 crates/core_simd/src/round.rs   | 7 -------
 crates/core_simd/tests/round.rs | 8 --------
 2 files changed, 15 deletions(-)

diff --git a/crates/core_simd/src/round.rs b/crates/core_simd/src/round.rs
index 06ccab3ec49..08339593600 100644
--- a/crates/core_simd/src/round.rs
+++ b/crates/core_simd/src/round.rs
@@ -22,13 +22,6 @@ macro_rules! implement {
             pub unsafe fn to_int_unchecked(self) -> Simd<$int_type, LANES> {
                 unsafe { intrinsics::simd_cast(self) }
             }
-
-            /// Creates a floating-point vector from an integer vector.  Rounds values that are
-            /// not exactly representable.
-            #[inline]
-            pub fn round_from_int(value: Simd<$int_type, LANES>) -> Self {
-                unsafe { intrinsics::simd_cast(value) }
-            }
         }
     }
 }
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 1a1bc9ebca7..90547d5782c 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -53,14 +53,6 @@ macro_rules! float_rounding_test {
             }
 
             test_helpers::test_lanes! {
-                fn from_int<const LANES: usize>() {
-                    test_helpers::test_unary_elementwise(
-                        &Vector::<LANES>::round_from_int,
-                        &|x| x as Scalar,
-                        &|_| true,
-                    )
-                }
-
                 fn to_int_unchecked<const LANES: usize>() {
                     // The maximum integer that can be represented by the equivalently sized float has
                     // all of the mantissa digits set to 1, pushed up to the MSB.

From 4910274686bcd144228a04d8d4d5dece4c7f5e3d Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 2 Feb 2022 18:21:12 -0800
Subject: [PATCH 03/11] Genericize to_int_unchecked

---
 crates/core_simd/src/lib.rs     |  1 +
 crates/core_simd/src/round.rs   | 15 ++++++++++-----
 crates/core_simd/tests/round.rs |  6 +++---
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 960a6640083..41f64e972d9 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,6 +1,7 @@
 #![cfg_attr(not(feature = "std"), no_std)]
 #![feature(
     const_fn_trait_bound,
+    convert_float_to_int,
     decl_macro,
     platform_intrinsics,
     repr_simd,
diff --git a/crates/core_simd/src/round.rs b/crates/core_simd/src/round.rs
index 08339593600..f1724cbc263 100644
--- a/crates/core_simd/src/round.rs
+++ b/crates/core_simd/src/round.rs
@@ -1,9 +1,10 @@
 use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use core::convert::FloatToInt;
 
 macro_rules! implement {
     {
-        $type:ty, $int_type:ty
+        $type:ty
     } => {
         impl<const LANES: usize> Simd<$type, LANES>
         where
@@ -19,12 +20,16 @@ macro_rules! implement {
             /// * Not be infinite
             /// * Be representable in the return type, after truncating off its fractional part
             #[inline]
-            pub unsafe fn to_int_unchecked(self) -> Simd<$int_type, LANES> {
+            pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, LANES>
+            where
+                $type: FloatToInt<I>,
+                I: SimdElement,
+            {
                 unsafe { intrinsics::simd_cast(self) }
             }
         }
     }
 }
 
-implement! { f32, i32 }
-implement! { f64, i64 }
+implement! { f32 }
+implement! { f64 }
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 90547d5782c..53732329237 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -64,11 +64,11 @@ macro_rules! float_rounding_test {
                     runner.run(
                         &test_helpers::array::UniformArrayStrategy::new(-MAX_REPRESENTABLE_VALUE..MAX_REPRESENTABLE_VALUE),
                         |x| {
-                            let result_1 = unsafe { Vector::from_array(x).to_int_unchecked().to_array() };
+                            let result_1 = unsafe { Vector::from_array(x).to_int_unchecked::<IntScalar>().to_array() };
                             let result_2 = {
-                                let mut result = [0; LANES];
+                                let mut result: [IntScalar; LANES] = [0; LANES];
                                 for (i, o) in x.iter().zip(result.iter_mut()) {
-                                    *o = unsafe { i.to_int_unchecked() };
+                                    *o = unsafe { i.to_int_unchecked::<IntScalar>() };
                                 }
                                 result
                             };

From 672bfebfd89b7d7ebdac3dbcf714c6010430d5fc Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Mon, 7 Feb 2022 21:24:21 -0800
Subject: [PATCH 04/11] Remove overflow panic from divrem

Includes some remarks in intrinsics.rs,
generated while auditing the interface for remaining UB.
---
 crates/core_simd/src/intrinsics.rs   |  9 +++++++
 crates/core_simd/src/ops.rs          | 39 +++++++++++++++++-----------
 crates/core_simd/tests/ops_macros.rs | 24 ++++++++---------
 3 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 233657202f7..b5d0df7548f 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -17,9 +17,15 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_mul<T>(x: T, y: T) -> T;
 
     /// udiv/sdiv/fdiv
+    /// ints and uints: {s,u}div incur UB if division by zero occurs.
+    /// ints: sdiv is UB for int::MIN / -1.
+    /// floats: fdiv is never UB, but may create NaNs or infinities.
     pub(crate) fn simd_div<T>(x: T, y: T) -> T;
 
     /// urem/srem/frem
+    /// ints and uints: {s,u}rem incur UB if division by zero occurs.
+    /// ints: srem is UB for int::MIN / -1.
+    /// floats: frem is equivalent to libm::fmod in the "default" floating point environment, sans errno.
     pub(crate) fn simd_rem<T>(x: T, y: T) -> T;
 
     /// shl
@@ -45,6 +51,9 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_as<T, U>(x: T) -> U;
 
     /// neg/fneg
+    /// ints: ultimately becomes a call to cg_ssa's BuilderMethods::neg. cg_llvm equates this to `simd_sub(Simd::splat(0), x)`.
+    /// floats: LLVM's fneg, which changes the floating point sign bit. Some arches have instructions for it.
+    /// Rust panics for Neg::neg(int::MIN) due to overflow, but it is not UB in LLVM without `nsw`.
     pub(crate) fn simd_neg<T>(x: T) -> T;
 
     /// fabs
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index b65038933bf..1b35b3e717a 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -57,29 +57,40 @@ macro_rules! wrap_bitshift {
     };
 }
 
-// Division by zero is poison, according to LLVM.
-// So is dividing the MIN value of a signed integer by -1,
-// since that would return MAX + 1.
-// FIXME: Rust allows <SInt>::MIN / -1,
-// so we should probably figure out how to make that safe.
+/// SAFETY: This macro must only be used to impl Div or Rem and given the matching intrinsic.
+/// It guards against LLVM's UB conditions for integer div or rem using masks and selects,
+/// thus guaranteeing a Rust value returns instead.
+///
+/// |                  | LLVM | Rust
+/// | :--------------: | :--- | :----------
+/// | N {/,%} 0        | UB   | panic!()
+/// | <$int>::MIN / -1 | UB   | <$int>::MIN
+/// | <$int>::MIN % -1 | UB   | 0
+///
 macro_rules! int_divrem_guard {
     (   $lhs:ident,
         $rhs:ident,
         {   const PANIC_ZERO: &'static str = $zero:literal;
-            const PANIC_OVERFLOW: &'static str = $overflow:literal;
             $simd_call:ident
         },
         $int:ident ) => {
         if $rhs.lanes_eq(Simd::splat(0)).any() {
             panic!($zero);
-        } else if <$int>::MIN != 0
-            && ($lhs.lanes_eq(Simd::splat(<$int>::MIN))
-                // type inference can break here, so cut an SInt to size
-                & $rhs.lanes_eq(Simd::splat(-1i64 as _))).any()
-        {
-            panic!($overflow);
         } else {
-            unsafe { $crate::simd::intrinsics::$simd_call($lhs, $rhs) }
+            // Prevent otherwise-UB overflow on the MIN / -1 case.
+            let rhs = if <$int>::MIN != 0 {
+                // This should, at worst, optimize to a few branchless logical ops
+                // Ideally, this entire conditional should evaporate
+                // Fire LLVM and implement those manually if it doesn't get the hint
+                ($lhs.lanes_eq(Simd::splat(<$int>::MIN))
+                // type inference can break here, so cut an SInt to size
+                & $rhs.lanes_eq(Simd::splat(-1i64 as _)))
+                .select(Simd::splat(1), $rhs)
+            } else {
+                // Nice base case to make it easy to const-fold away the other branch.
+                $rhs
+            };
+            unsafe { $crate::simd::intrinsics::$simd_call($lhs, rhs) }
         }
     };
 }
@@ -183,7 +194,6 @@ for_base_ops! {
     impl Div::div {
         int_divrem_guard {
             const PANIC_ZERO: &'static str = "attempt to divide by zero";
-            const PANIC_OVERFLOW: &'static str = "attempt to divide with overflow";
             simd_div
         }
     }
@@ -191,7 +201,6 @@ for_base_ops! {
     impl Rem::rem {
         int_divrem_guard {
             const PANIC_ZERO: &'static str = "attempt to calculate the remainder with a divisor of zero";
-            const PANIC_OVERFLOW: &'static str = "attempt to calculate the remainder with overflow";
             simd_rem
         }
     }
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 4fb9de198ee..9ba66fb8dd9 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -210,15 +210,21 @@ macro_rules! impl_signed_tests {
                     )
                 }
 
+                fn div_min_may_overflow<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(Scalar::MIN);
+                    let b = Vector::<LANES>::splat(-1);
+                    assert_eq!(a / b, a / (b * b));
+                }
+
+                fn rem_min_may_overflow<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(Scalar::MIN);
+                    let b = Vector::<LANES>::splat(-1);
+                    assert_eq!(a % b, a % (b * b));
+                }
+
             }
 
             test_helpers::test_lanes_panic! {
-                fn div_min_overflow_panics<const LANES: usize>() {
-                    let a = Vector::<LANES>::splat(Scalar::MIN);
-                    let b = Vector::<LANES>::splat(-1);
-                    let _ = a / b;
-                }
-
                 fn div_by_all_zeros_panics<const LANES: usize>() {
                     let a = Vector::<LANES>::splat(42);
                     let b = Vector::<LANES>::splat(0);
@@ -232,12 +238,6 @@ macro_rules! impl_signed_tests {
                     let _ = a / b;
                 }
 
-                fn rem_min_overflow_panic<const LANES: usize>() {
-                    let a = Vector::<LANES>::splat(Scalar::MIN);
-                    let b = Vector::<LANES>::splat(-1);
-                    let _ = a % b;
-                }
-
                 fn rem_zero_panic<const LANES: usize>() {
                     let a = Vector::<LANES>::splat(42);
                     let b = Vector::<LANES>::splat(0);

From e628a2991c47a771340cc5f8f06826c918f79609 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Tue, 8 Feb 2022 14:15:45 -0800
Subject: [PATCH 05/11] Document Simd is Simd<Wrapping<T>, N>

and other quirks like panicking
and the equivalence to zipping and mapping binary ops
---
 crates/core_simd/src/vector.rs | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index b7ef7a56c73..0a2f681f66b 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -12,7 +12,39 @@ pub(crate) mod ptr;
 use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Mask, MaskElement, SupportedLaneCount};
 
-/// A SIMD vector of `LANES` elements of type `T`.
+/// A SIMD vector of `LANES` elements of type `T`. `Simd<T, N>` has the same shape as [`[T; N]`](array), but operates like `T`.
+/// This type is commonly known by names like `f32x4` or `Vec4` in many programming languages.
+///
+/// Two vectors of the same type and length will, by convention, support the binary operations (+, *, etc.) that `T` does.
+/// These take the lanes at each index on the left-hand side and right-hand side, perform the binary operation,
+/// and return the result in the same lane in a vector of equal size. For a given operator, this is equivalent to zipping
+/// the two arrays together and mapping the operator over each lane.
+///
+/// ```rust
+/// # #![feature(array_zip, portable_simd)]
+/// # use core::simd::{Simd};
+/// let a0: [i32; 4] = [-2, 0, 2, 4];
+/// let a1 = [10, 9, 8, 7];
+/// let zm_add = a0.zip(a1).map(|(lhs, rhs)| lhs + rhs);
+/// let zm_mul = a0.zip(a1).map(|(lhs, rhs)| lhs * rhs);
+///
+/// // `Simd<T, N>` implements `From<[T; N]>
+/// let [v0, v1] = [a0, a1].map(|a| Simd::from(a));
+/// // Which means arrays implement `Into<Simd<T, N>>`.
+/// assert_eq!(v0 + v1, zm_add.into());
+/// assert_eq!(v0 * v1, zm_mul.into());
+/// ```
+///
+/// `Simd` with integers has the quirk that these operations are also inherently wrapping, as if `T` was [`Wrapping<T>`].
+/// Thus, `Simd` does not implement `wrapping_add`, because that is the behavior of the normal operation.
+/// This means there is no warning on overflows, even in "debug" builds.
+/// For most applications where `Simd` is appropriate, it is "not a bug" to wrap,
+/// and even "debug builds" are unlikely to tolerate the loss of performance.
+/// You may want to consider using explicitly checked arithmetic if such is required.
+/// Division by zero still causes a panic, so you may want to consider using floating point numbers if that is unacceptable.
+///
+/// [`Wrapping<T>`]: core::num::Wrapping
+///
 #[repr(simd)]
 pub struct Simd<T, const LANES: usize>([T; LANES])
 where

From 5d52455c65bf5a5eb258ed11591b8ebfa61ea5c7 Mon Sep 17 00:00:00 2001
From: Jubilee <46493976+workingjubilee@users.noreply.github.com>
Date: Tue, 8 Feb 2022 17:38:21 -0800
Subject: [PATCH 06/11] Review for clarity and concision

Co-authored-by: Caleb Zulawski <caleb.zulawski@gmail.com>
---
 crates/core_simd/src/vector.rs       | 9 ++++-----
 crates/core_simd/tests/ops_macros.rs | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 0a2f681f66b..5bd8ed69535 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -13,10 +13,9 @@ use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Mask, MaskElement, SupportedLaneCount};
 
 /// A SIMD vector of `LANES` elements of type `T`. `Simd<T, N>` has the same shape as [`[T; N]`](array), but operates like `T`.
-/// This type is commonly known by names like `f32x4` or `Vec4` in many programming languages.
 ///
-/// Two vectors of the same type and length will, by convention, support the binary operations (+, *, etc.) that `T` does.
-/// These take the lanes at each index on the left-hand side and right-hand side, perform the binary operation,
+/// Two vectors of the same type and length will, by convention, support the operators (+, *, etc.) that `T` does.
+/// These take the lanes at each index on the left-hand side and right-hand side, perform the operation,
 /// and return the result in the same lane in a vector of equal size. For a given operator, this is equivalent to zipping
 /// the two arrays together and mapping the operator over each lane.
 ///
@@ -29,14 +28,14 @@ use crate::simd::{LaneCount, Mask, MaskElement, SupportedLaneCount};
 /// let zm_mul = a0.zip(a1).map(|(lhs, rhs)| lhs * rhs);
 ///
 /// // `Simd<T, N>` implements `From<[T; N]>
-/// let [v0, v1] = [a0, a1].map(|a| Simd::from(a));
+/// let (v0, v1) = (Simd::from(a0), Simd::from(a1));
 /// // Which means arrays implement `Into<Simd<T, N>>`.
 /// assert_eq!(v0 + v1, zm_add.into());
 /// assert_eq!(v0 * v1, zm_mul.into());
 /// ```
 ///
 /// `Simd` with integers has the quirk that these operations are also inherently wrapping, as if `T` was [`Wrapping<T>`].
-/// Thus, `Simd` does not implement `wrapping_add`, because that is the behavior of the normal operation.
+/// Thus, `Simd` does not implement `wrapping_add`, because that is the default behavior.
 /// This means there is no warning on overflows, even in "debug" builds.
 /// For most applications where `Simd` is appropriate, it is "not a bug" to wrap,
 /// and even "debug builds" are unlikely to tolerate the loss of performance.
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 9ba66fb8dd9..50f7a4ca170 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -213,13 +213,13 @@ macro_rules! impl_signed_tests {
                 fn div_min_may_overflow<const LANES: usize>() {
                     let a = Vector::<LANES>::splat(Scalar::MIN);
                     let b = Vector::<LANES>::splat(-1);
-                    assert_eq!(a / b, a / (b * b));
+                    assert_eq!(a / b, a);
                 }
 
                 fn rem_min_may_overflow<const LANES: usize>() {
                     let a = Vector::<LANES>::splat(Scalar::MIN);
                     let b = Vector::<LANES>::splat(-1);
-                    assert_eq!(a % b, a % (b * b));
+                    assert_eq!(a % b, Vector::<LANES>::splat(0));
                 }
 
             }

From dddfffcfb3e49c752d6a28039735ada2552d0307 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Fri, 5 Nov 2021 01:42:29 +0000
Subject: [PATCH 07/11] Add some safety comments

---
 crates/core_simd/src/comparisons.rs      | 12 ++++++++++++
 crates/core_simd/src/masks.rs            |  9 +++++++++
 crates/core_simd/src/masks/bitmask.rs    |  1 +
 crates/core_simd/src/masks/full_masks.rs |  6 ++++++
 crates/core_simd/src/math.rs             |  4 ++++
 crates/core_simd/src/reduction.rs        |  8 ++++++++
 crates/core_simd/src/swizzle.rs          |  2 ++
 crates/core_simd/src/to_bytes.rs         |  2 ++
 crates/core_simd/src/vector.rs           | 12 +++++++-----
 crates/core_simd/src/vector/ptr.rs       |  4 ++++
 crates/core_simd/src/vendor.rs           |  2 ++
 11 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/crates/core_simd/src/comparisons.rs b/crates/core_simd/src/comparisons.rs
index edef5af3687..d024cf4ddbe 100644
--- a/crates/core_simd/src/comparisons.rs
+++ b/crates/core_simd/src/comparisons.rs
@@ -10,6 +10,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_eq(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
     }
 
@@ -17,6 +19,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_ne(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
     }
 }
@@ -30,6 +34,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_lt(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
     }
 
@@ -37,6 +43,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_gt(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
     }
 
@@ -44,6 +52,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_le(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
     }
 
@@ -51,6 +61,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_ge(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
     }
 }
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index ae1fef53da8..b1f98d9eb4e 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -42,6 +42,9 @@ mod sealed {
 use sealed::Sealed;
 
 /// Marker trait for types that may be used as SIMD mask elements.
+///
+/// # Safety
+/// Type must be a signed integer.
 pub unsafe trait MaskElement: SimdElement + Sealed {}
 
 macro_rules! impl_element {
@@ -149,6 +152,7 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
+        // Safety: the caller must confirm this invariant
         unsafe { Self(mask_impl::Mask::from_int_unchecked(value)) }
     }
 
@@ -161,6 +165,7 @@ where
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn from_int(value: Simd<T, LANES>) -> Self {
         assert!(T::valid(value), "all values must be either 0 or -1",);
+        // Safety: the validity has been checked
         unsafe { Self::from_int_unchecked(value) }
     }
 
@@ -179,6 +184,7 @@ where
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
     pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
+        // Safety: the caller must confirm this invariant
         unsafe { self.0.test_unchecked(lane) }
     }
 
@@ -190,6 +196,7 @@ where
     #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn test(&self, lane: usize) -> bool {
         assert!(lane < LANES, "lane index out of range");
+        // Safety: the lane index has been checked
         unsafe { self.test_unchecked(lane) }
     }
 
@@ -199,6 +206,7 @@ where
     /// `lane` must be less than `LANES`.
     #[inline]
     pub unsafe fn set_unchecked(&mut self, lane: usize, value: bool) {
+        // Safety: the caller must confirm this invariant
         unsafe {
             self.0.set_unchecked(lane, value);
         }
@@ -211,6 +219,7 @@ where
     #[inline]
     pub fn set(&mut self, lane: usize, value: bool) {
         assert!(lane < LANES, "lane index out of range");
+        // Safety: the lane index has been checked
         unsafe {
             self.set_unchecked(lane, value);
         }
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index b4217dc87ba..b7f8b2c236e 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -137,6 +137,7 @@ where
     where
         U: MaskElement,
     {
+        // Safety: bitmask layout does not depend on the element width
         unsafe { core::mem::transmute_copy(&self) }
     }
 
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index e5bb784bb91..02b5593c8f4 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -106,6 +106,7 @@ where
     where
         U: MaskElement,
     {
+        // Safety: masks are simply integer vectors of 0 and -1, and we can cast the element type.
         unsafe { Mask(intrinsics::simd_cast(self.0)) }
     }
 
@@ -155,12 +156,14 @@ where
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn any(self) -> bool {
+        // Safety: use `self` as an integer vector
         unsafe { intrinsics::simd_reduce_any(self.to_int()) }
     }
 
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn all(self) -> bool {
+        // Safety: use `self` as an integer vector
         unsafe { intrinsics::simd_reduce_all(self.to_int()) }
     }
 }
@@ -184,6 +187,7 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Self) -> Self {
+        // Safety: `self` is an integer vector
         unsafe { Self(intrinsics::simd_and(self.0, rhs.0)) }
     }
 }
@@ -197,6 +201,7 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Self) -> Self {
+        // Safety: `self` is an integer vector
         unsafe { Self(intrinsics::simd_or(self.0, rhs.0)) }
     }
 }
@@ -210,6 +215,7 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Self) -> Self {
+        // Safety: `self` is an integer vector
         unsafe { Self(intrinsics::simd_xor(self.0, rhs.0)) }
     }
 }
diff --git a/crates/core_simd/src/math.rs b/crates/core_simd/src/math.rs
index 7435b6df918..0b4e40983af 100644
--- a/crates/core_simd/src/math.rs
+++ b/crates/core_simd/src/math.rs
@@ -22,6 +22,7 @@ macro_rules! impl_uint_arith {
             /// ```
             #[inline]
             pub fn saturating_add(self, second: Self) -> Self {
+                // Safety: `self` is a vector
                 unsafe { simd_saturating_add(self, second) }
             }
 
@@ -41,6 +42,7 @@ macro_rules! impl_uint_arith {
             /// assert_eq!(sat, Simd::splat(0));
             #[inline]
             pub fn saturating_sub(self, second: Self) -> Self {
+                // Safety: `self` is a vector
                 unsafe { simd_saturating_sub(self, second) }
             }
         })+
@@ -68,6 +70,7 @@ macro_rules! impl_int_arith {
             /// ```
             #[inline]
             pub fn saturating_add(self, second: Self) -> Self {
+                // Safety: `self` is a vector
                 unsafe { simd_saturating_add(self, second) }
             }
 
@@ -87,6 +90,7 @@ macro_rules! impl_int_arith {
             /// assert_eq!(sat, Simd::from_array([MIN, MIN, MIN, 0]));
             #[inline]
             pub fn saturating_sub(self, second: Self) -> Self {
+                // Safety: `self` is a vector
                 unsafe { simd_saturating_sub(self, second) }
             }
 
diff --git a/crates/core_simd/src/reduction.rs b/crates/core_simd/src/reduction.rs
index e79a185816b..e1cd743e442 100644
--- a/crates/core_simd/src/reduction.rs
+++ b/crates/core_simd/src/reduction.rs
@@ -14,24 +14,28 @@ macro_rules! impl_integer_reductions {
             /// Horizontal wrapping add.  Returns the sum of the lanes of the vector, with wrapping addition.
             #[inline]
             pub fn horizontal_sum(self) -> $scalar {
+                // Safety: `self` is an integer vector
                 unsafe { simd_reduce_add_ordered(self, 0) }
             }
 
             /// Horizontal wrapping multiply.  Returns the product of the lanes of the vector, with wrapping multiplication.
             #[inline]
             pub fn horizontal_product(self) -> $scalar {
+                // Safety: `self` is an integer vector
                 unsafe { simd_reduce_mul_ordered(self, 1) }
             }
 
             /// Horizontal maximum.  Returns the maximum lane in the vector.
             #[inline]
             pub fn horizontal_max(self) -> $scalar {
+                // Safety: `self` is an integer vector
                 unsafe { simd_reduce_max(self) }
             }
 
             /// Horizontal minimum.  Returns the minimum lane in the vector.
             #[inline]
             pub fn horizontal_min(self) -> $scalar {
+                // Safety: `self` is an integer vector
                 unsafe { simd_reduce_min(self) }
             }
         }
@@ -63,6 +67,7 @@ macro_rules! impl_float_reductions {
                 if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
                     self.as_array().iter().sum()
                 } else {
+                    // Safety: `self` is a float vector
                     unsafe { simd_reduce_add_ordered(self, 0.) }
                 }
             }
@@ -74,6 +79,7 @@ macro_rules! impl_float_reductions {
                 if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
                     self.as_array().iter().product()
                 } else {
+                    // Safety: `self` is a float vector
                     unsafe { simd_reduce_mul_ordered(self, 1.) }
                 }
             }
@@ -84,6 +90,7 @@ macro_rules! impl_float_reductions {
             /// return either.  This function will not return `NaN` unless all lanes are `NaN`.
             #[inline]
             pub fn horizontal_max(self) -> $scalar {
+                // Safety: `self` is a float vector
                 unsafe { simd_reduce_max(self) }
             }
 
@@ -93,6 +100,7 @@ macro_rules! impl_float_reductions {
             /// return either.  This function will not return `NaN` unless all lanes are `NaN`.
             #[inline]
             pub fn horizontal_min(self) -> $scalar {
+                // Safety: `self` is a float vector
                 unsafe { simd_reduce_min(self) }
             }
         }
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index bdc489774a5..08b2add1166 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -95,6 +95,7 @@ pub trait Swizzle<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
         LaneCount<INPUT_LANES>: SupportedLaneCount,
         LaneCount<OUTPUT_LANES>: SupportedLaneCount,
     {
+        // Safety: `vector` is a vector, and `INDEX_IMPL` is a const array of u32.
         unsafe { intrinsics::simd_shuffle(vector, vector, Self::INDEX_IMPL) }
     }
 }
@@ -119,6 +120,7 @@ pub trait Swizzle2<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
         LaneCount<INPUT_LANES>: SupportedLaneCount,
         LaneCount<OUTPUT_LANES>: SupportedLaneCount,
     {
+        // Safety: `first` and `second` are vectors, and `INDEX_IMPL` is a const array of u32.
         unsafe { intrinsics::simd_shuffle(first, second, Self::INDEX_IMPL) }
     }
 }
diff --git a/crates/core_simd/src/to_bytes.rs b/crates/core_simd/src/to_bytes.rs
index 8d9b3e8ff85..b36b1a347b2 100644
--- a/crates/core_simd/src/to_bytes.rs
+++ b/crates/core_simd/src/to_bytes.rs
@@ -8,12 +8,14 @@ macro_rules! impl_to_bytes {
             /// Return the memory representation of this integer as a byte array in native byte
             /// order.
             pub fn to_ne_bytes(self) -> crate::simd::Simd<u8, {{ $size * LANES }}> {
+                // Safety: transmuting between vectors is safe
                 unsafe { core::mem::transmute_copy(&self) }
             }
 
             /// Create a native endian integer value from its memory representation as a byte array
             /// in native endianness.
             pub fn from_ne_bytes(bytes: crate::simd::Simd<u8, {{ $size * LANES }}>) -> Self {
+                // Safety: transmuting between vectors is safe
                 unsafe { core::mem::transmute_copy(&bytes) }
             }
         }
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 5bd8ed69535..e452fa8bfc8 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -206,7 +206,7 @@ where
         or: Self,
     ) -> Self {
         let enable: Mask<isize, LANES> = enable & idxs.lanes_lt(Simd::splat(slice.len()));
-        // SAFETY: We have masked-off out-of-bounds lanes.
+        // Safety: We have masked-off out-of-bounds lanes.
         unsafe { Self::gather_select_unchecked(slice, enable, idxs, or) }
     }
 
@@ -247,7 +247,7 @@ where
         let base_ptr = crate::simd::ptr::SimdConstPtr::splat(slice.as_ptr());
         // Ferris forgive me, I have done pointer arithmetic here.
         let ptrs = base_ptr.wrapping_add(idxs);
-        // SAFETY: The ptrs have been bounds-masked to prevent memory-unsafe reads insha'allah
+        // Safety: The ptrs have been bounds-masked to prevent memory-unsafe reads insha'allah
         unsafe { intrinsics::simd_gather(or, ptrs, enable.to_int()) }
     }
 
@@ -299,7 +299,7 @@ where
         idxs: Simd<usize, LANES>,
     ) {
         let enable: Mask<isize, LANES> = enable & idxs.lanes_lt(Simd::splat(slice.len()));
-        // SAFETY: We have masked-off out-of-bounds lanes.
+        // Safety: We have masked-off out-of-bounds lanes.
         unsafe { self.scatter_select_unchecked(slice, enable, idxs) }
     }
 
@@ -338,7 +338,7 @@ where
         enable: Mask<isize, LANES>,
         idxs: Simd<usize, LANES>,
     ) {
-        // SAFETY: This block works with *mut T derived from &mut 'a [T],
+        // Safety: This block works with *mut T derived from &mut 'a [T],
         // which means it is delicate in Rust's borrowing model, circa 2021:
         // &mut 'a [T] asserts uniqueness, so deriving &'a [T] invalidates live *mut Ts!
         // Even though this block is largely safe methods, it must be exactly this way
@@ -518,7 +518,9 @@ mod sealed {
 use sealed::Sealed;
 
 /// Marker trait for types that may be used as SIMD vector elements.
-/// SAFETY: This trait, when implemented, asserts the compiler can monomorphize
+///
+/// # Safety
+/// This trait, when implemented, asserts the compiler can monomorphize
 /// `#[repr(simd)]` structs with the marked type as an element.
 /// Strictly, it is valid to impl if the vector will not be miscompiled.
 /// Practically, it is user-unfriendly to impl it if the vector won't compile,
diff --git a/crates/core_simd/src/vector/ptr.rs b/crates/core_simd/src/vector/ptr.rs
index c668d9a6eae..417d255c28d 100644
--- a/crates/core_simd/src/vector/ptr.rs
+++ b/crates/core_simd/src/vector/ptr.rs
@@ -21,6 +21,8 @@ where
     #[inline]
     #[must_use]
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
+        // Safety: converting pointers to usize and vice-versa is safe
+        // (even if using that pointer is not)
         unsafe {
             let x: Simd<usize, LANES> = mem::transmute_copy(&self);
             mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
@@ -47,6 +49,8 @@ where
     #[inline]
     #[must_use]
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
+        // Safety: converting pointers to usize and vice-versa is safe
+        // (even if using that pointer is not)
         unsafe {
             let x: Simd<usize, LANES> = mem::transmute_copy(&self);
             mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
diff --git a/crates/core_simd/src/vendor.rs b/crates/core_simd/src/vendor.rs
index e8ce7176b4f..9fb70218c95 100644
--- a/crates/core_simd/src/vendor.rs
+++ b/crates/core_simd/src/vendor.rs
@@ -9,6 +9,8 @@ macro_rules! from_transmute {
         impl core::convert::From<$from> for $to {
             #[inline]
             fn from(value: $from) -> $to {
+                // Safety: transmuting between vectors is safe, but the caller of this macro
+                // checks the invariants
                 unsafe { core::mem::transmute(value) }
             }
         }

From 78a18c3433c13bbd6259242185667c385eafe859 Mon Sep 17 00:00:00 2001
From: Jubilee <46493976+workingjubilee@users.noreply.github.com>
Date: Thu, 10 Feb 2022 09:32:44 -0800
Subject: [PATCH 08/11] rust-lang/portable-simd#245: Explain unsafe contracts
 of core::simd

* Explain unsafe contracts of core::simd

This permeates the module with remarks on safety for pub methods,
layout of the Simd type, correct use of intrinsics, et cetera.
This is mostly to help others curious about how core::simd works,
including other Rust contributors, `unsafe` library authors,
and eventually ourselves.
---
 crates/core_simd/src/intrinsics.rs | 59 ++++++++++++++++++++++++------
 crates/core_simd/src/lib.rs        |  1 +
 crates/core_simd/src/round.rs      |  5 +++
 crates/core_simd/src/select.rs     |  4 ++
 crates/core_simd/src/vector.rs     | 42 +++++++++++++++++++++
 5 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index b5d0df7548f..e150946c705 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -2,16 +2,31 @@
 //! crate.
 //!
 //! The LLVM assembly language is documented here: <https://llvm.org/docs/LangRef.html>
+//!
+//! A quick glossary of jargon that may appear in this module, mostly paraphrasing LLVM's LangRef:
+//! - poison: "undefined behavior as a value". specifically, it is like uninit memory (such as padding bytes). it is "safe" to create poison, BUT
+//!   poison MUST NOT be observed from safe code, as operations on poison return poison, like NaN. unlike NaN, which has defined comparisons,
+//!   poison is neither true nor false, and LLVM may also convert it to undef (at which point it is both). so, it can't be conditioned on, either.
+//! - undef: "a value that is every value". functionally like poison, insofar as Rust is concerned. poison may become this. note:
+//!   this means that division by poison or undef is like division by zero, which means it inflicts...
+//! - "UB": poison and undef cover most of what people call "UB". "UB" means this operation immediately invalidates the program:
+//!   LLVM is allowed to lower it to `ud2` or other opcodes that may cause an illegal instruction exception, and this is the "good end".
+//!   The "bad end" is that LLVM may reverse time to the moment control flow diverged on a path towards undefined behavior,
+//!   and destroy the other branch, potentially deleting safe code and violating Rust's `unsafe` contract.
+//!
+//! Note that according to LLVM, vectors are not arrays, but they are equivalent when stored to and loaded from memory.
+//!
+//! Unless stated otherwise, all intrinsics for binary operations require SIMD vectors of equal types and lengths.
 
 /// These intrinsics aren't linked directly from LLVM and are mostly undocumented, however they are
-/// simply lowered to the matching LLVM instructions by the compiler.  The associated instruction
-/// is documented alongside each intrinsic.
+/// mostly lowered to the matching LLVM instructions by the compiler in a fairly straightforward manner.
+/// The associated LLVM instruction or intrinsic is documented alongside each Rust intrinsic function.
 extern "platform-intrinsic" {
     /// add/fadd
     pub(crate) fn simd_add<T>(x: T, y: T) -> T;
 
     /// sub/fsub
-    pub(crate) fn simd_sub<T>(x: T, y: T) -> T;
+    pub(crate) fn simd_sub<T>(lhs: T, rhs: T) -> T;
 
     /// mul/fmul
     pub(crate) fn simd_mul<T>(x: T, y: T) -> T;
@@ -20,19 +35,22 @@ extern "platform-intrinsic" {
     /// ints and uints: {s,u}div incur UB if division by zero occurs.
     /// ints: sdiv is UB for int::MIN / -1.
     /// floats: fdiv is never UB, but may create NaNs or infinities.
-    pub(crate) fn simd_div<T>(x: T, y: T) -> T;
+    pub(crate) fn simd_div<T>(lhs: T, rhs: T) -> T;
 
     /// urem/srem/frem
     /// ints and uints: {s,u}rem incur UB if division by zero occurs.
     /// ints: srem is UB for int::MIN / -1.
     /// floats: frem is equivalent to libm::fmod in the "default" floating point environment, sans errno.
-    pub(crate) fn simd_rem<T>(x: T, y: T) -> T;
+    pub(crate) fn simd_rem<T>(lhs: T, rhs: T) -> T;
 
     /// shl
-    pub(crate) fn simd_shl<T>(x: T, y: T) -> T;
+    /// for (u)ints. poison if rhs >= lhs::BITS
+    pub(crate) fn simd_shl<T>(lhs: T, rhs: T) -> T;
 
-    /// lshr/ashr
-    pub(crate) fn simd_shr<T>(x: T, y: T) -> T;
+    /// ints: ashr
+    /// uints: lshr
+    /// poison if rhs >= lhs::BITS
+    pub(crate) fn simd_shr<T>(lhs: T, rhs: T) -> T;
 
     /// and
     pub(crate) fn simd_and<T>(x: T, y: T) -> T;
@@ -44,6 +62,9 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_xor<T>(x: T, y: T) -> T;
 
     /// fptoui/fptosi/uitofp/sitofp
+    /// casting floats to integers is truncating, so it is safe to convert values like e.g. 1.5
+    /// but the truncated value must fit in the target type or the result is poison.
+    /// use `simd_as` instead for a cast that performs a saturating conversion.
     pub(crate) fn simd_cast<T, U>(x: T) -> U;
     /// follows Rust's `T as U` semantics, including saturating float casts
     /// which amounts to the same as `simd_cast` for many cases
@@ -63,6 +84,7 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_fmin<T>(x: T, y: T) -> T;
     pub(crate) fn simd_fmax<T>(x: T, y: T) -> T;
 
+    // these return Simd<int, N> with the same BITS size as the inputs
     pub(crate) fn simd_eq<T, U>(x: T, y: T) -> U;
     pub(crate) fn simd_ne<T, U>(x: T, y: T) -> U;
     pub(crate) fn simd_lt<T, U>(x: T, y: T) -> U;
@@ -71,19 +93,31 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_ge<T, U>(x: T, y: T) -> U;
 
     // shufflevector
+    // idx: LLVM calls it a "shuffle mask vector constant", a vector of i32s
     pub(crate) fn simd_shuffle<T, U, V>(x: T, y: T, idx: U) -> V;
 
+    /// llvm.masked.gather
+    /// like a loop of pointer reads
+    /// val: vector of values to select if a lane is masked
+    /// ptr: vector of pointers to read from
+    /// mask: a "wide" mask of integers, selects as if simd_select(mask, read(ptr), val)
+    /// note, the LLVM intrinsic accepts a mask vector of <N x i1>
+    /// FIXME: review this if/when we fix up our mask story in general?
     pub(crate) fn simd_gather<T, U, V>(val: T, ptr: U, mask: V) -> T;
+    /// llvm.masked.scatter
+    /// like gather, but more spicy, as it writes instead of reads
     pub(crate) fn simd_scatter<T, U, V>(val: T, ptr: U, mask: V);
 
     // {s,u}add.sat
     pub(crate) fn simd_saturating_add<T>(x: T, y: T) -> T;
 
     // {s,u}sub.sat
-    pub(crate) fn simd_saturating_sub<T>(x: T, y: T) -> T;
+    pub(crate) fn simd_saturating_sub<T>(lhs: T, rhs: T) -> T;
 
     // reductions
+    // llvm.vector.reduce.{add,fadd}
     pub(crate) fn simd_reduce_add_ordered<T, U>(x: T, y: U) -> U;
+    // llvm.vector.reduce.{mul,fmul}
     pub(crate) fn simd_reduce_mul_ordered<T, U>(x: T, y: U) -> U;
     #[allow(unused)]
     pub(crate) fn simd_reduce_all<T>(x: T) -> bool;
@@ -100,7 +134,10 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_bitmask<T, U>(x: T) -> U;
 
     // select
-    pub(crate) fn simd_select<M, T>(m: M, a: T, b: T) -> T;
+    // first argument is a vector of integers, -1 (all bits 1) is "true"
+    // logically equivalent to (yes & m) | (no & (m^-1),
+    // but you can use it on floats.
+    pub(crate) fn simd_select<M, T>(m: M, yes: T, no: T) -> T;
     #[allow(unused)]
-    pub(crate) fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
+    pub(crate) fn simd_select_bitmask<M, T>(m: M, yes: T, no: T) -> T;
 }
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 41f64e972d9..91ae34c05e0 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -3,6 +3,7 @@
     const_fn_trait_bound,
     convert_float_to_int,
     decl_macro,
+    intra_doc_pointers,
     platform_intrinsics,
     repr_simd,
     simd_ffi,
diff --git a/crates/core_simd/src/round.rs b/crates/core_simd/src/round.rs
index f1724cbc263..556bc2cc1fe 100644
--- a/crates/core_simd/src/round.rs
+++ b/crates/core_simd/src/round.rs
@@ -19,6 +19,11 @@ macro_rules! implement {
             /// * Not be NaN
             /// * Not be infinite
             /// * Be representable in the return type, after truncating off its fractional part
+            ///
+            /// If these requirements are infeasible or costly, consider using the safe function [cast],
+            /// which saturates on conversion.
+            ///
+            /// [cast]: Simd::cast
             #[inline]
             pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, LANES>
             where
diff --git a/crates/core_simd/src/select.rs b/crates/core_simd/src/select.rs
index 8d521057fbd..3acf07260e1 100644
--- a/crates/core_simd/src/select.rs
+++ b/crates/core_simd/src/select.rs
@@ -11,6 +11,7 @@ where
     /// For each lane in the mask, choose the corresponding lane from `true_values` if
     /// that lane mask is true, and `false_values` if that lane mask is false.
     ///
+    /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
@@ -31,6 +32,8 @@ where
     where
         U: SimdElement<Mask = T>,
     {
+        // Safety: The mask has been cast to a vector of integers,
+        // and the operands to select between are vectors of the same type and length.
         unsafe { intrinsics::simd_select(self.to_int(), true_values, false_values) }
     }
 
@@ -39,6 +42,7 @@ where
     /// For each lane in the mask, choose the corresponding lane from `true_values` if
     /// that lane mask is true, and `false_values` if that lane mask is false.
     ///
+    /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "std")] use core_simd::Mask;
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index e452fa8bfc8..ff1b2c756ad 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -44,6 +44,47 @@ use crate::simd::{LaneCount, Mask, MaskElement, SupportedLaneCount};
 ///
 /// [`Wrapping<T>`]: core::num::Wrapping
 ///
+/// # Layout
+/// `Simd<T, N>` has a layout similar to `[T; N]` (identical "shapes"), but with a greater alignment.
+/// `[T; N]` is aligned to `T`, but `Simd<T, N>` will have an alignment based on both `T` and `N`.
+/// It is thus sound to [`transmute`] `Simd<T, N>` to `[T; N]`, and will typically optimize to zero cost,
+/// but the reverse transmutation is more likely to require a copy the compiler cannot simply elide.
+///
+/// # ABI "Features"
+/// Due to Rust's safety guarantees, `Simd<T, N>` is currently passed to and from functions via memory, not SIMD registers,
+/// except as an optimization. `#[inline]` hints are recommended on functions that accept `Simd<T, N>` or return it.
+/// The need for this may be corrected in the future.
+///
+/// # Safe SIMD with Unsafe Rust
+///
+/// Operations with `Simd` are typically safe, but there are many reasons to want to combine SIMD with `unsafe` code.
+/// Care must be taken to respect differences between `Simd` and other types it may be transformed into or derived from.
+/// In particular, the layout of `Simd<T, N>` may be similar to `[T; N]`, and may allow some transmutations,
+/// but references to `[T; N]` are not interchangeable with those to `Simd<T, N>`.
+/// Thus, when using `unsafe` Rust to read and write `Simd<T, N>` through [raw pointers], it is a good idea to first try with
+/// [`read_unaligned`] and [`write_unaligned`]. This is because:
+/// - [`read`] and [`write`] require full alignment (in this case, `Simd<T, N>`'s alignment)
+/// - the likely source for reading or destination for writing `Simd<T, N>` is [`[T]`](slice) and similar types, aligned to `T`
+/// - combining these actions would violate the `unsafe` contract and explode the program into a puff of **undefined behavior**
+/// - the compiler can implicitly adjust layouts to make unaligned reads or writes fully aligned if it sees the optimization
+/// - most contemporary processors suffer no performance penalty for "unaligned" reads and writes that are aligned at runtime
+///
+/// By imposing less obligations, unaligned functions are less likely to make the program unsound,
+/// and may be just as fast as stricter alternatives.
+/// When trying to guarantee alignment, [`[T]::as_simd`][as_simd] is an option for converting `[T]` to `[Simd<T, N>]`,
+/// and allows soundly operating on an aligned SIMD body, but it may cost more time when handling the scalar head and tail.
+/// If these are not sufficient, then it is most ideal to design data structures to be already aligned
+/// to the `Simd<T, N>` you wish to use before using `unsafe` Rust to read or write.
+/// More conventional ways to compensate for these facts, like materializing `Simd` to or from an array first,
+/// are handled by safe methods like [`Simd::from_array`] and [`Simd::from_slice`].
+///
+/// [`transmute`]: core::mem::transmute
+/// [raw pointers]: pointer
+/// [`read_unaligned`]: pointer::read_unaligned
+/// [`write_unaligned`]: pointer::write_unaligned
+/// [`read`]: pointer::read
+/// [`write`]: pointer::write
+/// [as_simd]: slice::as_simd
 #[repr(simd)]
 pub struct Simd<T, const LANES: usize>([T; LANES])
 where
@@ -133,6 +174,7 @@ where
     #[inline]
     #[cfg(not(bootstrap))]
     pub fn cast<U: SimdElement>(self) -> Simd<U, LANES> {
+        // Safety: The input argument is a vector of a known SIMD type.
         unsafe { intrinsics::simd_as(self) }
     }
 

From 842ac87747c4a6f8002ada6bab04d97320d206fc Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 13 Jan 2022 21:20:17 -0500
Subject: [PATCH 09/11] Use bitmask trait

---
 crates/core_simd/src/masks.rs            | 22 ++-----
 crates/core_simd/src/masks/bitmask.rs    | 12 +---
 crates/core_simd/src/masks/full_masks.rs | 35 ++---------
 crates/core_simd/src/masks/to_bitmask.rs | 78 ++++++++++++++++++++++++
 crates/core_simd/tests/masks.rs          |  6 +-
 5 files changed, 93 insertions(+), 60 deletions(-)
 create mode 100644 crates/core_simd/src/masks/to_bitmask.rs

diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index ae1fef53da8..22514728ffa 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -12,8 +12,10 @@
 )]
 mod mask_impl;
 
-use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+mod to_bitmask;
+pub use to_bitmask::ToBitMask;
+
+use crate::simd::{intrinsics, LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::cmp::Ordering;
 use core::{fmt, mem};
 
@@ -216,22 +218,6 @@ where
         }
     }
 
-    /// Convert this mask to a bitmask, with one bit set per lane.
-    #[cfg(feature = "generic_const_exprs")]
-    #[inline]
-    #[must_use = "method returns a new array and does not mutate the original value"]
-    pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
-        self.0.to_bitmask()
-    }
-
-    /// Convert a bitmask to a mask.
-    #[cfg(feature = "generic_const_exprs")]
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask(bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
-        Self(mask_impl::Mask::from_bitmask(bitmask))
-    }
-
     /// Returns true if any lane is set, or false otherwise.
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index b4217dc87ba..f20f83ecb38 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -115,20 +115,14 @@ where
         unsafe { Self(intrinsics::simd_bitmask(value), PhantomData) }
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
-    #[must_use = "method returns a new array and does not mutate the original value"]
-    pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
-        // Safety: these are the same type and we are laundering the generic
+    pub unsafe fn to_bitmask_intrinsic<U>(self) -> U {
         unsafe { core::mem::transmute_copy(&self.0) }
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask(bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
-        // Safety: these are the same type and we are laundering the generic
-        Self(unsafe { core::mem::transmute_copy(&bitmask) }, PhantomData)
+    pub unsafe fn from_bitmask_intrinsic<U>(bitmask: U) -> Self {
+        unsafe { Self(core::mem::transmute_copy(&bitmask), PhantomData) }
     }
 
     #[inline]
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index e5bb784bb91..b20b0a4b708 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -109,41 +109,16 @@ where
         unsafe { Mask(intrinsics::simd_cast(self.0)) }
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
-    #[must_use = "method returns a new array and does not mutate the original value"]
-    pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
-        unsafe {
-            let mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN] =
-                intrinsics::simd_bitmask(self.0);
-
-            // There is a bug where LLVM appears to implement this operation with the wrong
-            // bit order.
-            // TODO fix this in a better way
-            if cfg!(target_endian = "big") {
-                for x in bitmask.as_mut() {
-                    *x = x.reverse_bits();
-                }
-            }
-
-            bitmask
-        }
+    pub unsafe fn to_bitmask_intrinsic<U>(self) -> U {
+        // Safety: caller must only return bitmask types
+        unsafe { intrinsics::simd_bitmask(self.0) }
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask(mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
+    pub unsafe fn from_bitmask_intrinsic<U>(bitmask: U) -> Self {
+        // Safety: caller must only pass bitmask types
         unsafe {
-            // There is a bug where LLVM appears to implement this operation with the wrong
-            // bit order.
-            // TODO fix this in a better way
-            if cfg!(target_endian = "big") {
-                for x in bitmask.as_mut() {
-                    *x = x.reverse_bits();
-                }
-            }
-
             Self::from_int_unchecked(intrinsics::simd_select_bitmask(
                 bitmask,
                 Self::splat(true).to_int(),
diff --git a/crates/core_simd/src/masks/to_bitmask.rs b/crates/core_simd/src/masks/to_bitmask.rs
new file mode 100644
index 00000000000..3a9f89f19eb
--- /dev/null
+++ b/crates/core_simd/src/masks/to_bitmask.rs
@@ -0,0 +1,78 @@
+use super::{mask_impl, Mask, MaskElement};
+
+/// Converts masks to and from bitmasks.
+///
+/// In a bitmask, each bit represents if the corresponding lane in the mask is set.
+pub trait ToBitMask<BitMask> {
+    /// Converts a mask to a bitmask.
+    fn to_bitmask(self) -> BitMask;
+
+    /// Converts a bitmask to a mask.
+    fn from_bitmask(bitmask: BitMask) -> Self;
+}
+
+macro_rules! impl_integer_intrinsic {
+    { $(unsafe impl ToBitMask<$int:ty> for Mask<_, $lanes:literal>)* } => {
+        $(
+        impl<T: MaskElement> ToBitMask<$int> for Mask<T, $lanes> {
+            fn to_bitmask(self) -> $int {
+                unsafe { self.0.to_bitmask_intrinsic() }
+            }
+
+            fn from_bitmask(bitmask: $int) -> Self {
+                unsafe { Self(mask_impl::Mask::from_bitmask_intrinsic(bitmask)) }
+            }
+        }
+        )*
+    }
+}
+
+impl_integer_intrinsic! {
+    unsafe impl ToBitMask<u8> for Mask<_, 8>
+    unsafe impl ToBitMask<u16> for Mask<_, 16>
+    unsafe impl ToBitMask<u32> for Mask<_, 32>
+    unsafe impl ToBitMask<u64> for Mask<_, 64>
+}
+
+macro_rules! impl_integer_via {
+    { $(impl ToBitMask<$int:ty, via $via:ty> for Mask<_, $lanes:literal>)* } => {
+        $(
+        impl<T: MaskElement> ToBitMask<$int> for Mask<T, $lanes> {
+            fn to_bitmask(self) -> $int {
+                let bitmask: $via = self.to_bitmask();
+                bitmask as _
+            }
+
+            fn from_bitmask(bitmask: $int) -> Self {
+                Self::from_bitmask(bitmask as $via)
+            }
+        }
+        )*
+    }
+}
+
+impl_integer_via! {
+    impl ToBitMask<u16, via u8> for Mask<_, 8>
+    impl ToBitMask<u32, via u8> for Mask<_, 8>
+    impl ToBitMask<u64, via u8> for Mask<_, 8>
+
+    impl ToBitMask<u32, via u16> for Mask<_, 16>
+    impl ToBitMask<u64, via u16> for Mask<_, 16>
+
+    impl ToBitMask<u64, via u32> for Mask<_, 32>
+}
+
+#[cfg(target_pointer_width = "32")]
+impl_integer_via! {
+    impl ToBitMask<usize, via u8> for Mask<_, 8>
+    impl ToBitMask<usize, via u16> for Mask<_, 16>
+    impl ToBitMask<usize, via u32> for Mask<_, 32>
+}
+
+#[cfg(target_pointer_width = "64")]
+impl_integer_via! {
+    impl ToBitMask<usize, via u8> for Mask<_, 8>
+    impl ToBitMask<usize, via u16> for Mask<_, 16>
+    impl ToBitMask<usize, via u32> for Mask<_, 32>
+    impl ToBitMask<usize, via u64> for Mask<_, 64>
+}
diff --git a/crates/core_simd/tests/masks.rs b/crates/core_simd/tests/masks.rs
index 6a8ecd33a73..965c0fa2635 100644
--- a/crates/core_simd/tests/masks.rs
+++ b/crates/core_simd/tests/masks.rs
@@ -68,16 +68,16 @@ macro_rules! test_mask_api {
                 assert_eq!(core_simd::Mask::<$type, 8>::from_int(int), mask);
             }
 
-            #[cfg(feature = "generic_const_exprs")]
             #[test]
             fn roundtrip_bitmask_conversion() {
+                use core_simd::ToBitMask;
                 let values = [
                     true, false, false, true, false, false, true, false,
                     true, true, false, false, false, false, false, true,
                 ];
                 let mask = core_simd::Mask::<$type, 16>::from_array(values);
-                let bitmask = mask.to_bitmask();
-                assert_eq!(bitmask, [0b01001001, 0b10000011]);
+                let bitmask: u16 = mask.to_bitmask();
+                assert_eq!(bitmask, 0b1000001101001001);
                 assert_eq!(core_simd::Mask::<$type, 16>::from_bitmask(bitmask), mask);
             }
         }

From 11c3eefa3594055192612d0d6f844e764dcbda15 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 6 Feb 2022 03:25:27 +0000
Subject: [PATCH 10/11] Manually implement for supported lanes

---
 crates/core_simd/src/masks/bitmask.rs    |  5 +-
 crates/core_simd/src/masks/full_masks.rs |  6 +-
 crates/core_simd/src/masks/to_bitmask.rs | 88 +++++++++---------------
 crates/core_simd/tests/masks.rs          |  2 +-
 4 files changed, 40 insertions(+), 61 deletions(-)

diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index f20f83ecb38..e27b2689606 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -116,12 +116,13 @@ where
     }
 
     #[inline]
-    pub unsafe fn to_bitmask_intrinsic<U>(self) -> U {
+    pub unsafe fn to_bitmask_integer<U>(self) -> U {
         unsafe { core::mem::transmute_copy(&self.0) }
     }
 
+    // Safety: U must be the integer with the exact number of bits required to hold the bitmask for
     #[inline]
-    pub unsafe fn from_bitmask_intrinsic<U>(bitmask: U) -> Self {
+    pub unsafe fn from_bitmask_integer<U>(bitmask: U) -> Self {
         unsafe { Self(core::mem::transmute_copy(&bitmask), PhantomData) }
     }
 
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index b20b0a4b708..90af486a887 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -110,13 +110,15 @@ where
     }
 
     #[inline]
-    pub unsafe fn to_bitmask_intrinsic<U>(self) -> U {
+    pub unsafe fn to_bitmask_integer<U>(self) -> U {
         // Safety: caller must only return bitmask types
         unsafe { intrinsics::simd_bitmask(self.0) }
     }
 
+    // Safety: U must be the integer with the exact number of bits required to hold the bitmask for
+    // this mask
     #[inline]
-    pub unsafe fn from_bitmask_intrinsic<U>(bitmask: U) -> Self {
+    pub unsafe fn from_bitmask_integer<U>(bitmask: U) -> Self {
         // Safety: caller must only pass bitmask types
         unsafe {
             Self::from_int_unchecked(intrinsics::simd_select_bitmask(
diff --git a/crates/core_simd/src/masks/to_bitmask.rs b/crates/core_simd/src/masks/to_bitmask.rs
index 3a9f89f19eb..86143f2331f 100644
--- a/crates/core_simd/src/masks/to_bitmask.rs
+++ b/crates/core_simd/src/masks/to_bitmask.rs
@@ -1,26 +1,45 @@
 use super::{mask_impl, Mask, MaskElement};
 
-/// Converts masks to and from bitmasks.
+/// Converts masks to and from integer bitmasks.
 ///
-/// In a bitmask, each bit represents if the corresponding lane in the mask is set.
-pub trait ToBitMask<BitMask> {
+/// Each bit of the bitmask corresponds to a mask lane, starting with the LSB.
+pub trait ToBitMask {
+    /// The integer bitmask type.
+    type BitMask;
+
     /// Converts a mask to a bitmask.
-    fn to_bitmask(self) -> BitMask;
+    fn to_bitmask(self) -> Self::BitMask;
 
     /// Converts a bitmask to a mask.
-    fn from_bitmask(bitmask: BitMask) -> Self;
+    fn from_bitmask(bitmask: Self::BitMask) -> Self;
+}
+
+/// Converts masks to and from byte array bitmasks.
+///
+/// Each bit of the bitmask corresponds to a mask lane, starting with the LSB of the first byte.
+pub trait ToBitMaskArray {
+    /// The length of the bitmask array.
+    const BYTES: usize;
+
+    /// Converts a mask to a bitmask.
+    fn to_bitmask_array(self) -> [u8; Self::BYTES];
+
+    /// Converts a bitmask to a mask.
+    fn from_bitmask_array(bitmask: [u8; Self::BYTES]) -> Self;
 }
 
 macro_rules! impl_integer_intrinsic {
-    { $(unsafe impl ToBitMask<$int:ty> for Mask<_, $lanes:literal>)* } => {
+    { $(unsafe impl ToBitMask<BitMask=$int:ty> for Mask<_, $lanes:literal>)* } => {
         $(
-        impl<T: MaskElement> ToBitMask<$int> for Mask<T, $lanes> {
+        impl<T: MaskElement> ToBitMask for Mask<T, $lanes> {
+            type BitMask = $int;
+
             fn to_bitmask(self) -> $int {
-                unsafe { self.0.to_bitmask_intrinsic() }
+                unsafe { self.0.to_bitmask_integer() }
             }
 
             fn from_bitmask(bitmask: $int) -> Self {
-                unsafe { Self(mask_impl::Mask::from_bitmask_intrinsic(bitmask)) }
+                unsafe { Self(mask_impl::Mask::from_bitmask_integer(bitmask)) }
             }
         }
         )*
@@ -28,51 +47,8 @@ macro_rules! impl_integer_intrinsic {
 }
 
 impl_integer_intrinsic! {
-    unsafe impl ToBitMask<u8> for Mask<_, 8>
-    unsafe impl ToBitMask<u16> for Mask<_, 16>
-    unsafe impl ToBitMask<u32> for Mask<_, 32>
-    unsafe impl ToBitMask<u64> for Mask<_, 64>
-}
-
-macro_rules! impl_integer_via {
-    { $(impl ToBitMask<$int:ty, via $via:ty> for Mask<_, $lanes:literal>)* } => {
-        $(
-        impl<T: MaskElement> ToBitMask<$int> for Mask<T, $lanes> {
-            fn to_bitmask(self) -> $int {
-                let bitmask: $via = self.to_bitmask();
-                bitmask as _
-            }
-
-            fn from_bitmask(bitmask: $int) -> Self {
-                Self::from_bitmask(bitmask as $via)
-            }
-        }
-        )*
-    }
-}
-
-impl_integer_via! {
-    impl ToBitMask<u16, via u8> for Mask<_, 8>
-    impl ToBitMask<u32, via u8> for Mask<_, 8>
-    impl ToBitMask<u64, via u8> for Mask<_, 8>
-
-    impl ToBitMask<u32, via u16> for Mask<_, 16>
-    impl ToBitMask<u64, via u16> for Mask<_, 16>
-
-    impl ToBitMask<u64, via u32> for Mask<_, 32>
-}
-
-#[cfg(target_pointer_width = "32")]
-impl_integer_via! {
-    impl ToBitMask<usize, via u8> for Mask<_, 8>
-    impl ToBitMask<usize, via u16> for Mask<_, 16>
-    impl ToBitMask<usize, via u32> for Mask<_, 32>
-}
-
-#[cfg(target_pointer_width = "64")]
-impl_integer_via! {
-    impl ToBitMask<usize, via u8> for Mask<_, 8>
-    impl ToBitMask<usize, via u16> for Mask<_, 16>
-    impl ToBitMask<usize, via u32> for Mask<_, 32>
-    impl ToBitMask<usize, via u64> for Mask<_, 64>
+    unsafe impl ToBitMask<BitMask=u8> for Mask<_, 8>
+    unsafe impl ToBitMask<BitMask=u16> for Mask<_, 16>
+    unsafe impl ToBitMask<BitMask=u32> for Mask<_, 32>
+    unsafe impl ToBitMask<BitMask=u64> for Mask<_, 64>
 }
diff --git a/crates/core_simd/tests/masks.rs b/crates/core_simd/tests/masks.rs
index 965c0fa2635..3aec36ca7b7 100644
--- a/crates/core_simd/tests/masks.rs
+++ b/crates/core_simd/tests/masks.rs
@@ -76,7 +76,7 @@ macro_rules! test_mask_api {
                     true, true, false, false, false, false, false, true,
                 ];
                 let mask = core_simd::Mask::<$type, 16>::from_array(values);
-                let bitmask: u16 = mask.to_bitmask();
+                let bitmask = mask.to_bitmask();
                 assert_eq!(bitmask, 0b1000001101001001);
                 assert_eq!(core_simd::Mask::<$type, 16>::from_bitmask(bitmask), mask);
             }

From 20fa4b76235afb6a2ad543781a10a14e8013b143 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 9 Feb 2022 04:54:05 +0000
Subject: [PATCH 11/11] Make internal mask implementation safe

---
 crates/core_simd/src/masks/bitmask.rs    | 15 +++++--
 crates/core_simd/src/masks/full_masks.rs | 51 ++++++++++++++++++++----
 crates/core_simd/src/masks/to_bitmask.rs | 39 +++++++++---------
 3 files changed, 75 insertions(+), 30 deletions(-)

diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index e27b2689606..7bf2add2036 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -1,7 +1,7 @@
 #![allow(unused_imports)]
 use super::MaskElement;
 use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SupportedLaneCount, ToBitMask};
 use core::marker::PhantomData;
 
 /// A mask where each lane is represented by a single bit.
@@ -116,13 +116,20 @@ where
     }
 
     #[inline]
-    pub unsafe fn to_bitmask_integer<U>(self) -> U {
+    pub fn to_bitmask_integer<U>(self) -> U
+    where
+        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
+    {
+        // Safety: these are the same types
         unsafe { core::mem::transmute_copy(&self.0) }
     }
 
-    // Safety: U must be the integer with the exact number of bits required to hold the bitmask for
     #[inline]
-    pub unsafe fn from_bitmask_integer<U>(bitmask: U) -> Self {
+    pub fn from_bitmask_integer<U>(bitmask: U) -> Self
+    where
+        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
+    {
+        // Safety: these are the same types
         unsafe { Self(core::mem::transmute_copy(&bitmask), PhantomData) }
     }
 
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index 90af486a887..848997a0792 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -2,7 +2,7 @@
 
 use super::MaskElement;
 use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SupportedLaneCount, ToBitMask};
 
 #[repr(transparent)]
 pub struct Mask<T, const LANES: usize>(Simd<T, LANES>)
@@ -66,6 +66,23 @@ where
     }
 }
 
+// Used for bitmask bit order workaround
+pub(crate) trait ReverseBits {
+    fn reverse_bits(self) -> Self;
+}
+
+macro_rules! impl_reverse_bits {
+    { $($int:ty),* } => {
+        $(
+        impl ReverseBits for $int {
+            fn reverse_bits(self) -> Self { <$int>::reverse_bits(self) }
+        }
+        )*
+    }
+}
+
+impl_reverse_bits! { u8, u16, u32, u64 }
+
 impl<T, const LANES: usize> Mask<T, LANES>
 where
     T: MaskElement,
@@ -110,16 +127,34 @@ where
     }
 
     #[inline]
-    pub unsafe fn to_bitmask_integer<U>(self) -> U {
-        // Safety: caller must only return bitmask types
-        unsafe { intrinsics::simd_bitmask(self.0) }
+    pub(crate) fn to_bitmask_integer<U: ReverseBits>(self) -> U
+    where
+        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
+    {
+        // Safety: U is required to be the appropriate bitmask type
+        let bitmask: U = unsafe { intrinsics::simd_bitmask(self.0) };
+
+        // LLVM assumes bit order should match endianness
+        if cfg!(target_endian = "big") {
+            bitmask.reverse_bits()
+        } else {
+            bitmask
+        }
     }
 
-    // Safety: U must be the integer with the exact number of bits required to hold the bitmask for
-    // this mask
     #[inline]
-    pub unsafe fn from_bitmask_integer<U>(bitmask: U) -> Self {
-        // Safety: caller must only pass bitmask types
+    pub(crate) fn from_bitmask_integer<U: ReverseBits>(bitmask: U) -> Self
+    where
+        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
+    {
+        // LLVM assumes bit order should match endianness
+        let bitmask = if cfg!(target_endian = "big") {
+            bitmask.reverse_bits()
+        } else {
+            bitmask
+        };
+
+        // Safety: U is required to be the appropriate bitmask type
         unsafe {
             Self::from_int_unchecked(intrinsics::simd_select_bitmask(
                 bitmask,
diff --git a/crates/core_simd/src/masks/to_bitmask.rs b/crates/core_simd/src/masks/to_bitmask.rs
index 86143f2331f..1c2037764c1 100644
--- a/crates/core_simd/src/masks/to_bitmask.rs
+++ b/crates/core_simd/src/masks/to_bitmask.rs
@@ -1,9 +1,26 @@
 use super::{mask_impl, Mask, MaskElement};
+use crate::simd::{LaneCount, SupportedLaneCount};
+
+mod sealed {
+    pub trait Sealed {}
+}
+pub use sealed::Sealed;
+
+impl<T, const LANES: usize> Sealed for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+}
 
 /// Converts masks to and from integer bitmasks.
 ///
 /// Each bit of the bitmask corresponds to a mask lane, starting with the LSB.
-pub trait ToBitMask {
+///
+/// # Safety
+/// This trait is `unsafe` and sealed, since the `BitMask` type must match the number of lanes in
+/// the mask.
+pub unsafe trait ToBitMask: Sealed {
     /// The integer bitmask type.
     type BitMask;
 
@@ -14,32 +31,18 @@ pub trait ToBitMask {
     fn from_bitmask(bitmask: Self::BitMask) -> Self;
 }
 
-/// Converts masks to and from byte array bitmasks.
-///
-/// Each bit of the bitmask corresponds to a mask lane, starting with the LSB of the first byte.
-pub trait ToBitMaskArray {
-    /// The length of the bitmask array.
-    const BYTES: usize;
-
-    /// Converts a mask to a bitmask.
-    fn to_bitmask_array(self) -> [u8; Self::BYTES];
-
-    /// Converts a bitmask to a mask.
-    fn from_bitmask_array(bitmask: [u8; Self::BYTES]) -> Self;
-}
-
 macro_rules! impl_integer_intrinsic {
     { $(unsafe impl ToBitMask<BitMask=$int:ty> for Mask<_, $lanes:literal>)* } => {
         $(
-        impl<T: MaskElement> ToBitMask for Mask<T, $lanes> {
+        unsafe impl<T: MaskElement> ToBitMask for Mask<T, $lanes> {
             type BitMask = $int;
 
             fn to_bitmask(self) -> $int {
-                unsafe { self.0.to_bitmask_integer() }
+                self.0.to_bitmask_integer()
             }
 
             fn from_bitmask(bitmask: $int) -> Self {
-                unsafe { Self(mask_impl::Mask::from_bitmask_integer(bitmask)) }
+                Self(mask_impl::Mask::from_bitmask_integer(bitmask))
             }
         }
         )*