Allow `transmute`s to produce `OperandValue`s instead of always using `alloca`s

LLVM can usually optimize these away, but especially for things like transmutes of newtypes it's silly to generate the `alloc`+`store`+`load` at all when it's actually a nop at LLVM level.
2023-04-01 01:46:36 -07:00 · 2023-04-01 01:46:36 -07:00 · 9aa9a846b6
parent 480068c235
commit 9aa9a846b6
6 changed files with 381 additions and 74 deletions
--- a/compiler/rustc_codegen_ssa/src/mir/operand.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/operand.rs
@ -23,10 +23,26 @@ pub enum OperandValue<V> {
    /// to be valid for the operand's lifetime.
    /// The second value, if any, is the extra data (vtable or length)
    /// which indicates that it refers to an unsized rvalue.
+    ///
+    /// An `OperandValue` has this variant for types which are neither
+    /// `Immediate` nor `Pair`s. The backend value in this variant must be a
+    /// pointer to the *non*-immediate backend type. That pointee type is the
+    /// one returned by [`LayoutTypeMethods::backend_type`].
    Ref(V, Option<V>, Align),
-    /// A single LLVM value.
+    /// A single LLVM immediate value.
+    ///
+    /// An `OperandValue` *must* be this variant for any type for which
+    /// [`LayoutTypeMethods::is_backend_immediate`] returns `true`.
+    /// The backend value in this variant must be the *immediate* backend type,
+    /// as returned by [`LayoutTypeMethods::immediate_backend_type`].
    Immediate(V),
    /// A pair of immediate LLVM values. Used by fat pointers too.
+    ///
+    /// An `OperandValue` *must* be this variant for any type for which
+    /// [`LayoutTypeMethods::is_backend_scalar_pair`] returns `true`.
+    /// The backend values in this variant must be the *immediate* backend types,
+    /// as returned by [`LayoutTypeMethods::scalar_pair_element_backend_type`]
+    /// with `immediate: true`.
    Pair(V, V),
 }

--- a/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
@ -10,7 +10,7 @@ use crate::MemFlags;
 use rustc_middle::mir;
 use rustc_middle::mir::Operand;
 use rustc_middle::ty::cast::{CastTy, IntTy};
-use rustc_middle::ty::layout::{HasTyCtxt, LayoutOf};
+use rustc_middle::ty::layout::{HasTyCtxt, LayoutOf, TyAndLayout};
 use rustc_middle::ty::{self, adjustment::PointerCast, Instance, Ty, TyCtxt};
 use rustc_span::source_map::{Span, DUMMY_SP};
 use rustc_target::abi::{self, FIRST_VARIANT};
@ -159,8 +159,8 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
        debug_assert!(dst.layout.is_sized());

        if src.layout.size != dst.layout.size
-            || src.layout.abi == abi::Abi::Uninhabited
-            || dst.layout.abi == abi::Abi::Uninhabited
+            || src.layout.abi.is_uninhabited()
+            || dst.layout.abi.is_uninhabited()
        {
            // In all of these cases it's UB to run this transmute, but that's
            // known statically so might as well trap for it, rather than just
@ -169,22 +169,20 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
            return;
        }

-        let size_in_bytes = src.layout.size.bytes();
-        if size_in_bytes == 0 {
-            // Nothing to write
+        if let Some(val) = self.codegen_transmute_operand(bx, src, dst.layout) {
+            val.store(bx, dst);
            return;
        }

        match src.val {
-            OperandValue::Ref(src_llval, meta, src_align) => {
-                debug_assert_eq!(meta, None);
-                // For a place-to-place transmute, call `memcpy` directly so that
-                // both arguments get the best-available alignment information.
-                let bytes = bx.cx().const_usize(size_in_bytes);
-                let flags = MemFlags::empty();
-                bx.memcpy(dst.llval, dst.align, src_llval, src_align, bytes, flags);
+            OperandValue::Ref(..) => {
+                span_bug!(
+                    self.mir.span,
+                    "Operand path should have handled transmute \
+                    from `Ref` {src:?} to place {dst:?}"
+                );
            }
-            OperandValue::Immediate(_) | OperandValue::Pair(_, _) => {
+            OperandValue::Immediate(..) | OperandValue::Pair(..) => {
                // When we have immediate(s), the alignment of the source is irrelevant,
                // so we can store them using the destination's alignment.
                let llty = bx.backend_type(src.layout);
@ -194,6 +192,94 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
        }
    }

+    /// Attempts to transmute an `OperandValue` to another `OperandValue`.
+    ///
+    /// Returns `None` for cases that can't work in that framework, such as for
+    /// `Immediate`->`Ref` that needs an `alloc` to get the location.
+    fn codegen_transmute_operand(
+        &mut self,
+        bx: &mut Bx,
+        operand: OperandRef<'tcx, Bx::Value>,
+        cast: TyAndLayout<'tcx>,
+    ) -> Option<OperandValue<Bx::Value>> {
+        // Callers already checked that the layout sizes match
+        debug_assert_eq!(operand.layout.size, cast.size);
+
+        let operand_kind = self.value_kind(operand.layout);
+        let cast_kind = self.value_kind(cast);
+
+        match operand.val {
+            OperandValue::Ref(ptr, meta, align) => {
+                debug_assert_eq!(meta, None);
+                debug_assert!(matches!(operand_kind, OperandValueKind::Ref));
+                let cast_bty = bx.backend_type(cast);
+                let cast_ptr = bx.pointercast(ptr, bx.type_ptr_to(cast_bty));
+                let fake_place = PlaceRef::new_sized_aligned(cast_ptr, cast, align);
+                Some(bx.load_operand(fake_place).val)
+            }
+            OperandValue::Immediate(imm) => {
+                let OperandValueKind::Immediate(in_scalar) = operand_kind else {
+                    bug!("Found {operand_kind:?} for operand {operand:?}");
+                };
+                if let OperandValueKind::Immediate(out_scalar) = cast_kind {
+                    let cast_bty = bx.backend_type(cast);
+                    Some(OperandValue::Immediate(Self::transmute_immediate(
+                        bx, imm, in_scalar, out_scalar, cast_bty,
+                    )))
+                } else {
+                    None
+                }
+            }
+            OperandValue::Pair(imm_a, imm_b) => {
+                let OperandValueKind::Pair(in_a, in_b) = operand_kind else {
+                    bug!("Found {operand_kind:?} for operand {operand:?}");
+                };
+                if let OperandValueKind::Pair(out_a, out_b) = cast_kind {
+                    let out_a_ibty = bx.scalar_pair_element_backend_type(cast, 0, false);
+                    let out_b_ibty = bx.scalar_pair_element_backend_type(cast, 1, false);
+                    Some(OperandValue::Pair(
+                        Self::transmute_immediate(bx, imm_a, in_a, out_a, out_a_ibty),
+                        Self::transmute_immediate(bx, imm_b, in_b, out_b, out_b_ibty),
+                    ))
+                } else {
+                    None
+                }
+            }
+        }
+    }
+
+    /// Transmutes one of the immediates from an [`OperandValue::Immediate`]
+    /// or an [`OperandValue::Pair`] to an immediate of the target type.
+    ///
+    /// `to_backend_ty` must be the *non*-immediate backend type (so it will be
+    /// `i8`, not `i1`, for `bool`-like types.)
+    fn transmute_immediate(
+        bx: &mut Bx,
+        mut imm: Bx::Value,
+        from_scalar: abi::Scalar,
+        to_scalar: abi::Scalar,
+        to_backend_ty: Bx::Type,
+    ) -> Bx::Value {
+        use abi::Primitive::*;
+        imm = bx.from_immediate(imm);
+        imm = match (from_scalar.primitive(), to_scalar.primitive()) {
+            (Int(..) | F32 | F64, Int(..) | F32 | F64) => bx.bitcast(imm, to_backend_ty),
+            (Pointer(..), Pointer(..)) => bx.pointercast(imm, to_backend_ty),
+            (Int(..), Pointer(..)) => bx.inttoptr(imm, to_backend_ty),
+            (Pointer(..), Int(..)) => bx.ptrtoint(imm, to_backend_ty),
+            (F32 | F64, Pointer(..)) => {
+                let int_imm = bx.bitcast(imm, bx.cx().type_isize());
+                bx.inttoptr(int_imm, to_backend_ty)
+            }
+            (Pointer(..), F32 | F64) => {
+                let int_imm = bx.ptrtoint(imm, bx.cx().type_isize());
+                bx.bitcast(int_imm, to_backend_ty)
+            }
+        };
+        imm = bx.to_immediate_scalar(imm, to_scalar);
+        imm
+    }
+
    pub fn codegen_rvalue_unsized(
        &mut self,
        bx: &mut Bx,
@ -396,7 +482,9 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                        OperandValue::Immediate(newval)
                    }
                    mir::CastKind::Transmute => {
-                        bug!("Transmute operand {:?} in `codegen_rvalue_operand`", operand);
+                        self.codegen_transmute_operand(bx, operand, cast).unwrap_or_else(|| {
+                            bug!("Unsupported transmute-as-operand of {operand:?} to {cast:?}");
+                        })
                    }
                };
                OperandRef { val, layout: cast }
@ -739,10 +827,36 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
 impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
    pub fn rvalue_creates_operand(&self, rvalue: &mir::Rvalue<'tcx>, span: Span) -> bool {
        match *rvalue {
-            mir::Rvalue::Cast(mir::CastKind::Transmute, ..) =>
-                // FIXME: Now that transmute is an Rvalue, it would be nice if
-                // it could create `Immediate`s for scalars, where possible.
-                false,
+            mir::Rvalue::Cast(mir::CastKind::Transmute, ref operand, cast_ty) => {
+                let operand_ty = operand.ty(self.mir, self.cx.tcx());
+                let cast_layout = self.cx.layout_of(self.monomorphize(cast_ty));
+                let operand_layout = self.cx.layout_of(self.monomorphize(operand_ty));
+                if operand_layout.size != cast_layout.size
+                    || operand_layout.abi.is_uninhabited()
+                    || cast_layout.abi.is_uninhabited()
+                {
+                    // Send UB cases to the full form so the operand version can
+                    // `bitcast` without worrying about malformed IR.
+                    return false;
+                }
+
+                match (self.value_kind(operand_layout), self.value_kind(cast_layout)) {
+                    // Can always load from a pointer as needed
+                    (OperandValueKind::Ref, _) => true,
+
+                    // Need to generate an `alloc` to get a pointer from an immediate
+                    (OperandValueKind::Immediate(..) | OperandValueKind::Pair(..), OperandValueKind::Ref) => false,
+
+                    // When we have scalar immediates, we can convert them as needed
+                    (OperandValueKind::Immediate(..), OperandValueKind::Immediate(..)) |
+                    (OperandValueKind::Pair(..), OperandValueKind::Pair(..)) => true,
+
+                    // Send mixings between scalars and pairs through the memory route
+                    // FIXME: Maybe this could use insertvalue/extractvalue instead?
+                    (OperandValueKind::Immediate(..), OperandValueKind::Pair(..)) |
+                    (OperandValueKind::Pair(..), OperandValueKind::Immediate(..)) => false,
+                }
+            }
            mir::Rvalue::Ref(..) |
            mir::Rvalue::CopyForDeref(..) |
            mir::Rvalue::AddressOf(..) |
@ -767,4 +881,30 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {

        // (*) this is only true if the type is suitable
    }
+
+    /// Gets which variant of [`OperandValue`] is expected for a particular type.
+    fn value_kind(&self, layout: TyAndLayout<'tcx>) -> OperandValueKind {
+        if self.cx.is_backend_immediate(layout) {
+            debug_assert!(!self.cx.is_backend_scalar_pair(layout));
+            OperandValueKind::Immediate(match layout.abi {
+                abi::Abi::Scalar(s) => s,
+                abi::Abi::Vector { element, .. } => element,
+                x => bug!("Couldn't translate {x:?} as backend immediate"),
+            })
+        } else if self.cx.is_backend_scalar_pair(layout) {
+            let abi::Abi::ScalarPair(s1, s2) = layout.abi else {
+                bug!("Couldn't translate {:?} as backend scalar pair", layout.abi)
+            };
+            OperandValueKind::Pair(s1, s2)
+        } else {
+            OperandValueKind::Ref
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone)]
+enum OperandValueKind {
+    Ref,
+    Immediate(abi::Scalar),
+    Pair(abi::Scalar, abi::Scalar),
 }
--- a/compiler/rustc_codegen_ssa/src/traits/type_.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/type_.rs
@ -100,11 +100,22 @@ pub trait DerivedTypeMethods<'tcx>: BaseTypeMethods<'tcx> + MiscMethods<'tcx> {
 impl<'tcx, T> DerivedTypeMethods<'tcx> for T where Self: BaseTypeMethods<'tcx> + MiscMethods<'tcx> {}

 pub trait LayoutTypeMethods<'tcx>: Backend<'tcx> {
+    /// The backend type used for a rust type when it's in memory,
+    /// such as when it's stack-allocated or when it's being loaded or stored.
    fn backend_type(&self, layout: TyAndLayout<'tcx>) -> Self::Type;
    fn cast_backend_type(&self, ty: &CastTarget) -> Self::Type;
    fn fn_decl_backend_type(&self, fn_abi: &FnAbi<'tcx, Ty<'tcx>>) -> Self::Type;
    fn fn_ptr_backend_type(&self, fn_abi: &FnAbi<'tcx, Ty<'tcx>>) -> Self::Type;
    fn reg_backend_type(&self, ty: &Reg) -> Self::Type;
+    /// The backend type used for a rust type when it's in an SSA register.
+    ///
+    /// For nearly all types this is the same as the [`Self::backend_type`], however
+    /// `bool` (and other `0`-or-`1` values) are kept as [`BaseTypeMethods::type_i1`]
+    /// in registers but as [`BaseTypeMethods::type_i8`] in memory.
+    ///
+    /// Converting values between the two different backend types is done using
+    /// [`from_immediate`](super::BuilderMethods::from_immediate) and
+    /// [`to_immediate_scalar`](super::BuilderMethods::to_immediate_scalar).
    fn immediate_backend_type(&self, layout: TyAndLayout<'tcx>) -> Self::Type;
    fn is_backend_immediate(&self, layout: TyAndLayout<'tcx>) -> bool;
    fn is_backend_scalar_pair(&self, layout: TyAndLayout<'tcx>) -> bool;
--- a/tests/codegen/intrinsics/transmute.rs
+++ b/tests/codegen/intrinsics/transmute.rs
@ -6,6 +6,7 @@
 #![feature(core_intrinsics)]
 #![feature(custom_mir)]
 #![feature(inline_const)]
+#![allow(unreachable_code)]

 use std::mem::transmute;

@ -24,6 +25,9 @@ pub struct Scalar64(i64);
 #[repr(C, align(4))]
 pub struct Aggregate64(u16, u8, i8, f32);

+#[repr(C)]
+pub struct Aggregate8(u8);
+
 // CHECK-LABEL: @check_bigger_size(
 #[no_mangle]
 #[custom_mir(dialect = "runtime", phase = "initial")]
@ -76,23 +80,80 @@ pub unsafe fn check_from_uninhabited(x: BigNever) -> u16 {
    }
 }

+// CHECK-LABEL: @check_intermediate_passthrough(
+#[no_mangle]
+pub unsafe fn check_intermediate_passthrough(x: u32) -> i32 {
+    // CHECK: start
+    // CHECK: %[[TMP:.+]] = add i32 1, %x
+    // CHECK: %[[RET:.+]] = add i32 %[[TMP]], 1
+    // CHECK: ret i32 %[[RET]]
+    unsafe {
+        transmute::<u32, i32>(1 + x) + 1
+    }
+}
+
+// CHECK-LABEL: @check_nop_pair(
+#[no_mangle]
+pub unsafe fn check_nop_pair(x: (u8, i8)) -> (i8, u8) {
+    // CHECK-NOT: alloca
+    // CHECK: %0 = insertvalue { i8, i8 } poison, i8 %x.0, 0
+    // CHECK: %1 = insertvalue { i8, i8 } %0, i8 %x.1, 1
+    // CHECK: ret { i8, i8 } %1
+    unsafe {
+        transmute(x)
+    }
+}
+
 // CHECK-LABEL: @check_to_newtype(
 #[no_mangle]
 pub unsafe fn check_to_newtype(x: u64) -> Scalar64 {
-    // CHECK: %0 = alloca i64
-    // CHECK: store i64 %x, ptr %0
-    // CHECK: %1 = load i64, ptr %0
-    // CHECK: ret i64 %1
+    // CHECK-NOT: alloca
+    // CHECK: ret i64 %x
    transmute(x)
 }

 // CHECK-LABEL: @check_from_newtype(
 #[no_mangle]
 pub unsafe fn check_from_newtype(x: Scalar64) -> u64 {
-    // CHECK: %0 = alloca i64
-    // CHECK: store i64 %x, ptr %0
-    // CHECK: %1 = load i64, ptr %0
-    // CHECK: ret i64 %1
+    // CHECK-NOT: alloca
+    // CHECK: ret i64 %x
+    transmute(x)
+}
+
+// CHECK-LABEL: @check_aggregate_to_bool(
+#[no_mangle]
+pub unsafe fn check_aggregate_to_bool(x: Aggregate8) -> bool {
+    // CHECK: %x = alloca %Aggregate8, align 1
+    // CHECK: %[[BYTE:.+]] = load i8, ptr %x, align 1
+    // CHECK: %[[BOOL:.+]] = trunc i8 %[[BYTE]] to i1
+    // CHECK: ret i1 %[[BOOL]]
+    transmute(x)
+}
+
+// CHECK-LABEL: @check_aggregate_from_bool(
+#[no_mangle]
+pub unsafe fn check_aggregate_from_bool(x: bool) -> Aggregate8 {
+    // CHECK: %0 = alloca %Aggregate8, align 1
+    // CHECK: %[[BYTE:.+]] = zext i1 %x to i8
+    // CHECK: store i8 %[[BYTE]], ptr %0, align 1
+    transmute(x)
+}
+
+// CHECK-LABEL: @check_byte_to_bool(
+#[no_mangle]
+pub unsafe fn check_byte_to_bool(x: u8) -> bool {
+    // CHECK-NOT: alloca
+    // CHECK: %0 = trunc i8 %x to i1
+    // CHECK: ret i1 %0
+    transmute(x)
+}
+
+// CHECK-LABEL: @check_byte_from_bool(
+#[no_mangle]
+pub unsafe fn check_byte_from_bool(x: bool) -> u8 {
+    // CHECK-NOT: alloca
+    // CHECK: %0 = zext i1 %x to i8
+    // CHECK: ret i8 %0
    transmute(x)
 }

@ -122,20 +183,18 @@ pub unsafe fn check_from_pair(x: Option<i32>) -> u64 {
 // CHECK-LABEL: @check_to_float(
 #[no_mangle]
 pub unsafe fn check_to_float(x: u32) -> f32 {
-    // CHECK: %0 = alloca float
-    // CHECK: store i32 %x, ptr %0
-    // CHECK: %1 = load float, ptr %0
-    // CHECK: ret float %1
+    // CHECK-NOT: alloca
+    // CHECK: %0 = bitcast i32 %x to float
+    // CHECK: ret float %0
    transmute(x)
 }

 // CHECK-LABEL: @check_from_float(
 #[no_mangle]
 pub unsafe fn check_from_float(x: f32) -> u32 {
-    // CHECK: %0 = alloca i32
-    // CHECK: store float %x, ptr %0
-    // CHECK: %1 = load i32, ptr %0
-    // CHECK: ret i32 %1
+    // CHECK-NOT: alloca
+    // CHECK: %0 = bitcast float %x to i32
+    // CHECK: ret i32 %0
    transmute(x)
 }

@ -144,19 +203,15 @@ pub unsafe fn check_from_float(x: f32) -> u32 {
 pub unsafe fn check_to_bytes(x: u32) -> [u8; 4] {
    // CHECK: %0 = alloca [4 x i8], align 1
    // CHECK: store i32 %x, ptr %0, align 1
-    // CHECK: %1 = load i32, ptr %0, align 1
-    // CHECK: ret i32 %1
    transmute(x)
 }

 // CHECK-LABEL: @check_from_bytes(
 #[no_mangle]
 pub unsafe fn check_from_bytes(x: [u8; 4]) -> u32 {
-    // CHECK: %1 = alloca i32, align 4
    // CHECK: %x = alloca [4 x i8], align 1
-    // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %1, ptr align 1 %x, i64 4, i1 false)
-    // CHECK: %3 = load i32, ptr %1, align 4
-    // CHECK: ret i32 %3
+    // CHECK: %[[VAL:.+]] = load i32, ptr %x, align 1
+    // CHECK: ret i32 %[[VAL]]
    transmute(x)
 }

@ -173,7 +228,9 @@ pub unsafe fn check_to_aggregate(x: u64) -> Aggregate64 {
 // CHECK-LABEL: @check_from_aggregate(
 #[no_mangle]
 pub unsafe fn check_from_aggregate(x: Aggregate64) -> u64 {
-    // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %{{[0-9]+}}, ptr align 4 %x, i64 8, i1 false)
+    // CHECK: %x = alloca %Aggregate64, align 4
+    // CHECK: %[[VAL:.+]] = load i64, ptr %x, align 4
+    // CHECK: ret i64 %[[VAL]]
    transmute(x)
 }

@ -194,3 +251,53 @@ pub unsafe fn check_long_array_more_aligned(x: [u8; 100]) -> [u32; 25] {
    // CHECK-NEXT: ret void
    transmute(x)
 }
+
+// CHECK-LABEL: @check_pair_with_bool(
+#[no_mangle]
+pub unsafe fn check_pair_with_bool(x: (u8, bool)) -> (bool, i8) {
+    // CHECK-NOT: alloca
+    // CHECK: trunc i8 %x.0 to i1
+    // CHECK: zext i1 %x.1 to i8
+    transmute(x)
+}
+
+// CHECK-LABEL: @check_float_to_pointer(
+#[no_mangle]
+pub unsafe fn check_float_to_pointer(x: f64) -> *const () {
+    // CHECK-NOT: alloca
+    // CHECK: %0 = bitcast double %x to i64
+    // CHECK: %1 = inttoptr i64 %0 to ptr
+    // CHECK: ret ptr %1
+    transmute(x)
+}
+
+// CHECK-LABEL: @check_float_from_pointer(
+#[no_mangle]
+pub unsafe fn check_float_from_pointer(x: *const ()) -> f64 {
+    // CHECK-NOT: alloca
+    // CHECK: %0 = ptrtoint ptr %x to i64
+    // CHECK: %1 = bitcast i64 %0 to double
+    // CHECK: ret double %1
+    transmute(x)
+}
+
+// CHECK-LABEL: @check_array_to_pair(
+#[no_mangle]
+pub unsafe fn check_array_to_pair(x: [u8; 16]) -> (i64, u64) {
+    // CHECK-NOT: alloca
+    // CHECK: %[[FST:.+]] = load i64, ptr %{{.+}}, align 1, !noundef !
+    // CHECK: %[[SND:.+]] = load i64, ptr %{{.+}}, align 1, !noundef !
+    // CHECK: %[[PAIR0:.+]] = insertvalue { i64, i64 } poison, i64 %[[FST]], 0
+    // CHECK: %[[PAIR01:.+]] = insertvalue { i64, i64 } %[[PAIR0]], i64 %[[SND]], 1
+    // CHECK: ret { i64, i64 } %[[PAIR01]]
+    transmute(x)
+}
+
+// CHECK-LABEL: @check_pair_to_array(
+#[no_mangle]
+pub unsafe fn check_pair_to_array(x: (i64, u64)) -> [u8; 16] {
+    // CHECK-NOT: alloca
+    // CHECK: store i64 %x.0, ptr %{{.+}}, align 1
+    // CHECK: store i64 %x.1, ptr %{{.+}}, align 1
+    transmute(x)
+}
--- a/tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs
+++ b/tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs
@ -4,7 +4,8 @@
 #![crate_type = "lib"]

 #![allow(non_camel_case_types)]
-#![feature(repr_simd, platform_intrinsics, min_const_generics)]
+#![feature(repr_simd, platform_intrinsics)]
+#![feature(inline_const)]

 #[repr(simd)]
 #[derive(Copy, Clone)]
@ -18,23 +19,65 @@ pub struct T([f32; 4]);
 #[derive(Copy, Clone)]
 pub struct U(f32, f32, f32, f32);

+// CHECK-LABEL: @array_align(
+#[no_mangle]
+pub fn array_align() -> usize {
+    // CHECK: ret [[USIZE:i[0-9]+]] [[ARRAY_ALIGN:[0-9]+]]
+    const { std::mem::align_of::<f32>() }
+}
+
+// CHECK-LABEL: @vector_align(
+#[no_mangle]
+pub fn vector_align() -> usize {
+    // CHECK: ret [[USIZE]] [[VECTOR_ALIGN:[0-9]+]]
+    const { std::mem::align_of::<U>() }
+}
+
 // CHECK-LABEL: @build_array_s
 #[no_mangle]
 pub fn build_array_s(x: [f32; 4]) -> S<4> {
-    // CHECK: call void @llvm.memcpy.{{.+}}({{.*}}, i{{[0-9]+}} 16, i1 false)
+    // CHECK: call void @llvm.memcpy.{{.+}}({{.*}} align [[VECTOR_ALIGN]] {{.*}} align [[ARRAY_ALIGN]] {{.*}}, [[USIZE]] 16, i1 false)
    S::<4>(x)
 }

+// CHECK-LABEL: @build_array_transmute_s
+#[no_mangle]
+pub fn build_array_transmute_s(x: [f32; 4]) -> S<4> {
+    // CHECK: %[[VAL:.+]] = load <4 x float>, {{ptr %x|.+>\* %.+}}, align [[ARRAY_ALIGN]]
+    // CHECK: store <4 x float> %[[VAL:.+]], {{ptr %0|.+>\* %.+}}, align [[VECTOR_ALIGN]]
+    unsafe { std::mem::transmute(x) }
+}
+
 // CHECK-LABEL: @build_array_t
 #[no_mangle]
 pub fn build_array_t(x: [f32; 4]) -> T {
-    // CHECK: call void @llvm.memcpy.{{.+}}({{.*}}, i{{[0-9]+}} 16, i1 false)
+    // CHECK: call void @llvm.memcpy.{{.+}}({{.*}} align [[VECTOR_ALIGN]] {{.*}} align [[ARRAY_ALIGN]] {{.*}}, [[USIZE]] 16, i1 false)
    T(x)
 }

+// CHECK-LABEL: @build_array_transmute_t
+#[no_mangle]
+pub fn build_array_transmute_t(x: [f32; 4]) -> T {
+    // CHECK: %[[VAL:.+]] = load <4 x float>, {{ptr %x|.+>\* %.+}}, align [[ARRAY_ALIGN]]
+    // CHECK: store <4 x float> %[[VAL:.+]], {{ptr %0|.+>\* %.+}}, align [[VECTOR_ALIGN]]
+    unsafe { std::mem::transmute(x) }
+}
+
 // CHECK-LABEL: @build_array_u
 #[no_mangle]
 pub fn build_array_u(x: [f32; 4]) -> U {
-    // CHECK: call void @llvm.memcpy.{{.+}}({{.*}}, i{{[0-9]+}} 16, i1 false)
+    // CHECK: store float %a, {{.+}}, align [[VECTOR_ALIGN]]
+    // CHECK: store float %b, {{.+}}, align [[ARRAY_ALIGN]]
+    // CHECK: store float %c, {{.+}}, align
+    // CHECK: store float %d, {{.+}}, align [[ARRAY_ALIGN]]
+    let [a, b, c, d] = x;
+    U(a, b, c, d)
+}
+
+// CHECK-LABEL: @build_array_transmute_u
+#[no_mangle]
+pub fn build_array_transmute_u(x: [f32; 4]) -> U {
+    // CHECK: %[[VAL:.+]] = load <4 x float>, {{ptr %x|.+>\* %.+}}, align [[ARRAY_ALIGN]]
+    // CHECK: store <4 x float> %[[VAL:.+]], {{ptr %0|.+>\* %.+}}, align [[VECTOR_ALIGN]]
    unsafe { std::mem::transmute(x) }
 }
--- a/tests/codegen/transmute-scalar.rs
+++ b/tests/codegen/transmute-scalar.rs
@ -5,63 +5,53 @@

 // With opaque ptrs in LLVM, `transmute` can load/store any `alloca` as any type,
 // without needing to pointercast, and SRoA will turn that into a `bitcast`.
-// As such, there's no longer special-casing in `transmute` to attempt to
-// generate `bitcast` ourselves, as that just made the IR longer.
+// Thus memory-to-memory transmutes don't need to generate them ourselves.

-// FIXME: That said, `bitcast`s could still be a valuable addition if they could
-// be done in `rvalue_creates_operand`, and thus avoid the `alloca`s entirely.
+// However, `bitcast`s and `ptrtoint`s and `inttoptr`s are still worth doing when
+// that allows us to avoid the `alloca`s entirely; see `rvalue_creates_operand`.

 // CHECK-LABEL: define{{.*}}i32 @f32_to_bits(float noundef %x)
-// CHECK: store float %{{.*}}, ptr %0
-// CHECK-NEXT: %[[RES:.*]] = load i32, ptr %0
-// CHECK: ret i32 %[[RES]]
+// CHECK: %0 = bitcast float %x to i32
+// CHECK-NEXT: ret i32 %0
 #[no_mangle]
 pub fn f32_to_bits(x: f32) -> u32 {
    unsafe { std::mem::transmute(x) }
 }

 // CHECK-LABEL: define{{.*}}i8 @bool_to_byte(i1 noundef zeroext %b)
-// CHECK: %1 = zext i1 %b to i8
-// CHECK-NEXT: store i8 %1, {{.*}} %0
-// CHECK-NEXT: %2 = load i8, {{.*}} %0
-// CHECK: ret i8 %2
+// CHECK: %0 = zext i1 %b to i8
+// CHECK-NEXT: ret i8 %0
 #[no_mangle]
 pub fn bool_to_byte(b: bool) -> u8 {
    unsafe { std::mem::transmute(b) }
 }

 // CHECK-LABEL: define{{.*}}noundef zeroext i1 @byte_to_bool(i8 noundef %byte)
-// CHECK: store i8 %byte, ptr %0
-// CHECK-NEXT: %1 = load i8, {{.*}} %0
-// CHECK-NEXT: %2 = trunc i8 %1 to i1
-// CHECK: ret i1 %2
+// CHECK: %0 = trunc i8 %byte to i1
+// CHECK-NEXT: ret i1 %0
 #[no_mangle]
 pub unsafe fn byte_to_bool(byte: u8) -> bool {
    std::mem::transmute(byte)
 }

-// CHECK-LABEL: define{{.*}}{{i8\*|ptr}} @ptr_to_ptr({{i16\*|ptr}} noundef %p)
-// CHECK: store {{i8\*|ptr}} %{{.*}}, {{.*}} %0
-// CHECK-NEXT: %[[RES:.*]] = load {{i8\*|ptr}}, {{.*}} %0
-// CHECK: ret {{i8\*|ptr}} %[[RES]]
+// CHECK-LABEL: define{{.*}}ptr @ptr_to_ptr(ptr noundef %p)
+// CHECK: ret ptr %p
 #[no_mangle]
 pub fn ptr_to_ptr(p: *mut u16) -> *mut u8 {
    unsafe { std::mem::transmute(p) }
 }

-// CHECK: define{{.*}}[[USIZE:i[0-9]+]] @ptr_to_int({{i16\*|ptr}} noundef %p)
-// CHECK: store {{i16\*|ptr}} %p, {{.*}}
-// CHECK-NEXT: %[[RES:.*]] = load [[USIZE]], {{.*}} %0
-// CHECK: ret [[USIZE]] %[[RES]]
+// CHECK: define{{.*}}[[USIZE:i[0-9]+]] @ptr_to_int(ptr noundef %p)
+// CHECK: %0 = ptrtoint ptr %p to [[USIZE]]
+// CHECK-NEXT: ret [[USIZE]] %0
 #[no_mangle]
 pub fn ptr_to_int(p: *mut u16) -> usize {
    unsafe { std::mem::transmute(p) }
 }

-// CHECK: define{{.*}}{{i16\*|ptr}} @int_to_ptr([[USIZE]] noundef %i)
-// CHECK: store [[USIZE]] %i, {{.*}}
-// CHECK-NEXT: %[[RES:.*]] = load {{i16\*|ptr}}, {{.*}} %0
-// CHECK: ret {{i16\*|ptr}} %[[RES]]
+// CHECK: define{{.*}}ptr @int_to_ptr([[USIZE]] noundef %i)
+// CHECK: %0 = inttoptr [[USIZE]] %i to ptr
+// CHECK-NEXT: ret ptr %0
 #[no_mangle]
 pub fn int_to_ptr(i: usize) -> *mut u16 {
    unsafe { std::mem::transmute(i) }