From b5376ba6017fa04a13afda6ac52587f06a6c0bd8 Mon Sep 17 00:00:00 2001
From: Scott McMurray <scottmcm@users.noreply.github.com>
Date: Fri, 29 Mar 2024 00:00:24 -0700
Subject: [PATCH] Remove my `scalar_copy_backend_type` optimization attempt

I added this back in 111999, but I no longer think it's a good idea
- It had to get scaled back to only power-of-two things to not break a bunch of targets
- LLVM seems to be getting better at memcpy removal anyway
- Introducing vector instructions has seemed to sometimes (115515) make autovectorization worse

So this removes it from the codegen crates entirely, and instead just tries to use <https://doc.rust-lang.org/nightly/nightly-rustc/rustc_codegen_ssa/traits/builder/trait.BuilderMethods.html#method.typed_place_copy> instead of direct `memcpy` so things will still use load/store for immediates.
---
 compiler/rustc_codegen_llvm/src/type_.rs      |  3 --
 compiler/rustc_codegen_llvm/src/type_of.rs    | 42 ------------------
 compiler/rustc_codegen_ssa/src/base.rs        | 38 ++--------------
 compiler/rustc_codegen_ssa/src/mir/block.rs   | 13 ++----
 compiler/rustc_codegen_ssa/src/mir/operand.rs | 16 +++----
 .../rustc_codegen_ssa/src/traits/builder.rs   | 22 ++++++++--
 .../rustc_codegen_ssa/src/traits/type_.rs     | 22 ----------
 tests/codegen/array-codegen.rs                | 44 +++++++++++--------
 tests/codegen/array-optimized.rs              |  8 ++--
 tests/codegen/mem-replace-simple-type.rs      |  6 +--
 10 files changed, 60 insertions(+), 154 deletions(-)

diff --git a/compiler/rustc_codegen_llvm/src/type_.rs b/compiler/rustc_codegen_llvm/src/type_.rs
index af1bbda4d08..a00f09dc40d 100644
--- a/compiler/rustc_codegen_llvm/src/type_.rs
+++ b/compiler/rustc_codegen_llvm/src/type_.rs
@@ -281,9 +281,6 @@ impl<'ll, 'tcx> LayoutTypeMethods<'tcx> for CodegenCx<'ll, 'tcx> {
     fn reg_backend_type(&self, ty: &Reg) -> &'ll Type {
         ty.llvm_type(self)
     }
-    fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
-        layout.scalar_copy_llvm_type(self)
-    }
 }
 
 impl<'ll, 'tcx> TypeMembershipMethods<'tcx> for CodegenCx<'ll, 'tcx> {
diff --git a/compiler/rustc_codegen_llvm/src/type_of.rs b/compiler/rustc_codegen_llvm/src/type_of.rs
index d10a083765b..40ed6baa610 100644
--- a/compiler/rustc_codegen_llvm/src/type_of.rs
+++ b/compiler/rustc_codegen_llvm/src/type_of.rs
@@ -5,7 +5,6 @@ use rustc_middle::bug;
 use rustc_middle::ty::layout::{LayoutOf, TyAndLayout};
 use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths};
 use rustc_middle::ty::{self, Ty, TypeVisitableExt};
-use rustc_target::abi::HasDataLayout;
 use rustc_target::abi::{Abi, Align, FieldsShape};
 use rustc_target::abi::{Int, Pointer, F128, F16, F32, F64};
 use rustc_target::abi::{Scalar, Size, Variants};
@@ -166,7 +165,6 @@ pub trait LayoutLlvmExt<'tcx> {
         index: usize,
         immediate: bool,
     ) -> &'a Type;
-    fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type>;
 }
 
 impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
@@ -308,44 +306,4 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
 
         self.scalar_llvm_type_at(cx, scalar)
     }
-
-    fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type> {
-        debug_assert!(self.is_sized());
-
-        // FIXME: this is a fairly arbitrary choice, but 128 bits on WASM
-        // (matching the 128-bit SIMD types proposal) and 256 bits on x64
-        // (like AVX2 registers) seems at least like a tolerable starting point.
-        let threshold = cx.data_layout().pointer_size * 4;
-        if self.layout.size() > threshold {
-            return None;
-        }
-
-        // Vectors, even for non-power-of-two sizes, have the same layout as
-        // arrays but don't count as aggregate types
-        // While LLVM theoretically supports non-power-of-two sizes, and they
-        // often work fine, sometimes x86-isel deals with them horribly
-        // (see #115212) so for now only use power-of-two ones.
-        if let FieldsShape::Array { count, .. } = self.layout.fields()
-            && count.is_power_of_two()
-            && let element = self.field(cx, 0)
-            && element.ty.is_integral()
-        {
-            // `cx.type_ix(bits)` is tempting here, but while that works great
-            // for things that *stay* as memory-to-memory copies, it also ends
-            // up suppressing vectorization as it introduces shifts when it
-            // extracts all the individual values.
-
-            let ety = element.llvm_type(cx);
-            if *count == 1 {
-                // Emitting `<1 x T>` would be silly; just use the scalar.
-                return Some(ety);
-            } else {
-                return Some(cx.type_vector(ety, *count));
-            }
-        }
-
-        // FIXME: The above only handled integer arrays; surely more things
-        // would also be possible. Be careful about provenance, though!
-        None
-    }
 }
diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs
index c4c16ee7311..930b9b8c0db 100644
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@@ -12,7 +12,7 @@ use crate::mir;
 use crate::mir::operand::OperandValue;
 use crate::mir::place::PlaceRef;
 use crate::traits::*;
-use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, MemFlags, ModuleCodegen, ModuleKind};
+use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, ModuleCodegen, ModuleKind};
 
 use rustc_ast::expand::allocator::{global_fn_name, AllocatorKind, ALLOCATOR_METHODS};
 use rustc_attr as attr;
@@ -37,7 +37,7 @@ use rustc_session::config::{self, CrateType, EntryFnType, OptLevel, OutputType};
 use rustc_session::Session;
 use rustc_span::symbol::sym;
 use rustc_span::Symbol;
-use rustc_target::abi::{Align, FIRST_VARIANT};
+use rustc_target::abi::FIRST_VARIANT;
 
 use std::cmp;
 use std::collections::BTreeSet;
@@ -282,15 +282,7 @@ pub fn coerce_unsized_into<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
                 }
 
                 if src_f.layout.ty == dst_f.layout.ty {
-                    memcpy_ty(
-                        bx,
-                        dst_f.llval,
-                        dst_f.align,
-                        src_f.llval,
-                        src_f.align,
-                        src_f.layout,
-                        MemFlags::empty(),
-                    );
+                    bx.typed_place_copy(dst_f, src_f);
                 } else {
                     coerce_unsized_into(bx, src_f, dst_f);
                 }
@@ -382,30 +374,6 @@ pub fn wants_new_eh_instructions(sess: &Session) -> bool {
     wants_wasm_eh(sess) || wants_msvc_seh(sess)
 }
 
-pub fn memcpy_ty<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
-    bx: &mut Bx,
-    dst: Bx::Value,
-    dst_align: Align,
-    src: Bx::Value,
-    src_align: Align,
-    layout: TyAndLayout<'tcx>,
-    flags: MemFlags,
-) {
-    let size = layout.size.bytes();
-    if size == 0 {
-        return;
-    }
-
-    if flags == MemFlags::empty()
-        && let Some(bty) = bx.cx().scalar_copy_backend_type(layout)
-    {
-        let temp = bx.load(bty, src, src_align);
-        bx.store(temp, dst, dst_align);
-    } else {
-        bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
-    }
-}
-
 pub fn codegen_instance<'a, 'tcx: 'a, Bx: BuilderMethods<'a, 'tcx>>(
     cx: &'a Bx::CodegenCx,
     instance: Instance<'tcx>,
diff --git a/compiler/rustc_codegen_ssa/src/mir/block.rs b/compiler/rustc_codegen_ssa/src/mir/block.rs
index 1aa52a985ef..c3137f0628e 100644
--- a/compiler/rustc_codegen_ssa/src/mir/block.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/block.rs
@@ -1459,7 +1459,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                 }
                 _ => (op.immediate_or_packed_pair(bx), arg.layout.align.abi, false),
             },
-            Ref(llval, _, align) => match arg.mode {
+            Ref(llval, llextra, align) => match arg.mode {
                 PassMode::Indirect { attrs, .. } => {
                     let required_align = match attrs.pointee_align {
                         Some(pointee_align) => cmp::max(pointee_align, arg.layout.align.abi),
@@ -1470,15 +1470,8 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                         // alignment requirements may be higher than the type's alignment, so copy
                         // to a higher-aligned alloca.
                         let scratch = PlaceRef::alloca_aligned(bx, arg.layout, required_align);
-                        base::memcpy_ty(
-                            bx,
-                            scratch.llval,
-                            scratch.align,
-                            llval,
-                            align,
-                            op.layout,
-                            MemFlags::empty(),
-                        );
+                        let op_place = PlaceRef { llval, llextra, layout: op.layout, align };
+                        bx.typed_place_copy(scratch, op_place);
                         (scratch.llval, scratch.align, true)
                     } else {
                         (llval, align, true)
diff --git a/compiler/rustc_codegen_ssa/src/mir/operand.rs b/compiler/rustc_codegen_ssa/src/mir/operand.rs
index 932926976b5..ac38b91c5e0 100644
--- a/compiler/rustc_codegen_ssa/src/mir/operand.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/operand.rs
@@ -1,7 +1,6 @@
 use super::place::PlaceRef;
 use super::{FunctionCx, LocalRef};
 
-use crate::base;
 use crate::size_of_val;
 use crate::traits::*;
 use crate::MemFlags;
@@ -398,7 +397,7 @@ impl<'a, 'tcx, V: CodegenObject> OperandValue<V> {
         self.store_with_flags(bx, dest, MemFlags::NONTEMPORAL);
     }
 
-    fn store_with_flags<Bx: BuilderMethods<'a, 'tcx, Value = V>>(
+    pub(crate) fn store_with_flags<Bx: BuilderMethods<'a, 'tcx, Value = V>>(
         self,
         bx: &mut Bx,
         dest: PlaceRef<'tcx, V>,
@@ -410,16 +409,11 @@ impl<'a, 'tcx, V: CodegenObject> OperandValue<V> {
                 // Avoid generating stores of zero-sized values, because the only way to have a zero-sized
                 // value is through `undef`/`poison`, and the store itself is useless.
             }
-            OperandValue::Ref(r, None, source_align) => {
+            OperandValue::Ref(llval, llextra @ None, source_align) => {
                 assert!(dest.layout.is_sized(), "cannot directly store unsized values");
-                if flags.contains(MemFlags::NONTEMPORAL) {
-                    // HACK(nox): This is inefficient but there is no nontemporal memcpy.
-                    let ty = bx.backend_type(dest.layout);
-                    let val = bx.load(ty, r, source_align);
-                    bx.store_with_flags(val, dest.llval, dest.align, flags);
-                    return;
-                }
-                base::memcpy_ty(bx, dest.llval, dest.align, r, source_align, dest.layout, flags)
+                let source_place =
+                    PlaceRef { llval, llextra, align: source_align, layout: dest.layout };
+                bx.typed_place_copy_with_flags(dest, source_place, flags);
             }
             OperandValue::Ref(_, Some(_), _) => {
                 bug!("cannot directly store unsized values");
diff --git a/compiler/rustc_codegen_ssa/src/traits/builder.rs b/compiler/rustc_codegen_ssa/src/traits/builder.rs
index 6c8dcc5b690..c9d40b7c724 100644
--- a/compiler/rustc_codegen_ssa/src/traits/builder.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/builder.rs
@@ -281,17 +281,31 @@ pub trait BuilderMethods<'a, 'tcx>:
         dst: PlaceRef<'tcx, Self::Value>,
         src: PlaceRef<'tcx, Self::Value>,
     ) {
-        debug_assert!(src.llextra.is_none());
-        debug_assert!(dst.llextra.is_none());
+        self.typed_place_copy_with_flags(dst, src, MemFlags::empty());
+    }
+
+    fn typed_place_copy_with_flags(
+        &mut self,
+        dst: PlaceRef<'tcx, Self::Value>,
+        src: PlaceRef<'tcx, Self::Value>,
+        flags: MemFlags,
+    ) {
+        debug_assert!(src.llextra.is_none(), "cannot directly copy from unsized values");
+        debug_assert!(dst.llextra.is_none(), "cannot directly copy into unsized values");
         debug_assert_eq!(dst.layout.size, src.layout.size);
         if self.sess().opts.optimize == OptLevel::No && self.is_backend_immediate(dst.layout) {
             // If we're not optimizing, the aliasing information from `memcpy`
             // isn't useful, so just load-store the value for smaller code.
             let temp = self.load_operand(src);
-            temp.val.store(self, dst);
+            temp.val.store_with_flags(self, dst, flags);
+        } else if flags.contains(MemFlags::NONTEMPORAL) {
+            // HACK(nox): This is inefficient but there is no nontemporal memcpy.
+            let ty = self.backend_type(dst.layout);
+            let val = self.load(ty, src.llval, src.align);
+            self.store_with_flags(val, dst.llval, dst.align, flags);
         } else if !dst.layout.is_zst() {
             let bytes = self.const_usize(dst.layout.size.bytes());
-            self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, MemFlags::empty());
+            self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, flags);
         }
     }
 
diff --git a/compiler/rustc_codegen_ssa/src/traits/type_.rs b/compiler/rustc_codegen_ssa/src/traits/type_.rs
index 555833759eb..34d9e75036f 100644
--- a/compiler/rustc_codegen_ssa/src/traits/type_.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/type_.rs
@@ -133,28 +133,6 @@ pub trait LayoutTypeMethods<'tcx>: Backend<'tcx> {
             || self.is_backend_immediate(layout)
             || self.is_backend_scalar_pair(layout))
     }
-
-    /// A type that can be used in a [`super::BuilderMethods::load`] +
-    /// [`super::BuilderMethods::store`] pair to implement a *typed* copy,
-    /// such as a MIR `*_0 = *_1`.
-    ///
-    /// It's always legal to return `None` here, as the provided impl does,
-    /// in which case callers should use [`super::BuilderMethods::memcpy`]
-    /// instead of the `load`+`store` pair.
-    ///
-    /// This can be helpful for things like arrays, where the LLVM backend type
-    /// `[3 x i16]` optimizes to three separate loads and stores, but it can
-    /// instead be copied via an `i48` that stays as the single `load`+`store`.
-    /// (As of 2023-05 LLVM cannot necessarily optimize away a `memcpy` in these
-    /// cases, due to `poison` handling, but in codegen we have more information
-    /// about the type invariants, so can emit something better instead.)
-    ///
-    /// This *should* return `None` for particularly-large types, where leaving
-    /// the `memcpy` may well be important to avoid code size explosion.
-    fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
-        let _ = layout;
-        None
-    }
 }
 
 // For backends that support CFI using type membership (i.e., testing whether a given pointer is
diff --git a/tests/codegen/array-codegen.rs b/tests/codegen/array-codegen.rs
index bb4bd5444db..1310e61c41d 100644
--- a/tests/codegen/array-codegen.rs
+++ b/tests/codegen/array-codegen.rs
@@ -5,52 +5,58 @@
 // CHECK-LABEL: @array_load
 #[no_mangle]
 pub fn array_load(a: &[u8; 4]) -> [u8; 4] {
-    // CHECK: %_0 = alloca [4 x i8], align 1
-    // CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
-    // CHECK: store <4 x i8> %[[TEMP1]], ptr %_0, align 1
-    // CHECK: %[[TEMP2:.+]] = load i32, ptr %_0, align 1
-    // CHECK: ret i32 %[[TEMP2]]
+    // CHECK-NOT: alloca
+    // CHECK: %[[ALLOCA:.+]] = alloca [4 x i8], align 1
+    // CHECK-NOT: alloca
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[ALLOCA]], ptr align 1 %a, {{.+}} 4, i1 false)
+    // CHECK: %[[TEMP:.+]] = load i32, ptr %[[ALLOCA]], align 1
+    // CHECK: ret i32 %[[TEMP]]
     *a
 }
 
 // CHECK-LABEL: @array_store
 #[no_mangle]
 pub fn array_store(a: [u8; 4], p: &mut [u8; 4]) {
+    // CHECK-NOT: alloca
+    // CHECK: %[[TEMP:.+]] = alloca i32, [[TEMPALIGN:align [0-9]+]]
+    // CHECK-NOT: alloca
     // CHECK: %a = alloca [4 x i8]
-    // CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
-    // CHECK-NEXT: store <4 x i8> %[[TEMP]], ptr %p, align 1
+    // CHECK-NOT: alloca
+    // store i32 %0, ptr %[[TEMP]]
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %a, ptr [[TEMPALIGN]] %[[TEMP]], {{.+}} 4, i1 false)
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %a, {{.+}} 4, i1 false)
     *p = a;
 }
 
 // CHECK-LABEL: @array_copy
 #[no_mangle]
 pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) {
+    // CHECK-NOT: alloca
     // CHECK: %[[LOCAL:.+]] = alloca [4 x i8], align 1
-    // CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
-    // CHECK: store <4 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
-    // CHECK: %[[TEMP2:.+]] = load <4 x i8>, ptr %[[LOCAL]], align 1
-    // CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1
+    // CHECK-NOT: alloca
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 4, i1 false)
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 4, i1 false)
     *p = *a;
 }
 
 // CHECK-LABEL: @array_copy_1_element
 #[no_mangle]
 pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
+    // CHECK-NOT: alloca
     // CHECK: %[[LOCAL:.+]] = alloca [1 x i8], align 1
-    // CHECK: %[[TEMP1:.+]] = load i8, ptr %a, align 1
-    // CHECK: store i8 %[[TEMP1]], ptr %[[LOCAL]], align 1
-    // CHECK: %[[TEMP2:.+]] = load i8, ptr %[[LOCAL]], align 1
-    // CHECK: store i8 %[[TEMP2]], ptr %p, align 1
+    // CHECK-NOT: alloca
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 1, i1 false)
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 1, i1 false)
     *p = *a;
 }
 
 // CHECK-LABEL: @array_copy_2_elements
 #[no_mangle]
 pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
+    // CHECK-NOT: alloca
     // CHECK: %[[LOCAL:.+]] = alloca [2 x i8], align 1
-    // CHECK: %[[TEMP1:.+]] = load <2 x i8>, ptr %a, align 1
-    // CHECK: store <2 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
-    // CHECK: %[[TEMP2:.+]] = load <2 x i8>, ptr %[[LOCAL]], align 1
-    // CHECK: store <2 x i8> %[[TEMP2]], ptr %p, align 1
+    // CHECK-NOT: alloca
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 2, i1 false)
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 2, i1 false)
     *p = *a;
 }
diff --git a/tests/codegen/array-optimized.rs b/tests/codegen/array-optimized.rs
index 4cf16f1fb30..42fdbd39b7e 100644
--- a/tests/codegen/array-optimized.rs
+++ b/tests/codegen/array-optimized.rs
@@ -16,8 +16,8 @@ pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
 #[no_mangle]
 pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
     // CHECK-NOT: alloca
-    // CHECK: %[[TEMP:.+]] = load <2 x i8>, ptr %a, align 1
-    // CHECK: store <2 x i8> %[[TEMP]], ptr %p, align 1
+    // CHECK: %[[TEMP:.+]] = load i16, ptr %a, align 1
+    // CHECK: store i16 %[[TEMP]], ptr %p, align 1
     // CHECK: ret
     *p = *a;
 }
@@ -26,8 +26,8 @@ pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
 #[no_mangle]
 pub fn array_copy_4_elements(a: &[u8; 4], p: &mut [u8; 4]) {
     // CHECK-NOT: alloca
-    // CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
-    // CHECK: store <4 x i8> %[[TEMP]], ptr %p, align 1
+    // CHECK: %[[TEMP:.+]] = load i32, ptr %a, align 1
+    // CHECK: store i32 %[[TEMP]], ptr %p, align 1
     // CHECK: ret
     *p = *a;
 }
diff --git a/tests/codegen/mem-replace-simple-type.rs b/tests/codegen/mem-replace-simple-type.rs
index b00fbad05d9..50b43f5854a 100644
--- a/tests/codegen/mem-replace-simple-type.rs
+++ b/tests/codegen/mem-replace-simple-type.rs
@@ -45,9 +45,7 @@ pub fn replace_short_array_3(r: &mut [u32; 3], v: [u32; 3]) -> [u32; 3] {
 // CHECK-LABEL: @replace_short_array_4(
 pub fn replace_short_array_4(r: &mut [u32; 4], v: [u32; 4]) -> [u32; 4] {
     // CHECK-NOT: alloca
-    // CHECK: %[[R:.+]] = load <4 x i32>, ptr %r, align 4
-    // CHECK: store <4 x i32> %[[R]], ptr %result
-    // CHECK: %[[V:.+]] = load <4 x i32>, ptr %v, align 4
-    // CHECK: store <4 x i32> %[[V]], ptr %r
+    // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %result, ptr align 4 %r, i64 16, i1 false)
+    // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %r, ptr align 4 %v, i64 16, i1 false)
     std::mem::replace(r, v)
 }