From d49fd9f8771c5e4de53e7f53431344c40b94ca55 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Fri, 10 Nov 2023 11:30:51 +0000
Subject: [PATCH 01/14] Merge commit 'c84d1871dc4456539b7b578830268ab3539915d0'
 into sync_cg_clif-2023-11-10

---
 Readme.md                      |   2 -
 build_system/tests.rs          |   1 +
 config.txt                     |   1 +
 example/neon.rs                | 234 ++++++++++++++++++
 patches/stdlib-lock.toml       |   9 +-
 rust-toolchain                 |   2 +-
 scripts/test_rustc_tests.sh    |   5 +
 src/inline_asm.rs              | 101 +++++++-
 src/intrinsics/llvm.rs         |  15 ++
 src/intrinsics/llvm_aarch64.rs | 111 +++++++--
 src/intrinsics/llvm_x86.rs     | 432 ++++++++++++++++++++++++++++++---
 src/intrinsics/mod.rs          |  59 +++++
 src/value_and_place.rs         |  56 +++++
 13 files changed, 961 insertions(+), 67 deletions(-)
 create mode 100644 example/neon.rs

diff --git a/Readme.md b/Readme.md
index 5664cbe7d4f..1a2b2bbc588 100644
--- a/Readme.md
+++ b/Readme.md
@@ -76,8 +76,6 @@ configuration options.
 
 ## Not yet supported
 
-* Inline assembly ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1041))
-    * On UNIX there is support for invoking an external assembler for `global_asm!` and `asm!`.
 * SIMD ([tracked here](https://github.com/rust-lang/rustc_codegen_cranelift/issues/171), `std::simd` fully works, `std::arch` is partially supported)
 * Unwinding on panics ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1677), `-Cpanic=abort` is enabled by default)
 
diff --git a/build_system/tests.rs b/build_system/tests.rs
index 1e24d1b113f..10736ff9a55 100644
--- a/build_system/tests.rs
+++ b/build_system/tests.rs
@@ -99,6 +99,7 @@ const BASE_SYSROOT_SUITE: &[TestCase] = &[
     TestCase::build_bin_and_run("aot.mod_bench", "example/mod_bench.rs", &[]),
     TestCase::build_bin_and_run("aot.issue-72793", "example/issue-72793.rs", &[]),
     TestCase::build_bin("aot.issue-59326", "example/issue-59326.rs"),
+    TestCase::build_bin_and_run("aot.neon", "example/neon.rs", &[]),
 ];
 
 pub(crate) static RAND_REPO: GitRepo = GitRepo::github(
diff --git a/config.txt b/config.txt
index 7ff805e58d9..2ccdc7d7874 100644
--- a/config.txt
+++ b/config.txt
@@ -42,6 +42,7 @@ aot.float-minmax-pass
 aot.mod_bench
 aot.issue-72793
 aot.issue-59326
+aot.neon
 
 testsuite.extended_sysroot
 test.rust-random/rand
diff --git a/example/neon.rs b/example/neon.rs
new file mode 100644
index 00000000000..bad26947967
--- /dev/null
+++ b/example/neon.rs
@@ -0,0 +1,234 @@
+// Most of these tests are copied from https://github.com/japaric/stdsimd/blob/0f4413d01c4f0c3ffbc5a69e9a37fbc7235b31a9/coresimd/arm/neon.rs
+
+#![feature(portable_simd)]
+
+#[cfg(target_arch = "aarch64")]
+use std::arch::aarch64::*;
+use std::mem::transmute;
+use std::simd::*;
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_s8() {
+    let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
+    let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = i8x8::from([-2, -4, 5, 7, 0, 2, 4, 6]);
+    let r: i8x8 = transmute(vpmin_s8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_s16() {
+    let a = i16x4::from([1, 2, 3, -4]);
+    let b = i16x4::from([0, 3, 2, 5]);
+    let e = i16x4::from([1, -4, 0, 2]);
+    let r: i16x4 = transmute(vpmin_s16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_s32() {
+    let a = i32x2::from([1, -2]);
+    let b = i32x2::from([0, 3]);
+    let e = i32x2::from([-2, 0]);
+    let r: i32x2 = transmute(vpmin_s32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = u8x8::from([1, 3, 5, 7, 0, 2, 4, 6]);
+    let r: u8x8 = transmute(vpmin_u8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_u16() {
+    let a = u16x4::from([1, 2, 3, 4]);
+    let b = u16x4::from([0, 3, 2, 5]);
+    let e = u16x4::from([1, 3, 0, 2]);
+    let r: u16x4 = transmute(vpmin_u16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_u32() {
+    let a = u32x2::from([1, 2]);
+    let b = u32x2::from([0, 3]);
+    let e = u32x2::from([1, 0]);
+    let r: u32x2 = transmute(vpmin_u32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_f32() {
+    let a = f32x2::from([1., -2.]);
+    let b = f32x2::from([0., 3.]);
+    let e = f32x2::from([-2., 0.]);
+    let r: f32x2 = transmute(vpmin_f32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_s8() {
+    let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
+    let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = i8x8::from([1, 3, 6, 8, 3, 5, 7, 9]);
+    let r: i8x8 = transmute(vpmax_s8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_s16() {
+    let a = i16x4::from([1, 2, 3, -4]);
+    let b = i16x4::from([0, 3, 2, 5]);
+    let e = i16x4::from([2, 3, 3, 5]);
+    let r: i16x4 = transmute(vpmax_s16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_s32() {
+    let a = i32x2::from([1, -2]);
+    let b = i32x2::from([0, 3]);
+    let e = i32x2::from([1, 3]);
+    let r: i32x2 = transmute(vpmax_s32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = u8x8::from([2, 4, 6, 8, 3, 5, 7, 9]);
+    let r: u8x8 = transmute(vpmax_u8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_u16() {
+    let a = u16x4::from([1, 2, 3, 4]);
+    let b = u16x4::from([0, 3, 2, 5]);
+    let e = u16x4::from([2, 4, 3, 5]);
+    let r: u16x4 = transmute(vpmax_u16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_u32() {
+    let a = u32x2::from([1, 2]);
+    let b = u32x2::from([0, 3]);
+    let e = u32x2::from([2, 3]);
+    let r: u32x2 = transmute(vpmax_u32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_f32() {
+    let a = f32x2::from([1., -2.]);
+    let b = f32x2::from([0., 3.]);
+    let e = f32x2::from([1., 3.]);
+    let r: f32x2 = transmute(vpmax_f32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_s16() {
+    let a = i16x4::from([1, 2, 3, 4]);
+    let b = i16x4::from([0, -1, -2, -3]);
+    let r: i16x4 = transmute(vpadd_s16(transmute(a), transmute(b)));
+    let e = i16x4::from([3, 7, -1, -5]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_s32() {
+    let a = i32x2::from([1, 2]);
+    let b = i32x2::from([0, -1]);
+    let r: i32x2 = transmute(vpadd_s32(transmute(a), transmute(b)));
+    let e = i32x2::from([3, -1]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_s8() {
+    let a = i8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = i8x8::from([0, -1, -2, -3, -4, -5, -6, -7]);
+    let r: i8x8 = transmute(vpadd_s8(transmute(a), transmute(b)));
+    let e = i8x8::from([3, 7, 11, 15, -1, -5, -9, -13]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_u16() {
+    let a = u16x4::from([1, 2, 3, 4]);
+    let b = u16x4::from([30, 31, 32, 33]);
+    let r: u16x4 = transmute(vpadd_u16(transmute(a), transmute(b)));
+    let e = u16x4::from([3, 7, 61, 65]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_u32() {
+    let a = u32x2::from([1, 2]);
+    let b = u32x2::from([30, 31]);
+    let r: u32x2 = transmute(vpadd_u32(transmute(a), transmute(b)));
+    let e = u32x2::from([3, 61]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u8x8::from([30, 31, 32, 33, 34, 35, 36, 37]);
+    let r: u8x8 = transmute(vpadd_u8(transmute(a), transmute(b)));
+    let e = u8x8::from([3, 7, 11, 15, 61, 65, 69, 73]);
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vqsub_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
+    let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
+    let r: u8x8 = transmute(vqsub_u8(transmute(a), transmute(b)));
+    let e = u8x8::from([0, 1, 2, 3, 0, 0, 0, 218]);
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vqadd_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
+    let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
+    let r: u8x8 = transmute(vqadd_u8(transmute(a), transmute(b)));
+    let e = u8x8::from([31, 3, 4, 5, 39, 0xff, 43, 0xff]);
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+fn main() {
+    unsafe {
+        test_vpmin_s8();
+        test_vpmin_s16();
+        test_vpmin_s32();
+        test_vpmin_u8();
+        test_vpmin_u16();
+        test_vpmin_u32();
+        test_vpmin_f32();
+        test_vpmax_s8();
+        test_vpmax_s16();
+        test_vpmax_s32();
+        test_vpmax_u8();
+        test_vpmax_u16();
+        test_vpmax_u32();
+        test_vpmax_f32();
+
+        test_vpadd_s16();
+        test_vpadd_s32();
+        test_vpadd_s8();
+        test_vpadd_u16();
+        test_vpadd_u32();
+        test_vpadd_u8();
+
+        test_vqsub_u8();
+        test_vqadd_u8();
+    }
+}
+
+#[cfg(not(target_arch = "aarch64"))]
+fn main() {}
diff --git a/patches/stdlib-lock.toml b/patches/stdlib-lock.toml
index 9902bca8eab..8a690bada0d 100644
--- a/patches/stdlib-lock.toml
+++ b/patches/stdlib-lock.toml
@@ -58,9 +58,9 @@ dependencies = [
 
 [[package]]
 name = "compiler_builtins"
-version = "0.1.100"
+version = "0.1.103"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6c0f24437059853f0fa64afc51f338f93647a3de4cf3358ba1bb4171a199775"
+checksum = "a3b73c3443a5fd2438d7ba4853c64e4c8efc2404a9e28a9234cc2d5eebc6c242"
 dependencies = [
  "cc",
  "rustc-std-workspace-core",
@@ -158,9 +158,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.149"
+version = "0.2.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
+checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
 dependencies = [
  "rustc-std-workspace-core",
 ]
@@ -415,7 +415,6 @@ dependencies = [
 name = "unwind"
 version = "0.0.0"
 dependencies = [
- "cc",
  "cfg-if",
  "compiler_builtins",
  "core",
diff --git a/rust-toolchain b/rust-toolchain
index 7e3eaacf8ef..b832b06e0ff 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2023-10-29"
+channel = "nightly-2023-11-10"
 components = ["rust-src", "rustc-dev", "llvm-tools"]
diff --git a/scripts/test_rustc_tests.sh b/scripts/test_rustc_tests.sh
index a299b6de6b1..cdc78adcf85 100755
--- a/scripts/test_rustc_tests.sh
+++ b/scripts/test_rustc_tests.sh
@@ -146,6 +146,11 @@ rm tests/ui/process/nofile-limit.rs # TODO some AArch64 linking issue
 
 rm tests/ui/stdio-is-blocking.rs # really slow with unoptimized libstd
 
+# rustc bugs
+# ==========
+# https://github.com/rust-lang/rust/pull/116447#issuecomment-1790451463
+rm tests/ui/coroutine/gen_block_*.rs
+
 cp ../dist/bin/rustdoc-clif ../dist/bin/rustdoc # some tests expect bin/rustdoc to exist
 
 # prevent $(RUSTDOC) from picking up the sysroot built by x.py. It conflicts with the one used by
diff --git a/src/inline_asm.rs b/src/inline_asm.rs
index 331649b2ec2..ce0eecca8a8 100644
--- a/src/inline_asm.rs
+++ b/src/inline_asm.rs
@@ -13,7 +13,7 @@ use crate::prelude::*;
 enum CInlineAsmOperand<'tcx> {
     In {
         reg: InlineAsmRegOrRegClass,
-        value: CValue<'tcx>,
+        value: Value,
     },
     Out {
         reg: InlineAsmRegOrRegClass,
@@ -23,7 +23,7 @@ enum CInlineAsmOperand<'tcx> {
     InOut {
         reg: InlineAsmRegOrRegClass,
         _late: bool,
-        in_value: CValue<'tcx>,
+        in_value: Value,
         out_place: Option<CPlace<'tcx>>,
     },
     Const {
@@ -47,7 +47,9 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     // Used by panic_abort on Windows, but uses a syntax which only happens to work with
     // asm!() by accident and breaks with the GNU assembler as well as global_asm!() for
     // the LLVM backend.
-    if template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string()) {
+    if template.len() == 1
+        && template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string())
+    {
         fx.bcx.ins().trap(TrapCode::User(1));
         return;
     }
@@ -55,9 +57,10 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     let operands = operands
         .into_iter()
         .map(|operand| match *operand {
-            InlineAsmOperand::In { reg, ref value } => {
-                CInlineAsmOperand::In { reg, value: crate::base::codegen_operand(fx, value) }
-            }
+            InlineAsmOperand::In { reg, ref value } => CInlineAsmOperand::In {
+                reg,
+                value: crate::base::codegen_operand(fx, value).load_scalar(fx),
+            },
             InlineAsmOperand::Out { reg, late, ref place } => CInlineAsmOperand::Out {
                 reg,
                 late,
@@ -67,7 +70,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
                 CInlineAsmOperand::InOut {
                     reg,
                     _late: late,
-                    in_value: crate::base::codegen_operand(fx, in_value),
+                    in_value: crate::base::codegen_operand(fx, in_value).load_scalar(fx),
                     out_place: out_place.map(|place| crate::base::codegen_place(fx, place)),
                 }
             }
@@ -165,7 +168,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     for (i, operand) in operands.iter().enumerate() {
         match operand {
             CInlineAsmOperand::In { reg: _, value } => {
-                inputs.push((asm_gen.stack_slots_input[i].unwrap(), value.load_scalar(fx)));
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
             }
             CInlineAsmOperand::Out { reg: _, late: _, place } => {
                 if let Some(place) = place {
@@ -173,7 +176,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
                 }
             }
             CInlineAsmOperand::InOut { reg: _, _late: _, in_value, out_place } => {
-                inputs.push((asm_gen.stack_slots_input[i].unwrap(), in_value.load_scalar(fx)));
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *in_value));
                 if let Some(out_place) = out_place {
                     outputs.push((asm_gen.stack_slots_output[i].unwrap(), *out_place));
                 }
@@ -726,3 +729,83 @@ fn call_inline_asm<'tcx>(
         place.write_cvalue(fx, CValue::by_val(value, place.layout()));
     }
 }
+
+pub(crate) fn codegen_xgetbv<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    xcr_no: Value,
+    ret: CPlace<'tcx>,
+) {
+    // FIXME add .eh_frame unwind info directives
+
+    let operands = vec![
+        CInlineAsmOperand::In {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
+            value: xcr_no,
+        },
+        CInlineAsmOperand::Out {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
+            late: true,
+            place: Some(ret),
+        },
+        CInlineAsmOperand::Out {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
+            late: true,
+            place: None,
+        },
+    ];
+    let options = InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM;
+
+    let mut inputs = Vec::new();
+    let mut outputs = Vec::new();
+
+    let mut asm_gen = InlineAssemblyGenerator {
+        tcx: fx.tcx,
+        arch: fx.tcx.sess.asm_arch.unwrap(),
+        enclosing_def_id: fx.instance.def_id(),
+        template: &[InlineAsmTemplatePiece::String(
+            "
+            xgetbv
+            // out = rdx << 32 | rax
+            shl rdx, 32
+            or rax, rdx
+            "
+            .to_string(),
+        )],
+        operands: &operands,
+        options,
+        registers: Vec::new(),
+        stack_slots_clobber: Vec::new(),
+        stack_slots_input: Vec::new(),
+        stack_slots_output: Vec::new(),
+        stack_slot_size: Size::from_bytes(0),
+    };
+    asm_gen.allocate_registers();
+    asm_gen.allocate_stack_slots();
+
+    let inline_asm_index = fx.cx.inline_asm_index.get();
+    fx.cx.inline_asm_index.set(inline_asm_index + 1);
+    let asm_name = format!(
+        "__inline_asm_{}_n{}",
+        fx.cx.cgu_name.as_str().replace('.', "__").replace('-', "_"),
+        inline_asm_index
+    );
+
+    let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
+    fx.cx.global_asm.push_str(&generated_asm);
+
+    for (i, operand) in operands.iter().enumerate() {
+        match operand {
+            CInlineAsmOperand::In { reg: _, value } => {
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
+            }
+            CInlineAsmOperand::Out { reg: _, late: _, place } => {
+                if let Some(place) = place {
+                    outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place));
+                }
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
+}
diff --git a/src/intrinsics/llvm.rs b/src/intrinsics/llvm.rs
index c1694760998..e9b7daf1492 100644
--- a/src/intrinsics/llvm.rs
+++ b/src/intrinsics/llvm.rs
@@ -51,6 +51,21 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
             });
         }
 
+        _ if intrinsic.starts_with("llvm.fma.v") => {
+            intrinsic_args!(fx, args => (x,y,z); intrinsic);
+
+            simd_trio_for_each_lane(
+                fx,
+                x,
+                y,
+                z,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, lane_x, lane_y, lane_z| {
+                    fx.bcx.ins().fma(lane_x, lane_y, lane_z)
+                },
+            );
+        }
+
         _ => {
             fx.tcx
                 .sess
diff --git a/src/intrinsics/llvm_aarch64.rs b/src/intrinsics/llvm_aarch64.rs
index 0c211a06dc4..ee098be1fce 100644
--- a/src/intrinsics/llvm_aarch64.rs
+++ b/src/intrinsics/llvm_aarch64.rs
@@ -44,7 +44,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             });
         }
 
-        _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") => {
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v")
+            || intrinsic.starts_with("llvm.aarch64.neon.uqadd.v") =>
+        {
             intrinsic_args!(fx, args => (x, y); intrinsic);
 
             simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
@@ -52,7 +54,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             });
         }
 
-        _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") => {
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v")
+            || intrinsic.starts_with("llvm.aarch64.neon.uqsub.v") =>
+        {
             intrinsic_args!(fx, args => (x, y); intrinsic);
 
             simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
@@ -156,6 +160,90 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             });
         }
 
+        _ if intrinsic.starts_with("llvm.aarch64.neon.umaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umax(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.smaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smax(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.uminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.fminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.fmaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmax(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.addp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().iadd(x_lane, y_lane),
+            );
+        }
+
         // FIXME generalize vector types
         "llvm.aarch64.neon.tbl1.v16i8" => {
             intrinsic_args!(fx, args => (t, idx); intrinsic);
@@ -172,25 +260,6 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             }
         }
 
-        // FIXME generalize vector types
-        "llvm.aarch64.neon.umaxp.v16i8" => {
-            intrinsic_args!(fx, args => (a, b); intrinsic);
-
-            // FIXME add helper for horizontal pairwise operations
-            for i in 0..8 {
-                let lane1 = a.value_lane(fx, i * 2).load_scalar(fx);
-                let lane2 = a.value_lane(fx, i * 2 + 1).load_scalar(fx);
-                let res = fx.bcx.ins().umax(lane1, lane2);
-                ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
-            }
-            for i in 0..8 {
-                let lane1 = b.value_lane(fx, i * 2).load_scalar(fx);
-                let lane2 = b.value_lane(fx, i * 2 + 1).load_scalar(fx);
-                let res = fx.bcx.ins().umax(lane1, lane2);
-                ret.place_lane(fx, 8 + i).to_ptr().store(fx, res, MemFlags::trusted());
-            }
-        }
-
         /*
         _ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v")
             || intrinsic.starts_with("llvm.aarch64.neon.sqshl.v")
diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index ea5997a14bb..4c536048626 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -20,16 +20,21 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
         // Used by is_x86_feature_detected!();
         "llvm.x86.xgetbv" => {
-            // FIXME use the actual xgetbv instruction
-            intrinsic_args!(fx, args => (v); intrinsic);
+            intrinsic_args!(fx, args => (xcr_no); intrinsic);
 
-            let v = v.load_scalar(fx);
+            let xcr_no = xcr_no.load_scalar(fx);
 
-            // As of writing on XCR0 exists
-            fx.bcx.ins().trapnz(v, TrapCode::UnreachableCodeReached);
+            crate::inline_asm::codegen_xgetbv(fx, xcr_no, ret);
+        }
 
-            let res = fx.bcx.ins().iconst(types::I64, 1 /* bit 0 must be set */);
-            ret.write_cvalue(fx, CValue::by_val(res, fx.layout_of(fx.tcx.types.i64)));
+        "llvm.x86.sse3.ldu.dq" | "llvm.x86.avx.ldu.dq.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128&ig_expand=4009
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256&ig_expand=4010
+            intrinsic_args!(fx, args => (ptr); intrinsic);
+
+            // FIXME correctly handle unalignedness
+            let val = CValue::by_ref(Pointer::new(ptr.load_scalar(fx)), ret.layout());
+            ret.write_cvalue(fx, val);
         }
 
         "llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
@@ -177,8 +182,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                 }
             }
         }
-        "llvm.x86.avx2.vperm2i128" => {
+        "llvm.x86.avx2.vperm2i128"
+        | "llvm.x86.avx.vperm2f128.ps.256"
+        | "llvm.x86.avx.vperm2f128.pd.256" => {
             // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd
             let (a, b, imm8) = match args {
                 [a, b, imm8] => (a, b, imm8),
                 _ => bug!("wrong number of args for intrinsic {intrinsic}"),
@@ -187,19 +196,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             let b = codegen_operand(fx, b);
             let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
 
-            let a_0 = a.value_lane(fx, 0).load_scalar(fx);
-            let a_1 = a.value_lane(fx, 1).load_scalar(fx);
-            let a_low = fx.bcx.ins().iconcat(a_0, a_1);
-            let a_2 = a.value_lane(fx, 2).load_scalar(fx);
-            let a_3 = a.value_lane(fx, 3).load_scalar(fx);
-            let a_high = fx.bcx.ins().iconcat(a_2, a_3);
+            let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
+            let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
 
-            let b_0 = b.value_lane(fx, 0).load_scalar(fx);
-            let b_1 = b.value_lane(fx, 1).load_scalar(fx);
-            let b_low = fx.bcx.ins().iconcat(b_0, b_1);
-            let b_2 = b.value_lane(fx, 2).load_scalar(fx);
-            let b_3 = b.value_lane(fx, 3).load_scalar(fx);
-            let b_high = fx.bcx.ins().iconcat(b_2, b_3);
+            let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
+            let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
 
             fn select4(
                 fx: &mut FunctionCx<'_, '_, '_>,
@@ -224,16 +225,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
             let control0 = imm8;
             let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
-            let (res_0, res_1) = fx.bcx.ins().isplit(res_low);
 
             let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
             let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
-            let (res_2, res_3) = fx.bcx.ins().isplit(res_high);
 
-            ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
-            ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
-            ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
-            ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
+            ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store(
+                fx,
+                res_low,
+                MemFlags::trusted(),
+            );
+            ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store(
+                fx,
+                res_high,
+                MemFlags::trusted(),
+            );
         }
         "llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
             let a = match args {
@@ -309,7 +314,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                 fx.bcx.ins().sshr(a_lane, saturated_count)
             });
         }
-        "llvm.x86.sse2.psad.bw" => {
+        "llvm.x86.sse2.psad.bw" | "llvm.x86.avx2.psad.bw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8&ig_expand=5770
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8&ig_expand=5771
             intrinsic_args!(fx, args => (a, b); intrinsic);
 
             assert_eq!(a.layout(), b.layout());
@@ -340,7 +347,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                 ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
             }
         }
-        "llvm.x86.ssse3.pmadd.ub.sw.128" => {
+        "llvm.x86.ssse3.pmadd.ub.sw.128" | "llvm.x86.avx2.pmadd.ub.sw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16&ig_expand=4267
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16&ig_expand=4270
             intrinsic_args!(fx, args => (a, b); intrinsic);
 
             let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx);
@@ -379,7 +388,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                 ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
             }
         }
-        "llvm.x86.sse2.pmadd.wd" => {
+        "llvm.x86.sse2.pmadd.wd" | "llvm.x86.avx2.pmadd.wd" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16&ig_expand=4231
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16&ig_expand=4234
             intrinsic_args!(fx, args => (a, b); intrinsic);
 
             assert_eq!(a.layout(), b.layout());
@@ -412,6 +423,369 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                 ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
             }
         }
+
+        "llvm.x86.ssse3.pmul.hr.sw.128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16&ig_expand=4782
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i16);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
+            assert_eq!(lane_count, ret_lane_count);
+
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
+            for out_lane_idx in 0..lane_count {
+                let a_lane = a.value_lane(fx, out_lane_idx).load_scalar(fx);
+                let a_lane = fx.bcx.ins().sextend(types::I32, a_lane);
+                let b_lane = b.value_lane(fx, out_lane_idx).load_scalar(fx);
+                let b_lane = fx.bcx.ins().sextend(types::I32, b_lane);
+
+                let mul: Value = fx.bcx.ins().imul(a_lane, b_lane);
+                let shifted = fx.bcx.ins().ushr_imm(mul, 14);
+                let incremented = fx.bcx.ins().iadd_imm(shifted, 1);
+                let shifted_again = fx.bcx.ins().ushr_imm(incremented, 1);
+
+                let res_lane = fx.bcx.ins().ireduce(types::I16, shifted_again);
+                let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+                ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.sse2.packuswb.128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i16);
+            assert_eq!(ret_lane_ty, fx.tcx.types.u8);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let zero = fx.bcx.ins().iconst(types::I16, 0);
+            let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
+
+            for idx in 0..lane_count {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.avx2.packuswb" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i16);
+            assert_eq!(ret_lane_ty, fx.tcx.types.u8);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let zero = fx.bcx.ins().iconst(types::I16, 0);
+            let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.sse2.packssdw.128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i32);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
+            let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
+
+            for idx in 0..lane_count {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.sse41.packusdw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i32);
+            assert_eq!(ret_lane_ty, fx.tcx.types.u16);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN));
+            let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX));
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.u16);
+
+            for idx in 0..lane_count {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().umax(lane, min_u16);
+                let sat = fx.bcx.ins().umin(sat, max_u16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().umax(lane, min_u16);
+                let sat = fx.bcx.ins().umin(sat, max_u16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.avx2.packssdw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i32);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
+            let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.pclmulqdq" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
+            intrinsic_args!(fx, args => (a, b, imm8); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i64);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i64);
+            assert_eq!(lane_count, 2);
+            assert_eq!(ret_lane_count, 2);
+
+            let imm8 = imm8.load_scalar(fx);
+
+            let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001);
+            let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
+            let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
+            let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0);
+
+            let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000);
+            let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
+            let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
+            let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0);
+
+            fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value {
+                let tmp = fx.bcx.ins().ushr_imm(val, bit);
+                fx.bcx.ins().band_imm(tmp, 1)
+            }
+
+            let mut res1 = fx.bcx.ins().iconst(types::I64, 0);
+            for i in 0..=63 {
+                let x = extract_bit(fx, temp1, 0);
+                let y = extract_bit(fx, temp2, i);
+                let mut temp = fx.bcx.ins().band(x, y);
+                for j in 1..=i {
+                    let x = extract_bit(fx, temp1, j);
+                    let y = extract_bit(fx, temp2, i - j);
+                    let z = fx.bcx.ins().band(x, y);
+                    temp = fx.bcx.ins().bxor(temp, z);
+                }
+                let temp = fx.bcx.ins().ishl_imm(temp, i);
+                res1 = fx.bcx.ins().bor(res1, temp);
+            }
+            ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted());
+
+            let mut res2 = fx.bcx.ins().iconst(types::I64, 0);
+            for i in 64..=127 {
+                let mut temp = fx.bcx.ins().iconst(types::I64, 0);
+                for j in i - 63..=63 {
+                    let x = extract_bit(fx, temp1, j);
+                    let y = extract_bit(fx, temp2, i - j);
+                    let z = fx.bcx.ins().band(x, y);
+                    temp = fx.bcx.ins().bxor(temp, z);
+                }
+                let temp = fx.bcx.ins().ishl_imm(temp, i);
+                res2 = fx.bcx.ins().bor(res2, temp);
+            }
+            ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());
+        }
+
+        "llvm.x86.avx.ptestz.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i64);
+            assert_eq!(ret.layout().ty, fx.tcx.types.i32);
+            assert_eq!(lane_count, 4);
+
+            let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
+            let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
+            let a_lane2 = a.value_lane(fx, 2).load_scalar(fx);
+            let a_lane3 = a.value_lane(fx, 3).load_scalar(fx);
+            let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
+            let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
+            let b_lane2 = b.value_lane(fx, 2).load_scalar(fx);
+            let b_lane3 = b.value_lane(fx, 3).load_scalar(fx);
+
+            let zero0 = fx.bcx.ins().band(a_lane0, b_lane0);
+            let zero1 = fx.bcx.ins().band(a_lane1, b_lane1);
+            let zero2 = fx.bcx.ins().band(a_lane2, b_lane2);
+            let zero3 = fx.bcx.ins().band(a_lane3, b_lane3);
+
+            let all_zero0 = fx.bcx.ins().bor(zero0, zero1);
+            let all_zero1 = fx.bcx.ins().bor(zero2, zero3);
+            let all_zero = fx.bcx.ins().bor(all_zero0, all_zero1);
+
+            let res = fx.bcx.ins().icmp_imm(IntCC::Equal, all_zero, 0);
+            let res = CValue::by_val(
+                fx.bcx.ins().uextend(types::I32, res),
+                fx.layout_of(fx.tcx.types.i32),
+            );
+            ret.write_cvalue(fx, res);
+        }
+
         _ => {
             fx.tcx
                 .sess
diff --git a/src/intrinsics/mod.rs b/src/intrinsics/mod.rs
index 83d5d53624e..bfeeb117ff5 100644
--- a/src/intrinsics/mod.rs
+++ b/src/intrinsics/mod.rs
@@ -132,6 +132,65 @@ fn simd_pair_for_each_lane<'tcx>(
     }
 }
 
+fn simd_horizontal_pair_for_each_lane<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    x: CValue<'tcx>,
+    y: CValue<'tcx>,
+    ret: CPlace<'tcx>,
+    f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value) -> Value,
+) {
+    assert_eq!(x.layout(), y.layout());
+    let layout = x.layout();
+
+    let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+    let lane_layout = fx.layout_of(lane_ty);
+    let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+    let ret_lane_layout = fx.layout_of(ret_lane_ty);
+    assert_eq!(lane_count, ret_lane_count);
+
+    for lane_idx in 0..lane_count {
+        let src = if lane_idx < (lane_count / 2) { x } else { y };
+        let src_idx = lane_idx % (lane_count / 2);
+
+        let lhs_lane = src.value_lane(fx, src_idx * 2).load_scalar(fx);
+        let rhs_lane = src.value_lane(fx, src_idx * 2 + 1).load_scalar(fx);
+
+        let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, lhs_lane, rhs_lane);
+        let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+        ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
+    }
+}
+
+fn simd_trio_for_each_lane<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    x: CValue<'tcx>,
+    y: CValue<'tcx>,
+    z: CValue<'tcx>,
+    ret: CPlace<'tcx>,
+    f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value, Value) -> Value,
+) {
+    assert_eq!(x.layout(), y.layout());
+    let layout = x.layout();
+
+    let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+    let lane_layout = fx.layout_of(lane_ty);
+    let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+    let ret_lane_layout = fx.layout_of(ret_lane_ty);
+    assert_eq!(lane_count, ret_lane_count);
+
+    for lane_idx in 0..lane_count {
+        let x_lane = x.value_lane(fx, lane_idx).load_scalar(fx);
+        let y_lane = y.value_lane(fx, lane_idx).load_scalar(fx);
+        let z_lane = z.value_lane(fx, lane_idx).load_scalar(fx);
+
+        let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, x_lane, y_lane, z_lane);
+        let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+        ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
+    }
+}
+
 fn simd_reduce<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     val: CValue<'tcx>,
diff --git a/src/value_and_place.rs b/src/value_and_place.rs
index 5f0aa6c5581..21ad2a835fc 100644
--- a/src/value_and_place.rs
+++ b/src/value_and_place.rs
@@ -243,6 +243,34 @@ impl<'tcx> CValue<'tcx> {
         let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
         let lane_layout = fx.layout_of(lane_ty);
         assert!(lane_idx < lane_count);
+
+        match self.0 {
+            CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
+            CValueInner::ByRef(ptr, None) => {
+                let field_offset = lane_layout.size * lane_idx;
+                let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
+                CValue::by_ref(field_ptr, lane_layout)
+            }
+            CValueInner::ByRef(_, Some(_)) => unreachable!(),
+        }
+    }
+
+    /// Like [`CValue::value_field`] except using the passed type as lane type instead of the one
+    /// specified by the vector type.
+    pub(crate) fn value_typed_lane(
+        self,
+        fx: &mut FunctionCx<'_, '_, 'tcx>,
+        lane_ty: Ty<'tcx>,
+        lane_idx: u64,
+    ) -> CValue<'tcx> {
+        let layout = self.1;
+        assert!(layout.ty.is_simd());
+        let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+        let lane_layout = fx.layout_of(lane_ty);
+        assert!(
+            (lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
+        );
+
         match self.0 {
             CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
             CValueInner::ByRef(ptr, None) => {
@@ -734,6 +762,34 @@ impl<'tcx> CPlace<'tcx> {
         }
     }
 
+    /// Like [`CPlace::place_field`] except using the passed type as lane type instead of the one
+    /// specified by the vector type.
+    pub(crate) fn place_typed_lane(
+        self,
+        fx: &mut FunctionCx<'_, '_, 'tcx>,
+        lane_ty: Ty<'tcx>,
+        lane_idx: u64,
+    ) -> CPlace<'tcx> {
+        let layout = self.layout();
+        assert!(layout.ty.is_simd());
+        let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+        let lane_layout = fx.layout_of(lane_ty);
+        assert!(
+            (lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
+        );
+
+        match self.inner {
+            CPlaceInner::Var(_, _) => unreachable!(),
+            CPlaceInner::VarPair(_, _, _) => unreachable!(),
+            CPlaceInner::Addr(ptr, None) => {
+                let field_offset = lane_layout.size * lane_idx;
+                let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
+                CPlace::for_ptr(field_ptr, lane_layout)
+            }
+            CPlaceInner::Addr(_, Some(_)) => unreachable!(),
+        }
+    }
+
     pub(crate) fn place_index(
         self,
         fx: &mut FunctionCx<'_, '_, 'tcx>,

From cdae1c939fcf9fadfbd79225c8710ca87a9e6e53 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sat, 11 Nov 2023 13:19:49 +0000
Subject: [PATCH 02/14] Set CG_CLIF_FORCE_GNU_AS for all rustc tests

---
 scripts/setup_rust_fork.sh | 8 +++++---
 scripts/test_bootstrap.sh  | 4 +---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/setup_rust_fork.sh b/scripts/setup_rust_fork.sh
index bbb8a010d96..197f2d28fdd 100644
--- a/scripts/setup_rust_fork.sh
+++ b/scripts/setup_rust_fork.sh
@@ -1,12 +1,14 @@
 #!/usr/bin/env bash
 set -e
 
+# CG_CLIF_FORCE_GNU_AS will force usage of as instead of the LLVM backend of rustc as we
+# the LLVM backend isn't compiled in here.
+export CG_CLIF_FORCE_GNU_AS=1
+
 # Compiletest expects all standard library paths to start with /rustc/FAKE_PREFIX.
 # CG_CLIF_STDLIB_REMAP_PATH_PREFIX will cause cg_clif's build system to pass
 # --remap-path-prefix to handle this.
-# CG_CLIF_FORCE_GNU_AS will force usage of as instead of the LLVM backend of rustc as we
-# the LLVM backend isn't compiled in here.
-CG_CLIF_FORCE_GNU_AS=1 CG_CLIF_STDLIB_REMAP_PATH_PREFIX=/rustc/FAKE_PREFIX ./y.sh build
+CG_CLIF_STDLIB_REMAP_PATH_PREFIX=/rustc/FAKE_PREFIX ./y.sh build
 
 echo "[SETUP] Rust fork"
 git clone https://github.com/rust-lang/rust.git || true
diff --git a/scripts/test_bootstrap.sh b/scripts/test_bootstrap.sh
index a8f6d7a2024..791d457993d 100755
--- a/scripts/test_bootstrap.sh
+++ b/scripts/test_bootstrap.sh
@@ -11,7 +11,5 @@ rm -r compiler/rustc_codegen_cranelift/{Cargo.*,src}
 cp ../Cargo.* compiler/rustc_codegen_cranelift/
 cp -r ../src compiler/rustc_codegen_cranelift/src
 
-# CG_CLIF_FORCE_GNU_AS will force usage of as instead of the LLVM backend of rustc as we
-# the LLVM backend isn't compiled in here.
-CG_CLIF_FORCE_GNU_AS=1 ./x.py build --stage 1 library/std
+./x.py build --stage 1 library/std
 popd

From d8445b1cf2e9666d0f7f72ce95f29e0d5549b6c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Ber=C3=A1nek?= <berykubik@gmail.com>
Date: Thu, 9 Nov 2023 16:46:53 +0100
Subject: [PATCH 03/14] Add Rustup installation instructions to README

Now that cranelift is distributed with Rustup, I find myself constantly searching for the https://github.com/rust-lang/rust/pull/81746 PR. I think that it would be useful to also describe the Rustup installation/usage instructions here in README.
---
 Readme.md | 49 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/Readme.md b/Readme.md
index 1a2b2bbc588..ca6ecdf1d0e 100644
--- a/Readme.md
+++ b/Readme.md
@@ -5,8 +5,48 @@ This has the potential to improve compilation times in debug mode.
 If your project doesn't use any of the things listed under "Not yet supported", it should work fine.
 If not please open an issue.
 
+## Download using Rustup
+
+The Cranelift codegen backend is distributed in nightly builds on Linux and x86_64 macOS. If you want to
+install it using Rustup, you can do that by running:
+
+```bash
+$ rustup component add rustc-codegen-cranelift-preview --toolchain nightly
+```
+
+Once it is installed, you can enable it with one of the following approaches:
+- `CARGO_PROFILE_DEV_CODEGEN_BACKEND=cranelift cargo +nightly build -Zcodegen-backend`
+- `RUSTFLAGS="-Zcodegen-backend=cranelift" cargo +nightly build`
+- Add the following to `.cargo/config.toml`:
+    ```toml
+    [unstable]
+    codegen-backend = true
+
+    [profile.dev]
+    codegen-backend = "cranelift"
+    ```
+- Add the following to `Cargo.toml`:
+    ```toml
+    # This line needs to come before anything else in Cargo.toml
+    cargo-features = ["codegen-backend"]
+
+    [profile.dev]
+    codegen-backend = "cranelift"
+    ```
+
+## Precompiled builds
+
+You can also download a pre-built version from the [releases] page.
+Extract the `dist` directory in the archive anywhere you want.
+If you want to use `cargo clif build` instead of having to specify the full path to the `cargo-clif` executable, you can add the `bin` subdirectory of the extracted `dist` directory to your `PATH`.
+(tutorial [for Windows](https://stackoverflow.com/a/44272417), and [for Linux/MacOS](https://unix.stackexchange.com/questions/26047/how-to-correctly-add-a-path-to-path/26059#26059)).
+
+[releases]: https://github.com/rust-lang/rustc_codegen_cranelift/releases/tag/dev
+
 ## Building and testing
 
+If you want to build the backend manually, you can download it from GitHub and build it yourself:
+
 ```bash
 $ git clone https://github.com/rust-lang/rustc_codegen_cranelift
 $ cd rustc_codegen_cranelift
@@ -22,15 +62,6 @@ $ ./test.sh
 
 For more docs on how to build and test see [build_system/usage.txt](build_system/usage.txt) or the help message of `./y.sh`.
 
-## Precompiled builds
-
-Alternatively you can download a pre built version from the [releases] page.
-Extract the `dist` directory in the archive anywhere you want.
-If you want to use `cargo clif build` instead of having to specify the full path to the `cargo-clif` executable, you can add the `bin` subdirectory of the extracted `dist` directory to your `PATH`.
-(tutorial [for Windows](https://stackoverflow.com/a/44272417), and [for Linux/MacOS](https://unix.stackexchange.com/questions/26047/how-to-correctly-add-a-path-to-path/26059#26059)).
-
-[releases]: https://github.com/rust-lang/rustc_codegen_cranelift/releases/tag/dev
-
 ## Usage
 
 rustc_codegen_cranelift can be used as a near-drop-in replacement for `cargo build` or `cargo run` for existing projects.

From ede3269ed80ebf60e8c996396ce3231e6ba4ebbd Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sun, 12 Nov 2023 16:59:15 +0000
Subject: [PATCH 04/14] Print /proc/cpuinfo for the CI runner where relevant

---
 .github/workflows/abi-cafe.yml |  4 ++++
 .github/workflows/main.yml     | 10 ++++++++++
 .github/workflows/rustc.yml    |  6 ++++++
 3 files changed, 20 insertions(+)

diff --git a/.github/workflows/abi-cafe.yml b/.github/workflows/abi-cafe.yml
index 12aa69d3c79..bd3b051185b 100644
--- a/.github/workflows/abi-cafe.yml
+++ b/.github/workflows/abi-cafe.yml
@@ -35,6 +35,10 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      if: matrix.os == 'ubuntu-latest'
+      run: cat /proc/cpuinfo
+
     - name: Cache cargo target dir
       uses: actions/cache@v3
       with:
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 47d9a3b93f7..05dc28d0745 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -66,6 +66,10 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      if: matrix.os == 'ubuntu-latest'
+      run: cat /proc/cpuinfo
+
     - name: Cache cargo target dir
       uses: actions/cache@v3
       with:
@@ -136,6 +140,9 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      run: cat /proc/cpuinfo
+
     - name: Prepare dependencies
       run: ./y.sh prepare
 
@@ -159,6 +166,9 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      run: cat /proc/cpuinfo
+
     - name: Cache cargo target dir
       uses: actions/cache@v3
       with:
diff --git a/.github/workflows/rustc.yml b/.github/workflows/rustc.yml
index b49dc3aff7a..cb5dd51fee3 100644
--- a/.github/workflows/rustc.yml
+++ b/.github/workflows/rustc.yml
@@ -11,6 +11,9 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      run: cat /proc/cpuinfo
+
     - name: Cache cargo target dir
       uses: actions/cache@v3
       with:
@@ -31,6 +34,9 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      run: cat /proc/cpuinfo
+
     - name: Cache cargo target dir
       uses: actions/cache@v3
       with:

From 794707078987ceb9e72cb22820e1c818fd6c212e Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sat, 11 Nov 2023 19:43:42 +0000
Subject: [PATCH 05/14] Minor changes to codegen_inline_asm_terminator

---
 src/base.rs       | 2 +-
 src/inline_asm.rs | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/base.rs b/src/base.rs
index 91b1547cb6e..71557d49ef2 100644
--- a/src/base.rs
+++ b/src/base.rs
@@ -456,7 +456,7 @@ fn codegen_fn_body(fx: &mut FunctionCx<'_, '_, '_>, start_block: Block) {
                     );
                 }
 
-                crate::inline_asm::codegen_inline_asm(
+                crate::inline_asm::codegen_inline_asm_terminator(
                     fx,
                     source_info.span,
                     template,
diff --git a/src/inline_asm.rs b/src/inline_asm.rs
index ce0eecca8a8..0159ec5db9e 100644
--- a/src/inline_asm.rs
+++ b/src/inline_asm.rs
@@ -34,7 +34,7 @@ enum CInlineAsmOperand<'tcx> {
     },
 }
 
-pub(crate) fn codegen_inline_asm<'tcx>(
+pub(crate) fn codegen_inline_asm_terminator<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     span: Span,
     template: &[InlineAsmTemplatePiece],
@@ -135,9 +135,6 @@ pub(crate) fn codegen_inline_asm<'tcx>(
         })
         .collect::<Vec<_>>();
 
-    let mut inputs = Vec::new();
-    let mut outputs = Vec::new();
-
     let mut asm_gen = InlineAssemblyGenerator {
         tcx: fx.tcx,
         arch: fx.tcx.sess.asm_arch.unwrap(),
@@ -165,6 +162,8 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
     fx.cx.global_asm.push_str(&generated_asm);
 
+    let mut inputs = Vec::new();
+    let mut outputs = Vec::new();
     for (i, operand) in operands.iter().enumerate() {
         match operand {
             CInlineAsmOperand::In { reg: _, value } => {

From 9205667c0d68f9cdafc2cecb6cdeefb651cefb78 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sat, 11 Nov 2023 19:48:30 +0000
Subject: [PATCH 06/14] Extract codegen_inline_asm_inner function and use in
 codegen_xgetbv

---
 src/inline_asm.rs | 82 +++++++++++++++--------------------------------
 1 file changed, 26 insertions(+), 56 deletions(-)

diff --git a/src/inline_asm.rs b/src/inline_asm.rs
index 0159ec5db9e..90a63174fe3 100644
--- a/src/inline_asm.rs
+++ b/src/inline_asm.rs
@@ -42,8 +42,6 @@ pub(crate) fn codegen_inline_asm_terminator<'tcx>(
     options: InlineAsmOptions,
     destination: Option<mir::BasicBlock>,
 ) {
-    // FIXME add .eh_frame unwind info directives
-
     // Used by panic_abort on Windows, but uses a syntax which only happens to work with
     // asm!() by accident and breaks with the GNU assembler as well as global_asm!() for
     // the LLVM backend.
@@ -135,12 +133,33 @@ pub(crate) fn codegen_inline_asm_terminator<'tcx>(
         })
         .collect::<Vec<_>>();
 
+    codegen_inline_asm_inner(fx, template, &operands, options);
+
+    match destination {
+        Some(destination) => {
+            let destination_block = fx.get_block(destination);
+            fx.bcx.ins().jump(destination_block, &[]);
+        }
+        None => {
+            fx.bcx.ins().trap(TrapCode::UnreachableCodeReached);
+        }
+    }
+}
+
+fn codegen_inline_asm_inner<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    template: &[InlineAsmTemplatePiece],
+    operands: &[CInlineAsmOperand<'tcx>],
+    options: InlineAsmOptions,
+) {
+    // FIXME add .eh_frame unwind info directives
+
     let mut asm_gen = InlineAssemblyGenerator {
         tcx: fx.tcx,
         arch: fx.tcx.sess.asm_arch.unwrap(),
         enclosing_def_id: fx.instance.def_id(),
         template,
-        operands: &operands,
+        operands,
         options,
         registers: Vec::new(),
         stack_slots_clobber: Vec::new(),
@@ -185,16 +204,6 @@ pub(crate) fn codegen_inline_asm_terminator<'tcx>(
     }
 
     call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
-
-    match destination {
-        Some(destination) => {
-            let destination_block = fx.get_block(destination);
-            fx.bcx.ins().jump(destination_block, &[]);
-        }
-        None => {
-            fx.bcx.ins().trap(TrapCode::UnreachableCodeReached);
-        }
-    }
 }
 
 struct InlineAssemblyGenerator<'a, 'tcx> {
@@ -754,14 +763,9 @@ pub(crate) fn codegen_xgetbv<'tcx>(
     ];
     let options = InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM;
 
-    let mut inputs = Vec::new();
-    let mut outputs = Vec::new();
-
-    let mut asm_gen = InlineAssemblyGenerator {
-        tcx: fx.tcx,
-        arch: fx.tcx.sess.asm_arch.unwrap(),
-        enclosing_def_id: fx.instance.def_id(),
-        template: &[InlineAsmTemplatePiece::String(
+    codegen_inline_asm_inner(
+        fx,
+        &[InlineAsmTemplatePiece::String(
             "
             xgetbv
             // out = rdx << 32 | rax
@@ -770,41 +774,7 @@ pub(crate) fn codegen_xgetbv<'tcx>(
             "
             .to_string(),
         )],
-        operands: &operands,
+        &operands,
         options,
-        registers: Vec::new(),
-        stack_slots_clobber: Vec::new(),
-        stack_slots_input: Vec::new(),
-        stack_slots_output: Vec::new(),
-        stack_slot_size: Size::from_bytes(0),
-    };
-    asm_gen.allocate_registers();
-    asm_gen.allocate_stack_slots();
-
-    let inline_asm_index = fx.cx.inline_asm_index.get();
-    fx.cx.inline_asm_index.set(inline_asm_index + 1);
-    let asm_name = format!(
-        "__inline_asm_{}_n{}",
-        fx.cx.cgu_name.as_str().replace('.', "__").replace('-', "_"),
-        inline_asm_index
     );
-
-    let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
-    fx.cx.global_asm.push_str(&generated_asm);
-
-    for (i, operand) in operands.iter().enumerate() {
-        match operand {
-            CInlineAsmOperand::In { reg: _, value } => {
-                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
-            }
-            CInlineAsmOperand::Out { reg: _, late: _, place } => {
-                if let Some(place) = place {
-                    outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place));
-                }
-            }
-            _ => unreachable!(),
-        }
-    }
-
-    call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
 }

From 2769ac03ba19e052a440cdf1222c22fe8e8020b6 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sat, 11 Nov 2023 19:59:42 +0000
Subject: [PATCH 07/14] Inline codegen_xgetbv into llvm_x86.rs

---
 src/inline_asm.rs          | 45 ++------------------------------------
 src/intrinsics/llvm_x86.rs | 33 +++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 44 deletions(-)

diff --git a/src/inline_asm.rs b/src/inline_asm.rs
index 90a63174fe3..759ee8844fa 100644
--- a/src/inline_asm.rs
+++ b/src/inline_asm.rs
@@ -10,7 +10,7 @@ use target_lexicon::BinaryFormat;
 
 use crate::prelude::*;
 
-enum CInlineAsmOperand<'tcx> {
+pub(crate) enum CInlineAsmOperand<'tcx> {
     In {
         reg: InlineAsmRegOrRegClass,
         value: Value,
@@ -146,7 +146,7 @@ pub(crate) fn codegen_inline_asm_terminator<'tcx>(
     }
 }
 
-fn codegen_inline_asm_inner<'tcx>(
+pub(crate) fn codegen_inline_asm_inner<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     template: &[InlineAsmTemplatePiece],
     operands: &[CInlineAsmOperand<'tcx>],
@@ -737,44 +737,3 @@ fn call_inline_asm<'tcx>(
         place.write_cvalue(fx, CValue::by_val(value, place.layout()));
     }
 }
-
-pub(crate) fn codegen_xgetbv<'tcx>(
-    fx: &mut FunctionCx<'_, '_, 'tcx>,
-    xcr_no: Value,
-    ret: CPlace<'tcx>,
-) {
-    // FIXME add .eh_frame unwind info directives
-
-    let operands = vec![
-        CInlineAsmOperand::In {
-            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
-            value: xcr_no,
-        },
-        CInlineAsmOperand::Out {
-            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
-            late: true,
-            place: Some(ret),
-        },
-        CInlineAsmOperand::Out {
-            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
-            late: true,
-            place: None,
-        },
-    ];
-    let options = InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM;
-
-    codegen_inline_asm_inner(
-        fx,
-        &[InlineAsmTemplatePiece::String(
-            "
-            xgetbv
-            // out = rdx << 32 | rax
-            shl rdx, 32
-            or rax, rdx
-            "
-            .to_string(),
-        )],
-        &operands,
-        options,
-    );
-}
diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index 4c536048626..75e4850d290 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -1,7 +1,10 @@
 //! Emulate x86 LLVM intrinsics
 
+use rustc_ast::ast::{InlineAsmOptions, InlineAsmTemplatePiece};
 use rustc_middle::ty::GenericArgsRef;
+use rustc_target::asm::*;
 
+use crate::inline_asm::{codegen_inline_asm_inner, CInlineAsmOperand};
 use crate::intrinsics::*;
 use crate::prelude::*;
 
@@ -24,7 +27,35 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
             let xcr_no = xcr_no.load_scalar(fx);
 
-            crate::inline_asm::codegen_xgetbv(fx, xcr_no, ret);
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(
+                    "
+                    xgetbv
+                    // out = rdx << 32 | rax
+                    shl rdx, 32
+                    or rax, rdx
+                    "
+                    .to_string(),
+                )],
+                &[
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
+                        value: xcr_no,
+                    },
+                    CInlineAsmOperand::Out {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
+                        late: true,
+                        place: Some(ret),
+                    },
+                    CInlineAsmOperand::Out {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
+                        late: true,
+                        place: None,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
         }
 
         "llvm.x86.sse3.ldu.dq" | "llvm.x86.avx.ldu.dq.256" => {

From 6ef877c8c27dfaaab6b5f1c6f12758c74eeb84bb Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sat, 11 Nov 2023 20:58:05 +0000
Subject: [PATCH 08/14] Support narrowing casts in mir_operand_get_const_val

---
 src/constant.rs        | 43 ++++++++++++++++++++++++++++++------------
 src/intrinsics/simd.rs | 14 +++++++-------
 2 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/src/constant.rs b/src/constant.rs
index b0853d30e03..cf68a3857c5 100644
--- a/src/constant.rs
+++ b/src/constant.rs
@@ -1,10 +1,13 @@
 //! Handling of `static`s, `const`s and promoted allocations
 
+use std::cmp::Ordering;
+
 use cranelift_module::*;
 use rustc_data_structures::fx::{FxHashMap, FxHashSet};
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrFlags;
 use rustc_middle::mir::interpret::{read_target_uint, AllocId, GlobalAlloc, Scalar};
 use rustc_middle::mir::ConstValue;
+use rustc_middle::ty::ScalarInt;
 
 use crate::prelude::*;
 
@@ -430,9 +433,9 @@ fn define_all_allocs(tcx: TyCtxt<'_>, module: &mut dyn Module, cx: &mut Constant
 pub(crate) fn mir_operand_get_const_val<'tcx>(
     fx: &FunctionCx<'_, '_, 'tcx>,
     operand: &Operand<'tcx>,
-) -> Option<ConstValue<'tcx>> {
+) -> Option<ScalarInt> {
     match operand {
-        Operand::Constant(const_) => Some(eval_mir_constant(fx, const_).0),
+        Operand::Constant(const_) => eval_mir_constant(fx, const_).0.try_to_scalar_int(),
         // FIXME(rust-lang/rust#85105): Casts like `IMM8 as u32` result in the const being stored
         // inside a temporary before being passed to the intrinsic requiring the const argument.
         // This code tries to find a single constant defining definition of the referenced local.
@@ -440,7 +443,7 @@ pub(crate) fn mir_operand_get_const_val<'tcx>(
             if !place.projection.is_empty() {
                 return None;
             }
-            let mut computed_const_val = None;
+            let mut computed_scalar_int = None;
             for bb_data in fx.mir.basic_blocks.iter() {
                 for stmt in &bb_data.statements {
                     match &stmt.kind {
@@ -456,22 +459,38 @@ pub(crate) fn mir_operand_get_const_val<'tcx>(
                                     operand,
                                     ty,
                                 ) => {
-                                    if computed_const_val.is_some() {
+                                    if computed_scalar_int.is_some() {
                                         return None; // local assigned twice
                                     }
                                     if !matches!(ty.kind(), ty::Uint(_) | ty::Int(_)) {
                                         return None;
                                     }
-                                    let const_val = mir_operand_get_const_val(fx, operand)?;
-                                    if fx.layout_of(*ty).size
-                                        != const_val.try_to_scalar_int()?.size()
+                                    let scalar_int = mir_operand_get_const_val(fx, operand)?;
+                                    let scalar_int = match fx
+                                        .layout_of(*ty)
+                                        .size
+                                        .cmp(&scalar_int.size())
                                     {
-                                        return None;
-                                    }
-                                    computed_const_val = Some(const_val);
+                                        Ordering::Equal => scalar_int,
+                                        Ordering::Less => match ty.kind() {
+                                            ty::Uint(_) => ScalarInt::try_from_uint(
+                                                scalar_int.try_to_uint(scalar_int.size()).unwrap(),
+                                                fx.layout_of(*ty).size,
+                                            )
+                                            .unwrap(),
+                                            ty::Int(_) => ScalarInt::try_from_int(
+                                                scalar_int.try_to_int(scalar_int.size()).unwrap(),
+                                                fx.layout_of(*ty).size,
+                                            )
+                                            .unwrap(),
+                                            _ => unreachable!(),
+                                        },
+                                        Ordering::Greater => return None,
+                                    };
+                                    computed_scalar_int = Some(scalar_int);
                                 }
                                 Rvalue::Use(operand) => {
-                                    computed_const_val = mir_operand_get_const_val(fx, operand)
+                                    computed_scalar_int = mir_operand_get_const_val(fx, operand)
                                 }
                                 _ => return None,
                             }
@@ -522,7 +541,7 @@ pub(crate) fn mir_operand_get_const_val<'tcx>(
                     TerminatorKind::Call { .. } => {}
                 }
             }
-            computed_const_val
+            computed_scalar_int
         }
     }
 }
diff --git a/src/intrinsics/simd.rs b/src/intrinsics/simd.rs
index ea137c4ca1e..0bd211fd614 100644
--- a/src/intrinsics/simd.rs
+++ b/src/intrinsics/simd.rs
@@ -282,11 +282,11 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
                 fx.tcx.sess.span_fatal(span, "Index argument for `simd_insert` is not a constant");
             };
 
-            let idx = idx_const
-                .try_to_bits(Size::from_bytes(4 /* u32*/))
-                .unwrap_or_else(|| panic!("kind not scalar: {:?}", idx_const));
+            let idx: u32 = idx_const
+                .try_to_u32()
+                .unwrap_or_else(|_| panic!("kind not scalar: {:?}", idx_const));
             let (lane_count, _lane_ty) = base.layout().ty.simd_size_and_type(fx.tcx);
-            if idx >= lane_count.into() {
+            if u64::from(idx) >= lane_count {
                 fx.tcx.sess.span_fatal(
                     fx.mir.span,
                     format!("[simd_insert] idx {} >= lane_count {}", idx, lane_count),
@@ -331,10 +331,10 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
             };
 
             let idx = idx_const
-                .try_to_bits(Size::from_bytes(4 /* u32*/))
-                .unwrap_or_else(|| panic!("kind not scalar: {:?}", idx_const));
+                .try_to_u32()
+                .unwrap_or_else(|_| panic!("kind not scalar: {:?}", idx_const));
             let (lane_count, _lane_ty) = v.layout().ty.simd_size_and_type(fx.tcx);
-            if idx >= lane_count.into() {
+            if u64::from(idx) >= lane_count {
                 fx.tcx.sess.span_fatal(
                     fx.mir.span,
                     format!("[simd_extract] idx {} >= lane_count {}", idx, lane_count),

From dc6033477778bd16462d630a2812961b0856461d Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sat, 11 Nov 2023 20:59:15 +0000
Subject: [PATCH 09/14] Implement AES-NI intrinsics using inline asm

---
 src/abi/mod.rs             |   1 +
 src/inline_asm.rs          |  44 +++++++++--
 src/intrinsics/llvm.rs     |   2 +
 src/intrinsics/llvm_x86.rs | 156 +++++++++++++++++++++++++++++++++++++
 4 files changed, 198 insertions(+), 5 deletions(-)

diff --git a/src/abi/mod.rs b/src/abi/mod.rs
index c4572e03525..0ff1473da43 100644
--- a/src/abi/mod.rs
+++ b/src/abi/mod.rs
@@ -383,6 +383,7 @@ pub(crate) fn codegen_terminator_call<'tcx>(
                 args,
                 ret_place,
                 target,
+                source_info.span,
             );
             return;
         }
diff --git a/src/inline_asm.rs b/src/inline_asm.rs
index 759ee8844fa..25d14319f57 100644
--- a/src/inline_asm.rs
+++ b/src/inline_asm.rs
@@ -645,8 +645,21 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
     ) {
         match arch {
             InlineAsmArch::X86_64 => {
-                write!(generated_asm, "    mov [rbx+0x{:x}], ", offset.bytes()).unwrap();
-                reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap();
+                match reg {
+                    InlineAsmReg::X86(reg)
+                        if reg as u32 >= X86InlineAsmReg::xmm0 as u32
+                            && reg as u32 <= X86InlineAsmReg::xmm15 as u32 =>
+                    {
+                        // rustc emits x0 rather than xmm0
+                        write!(generated_asm, "    movups [rbx+0x{:x}], ", offset.bytes()).unwrap();
+                        write!(generated_asm, "xmm{}", reg as u32 - X86InlineAsmReg::xmm0 as u32)
+                            .unwrap();
+                    }
+                    _ => {
+                        write!(generated_asm, "    mov [rbx+0x{:x}], ", offset.bytes()).unwrap();
+                        reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap();
+                    }
+                }
                 generated_asm.push('\n');
             }
             InlineAsmArch::AArch64 => {
@@ -671,8 +684,24 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
     ) {
         match arch {
             InlineAsmArch::X86_64 => {
-                generated_asm.push_str("    mov ");
-                reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap();
+                match reg {
+                    InlineAsmReg::X86(reg)
+                        if reg as u32 >= X86InlineAsmReg::xmm0 as u32
+                            && reg as u32 <= X86InlineAsmReg::xmm15 as u32 =>
+                    {
+                        // rustc emits x0 rather than xmm0
+                        write!(
+                            generated_asm,
+                            "    movups xmm{}",
+                            reg as u32 - X86InlineAsmReg::xmm0 as u32
+                        )
+                        .unwrap();
+                    }
+                    _ => {
+                        generated_asm.push_str("    mov ");
+                        reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap()
+                    }
+                }
                 writeln!(generated_asm, ", [rbx+0x{:x}]", offset.bytes()).unwrap();
             }
             InlineAsmArch::AArch64 => {
@@ -728,7 +757,12 @@ fn call_inline_asm<'tcx>(
     fx.bcx.ins().call(inline_asm_func, &[stack_slot_addr]);
 
     for (offset, place) in outputs {
-        let ty = fx.clif_type(place.layout().ty).unwrap();
+        let ty = if place.layout().ty.is_simd() {
+            let (lane_count, lane_type) = place.layout().ty.simd_size_and_type(fx.tcx);
+            fx.clif_type(lane_type).unwrap().by(lane_count.try_into().unwrap()).unwrap()
+        } else {
+            fx.clif_type(place.layout().ty).unwrap()
+        };
         let value = stack_slot.offset(fx, i32::try_from(offset.bytes()).unwrap().into()).load(
             fx,
             ty,
diff --git a/src/intrinsics/llvm.rs b/src/intrinsics/llvm.rs
index e9b7daf1492..659e6c133ef 100644
--- a/src/intrinsics/llvm.rs
+++ b/src/intrinsics/llvm.rs
@@ -12,6 +12,7 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
     args: &[mir::Operand<'tcx>],
     ret: CPlace<'tcx>,
     target: Option<BasicBlock>,
+    span: Span,
 ) {
     if intrinsic.starts_with("llvm.aarch64") {
         return llvm_aarch64::codegen_aarch64_llvm_intrinsic_call(
@@ -31,6 +32,7 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
             args,
             ret,
             target,
+            span,
         );
     }
 
diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index 75e4850d290..85a2db43fde 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -15,6 +15,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
     args: &[mir::Operand<'tcx>],
     ret: CPlace<'tcx>,
     target: Option<BasicBlock>,
+    span: Span,
 ) {
     match intrinsic {
         "llvm.x86.sse2.pause" | "llvm.aarch64.isb" => {
@@ -718,6 +719,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
         }
 
         "llvm.x86.pclmulqdq" => {
+            // FIXME use inline asm
             // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
             intrinsic_args!(fx, args => (a, b, imm8); intrinsic);
 
@@ -779,6 +781,160 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());
         }
 
+        "llvm.x86.aesni.aeskeygenassist" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128&ig_expand=261
+            intrinsic_args!(fx, args => (a, _imm8); intrinsic);
+
+            let a = a.load_scalar(fx);
+
+            let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[1])
+            {
+                imm8
+            } else {
+                fx.tcx.sess.span_fatal(
+                    span,
+                    "Index argument for `_mm_aeskeygenassist_si128` is not a constant",
+                );
+            };
+
+            let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("aeskeygenassist xmm0, xmm0, {imm8}"))],
+                &[CInlineAsmOperand::InOut {
+                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                    _late: true,
+                    in_value: a,
+                    out_place: Some(ret),
+                }],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesimc" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128&ig_expand=260
+            intrinsic_args!(fx, args => (a); intrinsic);
+
+            let a = a.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesimc xmm0, xmm0".to_string())],
+                &[CInlineAsmOperand::InOut {
+                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                    _late: true,
+                    in_value: a,
+                    out_place: Some(ret),
+                }],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesenc" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc_si128&ig_expand=252
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesenc xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesenclast" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128&ig_expand=257
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesenclast xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesdec" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128&ig_expand=242
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesdec xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesdeclast" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128&ig_expand=247
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesdeclast xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
         "llvm.x86.avx.ptestz.256" => {
             // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945
             intrinsic_args!(fx, args => (a, b); intrinsic);

From 813f8b4fd06fedf4952bd446b437d37137306103 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sat, 11 Nov 2023 21:04:39 +0000
Subject: [PATCH 10/14] Use inline asm for _mm_clmulepi64_si128

This is a lot more compact and significantly faster
---
 src/intrinsics/llvm_x86.rs | 85 ++++++++++++++------------------------
 1 file changed, 30 insertions(+), 55 deletions(-)

diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index 85a2db43fde..ef0bb97bd7d 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -719,66 +719,41 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
         }
 
         "llvm.x86.pclmulqdq" => {
-            // FIXME use inline asm
             // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
-            intrinsic_args!(fx, args => (a, b, imm8); intrinsic);
+            intrinsic_args!(fx, args => (a, b, _imm8); intrinsic);
 
-            assert_eq!(a.layout(), b.layout());
-            let layout = a.layout();
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
 
-            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
-            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
-            assert_eq!(lane_ty, fx.tcx.types.i64);
-            assert_eq!(ret_lane_ty, fx.tcx.types.i64);
-            assert_eq!(lane_count, 2);
-            assert_eq!(ret_lane_count, 2);
+            let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[2])
+            {
+                imm8
+            } else {
+                fx.tcx.sess.span_fatal(
+                    span,
+                    "Index argument for `_mm_clmulepi64_si128` is not a constant",
+                );
+            };
 
-            let imm8 = imm8.load_scalar(fx);
+            let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
 
-            let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001);
-            let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
-            let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
-            let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0);
-
-            let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000);
-            let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
-            let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
-            let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0);
-
-            fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value {
-                let tmp = fx.bcx.ins().ushr_imm(val, bit);
-                fx.bcx.ins().band_imm(tmp, 1)
-            }
-
-            let mut res1 = fx.bcx.ins().iconst(types::I64, 0);
-            for i in 0..=63 {
-                let x = extract_bit(fx, temp1, 0);
-                let y = extract_bit(fx, temp2, i);
-                let mut temp = fx.bcx.ins().band(x, y);
-                for j in 1..=i {
-                    let x = extract_bit(fx, temp1, j);
-                    let y = extract_bit(fx, temp2, i - j);
-                    let z = fx.bcx.ins().band(x, y);
-                    temp = fx.bcx.ins().bxor(temp, z);
-                }
-                let temp = fx.bcx.ins().ishl_imm(temp, i);
-                res1 = fx.bcx.ins().bor(res1, temp);
-            }
-            ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted());
-
-            let mut res2 = fx.bcx.ins().iconst(types::I64, 0);
-            for i in 64..=127 {
-                let mut temp = fx.bcx.ins().iconst(types::I64, 0);
-                for j in i - 63..=63 {
-                    let x = extract_bit(fx, temp1, j);
-                    let y = extract_bit(fx, temp2, i - j);
-                    let z = fx.bcx.ins().band(x, y);
-                    temp = fx.bcx.ins().bxor(temp, z);
-                }
-                let temp = fx.bcx.ins().ishl_imm(temp, i);
-                res2 = fx.bcx.ins().bor(res2, temp);
-            }
-            ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("pclmulqdq xmm0, xmm1, {imm8}"))],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: b,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
         }
 
         "llvm.x86.aesni.aeskeygenassist" => {

From ca85cc3c7b7edbd11752828edbaa07a524176db4 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sun, 12 Nov 2023 17:17:57 +0000
Subject: [PATCH 11/14] Implement SHA256 intrinsics using inline asm

---
 src/intrinsics/llvm_x86.rs | 84 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index ef0bb97bd7d..8dd2b6ed014 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -910,6 +910,90 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             );
         }
 
+        "llvm.x86.sha256rnds2" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256rnds2_epu32&ig_expand=5977
+            intrinsic_args!(fx, args => (a, b, k); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+            let k = k.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("sha256rnds2 xmm1, xmm2".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),
+                        value: b,
+                    },
+                    // Implicit argument to the sha256rnds2 instruction
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        value: k,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.sha256msg1" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg1_epu32&ig_expand=5975
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("sha256msg1 xmm1, xmm2".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),
+                        value: b,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.sha256msg2" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg2_epu32&ig_expand=5976
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("sha256msg2 xmm1, xmm2".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),
+                        value: b,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
         "llvm.x86.avx.ptestz.256" => {
             // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945
             intrinsic_args!(fx, args => (a, b); intrinsic);

From cc59a427c92452c54199c917990f5fd4b60633e2 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sun, 12 Nov 2023 17:35:58 +0000
Subject: [PATCH 12/14] Use git clone --filter=tree:0 to speed up rust clones
 in CI

---
 scripts/setup_rust_fork.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/setup_rust_fork.sh b/scripts/setup_rust_fork.sh
index 197f2d28fdd..731828caae2 100644
--- a/scripts/setup_rust_fork.sh
+++ b/scripts/setup_rust_fork.sh
@@ -11,7 +11,7 @@ export CG_CLIF_FORCE_GNU_AS=1
 CG_CLIF_STDLIB_REMAP_PATH_PREFIX=/rustc/FAKE_PREFIX ./y.sh build
 
 echo "[SETUP] Rust fork"
-git clone https://github.com/rust-lang/rust.git || true
+git clone https://github.com/rust-lang/rust.git --filter=tree:0 || true
 pushd rust
 git fetch
 git checkout -- .

From d9122c7565ff7757d126beb8e24dca07dc730640 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Thu, 16 Nov 2023 20:55:49 +0000
Subject: [PATCH 13/14] Update reference to bjorn3/rustc_codegen_cranelift

---
 scripts/rustup.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/rustup.sh b/scripts/rustup.sh
index e62788f2e50..355282911c2 100755
--- a/scripts/rustup.sh
+++ b/scripts/rustup.sh
@@ -46,7 +46,7 @@ case $1 in
         git pull origin master
         branch=sync_cg_clif-$(date +%Y-%m-%d)
         git checkout -b "$branch"
-        "$cg_clif/git-fixed-subtree.sh" pull --prefix=compiler/rustc_codegen_cranelift/ https://github.com/bjorn3/rustc_codegen_cranelift.git master
+        "$cg_clif/git-fixed-subtree.sh" pull --prefix=compiler/rustc_codegen_cranelift/ https://github.com/rust-lang/rustc_codegen_cranelift.git master
         git push -u my "$branch"
 
         # immediately merge the merge commit into cg_clif to prevent merge conflicts when syncing

From def04540a4e2541b995195c752c751295606a388 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Thu, 16 Nov 2023 21:06:08 +0000
Subject: [PATCH 14/14] Rustup to rustc 1.76.0-nightly (6b771f6b5 2023-11-15)

---
 rust-toolchain | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust-toolchain b/rust-toolchain
index b832b06e0ff..80ef1e49f23 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2023-11-10"
+channel = "nightly-2023-11-16"
 components = ["rust-src", "rustc-dev", "llvm-tools"]