Auto merge of #3640 - folkertdev:add-pclmulqdq, r=RalfJung

add support for `pclmulqdq` intrinsic

This instruction is required in fast implementations of the crc32 checksum algorithm, and used in the https://crates.io/crates/crc32fast and https://crates.io/crates/zlib-rs crates.

Some questions from my side

- is my method for decomposing a `__m128i` into two separate `i64` values allright?
This commit is contained in:
bors 2024-06-08 16:34:07 +00:00
commit 4d5fd1129a
2 changed files with 117 additions and 0 deletions

View File

@ -105,6 +105,13 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
}
}
"pclmulqdq" => {
let [left, right, imm] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
pclmulqdq(this, left, right, imm, dest)?;
}
name if name.starts_with("sse.") => {
return sse::EvalContextExt::emulate_x86_sse_intrinsic(
this, link_name, abi, args, dest,
@ -1133,6 +1140,68 @@ fn pmulhrsw<'tcx>(
Ok(())
}
/// Perform a carry-less multiplication of two 64-bit integers, selected from `left` and `right` according to `imm8`,
/// and store the results in `dst`.
///
/// `left` and `right` are both vectors of type 2 x i64. Only bits 0 and 4 of `imm8` matter;
/// they select the element of `left` and `right`, respectively.
///
/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128>
fn pclmulqdq<'tcx>(
this: &mut MiriInterpCx<'tcx>,
left: &OpTy<'tcx>,
right: &OpTy<'tcx>,
imm8: &OpTy<'tcx>,
dest: &MPlaceTy<'tcx>,
) -> InterpResult<'tcx, ()> {
assert_eq!(left.layout, right.layout);
assert_eq!(left.layout.size, dest.layout.size);
// Transmute to `[u64; 2]`
let array_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, 2))?;
let left = left.transmute(array_layout, this)?;
let right = right.transmute(array_layout, this)?;
let dest = dest.transmute(array_layout, this)?;
let imm8 = this.read_scalar(imm8)?.to_u8()?;
// select the 64-bit integer from left that the user specified (low or high)
let index = if (imm8 & 0x01) == 0 { 0 } else { 1 };
let left = this.read_scalar(&this.project_index(&left, index)?)?.to_u64()?;
// select the 64-bit integer from right that the user specified (low or high)
let index = if (imm8 & 0x10) == 0 { 0 } else { 1 };
let right = this.read_scalar(&this.project_index(&right, index)?)?.to_u64()?;
// Perform carry-less multiplication
//
// This operation is like long multiplication, but ignores all carries.
// That idea corresponds to the xor operator, which is used in the implementation.
//
// Wikipedia has an example https://en.wikipedia.org/wiki/Carry-less_product#Example
let mut result: u128 = 0;
for i in 0..64 {
// if the i-th bit in right is set
if (right & (1 << i)) != 0 {
// xor result with `left` shifted to the left by i positions
result ^= (left as u128) << i;
}
}
let result_low = (result & 0xFFFF_FFFF_FFFF_FFFF) as u64;
let result_high = (result >> 64) as u64;
let dest_low = this.project_index(&dest, 0)?;
this.write_scalar(Scalar::from_u64(result_low), &dest_low)?;
let dest_high = this.project_index(&dest, 1)?;
this.write_scalar(Scalar::from_u64(result_high), &dest_high)?;
Ok(())
}
/// Packs two N-bit integer vectors to a single N/2-bit integers.
///
/// The conversion from N-bit to N/2-bit should be provided by `f`.

View File

@ -0,0 +1,48 @@
// Ignore everything except x86 and x86_64
// Any new targets that are added to CI should be ignored here.
// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.)
//@ignore-target-aarch64
//@ignore-target-arm
//@ignore-target-avr
//@ignore-target-s390x
//@ignore-target-thumbv7em
//@ignore-target-wasm32
//@compile-flags: -C target-feature=+pclmulqdq
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
fn main() {
assert!(is_x86_feature_detected!("pclmulqdq"));
let a = (0x7fffffffffffffff, 0x4317e40ab4ddcf05);
let b = (0xdd358416f52ecd34, 0x633d11cc638ca16b);
unsafe {
assert_eq!(clmulepi64_si128::<0x00>(a, b), (13036940098130298092, 2704901987789626761));
assert_eq!(clmulepi64_si128::<0x01>(a, b), (6707488474444649956, 3901733953304450635));
assert_eq!(clmulepi64_si128::<0x10>(a, b), (11607166829323378905, 1191897396234301548));
assert_eq!(clmulepi64_si128::<0x11>(a, b), (7731954893213347271, 1760130762532070957));
}
}
#[target_feature(enable = "pclmulqdq")]
unsafe fn clmulepi64_si128<const IMM8: i32>(
(a1, a2): (u64, u64),
(b1, b2): (u64, u64),
) -> (u64, u64) {
// SAFETY: There are no safety requirements for calling `_mm_clmulepi64_si128`.
// It's just unsafe for API consistency with other intrinsics.
unsafe {
let a = core::mem::transmute::<_, __m128i>([a1, a2]);
let b = core::mem::transmute::<_, __m128i>([b1, b2]);
let out = _mm_clmulepi64_si128::<IMM8>(a, b);
let [c1, c2] = core::mem::transmute::<_, [u64; 2]>(out);
(c1, c2)
}
}