Auto merge of #111850 - the8472:external-step-by, r=scottmcm

Specialize `StepBy<Range<{integer}>>`

OLD

    iter::bench_range_step_by_fold_u16      700.00ns/iter +/- 10.00ns
    iter::bench_range_step_by_fold_usize    519.00ns/iter  +/- 6.00ns
    iter::bench_range_step_by_loop_u32      555.00ns/iter  +/- 7.00ns
    iter::bench_range_step_by_sum_reducible  37.00ns/iter  +/- 0.00ns

NEW

    iter::bench_range_step_by_fold_u16       49.00ns/iter +/- 0.00ns
    iter::bench_range_step_by_fold_usize    194.00ns/iter +/- 1.00ns
    iter::bench_range_step_by_loop_u32       98.00ns/iter +/- 0.00ns
    iter::bench_range_step_by_sum_reducible   1.00ns/iter +/- 0.00ns

NEW + `-Ctarget-cpu=x86-64-v3`

    iter::bench_range_step_by_fold_u16      22.00ns/iter +/- 0.00ns
    iter::bench_range_step_by_fold_usize    80.00ns/iter +/- 1.00ns
    iter::bench_range_step_by_loop_u32      41.00ns/iter +/- 0.00ns
    iter::bench_range_step_by_sum_reducible  1.00ns/iter +/- 0.00ns

I have only optimized for walltime of those methods, I haven't tested whether it eliminates bounds checks when indexing into slices via things like `(0..slice.len()).step_by(16)`.
This commit is contained in:
bors 2023-06-26 00:28:30 +00:00
commit ae8ffa663c
3 changed files with 484 additions and 38 deletions

View File

@ -2,6 +2,7 @@ use core::borrow::Borrow;
use core::iter::*;
use core::mem;
use core::num::Wrapping;
use core::ops::Range;
use test::{black_box, Bencher};
#[bench]
@ -69,6 +70,57 @@ fn bench_max(b: &mut Bencher) {
})
}
#[bench]
fn bench_range_step_by_sum_reducible(b: &mut Bencher) {
let r = 0u32..1024;
b.iter(|| {
let r = black_box(r.clone()).step_by(8);
let mut sum: u32 = 0;
for i in r {
sum += i;
}
sum
})
}
#[bench]
fn bench_range_step_by_loop_u32(b: &mut Bencher) {
let r = 0..(u16::MAX as u32);
b.iter(|| {
let r = black_box(r.clone()).step_by(64);
let mut sum: u32 = 0;
for i in r {
let i = i ^ i.wrapping_sub(1);
sum = sum.wrapping_add(i);
}
sum
})
}
#[bench]
fn bench_range_step_by_fold_usize(b: &mut Bencher) {
let r: Range<usize> = 0..(u16::MAX as usize);
b.iter(|| {
let r = black_box(r.clone());
r.step_by(64)
.map(|x: usize| x ^ (x.wrapping_sub(1)))
.fold(0usize, |acc, i| acc.wrapping_add(i))
})
}
#[bench]
fn bench_range_step_by_fold_u16(b: &mut Bencher) {
let r: Range<u16> = 0..u16::MAX;
b.iter(|| {
let r = black_box(r.clone());
r.step_by(64).map(|x: u16| x ^ (x.wrapping_sub(1))).fold(0u16, |acc, i| acc.wrapping_add(i))
})
}
pub fn copy_zip(xs: &[u8], ys: &mut [u8]) {
for (a, b) in ys.iter_mut().zip(xs) {
*a = *b;

View File

@ -1,4 +1,9 @@
use crate::{intrinsics, iter::from_fn, ops::Try};
use crate::convert::TryFrom;
use crate::{
intrinsics,
iter::{from_fn, TrustedLen},
ops::{Range, Try},
};
/// An iterator for stepping iterators by a custom amount.
///
@ -11,14 +16,22 @@ use crate::{intrinsics, iter::from_fn, ops::Try};
#[stable(feature = "iterator_step_by", since = "1.28.0")]
#[derive(Clone, Debug)]
pub struct StepBy<I> {
/// This field is guaranteed to be preprocessed by the specialized `SpecRangeSetup::setup`
/// in the constructor.
/// For most iterators that processing is a no-op, but for Range<{integer}> types it is lossy
/// which means the inner iterator cannot be returned to user code.
/// Additionally this type-dependent preprocessing means specialized implementations
/// cannot be used interchangeably.
iter: I,
step: usize,
first_take: bool,
}
impl<I> StepBy<I> {
#[inline]
pub(in crate::iter) fn new(iter: I, step: usize) -> StepBy<I> {
assert!(step != 0);
let iter = <I as SpecRangeSetup<I>>::setup(iter, step);
StepBy { iter, step: step - 1, first_take: true }
}
}
@ -32,16 +45,174 @@ where
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.first_take {
self.first_take = false;
self.iter.next()
} else {
self.iter.nth(self.step)
}
self.spec_next()
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.spec_size_hint()
}
#[inline]
fn nth(&mut self, n: usize) -> Option<Self::Item> {
self.spec_nth(n)
}
fn try_fold<Acc, F, R>(&mut self, acc: Acc, f: F) -> R
where
F: FnMut(Acc, Self::Item) -> R,
R: Try<Output = Acc>,
{
self.spec_try_fold(acc, f)
}
#[inline]
fn fold<Acc, F>(self, acc: Acc, f: F) -> Acc
where
F: FnMut(Acc, Self::Item) -> Acc,
{
self.spec_fold(acc, f)
}
}
impl<I> StepBy<I>
where
I: ExactSizeIterator,
{
// The zero-based index starting from the end of the iterator of the
// last element. Used in the `DoubleEndedIterator` implementation.
fn next_back_index(&self) -> usize {
let rem = self.iter.len() % (self.step + 1);
if self.first_take {
if rem == 0 { self.step } else { rem - 1 }
} else {
rem
}
}
}
#[stable(feature = "double_ended_step_by_iterator", since = "1.38.0")]
impl<I> DoubleEndedIterator for StepBy<I>
where
I: DoubleEndedIterator + ExactSizeIterator,
{
#[inline]
fn next_back(&mut self) -> Option<Self::Item> {
self.spec_next_back()
}
#[inline]
fn nth_back(&mut self, n: usize) -> Option<Self::Item> {
self.spec_nth_back(n)
}
fn try_rfold<Acc, F, R>(&mut self, init: Acc, f: F) -> R
where
F: FnMut(Acc, Self::Item) -> R,
R: Try<Output = Acc>,
{
self.spec_try_rfold(init, f)
}
#[inline]
fn rfold<Acc, F>(self, init: Acc, f: F) -> Acc
where
Self: Sized,
F: FnMut(Acc, Self::Item) -> Acc,
{
self.spec_rfold(init, f)
}
}
// StepBy can only make the iterator shorter, so the len will still fit.
#[stable(feature = "iterator_step_by", since = "1.28.0")]
impl<I> ExactSizeIterator for StepBy<I> where I: ExactSizeIterator {}
trait SpecRangeSetup<T> {
fn setup(inner: T, step: usize) -> T;
}
impl<T> SpecRangeSetup<T> for T {
#[inline]
default fn setup(inner: T, _step: usize) -> T {
inner
}
}
/// Specialization trait to optimize `StepBy<Range<{integer}>>` iteration.
///
/// # Safety
///
/// Technically this is safe to implement (look ma, no unsafe!), but in reality
/// a lot of unsafe code relies on ranges over integers being correct.
///
/// For correctness *all* public StepBy methods must be specialized
/// because `setup` drastically alters the meaning of the struct fields so that mixing
/// different implementations would lead to incorrect results.
unsafe trait StepByImpl<I> {
type Item;
fn spec_next(&mut self) -> Option<Self::Item>;
fn spec_size_hint(&self) -> (usize, Option<usize>);
fn spec_nth(&mut self, n: usize) -> Option<Self::Item>;
fn spec_try_fold<Acc, F, R>(&mut self, acc: Acc, f: F) -> R
where
F: FnMut(Acc, Self::Item) -> R,
R: Try<Output = Acc>;
fn spec_fold<Acc, F>(self, acc: Acc, f: F) -> Acc
where
F: FnMut(Acc, Self::Item) -> Acc;
}
/// Specialization trait for double-ended iteration.
///
/// See also: `StepByImpl`
///
/// # Safety
///
/// The specializations must be implemented together with `StepByImpl`
/// where applicable. I.e. if `StepBy` does support backwards iteration
/// for a given iterator and that is specialized for forward iteration then
/// it must also be specialized for backwards iteration.
unsafe trait StepByBackImpl<I> {
type Item;
fn spec_next_back(&mut self) -> Option<Self::Item>
where
I: DoubleEndedIterator + ExactSizeIterator;
fn spec_nth_back(&mut self, n: usize) -> Option<Self::Item>
where
I: DoubleEndedIterator + ExactSizeIterator;
fn spec_try_rfold<Acc, F, R>(&mut self, init: Acc, f: F) -> R
where
I: DoubleEndedIterator + ExactSizeIterator,
F: FnMut(Acc, Self::Item) -> R,
R: Try<Output = Acc>;
fn spec_rfold<Acc, F>(self, init: Acc, f: F) -> Acc
where
I: DoubleEndedIterator + ExactSizeIterator,
F: FnMut(Acc, Self::Item) -> Acc;
}
unsafe impl<I: Iterator> StepByImpl<I> for StepBy<I> {
type Item = I::Item;
#[inline]
default fn spec_next(&mut self) -> Option<I::Item> {
let step_size = if self.first_take { 0 } else { self.step };
self.first_take = false;
self.iter.nth(step_size)
}
#[inline]
default fn spec_size_hint(&self) -> (usize, Option<usize>) {
#[inline]
fn first_size(step: usize) -> impl Fn(usize) -> usize {
move |n| if n == 0 { 0 } else { 1 + (n - 1) / (step + 1) }
@ -64,7 +235,7 @@ where
}
#[inline]
fn nth(&mut self, mut n: usize) -> Option<Self::Item> {
default fn spec_nth(&mut self, mut n: usize) -> Option<I::Item> {
if self.first_take {
self.first_take = false;
let first = self.iter.next();
@ -108,7 +279,7 @@ where
}
}
fn try_fold<Acc, F, R>(&mut self, mut acc: Acc, mut f: F) -> R
default fn spec_try_fold<Acc, F, R>(&mut self, mut acc: Acc, mut f: F) -> R
where
F: FnMut(Acc, Self::Item) -> R,
R: Try<Output = Acc>,
@ -128,7 +299,7 @@ where
from_fn(nth(&mut self.iter, self.step)).try_fold(acc, f)
}
fn fold<Acc, F>(mut self, mut acc: Acc, mut f: F) -> Acc
default fn spec_fold<Acc, F>(mut self, mut acc: Acc, mut f: F) -> Acc
where
F: FnMut(Acc, Self::Item) -> Acc,
{
@ -148,34 +319,16 @@ where
}
}
impl<I> StepBy<I>
where
I: ExactSizeIterator,
{
// The zero-based index starting from the end of the iterator of the
// last element. Used in the `DoubleEndedIterator` implementation.
fn next_back_index(&self) -> usize {
let rem = self.iter.len() % (self.step + 1);
if self.first_take {
if rem == 0 { self.step } else { rem - 1 }
} else {
rem
}
}
}
unsafe impl<I: DoubleEndedIterator + ExactSizeIterator> StepByBackImpl<I> for StepBy<I> {
type Item = I::Item;
#[stable(feature = "double_ended_step_by_iterator", since = "1.38.0")]
impl<I> DoubleEndedIterator for StepBy<I>
where
I: DoubleEndedIterator + ExactSizeIterator,
{
#[inline]
fn next_back(&mut self) -> Option<Self::Item> {
default fn spec_next_back(&mut self) -> Option<Self::Item> {
self.iter.nth_back(self.next_back_index())
}
#[inline]
fn nth_back(&mut self, n: usize) -> Option<Self::Item> {
default fn spec_nth_back(&mut self, n: usize) -> Option<I::Item> {
// `self.iter.nth_back(usize::MAX)` does the right thing here when `n`
// is out of bounds because the length of `self.iter` does not exceed
// `usize::MAX` (because `I: ExactSizeIterator`) and `nth_back` is
@ -184,7 +337,7 @@ where
self.iter.nth_back(n)
}
fn try_rfold<Acc, F, R>(&mut self, init: Acc, mut f: F) -> R
default fn spec_try_rfold<Acc, F, R>(&mut self, init: Acc, mut f: F) -> R
where
F: FnMut(Acc, Self::Item) -> R,
R: Try<Output = Acc>,
@ -207,10 +360,10 @@ where
}
#[inline]
fn rfold<Acc, F>(mut self, init: Acc, mut f: F) -> Acc
default fn spec_rfold<Acc, F>(mut self, init: Acc, mut f: F) -> Acc
where
Self: Sized,
F: FnMut(Acc, Self::Item) -> Acc,
F: FnMut(Acc, I::Item) -> Acc,
{
#[inline]
fn nth_back<I: DoubleEndedIterator>(
@ -230,6 +383,192 @@ where
}
}
// StepBy can only make the iterator shorter, so the len will still fit.
#[stable(feature = "iterator_step_by", since = "1.28.0")]
impl<I> ExactSizeIterator for StepBy<I> where I: ExactSizeIterator {}
/// For these implementations, `SpecRangeSetup` calculates the number
/// of iterations that will be needed and stores that in `iter.end`.
///
/// The various iterator implementations then rely on that to not need
/// overflow checking, letting loops just be counted instead.
///
/// These only work for unsigned types, and will need to be reworked
/// if you want to use it to specialize on signed types.
///
/// Currently these are only implemented for integers up to usize due to
/// correctness issues around ExactSizeIterator impls on 16bit platforms.
/// And since ExactSizeIterator is a prerequisite for backwards iteration
/// and we must consistently specialize backwards and forwards iteration
/// that makes the situation complicated enough that it's not covered
/// for now.
macro_rules! spec_int_ranges {
($($t:ty)*) => ($(
const _: () = assert!(usize::BITS >= <$t>::BITS);
impl SpecRangeSetup<Range<$t>> for Range<$t> {
#[inline]
fn setup(mut r: Range<$t>, step: usize) -> Range<$t> {
let inner_len = r.size_hint().0;
// If step exceeds $t::MAX, then the count will be at most 1 and
// thus always fit into $t.
let yield_count = inner_len.div_ceil(step);
// Turn the range end into an iteration counter
r.end = yield_count as $t;
r
}
}
unsafe impl StepByImpl<Range<$t>> for StepBy<Range<$t>> {
#[inline]
fn spec_next(&mut self) -> Option<$t> {
// if a step size larger than the type has been specified fall back to
// t::MAX, in which case remaining will be at most 1.
// The `+ 1` can't overflow since the constructor substracted 1 from the original value.
let step = <$t>::try_from(self.step + 1).unwrap_or(<$t>::MAX);
let remaining = self.iter.end;
if remaining > 0 {
let val = self.iter.start;
// this can only overflow during the last step, after which the value
// will not be used
self.iter.start = val.wrapping_add(step);
self.iter.end = remaining - 1;
Some(val)
} else {
None
}
}
#[inline]
fn spec_size_hint(&self) -> (usize, Option<usize>) {
let remaining = self.iter.end as usize;
(remaining, Some(remaining))
}
// The methods below are all copied from the Iterator trait default impls.
// We have to repeat them here so that the specialization overrides the StepByImpl defaults
#[inline]
fn spec_nth(&mut self, n: usize) -> Option<Self::Item> {
self.advance_by(n).ok()?;
self.next()
}
#[inline]
fn spec_try_fold<Acc, F, R>(&mut self, init: Acc, mut f: F) -> R
where
F: FnMut(Acc, Self::Item) -> R,
R: Try<Output = Acc>
{
let mut accum = init;
while let Some(x) = self.next() {
accum = f(accum, x)?;
}
try { accum }
}
#[inline]
fn spec_fold<Acc, F>(self, init: Acc, mut f: F) -> Acc
where
F: FnMut(Acc, Self::Item) -> Acc
{
// if a step size larger than the type has been specified fall back to
// t::MAX, in which case remaining will be at most 1.
let step = <$t>::try_from(self.step + 1).unwrap_or(<$t>::MAX);
let remaining = self.iter.end;
let mut acc = init;
let mut val = self.iter.start;
for _ in 0..remaining {
acc = f(acc, val);
// this can only overflow during the last step, after which the value
// will no longer be used
val = val.wrapping_add(step);
}
acc
}
}
/// Safety: This macro is only applied to ranges over types <= usize
/// which means the inner length is guaranteed to fit into a usize and so
/// the outer length calculation won't encounter clamped values
#[unstable(feature = "trusted_len", issue = "37572")]
unsafe impl TrustedLen for StepBy<Range<$t>> {}
)*)
}
macro_rules! spec_int_ranges_r {
($($t:ty)*) => ($(
const _: () = assert!(usize::BITS >= <$t>::BITS);
unsafe impl StepByBackImpl<Range<$t>> for StepBy<Range<$t>> {
#[inline]
fn spec_next_back(&mut self) -> Option<Self::Item>
where Range<$t>: DoubleEndedIterator + ExactSizeIterator,
{
let step = (self.step + 1) as $t;
let remaining = self.iter.end;
if remaining > 0 {
let start = self.iter.start;
self.iter.end = remaining - 1;
Some(start + step * (remaining - 1))
} else {
None
}
}
// The methods below are all copied from the Iterator trait default impls.
// We have to repeat them here so that the specialization overrides the StepByImplBack defaults
#[inline]
fn spec_nth_back(&mut self, n: usize) -> Option<Self::Item>
where Self: DoubleEndedIterator,
{
if self.advance_back_by(n).is_err() {
return None;
}
self.next_back()
}
#[inline]
fn spec_try_rfold<Acc, F, R>(&mut self, init: Acc, mut f: F) -> R
where
Self: DoubleEndedIterator,
F: FnMut(Acc, Self::Item) -> R,
R: Try<Output = Acc>
{
let mut accum = init;
while let Some(x) = self.next_back() {
accum = f(accum, x)?;
}
try { accum }
}
#[inline]
fn spec_rfold<Acc, F>(mut self, init: Acc, mut f: F) -> Acc
where
Self: DoubleEndedIterator,
F: FnMut(Acc, Self::Item) -> Acc
{
let mut accum = init;
while let Some(x) = self.next_back() {
accum = f(accum, x);
}
accum
}
}
)*)
}
#[cfg(target_pointer_width = "64")]
spec_int_ranges!(u8 u16 u32 u64 usize);
// DoubleEndedIterator requires ExactSizeIterator, which isn't implemented for Range<u64>
#[cfg(target_pointer_width = "64")]
spec_int_ranges_r!(u8 u16 u32 usize);
#[cfg(target_pointer_width = "32")]
spec_int_ranges!(u8 u16 u32 usize);
#[cfg(target_pointer_width = "32")]
spec_int_ranges_r!(u8 u16 u32 usize);
#[cfg(target_pointer_width = "16")]
spec_int_ranges!(u8 u16 usize);
#[cfg(target_pointer_width = "16")]
spec_int_ranges_r!(u8 u16 usize);

View File

@ -244,3 +244,58 @@ fn test_step_by_skip() {
assert_eq!((0..=50).step_by(10).nth(3), Some(30));
assert_eq!((200..=255u8).step_by(10).nth(3), Some(230));
}
struct DeOpt<I: Iterator>(I);
impl<I: Iterator> Iterator for DeOpt<I> {
type Item = I::Item;
fn next(&mut self) -> core::option::Option<Self::Item> {
self.0.next()
}
}
impl<I: DoubleEndedIterator> DoubleEndedIterator for DeOpt<I> {
fn next_back(&mut self) -> core::option::Option<Self::Item> {
self.0.next_back()
}
}
#[test]
fn test_step_by_fold_range_specialization() {
macro_rules! t {
($range:expr, $var: ident, $body:tt) => {
{
// run the same tests for the non-optimized version
let mut $var = DeOpt($range);
$body
}
{
let mut $var = $range;
$body
}
}
}
t!((1usize..5).step_by(1), r, {
assert_eq!(r.next_back(), Some(4));
assert_eq!(r.sum::<usize>(), 6);
});
t!((0usize..4).step_by(2), r, {
assert_eq!(r.next(), Some(0));
assert_eq!(r.sum::<usize>(), 2);
});
t!((0usize..5).step_by(2), r, {
assert_eq!(r.next(), Some(0));
assert_eq!(r.sum::<usize>(), 6);
});
t!((usize::MAX - 6 .. usize::MAX).step_by(5), r, {
assert_eq!(r.next(), Some(usize::MAX - 6));
assert_eq!(r.sum::<usize>(), usize::MAX - 1);
});
}