From 77b053a2dd20cc8b1dc1401385a0ef71c7cb25b4 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 17 Jul 2023 13:04:07 +1000 Subject: [PATCH 1/4] Add `MonoItemData::inlined`. --- compiler/rustc_middle/src/mir/mono.rs | 6 ++++ .../rustc_monomorphize/src/partitioning.rs | 30 ++++++++----------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs index 16addc2dc1e..d217e1c5f8f 100644 --- a/compiler/rustc_middle/src/mir/mono.rs +++ b/compiler/rustc_middle/src/mir/mono.rs @@ -248,8 +248,14 @@ pub struct CodegenUnit<'tcx> { /// Auxiliary info about a `MonoItem`. #[derive(Copy, Clone, PartialEq, Debug, HashStable)] pub struct MonoItemData { + /// A cached copy of the result of `MonoItem::instantiation_mode`, where + /// `GloballyShared` maps to `false` and `LocalCopy` maps to `true`. + pub inlined: bool, + pub linkage: Linkage, pub visibility: Visibility, + + /// A cached copy of the result of `MonoItem::size_estimate`. pub size_estimate: usize, } diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index 54096abb2e0..f6a71e5cf54 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -248,7 +248,8 @@ where } let size_estimate = mono_item.size_estimate(cx.tcx); - cgu.items_mut().insert(mono_item, MonoItemData { linkage, visibility, size_estimate }); + cgu.items_mut() + .insert(mono_item, MonoItemData { inlined: false, linkage, visibility, size_estimate }); // Get all inlined items that are reachable from `mono_item` without // going via another root item. This includes drop-glue, functions from @@ -263,6 +264,7 @@ where for inlined_item in reachable_inlined_items { // This is a CGU-private copy. cgu.items_mut().entry(inlined_item).or_insert_with(|| MonoItemData { + inlined: true, linkage: Linkage::Internal, visibility: Visibility::Default, size_estimate: inlined_item.size_estimate(cx.tcx), @@ -870,19 +872,16 @@ fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit< all_cgu_sizes.push(cgu.size_estimate()); for (item, data) in cgu.items() { - match item.instantiation_mode(tcx) { - InstantiationMode::GloballyShared { .. } => { - root_items += 1; - root_size += data.size_estimate; - } - InstantiationMode::LocalCopy => { - if inlined_items.insert(item) { - unique_inlined_items += 1; - unique_inlined_size += data.size_estimate; - } - placed_inlined_items += 1; - placed_inlined_size += data.size_estimate; + if !data.inlined { + root_items += 1; + root_size += data.size_estimate; + } else { + if inlined_items.insert(item) { + unique_inlined_items += 1; + unique_inlined_size += data.size_estimate; } + placed_inlined_items += 1; + placed_inlined_size += data.size_estimate; } } } @@ -937,10 +936,7 @@ fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit< let symbol_name = item.symbol_name(tcx).name; let symbol_hash_start = symbol_name.rfind('h'); let symbol_hash = symbol_hash_start.map_or("", |i| &symbol_name[i..]); - let kind = match item.instantiation_mode(tcx) { - InstantiationMode::GloballyShared { .. } => "root", - InstantiationMode::LocalCopy => "inlined", - }; + let kind = if !data.inlined { "root" } else { "inlined" }; let size = data.size_estimate; let _ = with_no_trimmed_paths!(writeln!( s, From b2c3948892558ca1ede7d3c9b96e1970d3b688fc Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 14 Jul 2023 09:12:27 +1000 Subject: [PATCH 2/4] Split the CGU merging loop. It has two conditions. This commit splits it in two, one per condition. The next commit will change the first loop. --- .../rustc_monomorphize/src/partitioning.rs | 66 +++++++++++-------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index f6a71e5cf54..54bb9de4f0c 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -318,29 +318,9 @@ fn merge_codegen_units<'tcx>( let mut cgu_contents: FxHashMap> = codegen_units.iter().map(|cgu| (cgu.name(), vec![cgu.name()])).collect(); - // Having multiple CGUs can drastically speed up compilation. But for - // non-incremental builds, tiny CGUs slow down compilation *and* result in - // worse generated code. So we don't allow CGUs smaller than this (unless - // there is just one CGU, of course). Note that CGU sizes of 100,000+ are - // common in larger programs, so this isn't all that large. - const NON_INCR_MIN_CGU_SIZE: usize = 1800; - - // Repeatedly merge the two smallest codegen units as long as: - // - we have more CGUs than the upper limit, or - // - (Non-incremental builds only) the user didn't specify a CGU count, and - // there are multiple CGUs, and some are below the minimum size. - // - // The "didn't specify a CGU count" condition is because when an explicit - // count is requested we observe it as closely as possible. For example, - // the `compiler_builtins` crate sets `codegen-units = 10000` and it's - // critical they aren't merged. Also, some tests use explicit small values - // and likewise won't work if small CGUs are merged. - while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize() - || (cx.tcx.sess.opts.incremental.is_none() - && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_)) - && codegen_units.len() > 1 - && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE)) - { + // Repeatedly merge the two smallest codegen units as long as we have more + // CGUs than the upper limit. + while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize() { // Sort small cgus to the back. codegen_units.sort_by_key(|cgu| cmp::Reverse(cgu.size_estimate())); @@ -357,12 +337,42 @@ fn merge_codegen_units<'tcx>( // in `smallest` before. let mut consumed_cgu_names = cgu_contents.remove(&smallest.name()).unwrap(); cgu_contents.get_mut(&second_smallest.name()).unwrap().append(&mut consumed_cgu_names); + } - debug!( - "CodegenUnit {} merged into CodegenUnit {}", - smallest.name(), - second_smallest.name() - ); + // Having multiple CGUs can drastically speed up compilation. But for + // non-incremental builds, tiny CGUs slow down compilation *and* result in + // worse generated code. So we don't allow CGUs smaller than this (unless + // there is just one CGU, of course). Note that CGU sizes of 100,000+ are + // common in larger programs, so this isn't all that large. + const NON_INCR_MIN_CGU_SIZE: usize = 1800; + + // Repeatedly merge the two smallest codegen units as long as: it's a + // non-incremental build, and the user didn't specify a CGU count, and + // there are multiple CGUs, and some are below the minimum size. + // + // The "didn't specify a CGU count" condition is because when an explicit + // count is requested we observe it as closely as possible. For example, + // the `compiler_builtins` crate sets `codegen-units = 10000` and it's + // critical they aren't merged. Also, some tests use explicit small values + // and likewise won't work if small CGUs are merged. + while cx.tcx.sess.opts.incremental.is_none() + && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_)) + && codegen_units.len() > 1 + && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE) + { + // Sort small cgus to the back. + codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate())); + + let mut smallest = codegen_units.pop().unwrap(); + let second_smallest = codegen_units.last_mut().unwrap(); + + // Move the items from `smallest` to `second_smallest`. Some of them + // may be duplicate inlined items, in which case the destination CGU is + // unaffected. Recalculate size estimates afterwards. + second_smallest.items_mut().extend(smallest.items_mut().drain()); + second_smallest.compute_size_estimate(); + + // Don't update `cgu_contents`, that's only for incremental builds. } let cgu_name_builder = &mut CodegenUnitNameBuilder::new(cx.tcx); From 017c0b5a015e79625034ff47f199095cdf0af6b6 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 18 Jul 2023 09:45:21 +1000 Subject: [PATCH 3/4] Add a useful comment. --- compiler/rustc_middle/src/mir/mono.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs index d217e1c5f8f..f4133dfbc95 100644 --- a/compiler/rustc_middle/src/mir/mono.rs +++ b/compiler/rustc_middle/src/mir/mono.rs @@ -56,6 +56,8 @@ impl<'tcx> MonoItem<'tcx> { } } + // Note: if you change how item size estimates work, you might need to + // change NON_INCR_MIN_CGU_SIZE as well. pub fn size_estimate(&self, tcx: TyCtxt<'tcx>) -> usize { match *self { MonoItem::Fn(instance) => { From 05de5d6f64e493afbf8e2bd15333f67268ddcc89 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 14 Jul 2023 09:45:55 +1000 Subject: [PATCH 4/4] Change the primary CGU merging algorithm. Instead of repeatedly merging the two smallest CGUs, we now use a merging algorithm that aims to minimize the duplication of inlined functions. `exa-0.10.1` was one benchmark that saw particularly good results. The old CGU stats: ``` INTERNALIZE - unique items: 2774 (1216 root + 1558 inlined), unique size: 122065 (77219 root + 44846 inlined) - placed items: 3834 (1216 root + 2618 inlined), placed size: 154552 (77219 root + 77333 inlined) - placed/unique items ratio: 1.38, placed/unique size ratio: 1.27 - CGUs: 16, mean size: 9659.5, sizes: [11791, 11634, 11173, 10987, 10939, 10507, 9992, 9813, 9593, 9580, 9030, 8447, 7975, 7961, 7876, 7254] ``` The new CGU stats: ``` INTERNALIZE - unique items: 2774 (1216 root + 1558 inlined), unique size: 122065 (77219 root + 44846 inlined) - placed items: 3626 (1216 root + 2410 inlined), placed size: 147201 (77219 root + 69982 inlined) - placed/unique items ratio: 1.31, placed/unique size ratio: 1.21 - CGUs: 16, mean size: 9200.1, sizes: [11634, 10939, 10227, 9555, 9178, 9167, 8879, 8804, 8604, 8603 (x3), 8602 (x2), 8601, 8600] ``` The difference is in the number of inlined items. There are 1558 unique inlined items. With the old algorithm these were placed 2618 times, resulting in 1060 duplicates. With the new algorithm these were placed 2410 times, resulting in 852 duplicates. Also, the mean CGU size dropped from 9659.5 to 9200.1, and the CGU size distribution tightened, with the biggest one a little smaller and the smallest ones a little bigger. --- .../rustc_monomorphize/src/partitioning.rs | 80 +++++++++++++++---- 1 file changed, 66 insertions(+), 14 deletions(-) diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index 54bb9de4f0c..71aef53192f 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -318,25 +318,58 @@ fn merge_codegen_units<'tcx>( let mut cgu_contents: FxHashMap> = codegen_units.iter().map(|cgu| (cgu.name(), vec![cgu.name()])).collect(); - // Repeatedly merge the two smallest codegen units as long as we have more - // CGUs than the upper limit. - while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize() { - // Sort small cgus to the back. + // If N is the maximum number of CGUs, and the CGUs are sorted from largest + // to smallest, we repeatedly find which CGU in codegen_units[N..] has the + // greatest overlap of inlined items with codegen_units[N-1], merge that + // CGU into codegen_units[N-1], then re-sort by size and repeat. + // + // We use inlined item overlap to guide this merging because it minimizes + // duplication of inlined items, which makes LLVM be faster and generate + // better and smaller machine code. + // + // Why merge into codegen_units[N-1]? We want CGUs to have similar sizes, + // which means we don't want codegen_units[0..N] (the already big ones) + // getting any bigger, if we can avoid it. When we have more than N CGUs + // then at least one of the biggest N will have to grow. codegen_units[N-1] + // is the smallest of those, and so has the most room to grow. + let max_codegen_units = cx.tcx.sess.codegen_units().as_usize(); + while codegen_units.len() > max_codegen_units { + // Sort small CGUs to the back. codegen_units.sort_by_key(|cgu| cmp::Reverse(cgu.size_estimate())); - let mut smallest = codegen_units.pop().unwrap(); - let second_smallest = codegen_units.last_mut().unwrap(); + let cgu_dst = &codegen_units[max_codegen_units - 1]; - // Move the items from `smallest` to `second_smallest`. Some of them - // may be duplicate inlined items, in which case the destination CGU is + // Find the CGU that overlaps the most with `cgu_dst`. In the case of a + // tie, favour the earlier (bigger) CGU. + let mut max_overlap = 0; + let mut max_overlap_i = max_codegen_units; + for (i, cgu_src) in codegen_units.iter().enumerate().skip(max_codegen_units) { + if cgu_src.size_estimate() <= max_overlap { + // None of the remaining overlaps can exceed `max_overlap`, so + // stop looking. + break; + } + + let overlap = compute_inlined_overlap(cgu_dst, cgu_src); + if overlap > max_overlap { + max_overlap = overlap; + max_overlap_i = i; + } + } + + let mut cgu_src = codegen_units.swap_remove(max_overlap_i); + let cgu_dst = &mut codegen_units[max_codegen_units - 1]; + + // Move the items from `cgu_src` to `cgu_dst`. Some of them may be + // duplicate inlined items, in which case the destination CGU is // unaffected. Recalculate size estimates afterwards. - second_smallest.items_mut().extend(smallest.items_mut().drain()); - second_smallest.compute_size_estimate(); + cgu_dst.items_mut().extend(cgu_src.items_mut().drain()); + cgu_dst.compute_size_estimate(); - // Record that `second_smallest` now contains all the stuff that was - // in `smallest` before. - let mut consumed_cgu_names = cgu_contents.remove(&smallest.name()).unwrap(); - cgu_contents.get_mut(&second_smallest.name()).unwrap().append(&mut consumed_cgu_names); + // Record that `cgu_dst` now contains all the stuff that was in + // `cgu_src` before. + let mut consumed_cgu_names = cgu_contents.remove(&cgu_src.name()).unwrap(); + cgu_contents.get_mut(&cgu_dst.name()).unwrap().append(&mut consumed_cgu_names); } // Having multiple CGUs can drastically speed up compilation. But for @@ -451,6 +484,25 @@ fn merge_codegen_units<'tcx>( } } +/// Compute the combined size of all inlined items that appear in both `cgu1` +/// and `cgu2`. +fn compute_inlined_overlap<'tcx>(cgu1: &CodegenUnit<'tcx>, cgu2: &CodegenUnit<'tcx>) -> usize { + // Either order works. We pick the one that involves iterating over fewer + // items. + let (src_cgu, dst_cgu) = + if cgu1.items().len() <= cgu2.items().len() { (cgu1, cgu2) } else { (cgu2, cgu1) }; + + let mut overlap = 0; + for (item, data) in src_cgu.items().iter() { + if data.inlined { + if dst_cgu.items().contains_key(item) { + overlap += data.size_estimate; + } + } + } + overlap +} + fn internalize_symbols<'tcx>( cx: &PartitioningCx<'_, 'tcx>, codegen_units: &mut [CodegenUnit<'tcx>],