diff --git a/Cargo.lock b/Cargo.lock index fb401ed4cd0..939c1ce58c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3626,6 +3626,7 @@ version = "0.0.0" dependencies = [ "bitflags", "cc", + "itertools 0.9.0", "jobserver", "libc", "memmap", diff --git a/compiler/rustc_codegen_ssa/Cargo.toml b/compiler/rustc_codegen_ssa/Cargo.toml index e5df0f60941..835f9062399 100644 --- a/compiler/rustc_codegen_ssa/Cargo.toml +++ b/compiler/rustc_codegen_ssa/Cargo.toml @@ -10,6 +10,7 @@ test = false [dependencies] bitflags = "1.2.1" cc = "1.0.1" +itertools = "0.9" num_cpus = "1.0" memmap = "0.7" tracing = "0.1" diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs index 0fc11c286f8..658ad3c375d 100644 --- a/compiler/rustc_codegen_ssa/src/base.rs +++ b/compiler/rustc_codegen_ssa/src/base.rs @@ -32,10 +32,11 @@ use rustc_session::config::{self, EntryFnType}; use rustc_session::Session; use rustc_target::abi::{Align, LayoutOf, VariantIdx}; -use std::cmp; use std::ops::{Deref, DerefMut}; use std::time::{Duration, Instant}; +use itertools::Itertools; + pub fn bin_op_to_icmp_predicate(op: hir::BinOpKind, signed: bool) -> IntPredicate { match op { hir::BinOpKind::Eq => IntPredicate::IntEQ, @@ -546,12 +547,23 @@ pub fn codegen_crate( ongoing_codegen.submit_pre_codegened_module_to_llvm(tcx, metadata_module); } - // We sort the codegen units by size. This way we can schedule work for LLVM - // a bit more efficiently. - let codegen_units = { - let mut codegen_units = codegen_units.iter().collect::>(); - codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate())); - codegen_units + // For better throughput during parallel processing by LLVM, we used to sort + // CGUs largest to smallest. This would lead to better thread utilization + // by, for example, preventing a large CGU from being processed last and + // having only one LLVM thread working while the rest remained idle. + // + // However, this strategy would lead to high memory usage, as it meant the + // LLVM-IR for all of the largest CGUs would be resident in memory at once. + // + // Instead, we can compromise by ordering CGUs such that the largest and + // smallest are first, second largest and smallest are next, etc. If there + // are large size variations, this can reduce memory usage significantly. + let codegen_units: Vec<_> = { + let mut sorted_cgus = codegen_units.iter().collect::>(); + sorted_cgus.sort_by_cached_key(|cgu| cgu.size_estimate()); + + let (first_half, second_half) = sorted_cgus.split_at(sorted_cgus.len() / 2); + second_half.iter().rev().interleave(first_half).copied().collect() }; // The non-parallel compiler can only translate codegen units to LLVM IR