Auto merge of #123246 - Kobzol:tarball-reproducible, r=Mark-Simulacrum

Make source tarball generation more reproducible

This PR performs several changes to source tarball generation (`x dist rustc-src`) in order to make it more reproducible (in light of the recent "xz backdoor"...). I want to follow up on it with making a separate CI workflow for generating the tarball.

After this PR, running this locally produces identical checksums:
```bash
$ ./x dist rustc-src
$ sha256sum build/dist/rustc-1.79.0-src.tar.gz

$ ./x dist rustc-src
$ sha256sum build/dist/rustc-1.79.0-src.tar.gz
```

r? `@Mark-Simulacrum`
This commit is contained in:
bors 2024-03-31 12:36:23 +00:00
commit a8cfc83801
3 changed files with 58 additions and 29 deletions

View File

@ -995,9 +995,9 @@ impl Step for PlainSourceTarball {
if builder.rust_info().is_managed_git_subrepository() if builder.rust_info().is_managed_git_subrepository()
|| builder.rust_info().is_from_tarball() || builder.rust_info().is_from_tarball()
{ {
if builder.rust_info().is_managed_git_subrepository() { // Ensure we have all submodules from src and other directories checked out.
// Ensure we have the submodules checked out. for submodule in builder.get_all_submodules() {
builder.update_submodule(Path::new("src/tools/cargo")); builder.update_submodule(Path::new(submodule));
} }
// Vendor all Cargo dependencies // Vendor all Cargo dependencies
@ -1028,6 +1028,20 @@ impl Step for PlainSourceTarball {
builder.create(&cargo_config_dir.join("config.toml"), &config); builder.create(&cargo_config_dir.join("config.toml"), &config);
} }
// Delete extraneous directories
// FIXME: if we're managed by git, we should probably instead ask git if the given path
// is managed by it?
for entry in walkdir::WalkDir::new(tarball.image_dir())
.follow_links(true)
.into_iter()
.filter_map(|e| e.ok())
{
if entry.path().is_dir() && entry.path().file_name() == Some(OsStr::new("__pycache__"))
{
t!(fs::remove_dir_all(entry.path()));
}
}
tarball.bare() tarball.bare()
} }
} }

View File

@ -554,29 +554,7 @@ impl<'a> ShouldRun<'a> {
/// ///
/// [`path`]: ShouldRun::path /// [`path`]: ShouldRun::path
pub fn paths(mut self, paths: &[&str]) -> Self { pub fn paths(mut self, paths: &[&str]) -> Self {
static SUBMODULES_PATHS: OnceLock<Vec<String>> = OnceLock::new(); let submodules_paths = self.builder.get_all_submodules();
let init_submodules_paths = |src: &PathBuf| {
let file = File::open(src.join(".gitmodules")).unwrap();
let mut submodules_paths = vec![];
for line in BufReader::new(file).lines() {
if let Ok(line) = line {
let line = line.trim();
if line.starts_with("path") {
let actual_path =
line.split(' ').last().expect("Couldn't get value of path");
submodules_paths.push(actual_path.to_owned());
}
}
}
submodules_paths
};
let submodules_paths =
SUBMODULES_PATHS.get_or_init(|| init_submodules_paths(&self.builder.src));
self.paths.insert(PathSet::Set( self.paths.insert(PathSet::Set(
paths paths
@ -2151,6 +2129,37 @@ impl<'a> Builder<'a> {
out out
} }
/// Return paths of all submodules managed by git.
/// If the current checkout is not managed by git, returns an empty slice.
pub fn get_all_submodules(&self) -> &[String] {
if !self.rust_info().is_managed_git_subrepository() {
return &[];
}
static SUBMODULES_PATHS: OnceLock<Vec<String>> = OnceLock::new();
let init_submodules_paths = |src: &PathBuf| {
let file = File::open(src.join(".gitmodules")).unwrap();
let mut submodules_paths = vec![];
for line in BufReader::new(file).lines() {
if let Ok(line) = line {
let line = line.trim();
if line.starts_with("path") {
let actual_path =
line.split(' ').last().expect("Couldn't get value of path");
submodules_paths.push(actual_path.to_owned());
}
}
}
submodules_paths
};
&SUBMODULES_PATHS.get_or_init(|| init_submodules_paths(&self.src))
}
/// Ensure that a given step is built *only if it's supposed to be built by default*, returning /// Ensure that a given step is built *only if it's supposed to be built by default*, returning
/// its output. This will cache the step, so it's safe (and good!) to call this as often as /// its output. This will cache the step, so it's safe (and good!) to call this as often as
/// needed to ensure that all dependencies are build. /// needed to ensure that all dependencies are build.

View File

@ -2,7 +2,7 @@ use anyhow::{bail, Context, Result};
use std::fs::{read_link, symlink_metadata}; use std::fs::{read_link, symlink_metadata};
use std::io::{BufWriter, Write}; use std::io::{BufWriter, Write};
use std::path::Path; use std::path::Path;
use tar::{Builder, Header}; use tar::{Builder, Header, HeaderMode};
use walkdir::WalkDir; use walkdir::WalkDir;
use crate::{ use crate::{
@ -53,14 +53,19 @@ impl Tarballer {
// Sort files by their suffix, to group files with the same name from // Sort files by their suffix, to group files with the same name from
// different locations (likely identical) and files with the same // different locations (likely identical) and files with the same
// extension (likely containing similar data). // extension (likely containing similar data).
let (dirs, mut files) = get_recursive_paths(&self.work_dir, &self.input) // Sorting of file and directory paths also helps with the reproducibility
// of the resulting archive.
let (mut dirs, mut files) = get_recursive_paths(&self.work_dir, &self.input)
.context("failed to collect file paths")?; .context("failed to collect file paths")?;
dirs.sort();
files.sort_by(|a, b| a.bytes().rev().cmp(b.bytes().rev())); files.sort_by(|a, b| a.bytes().rev().cmp(b.bytes().rev()));
// Write the tar into both encoded files. We write all directories // Write the tar into both encoded files. We write all directories
// first, so files may be directly created. (See rust-lang/rustup.rs#1092.) // first, so files may be directly created. (See rust-lang/rustup.rs#1092.)
let buf = BufWriter::with_capacity(1024 * 1024, encoder); let buf = BufWriter::with_capacity(1024 * 1024, encoder);
let mut builder = Builder::new(buf); let mut builder = Builder::new(buf);
// Make uid, gid and mtime deterministic to improve reproducibility
builder.mode(HeaderMode::Deterministic);
let pool = rayon::ThreadPoolBuilder::new().num_threads(2).build().unwrap(); let pool = rayon::ThreadPoolBuilder::new().num_threads(2).build().unwrap();
pool.install(move || { pool.install(move || {
@ -91,7 +96,8 @@ impl Tarballer {
fn append_path<W: Write>(builder: &mut Builder<W>, src: &Path, path: &String) -> Result<()> { fn append_path<W: Write>(builder: &mut Builder<W>, src: &Path, path: &String) -> Result<()> {
let stat = symlink_metadata(src)?; let stat = symlink_metadata(src)?;
let mut header = Header::new_gnu(); let mut header = Header::new_gnu();
header.set_metadata(&stat); header.set_metadata_in_mode(&stat, HeaderMode::Deterministic);
if stat.file_type().is_symlink() { if stat.file_type().is_symlink() {
let link = read_link(src)?; let link = read_link(src)?;
builder.append_link(&mut header, path, &link)?; builder.append_link(&mut header, path, &link)?;