From 28db4ccda76ffd2ce4c36912a194979a7ce2ef8d Mon Sep 17 00:00:00 2001 From: Michael Howell Date: Thu, 21 Mar 2024 17:19:39 -0700 Subject: [PATCH] rustdoc-search: compressed bitmap to sort, then load desc This adds a bit more data than "pure sharding" by including information about which items have no description at all. This way, it can sort the results, then truncate, then finally download the description. With the "e" bitmap: 2380KiB Without the "e" bitmap: 2364KiB --- Cargo.lock | 2 + src/librustdoc/Cargo.toml | 2 + src/librustdoc/html/render/search_index.rs | 233 ++++++++++++++++++++- src/librustdoc/html/static/js/search.js | 190 ++++++++++++++--- src/tools/rustdoc-js/tester.js | 5 - 5 files changed, 395 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 16aed3dc49c..1e7ef0cca9b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4741,6 +4741,8 @@ version = "0.0.0" dependencies = [ "arrayvec", "askama", + "base64", + "byteorder", "expect-test", "indexmap", "itertools 0.12.1", diff --git a/src/librustdoc/Cargo.toml b/src/librustdoc/Cargo.toml index bd0fbef998b..44ba2a8c015 100644 --- a/src/librustdoc/Cargo.toml +++ b/src/librustdoc/Cargo.toml @@ -9,6 +9,8 @@ path = "lib.rs" [dependencies] arrayvec = { version = "0.7", default-features = false } askama = { version = "0.12", default-features = false, features = ["config"] } +base64 = "0.21.7" +byteorder = "1.5" itertools = "0.12" indexmap = "2" minifier = "0.3.0" diff --git a/src/librustdoc/html/render/search_index.rs b/src/librustdoc/html/render/search_index.rs index 34a4a89aa7b..2ec22df0b43 100644 --- a/src/librustdoc/html/render/search_index.rs +++ b/src/librustdoc/html/render/search_index.rs @@ -1,6 +1,7 @@ use std::collections::hash_map::Entry; use std::collections::{BTreeMap, VecDeque}; +use base64::prelude::*; use rustc_data_structures::fx::{FxHashMap, FxIndexMap}; use rustc_middle::ty::TyCtxt; use rustc_span::def_id::DefId; @@ -21,14 +22,14 @@ use crate::html::render::{self, IndexItem, IndexItemFunctionType, RenderType, Re /// /// The `index` is a JSON-encoded list of names and other information. /// -/// The desc has newlined descriptions, split up by size into 1MiB shards. +/// The desc has newlined descriptions, split up by size into 128KiB shards. /// For example, `(4, "foo\nbar\nbaz\nquux")`. pub(crate) struct SerializedSearchIndex { pub(crate) index: String, pub(crate) desc: Vec<(usize, String)>, } -const DESC_INDEX_SHARD_LEN: usize = 1024 * 1024; +const DESC_INDEX_SHARD_LEN: usize = 128 * 1024; /// Builds the search index from the collected metadata pub(crate) fn build_index<'tcx>( @@ -342,6 +343,8 @@ pub(crate) fn build_index<'tcx>( // A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string // for information on the format. descindex: String, + // A list of items with no description. This is eventually turned into a bitmap. + emptydesc: Vec, } struct Paths { @@ -456,7 +459,8 @@ pub(crate) fn build_index<'tcx>( } if item.deprecation.is_some() { - deprecated.push(index); + // bitmasks always use 1-indexing for items, with 0 as the crate itself + deprecated.push(u32::try_from(index + 1).unwrap()); } } @@ -473,9 +477,18 @@ pub(crate) fn build_index<'tcx>( crate_data.serialize_field("i", &parents)?; crate_data.serialize_field("f", &functions)?; crate_data.serialize_field("D", &self.descindex)?; - crate_data.serialize_field("c", &deprecated)?; crate_data.serialize_field("p", &paths)?; crate_data.serialize_field("b", &self.associated_item_disambiguators)?; + let mut buf = Vec::new(); + let mut strbuf = String::new(); + write_bitmap_to_bytes(&deprecated, &mut buf).unwrap(); + BASE64_STANDARD.encode_string(&buf, &mut strbuf); + crate_data.serialize_field("c", &strbuf)?; + strbuf.clear(); + buf.clear(); + write_bitmap_to_bytes(&self.emptydesc, &mut buf).unwrap(); + BASE64_STANDARD.encode_string(&buf, &mut strbuf); + crate_data.serialize_field("e", &strbuf)?; if has_aliases { crate_data.serialize_field("a", &self.aliases)?; } @@ -483,11 +496,18 @@ pub(crate) fn build_index<'tcx>( } } - let desc = { + let (emptydesc, desc) = { + let mut emptydesc = Vec::new(); let mut result = Vec::new(); let mut set = String::new(); let mut len: usize = 0; + let mut itemindex: u32 = 0; for desc in std::iter::once(&crate_doc).chain(crate_items.iter().map(|item| &item.desc)) { + if desc == "" { + emptydesc.push(itemindex); + itemindex += 1; + continue; + } if set.len() >= DESC_INDEX_SHARD_LEN { result.push((len, std::mem::replace(&mut set, String::new()))); len = 0; @@ -496,9 +516,10 @@ pub(crate) fn build_index<'tcx>( } set.push_str(&desc); len += 1; + itemindex += 1; } result.push((len, std::mem::replace(&mut set, String::new()))); - result + (emptydesc, result) }; let descindex = { @@ -509,7 +530,10 @@ pub(crate) fn build_index<'tcx>( descindex }; - assert_eq!(crate_items.len() + 1, desc.iter().map(|(len, _)| *len).sum::()); + assert_eq!( + crate_items.len() + 1, + desc.iter().map(|(len, _)| *len).sum::() + emptydesc.len() + ); // The index, which is actually used to search, is JSON // It uses `JSON.parse(..)` to actually load, since JSON @@ -523,6 +547,7 @@ pub(crate) fn build_index<'tcx>( aliases: &aliases, associated_item_disambiguators: &associated_item_disambiguators, descindex, + emptydesc, }) .expect("failed serde conversion") // All these `replace` calls are because we have to go through JS string for JSON content. @@ -571,6 +596,200 @@ pub(crate) fn write_vlqhex_to_string(n: i32, string: &mut String) { } } +// checked against roaring-rs in +// https://gitlab.com/notriddle/roaring-test +pub fn write_bitmap_to_bytes(domain: &[u32], mut out: impl std::io::Write) -> std::io::Result<()> { + // https://arxiv.org/pdf/1603.06549.pdf + let mut keys = Vec::::new(); + let mut containers = Vec::::new(); + enum Container { + /// number of ones, bits + Bits(Box<[u64; 1024]>), + /// list of entries + Array(Vec), + /// list of (start, len-1) + Run(Vec<(u16, u16)>), + } + impl Container { + fn popcount(&self) -> u32 { + match self { + Container::Bits(bits) => bits.iter().copied().map(|x| x.count_ones()).sum(), + Container::Array(array) => { + array.len().try_into().expect("array can't be bigger than 2**32") + } + Container::Run(runs) => { + runs.iter().copied().map(|(_, lenm1)| u32::from(lenm1) + 1).sum() + } + } + } + fn push(&mut self, value: u16) { + match self { + Container::Bits(bits) => bits[value as usize >> 6] |= 1 << (value & 0x3F), + Container::Array(array) => { + array.push(value); + if array.len() >= 4096 { + let array = std::mem::replace(array, Vec::new()); + *self = Container::Bits(Box::new([0; 1024])); + for value in array { + self.push(value); + } + } + } + Container::Run(runs) => { + if let Some(r) = runs.last_mut() + && r.0 + r.1 + 1 == value + { + r.1 += 1; + } else { + runs.push((value, 0)); + } + } + } + } + fn try_make_run(&mut self) -> bool { + match self { + Container::Bits(bits) => { + let mut r: u64 = 0; + for (i, chunk) in bits.iter().copied().enumerate() { + let next_chunk = + i.checked_add(1).and_then(|i| bits.get(i)).copied().unwrap_or(0); + r += !chunk & u64::from((chunk << 1).count_ones()); + r += !next_chunk & u64::from((chunk >> 63).count_ones()); + } + if (2 + 4 * r) < 8192 { + let bits = std::mem::replace(bits, Box::new([0; 1024])); + *self = Container::Run(Vec::new()); + for (i, bits) in bits.iter().copied().enumerate() { + if bits == 0 { + continue; + } + for j in 0..64 { + let value = (u16::try_from(i).unwrap() << 6) | j; + if bits & (1 << j) != 0 { + self.push(value); + } + } + } + true + } else { + false + } + } + Container::Array(array) if array.len() <= 5 => false, + Container::Array(array) => { + let mut r = 0; + let mut prev = None; + for value in array.iter().copied() { + if value.checked_sub(1) != prev { + r += 1; + } + prev = Some(value); + } + if 2 + 4 * r < 2 * array.len() + 2 { + let array = std::mem::replace(array, Vec::new()); + *self = Container::Run(Vec::new()); + for value in array { + self.push(value); + } + true + } else { + false + } + } + Container::Run(_) => true, + } + } + } + let mut key: u16; + let mut domain_iter = domain.into_iter().copied().peekable(); + let mut has_run = false; + while let Some(entry) = domain_iter.next() { + key = (entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit"); + let value: u16 = (entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit"); + let mut container = Container::Array(vec![value]); + while let Some(entry) = domain_iter.peek().copied() { + let entry_key: u16 = + (entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit"); + if entry_key != key { + break; + } + domain_iter.next().expect("peeking just succeeded"); + container + .push((entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit")); + } + keys.push(key); + has_run = container.try_make_run() || has_run; + containers.push(container); + } + // https://github.com/RoaringBitmap/RoaringFormatSpec + use byteorder::{WriteBytesExt, LE}; + const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; + const SERIAL_COOKIE: u32 = 12347; + const NO_OFFSET_THRESHOLD: u32 = 4; + let size: u32 = containers.len().try_into().unwrap(); + let start_offset = if has_run { + out.write_u32::(SERIAL_COOKIE | ((size - 1) << 16))?; + for set in containers.chunks(8) { + let mut b = 0; + for (i, container) in set.iter().enumerate() { + if matches!(container, &Container::Run(..)) { + b |= 1 << i; + } + } + out.write_u8(b)?; + } + if size < NO_OFFSET_THRESHOLD { + 4 + 4 * size + ((size + 7) / 8) + } else { + 4 + 8 * size + ((size + 7) / 8) + } + } else { + out.write_u32::(SERIAL_COOKIE_NO_RUNCONTAINER)?; + out.write_u32::(containers.len().try_into().unwrap())?; + 4 + 4 + 4 * size + 4 * size + }; + for (&key, container) in keys.iter().zip(&containers) { + // descriptive header + let key: u32 = key.into(); + let count: u32 = container.popcount() - 1; + out.write_u32::((count << 16) | key)?; + } + if !has_run || size >= NO_OFFSET_THRESHOLD { + // offset header + let mut starting_offset = start_offset; + for container in &containers { + out.write_u32::(starting_offset)?; + starting_offset += match container { + Container::Bits(_) => 8192u32, + Container::Array(array) => u32::try_from(array.len()).unwrap() * 2, + Container::Run(runs) => 2 + u32::try_from(runs.len()).unwrap() * 4, + }; + } + } + for container in &containers { + match container { + Container::Bits(bits) => { + for chunk in bits.iter() { + out.write_u64::(*chunk)?; + } + } + Container::Array(array) => { + for value in array.iter() { + out.write_u16::(*value)?; + } + } + Container::Run(runs) => { + out.write_u16::((runs.len()).try_into().unwrap())?; + for (start, lenm1) in runs.iter().copied() { + out.write_u16::(start)?; + out.write_u16::(lenm1)?; + } + } + } + } + Ok(()) +} + pub(crate) fn get_function_type_for_search<'tcx>( item: &clean::Item, tcx: TyCtxt<'tcx>, diff --git a/src/librustdoc/html/static/js/search.js b/src/librustdoc/html/static/js/search.js index 15da5bf96b2..e70c5bfd734 100644 --- a/src/librustdoc/html/static/js/search.js +++ b/src/librustdoc/html/static/js/search.js @@ -242,6 +242,14 @@ function initSearch(rawSearchIndex) { * @type {Array} */ let searchIndex; + /** + * @type {Map} + */ + let searchIndexDeprecated; + /** + * @type {Map} + */ + let searchIndexEmptyDesc; /** * @type {Uint32Array} */ @@ -1326,7 +1334,6 @@ function initSearch(rawSearchIndex) { duplicates.add(obj.fullPath); obj.href = res[1]; - obj.desc = result.desc; out.push(obj); if (out.length >= MAX_RESULTS) { break; @@ -1353,12 +1360,6 @@ function initSearch(rawSearchIndex) { result.word = searchIndex[result.id].word; result_list.push(result); } - for (const result of result_list) { - result.desc = searchState.loadDesc(result.item); - } - for (const result of result_list) { - result.desc = await result.desc; - } result_list.sort((aaa, bbb) => { let a, b; @@ -1401,8 +1402,8 @@ function initSearch(rawSearchIndex) { } // sort deprecated items later - a = aaa.item.deprecated; - b = bbb.item.deprecated; + a = searchIndexDeprecated.get(aaa.item.crate).contains(aaa.item.bitIndex); + b = searchIndexDeprecated.get(bbb.item.crate).contains(bbb.item.bitIndex); if (a !== b) { return a - b; } @@ -1429,8 +1430,8 @@ function initSearch(rawSearchIndex) { } // sort by description (no description goes later) - a = (aaa.desc === ""); - b = (bbb.desc === ""); + a = searchIndexEmptyDesc.get(aaa.item.crate).contains(aaa.item.bitIndex); + b = searchIndexEmptyDesc.get(bbb.item.crate).contains(bbb.item.bitIndex); if (a !== b) { return a - b; } @@ -1453,7 +1454,16 @@ function initSearch(rawSearchIndex) { return 0; }); - return transformResults(result_list); + const transformed = transformResults(result_list); + for (const result of transformed) { + result.desc = searchIndexEmptyDesc.get(result.crate).contains(result.bitIndex) ? + "" : + searchState.loadDesc(result); + } + for (const result of transformed) { + result.desc = await result.desc; + } + return transformed; } /** @@ -2079,7 +2089,7 @@ function initSearch(rawSearchIndex) { parent: item.parent, type: item.type, is_alias: true, - deprecated: item.deprecated, + bitIndex: item.bitIndex, implDisambiguator: item.implDisambiguator, }; } @@ -2712,9 +2722,11 @@ ${item.displayPath}${name}\ currentResults = results.query.userQuery; - const ret_others = await addTab(results.others, results.query, true); - const ret_in_args = await addTab(results.in_args, results.query, false); - const ret_returned = await addTab(results.returned, results.query, false); + const [ret_others, ret_in_args, ret_returned] = await Promise.all([ + addTab(results.others, results.query, true), + addTab(results.in_args, results.query, false), + addTab(results.returned, results.query, false), + ]); // Navigate to the relevant tab if the current tab is empty, like in case users search // for "-> String". If they had selected another tab previously, they have to click on @@ -3267,6 +3279,123 @@ ${item.displayPath}${name}\ return result; } } + class RoaringBitmap { + constructor(str) { + const strdecoded = atob(str); + const u8array = new Uint8Array(strdecoded.length); + for (let j = 0; j < strdecoded.length; ++j) { + u8array[j] = strdecoded.charCodeAt(j); + } + const has_runs = u8array[0] === 0x3b; + const size = has_runs ? + ((u8array[2] | (u8array[3] << 8)) + 1) : + ((u8array[4] | (u8array[5] << 8) | (u8array[6] << 16) | (u8array[7] << 24))); + let i = has_runs ? 4 : 8; + let is_run; + if (has_runs) { + const is_run_len = Math.floor((size + 7) / 8); + is_run = u8array.slice(i, i + is_run_len); + i += is_run_len; + } else { + is_run = new Uint8Array(); + } + this.keys = []; + this.cardinalities = []; + for (let j = 0; j < size; ++j) { + this.keys.push(u8array[i] | (u8array[i + 1] << 8)); + i += 2; + this.cardinalities.push((u8array[i] | (u8array[i + 1] << 8)) + 1); + i += 2; + } + this.containers = []; + let offsets = null; + if (!has_runs || this.keys.length >= 4) { + offsets = []; + for (let j = 0; j < size; ++j) { + offsets.push(u8array[i] | (u8array[i + 1] << 8) | (u8array[i + 2] << 16) | + (u8array[i + 3] << 24)); + i += 4; + } + } + for (let j = 0; j < size; ++j) { + if (offsets && offsets[j] !== i) { + console.log(this.containers); + throw new Error(`corrupt bitmap ${j}: ${i} / ${offsets[j]}`); + } + if (is_run[j >> 3] & (1 << (j & 0x7))) { + const runcount = (u8array[i] | (u8array[i + 1] << 8)); + i += 2; + this.containers.push(new RoaringBitmapRun( + runcount, + u8array.slice(i, i + (runcount * 4)), + )); + i += runcount * 4; + } else if (this.cardinalities[j] >= 4096) { + this.containers.push(new RoaringBitmapBits(u8array.slice(i, i + 8192))); + i += 8192; + } else { + const end = this.cardinalities[j] * 2; + this.containers.push(new RoaringBitmapArray( + this.cardinalities[j], + u8array.slice(i, i + end), + )); + i += end; + } + } + } + contains(keyvalue) { + const key = keyvalue >> 16; + const value = keyvalue & 0xFFFF; + for (let i = 0; i < this.keys.length; ++i) { + if (this.keys[i] === key) { + return this.containers[i].contains(value); + } + } + return false; + } + } + + class RoaringBitmapRun { + constructor(runcount, array) { + this.runcount = runcount; + this.array = array; + } + contains(value) { + const l = this.runcount * 4; + for (let i = 0; i < l; i += 4) { + const start = this.array[i] | (this.array[i + 1] << 8); + const lenm1 = this.array[i + 2] | (this.array[i + 3] << 8); + if (value >= start && value <= (start + lenm1)) { + return true; + } + } + return false; + } + } + class RoaringBitmapArray { + constructor(cardinality, array) { + this.cardinality = cardinality; + this.array = array; + } + contains(value) { + const l = this.cardinality * 2; + for (let i = 0; i < l; i += 2) { + const start = this.array[i] | (this.array[i + 1] << 8); + if (value === start) { + return true; + } + } + return false; + } + } + class RoaringBitmapBits { + constructor(array) { + this.array = array; + } + contains(value) { + return !!(this.array[value >> 3] & (1 << (value & 7))); + } + } /** * Convert raw search index into in-memory search index. @@ -3275,6 +3404,8 @@ ${item.displayPath}${name}\ */ function buildIndex(rawSearchIndex) { searchIndex = []; + searchIndexDeprecated = new Map(); + searchIndexEmptyDesc = new Map(); const charA = "A".charCodeAt(0); let currentIndex = 0; let id = 0; @@ -3307,6 +3438,11 @@ ${item.displayPath}${name}\ }; const descShardList = [ descShard ]; + // Deprecated items and items with no description + searchIndexDeprecated.set(crate, new RoaringBitmap(crateCorpus.c)); + searchIndexEmptyDesc.set(crate, new RoaringBitmap(crateCorpus.e)); + let descIndex = 0; + // This object should have exactly the same set of fields as the "row" // object defined below. Your JavaScript runtime will thank you. // https://mathiasbynens.be/notes/shapes-ics @@ -3316,18 +3452,21 @@ ${item.displayPath}${name}\ name: crate, path: "", descShard, - descIndex: 0, + descIndex, parent: undefined, type: null, id, word: crate, normalizedName: crate.indexOf("_") === -1 ? crate : crate.replace(/_/g, ""), - deprecated: null, + bitIndex: 0, implDisambiguator: null, }; id += 1; searchIndex.push(crateRow); currentIndex += 1; + if (!searchIndexEmptyDesc.get(crate).contains(0)) { + descIndex += 1; + } // a String of one character item type codes const itemTypes = crateCorpus.t; @@ -3341,9 +3480,7 @@ ${item.displayPath}${name}\ const itemPaths = new Map(crateCorpus.q); // an array of (Number) the parent path index + 1 to `paths`, or 0 if none const itemParentIdxs = crateCorpus.i; - // an array of (Number) indices for the deprecated items - const deprecatedItems = new Set(crateCorpus.c); - // an array of (Number) indices for the deprecated items + // a map Number, string for impl disambiguators const implDisambiguator = new Map(crateCorpus.b); // an array of [(Number) item type, // (String) name] @@ -3388,9 +3525,10 @@ ${item.displayPath}${name}\ // faster analysis operations lastPath = ""; len = itemTypes.length; - let descIndex = 1; for (let i = 0; i < len; ++i) { - if (descIndex >= descShard.len) { + const bitIndex = i + 1; + if (descIndex >= descShard.len && + !searchIndexEmptyDesc.get(crate).contains(bitIndex)) { descShard = { crate, shard: descShard.shard + 1, @@ -3439,13 +3577,15 @@ ${item.displayPath}${name}\ id, word, normalizedName: word.indexOf("_") === -1 ? word : word.replace(/_/g, ""), - deprecated: deprecatedItems.has(i), + bitIndex, implDisambiguator: implDisambiguator.has(i) ? implDisambiguator.get(i) : null, }; id += 1; searchIndex.push(row); lastPath = row.path; - descIndex += 1; + if (!searchIndexEmptyDesc.get(crate).contains(bitIndex)) { + descIndex += 1; + } } if (aliases) { diff --git a/src/tools/rustdoc-js/tester.js b/src/tools/rustdoc-js/tester.js index 1af2f44c230..43a22f358c3 100644 --- a/src/tools/rustdoc-js/tester.js +++ b/src/tools/rustdoc-js/tester.js @@ -559,8 +559,3 @@ process.on("beforeExit", () => { console.log("process did not complete"); process.exit(1); }); - -/*process.on("uncaughtException", (err) => { - console.log(`Uncaught Exception: ${err.message}`); - process.exit(1); -});*/