406 lines
11 KiB
406 lines
11 KiB
// This code is a Qiskit project.
// (C) Copyright IBM 2024.
// This code is licensed under the Apache License, Version 2.0. You may
// obtain a copy of this license in the LICENSE file in the root directory
// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
// Any modifications or derivative works of this code must retain this
// copyright notice, and modified files need to carry a notice indicating
// that they have been altered from the originals.
import { CheerioAPI, Cheerio, load, Element } from "cheerio";
import { Image } from "./HtmlToMdResult";
import { Metadata, ApiType } from "./Metadata";
import { processMdxComponent } from "./generateApiComponents";
export type ProcessedHtml = {
html: string;
meta: Metadata;
images: Image[];
isReleaseNotes: boolean;
export async function processHtml(options: {
html: string;
fileName: string;
imageDestination: string;
determineGithubUrl: (fileName: string) => string;
releaseNotesTitle: string;
}): Promise<ProcessedHtml> {
const {
} = options;
const $ = load(html);
const $main = $(`[role='main']`);
const isReleaseNotes = fileName.endsWith("release_notes.html");
const images = loadImages($, $main, imageDestination, isReleaseNotes);
if (isReleaseNotes) {
renameAllH1s($, releaseNotesTitle);
// Warning: the sequence of operations often matters.
removeHtmlExtensionsInRelativeLinks($, $main);
handleSphinxDesignCards($, $main);
addLanguageClassToCodeBlocks($, $main);
replaceViewcodeLinksWithGitHub($, $main, determineGithubUrl);
convertRubricsToHeaders($, $main);
processSimpleFieldLists($, $main);
preserveMathBlockWhitespace($, $main);
const meta: Metadata = {};
await processMembersAndSetMeta($, $main, meta);
maybeSetModuleMetadata($, $main, meta);
if (meta.apiType === "module") {
updateModuleHeadings($, $main, meta);
return { html: $main.html()!, meta, images, isReleaseNotes };
export function loadImages(
$: CheerioAPI,
$main: Cheerio<any>,
imageDestination: string,
isReleaseNotes: boolean,
): Image[] {
return $main
.filter((img) => $(img).attr("src"))
.map((img) => {
const $img = $(img);
const fileName = $img.attr("src")!.split("/").pop()!;
let dest = `${imageDestination}/${fileName}`;
if (isReleaseNotes) {
// Release notes links should point to the current version
dest = dest.replace(/[0-9].*\//, "");
$img.attr("src", dest);
return { fileName, dest };
export function removeHtmlExtensionsInRelativeLinks(
$: CheerioAPI,
$main: Cheerio<any>,
): void {
$main.find("a").each((_, link) => {
const $link = $(link);
const href = $link.attr("href");
if (href && !href.startsWith("http")) {
$link.attr("href", href.replaceAll(".html", ""));
export function renameAllH1s($: CheerioAPI, releaseNotesTitle: string): void {
export function removePermalinks($main: Cheerio<any>): void {
for (const [prefix, suffix] of [
["Permalink", "headline"],
["Permalink", "heading"],
["Permalink", "definition"],
["Link", "heading"],
["Link", "definition"],
]) {
$main.find(`a[title="${prefix} to this ${suffix}"]`).remove();
export function removeDownloadSourceCode($main: Cheerio<any>): void {
$main.find("p > a.reference.download.internal").closest("p").remove();
export function removeMatplotlibFigCaptions($main: Cheerio<any>): void {
.find("figcaption, div.figure p.caption")
.has("span.caption-text a.download.internal.reference")
* Flattens out sphinx-design cards, which are collapsible normally.
* Sets the card summary as a header and removes the blockquote from the body.
* This is only used by the historical API docs for qiskit-ibm-runtime. We disabled sphinx-design
* for every project moving forward.
export function handleSphinxDesignCards(
$: CheerioAPI,
$main: Cheerio<any>,
): void {
$main.find(".sd-summary-title").each((_, quote) => {
const $quote = $(quote);
$main.find(".sd-card-body blockquote").each((_, quote) => {
const $quote = $(quote);
export function addLanguageClassToCodeBlocks(
$: CheerioAPI,
$main: Cheerio<any>,
): void {
$main.find("pre").each((_, pre) => {
const $pre = $(pre);
`<pre><code class="language-python">${$pre.html()}</code></pre>`,
* Redirect URLS from `sphinx.ext.viewcode` to instead go to GitHub.
* These URLs will only go to the overall source code file, not the specific lines
* of code. This function only changes the URLs; the DOM still needs to be modified
* to remove the original `[source]` anchor element from Sphinx with our own `GitHub`
* anchor element in the correct location.
* This does not impact links from `sphinx.ext.linkcode`.
export function replaceViewcodeLinksWithGitHub(
$: CheerioAPI,
$main: Cheerio<any>,
determineGithubUrl: (fileName: string) => string,
): void {
$main.find("a").each((_, a) => {
const $a = $(a);
const href = $a.attr("href");
if (
href === undefined ||
href.startsWith("http:") ||
) {
// E.g. `qiskit_ibm_runtime/ibm_backend`
const fullFileName = href.match(/_modules\/(.*?)(#|$)/)![1];
$a.attr("href", determineGithubUrl(fullFileName));
export function convertRubricsToHeaders(
$: CheerioAPI,
$main: Cheerio<any>,
): void {
// A rubric is "a paragraph heading that is not used to create a table
// of contents node". Depending on the heading, this should be either <h2> or
// <strong>
function appropriateHtmlTag(html: string | null) {
html = String(html);
return html == "Methods" ||
html == "Methods Defined Here" ||
html == "Attributes"
? "h2"
: "strong";
$main.find(".rubric").each((_, el) => {
const $el = $(el);
const tag = appropriateHtmlTag($el.html());
export function processSimpleFieldLists(
$: CheerioAPI,
$main: Cheerio<any>,
): void {
// TODO(#479): Have a better understanding of what dl.field-list.simple corresponds to
// and confirm this behavior makes sense.
.map((dl) => {
const $dl = $(dl);
.forEach((dt) => {
const $dt = $(dt);
.forEach((dd) => {
const $dd = $(dd);
export function removeColonSpans($main: Cheerio<any>): void {
export async function processMembersAndSetMeta(
$: CheerioAPI,
$main: Cheerio<any>,
meta: Metadata,
): Promise<void> {
let continueMapMembers = true;
while (continueMapMembers) {
// members can be recursive, so we need to pick elements one by one
const dl = $main
"dl.py.class, dl.py.property, dl.py.method, dl.py.attribute, dl.py.function, dl.py.exception",
if (!dl) {
continueMapMembers = false;
const $dl = $(dl);
const id = $dl.find("dt").attr("id") || "";
const apiType = getApiType($dl);
const priorApiType = meta.apiType;
if (!priorApiType) {
meta.apiType = apiType;
meta.apiName = id;
const bodyElements: string[] = [];
const signatures: Cheerio<Element>[] = [];
for (const child of $dl.children().toArray()) {
const $child = $(child);
if (child.name !== "dt" || !apiType) {
if (signatures.length == 0) {
} else {
const [openTag, closeTag] = await processMdxComponent(
export function maybeSetModuleMetadata(
$: CheerioAPI,
$main: Cheerio<any>,
meta: Metadata,
): void {
const modulePrefix = "module-";
const moduleIdWithPrefix = $main
.find("span, section, div.section")
.map((el) => $(el).attr("id"))
.find((id) => id?.startsWith(modulePrefix));
if (moduleIdWithPrefix) {
meta.apiType = "module";
meta.apiName = moduleIdWithPrefix.slice(modulePrefix.length);
export function preserveMathBlockWhitespace(
$: CheerioAPI,
$main: Cheerio<any>,
): void {
.map((el) => {
const $el = $(el);
$el.replaceWith(`<pre class="math">${$el.html()}</pre>`);
export function updateModuleHeadings(
$: CheerioAPI,
$main: Cheerio<any>,
meta: Metadata,
): void {
.forEach((el) => {
const $el = $(el);
const $a = $($el.find("a"));
const signature = $a.text();
let title = $el.text();
title = title.replace("()", "");
let replacement = `<${el.tagName}>${title}</${el.tagName}>`;
if (signature.trim().length > 0) {
replacement += `<span class="target" id="module-${meta.apiName}" /><p><code>${signature}</code></p>`;
function getApiType($dl: Cheerio<any>): ApiType | undefined {
// Historical versions were generating properties incorrectly as methods.
// We can fix this by looking at the modifier before the signature.
// See https://github.com/Qiskit/documentation/issues/1352 for more information.
if (hasPropertyModifier($dl)) {
return "property";
for (const className of [
]) {
if ($dl.hasClass(className)) {
return className as ApiType;
return undefined;
function hasPropertyModifier($dl: Cheerio<any>): boolean {
const rawModifiers = $dl.find("dt").find("em.property");
const modifiers = rawModifiers.text().trim();
return modifiers == "property";