Show diff between 2 CI runs on Slack reports (#22798)
fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
5f97bbc124
commit
648bd5a8aa
|
@ -487,12 +487,23 @@ jobs:
|
|||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
CI_EVENT: scheduled
|
||||
CI_SHA: ${{ github.sha }}
|
||||
CI_WORKFLOW_REF: ${{ github.workflow_ref }}
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
sudo apt-get install -y curl
|
||||
pip install slack_sdk
|
||||
pip show slack_sdk
|
||||
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
|
||||
|
||||
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
|
||||
- name: Failure table artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: test_failure_tables
|
||||
path: test_failure_tables
|
||||
|
|
|
@ -2,7 +2,6 @@ import argparse
|
|||
import json
|
||||
import math
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import traceback
|
||||
import zipfile
|
||||
|
@ -70,19 +69,16 @@ def download_artifact(artifact_name, artifact_url, output_dir, token):
|
|||
but it can't be used to download directly. We need to get a redirect URL first.
|
||||
See https://docs.github.com/en/rest/actions/artifacts#download-an-artifact
|
||||
"""
|
||||
# Get the redirect URL first
|
||||
cmd = f'curl -v -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}" {artifact_url}'
|
||||
output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||
o = output.stdout.decode("utf-8")
|
||||
lines = o.splitlines()
|
||||
headers = None
|
||||
if token is not None:
|
||||
headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
|
||||
|
||||
for line in lines:
|
||||
if line.startswith("< Location: "):
|
||||
redirect_url = line[len("< Location: ") :]
|
||||
r = requests.get(redirect_url, allow_redirects=True)
|
||||
p = os.path.join(output_dir, f"{artifact_name}.zip")
|
||||
open(p, "wb").write(r.content)
|
||||
break
|
||||
result = requests.get(artifact_url, headers=headers, allow_redirects=False)
|
||||
download_url = result.headers["Location"]
|
||||
response = requests.get(download_url, allow_redirects=True)
|
||||
file_path = os.path.join(output_dir, f"{artifact_name}.zip")
|
||||
with open(file_path, "wb") as fp:
|
||||
fp.write(response.content)
|
||||
|
||||
|
||||
def get_errors_from_single_artifact(artifact_zip_path, job_links=None):
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
import os
|
||||
import zipfile
|
||||
|
||||
import requests
|
||||
from get_ci_error_statistics import download_artifact, get_artifacts_links
|
||||
|
||||
|
||||
def get_daily_ci_runs(token, num_runs=7):
|
||||
"""Get the workflow runs of the scheduled (daily) CI.
|
||||
|
||||
This only selects the runs triggered by the `schedule` event on the `main` branch.
|
||||
"""
|
||||
headers = None
|
||||
if token is not None:
|
||||
headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
|
||||
|
||||
# The id of a workflow (not of a workflow run)
|
||||
workflow_id = "636036"
|
||||
|
||||
url = f"https://api.github.com/repos/huggingface/transformers/actions/workflows/{workflow_id}/runs"
|
||||
# On `main` branch + event being `schedule` + not returning PRs + only `num_runs` results
|
||||
url += f"?branch=main&event=schedule&exclude_pull_requests=true&per_page={num_runs}"
|
||||
|
||||
result = requests.get(url, headers=headers).json()
|
||||
|
||||
return result["workflow_runs"]
|
||||
|
||||
|
||||
def get_last_daily_ci_runs(token):
|
||||
"""Get the last completed workflow run id of the scheduled (daily) CI."""
|
||||
workflow_runs = get_daily_ci_runs(token)
|
||||
workflow_run_id = None
|
||||
for workflow_run in workflow_runs:
|
||||
if workflow_run["status"] == "completed":
|
||||
workflow_run_id = workflow_run["id"]
|
||||
break
|
||||
|
||||
return workflow_run_id
|
||||
|
||||
|
||||
def get_last_daily_ci_artifacts(artifact_names, output_dir, token):
|
||||
"""Get the artifacts of last completed workflow run id of the scheduled (daily) CI."""
|
||||
workflow_run_id = get_last_daily_ci_runs(token)
|
||||
if workflow_run_id is not None:
|
||||
artifacts_links = get_artifacts_links(worflow_run_id=workflow_run_id, token=token)
|
||||
for artifact_name in artifact_names:
|
||||
if artifact_name in artifacts_links:
|
||||
artifact_url = artifacts_links[artifact_name]
|
||||
download_artifact(
|
||||
artifact_name=artifact_name, artifact_url=artifact_url, output_dir=output_dir, token=token
|
||||
)
|
||||
|
||||
|
||||
def get_last_daily_ci_reports(artifact_names, output_dir, token):
|
||||
"""Get the artifacts' content of the last completed workflow run id of the scheduled (daily) CI."""
|
||||
get_last_daily_ci_artifacts(artifact_names, output_dir, token)
|
||||
|
||||
results = {}
|
||||
for artifact_name in artifact_names:
|
||||
results[artifact_name] = {}
|
||||
artifact_zip_path = os.path.join(output_dir, f"{artifact_name}.zip")
|
||||
if os.path.isfile(artifact_zip_path):
|
||||
with zipfile.ZipFile(artifact_zip_path) as z:
|
||||
for filename in z.namelist():
|
||||
if not os.path.isdir(filename):
|
||||
# read the file
|
||||
with z.open(filename) as f:
|
||||
results[artifact_name][filename] = f.read().decode("UTF-8")
|
||||
|
||||
return results
|
|
@ -25,6 +25,7 @@ from typing import Dict, List, Optional, Union
|
|||
|
||||
import requests
|
||||
from get_ci_error_statistics import get_job_links
|
||||
from get_previous_daily_ci import get_last_daily_ci_reports
|
||||
from slack_sdk import WebClient
|
||||
|
||||
|
||||
|
@ -274,6 +275,43 @@ class Message:
|
|||
|
||||
return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}}
|
||||
|
||||
def compute_diff_for_failure_reports(self, curr_failure_report, prev_failure_report): # noqa
|
||||
# Remove the leading and training parts that don't contain failure count information.
|
||||
model_failures = curr_failure_report.split("\n")[3:-2]
|
||||
prev_model_failures = prev_failure_report.split("\n")[3:-2]
|
||||
entries_changed = set(model_failures).difference(prev_model_failures)
|
||||
|
||||
prev_map = {}
|
||||
for f in prev_model_failures:
|
||||
items = [x.strip() for x in f.split("| ")]
|
||||
prev_map[items[-1]] = [int(x) for x in items[:-1]]
|
||||
|
||||
curr_map = {}
|
||||
for f in entries_changed:
|
||||
items = [x.strip() for x in f.split("| ")]
|
||||
curr_map[items[-1]] = [int(x) for x in items[:-1]]
|
||||
|
||||
diff_map = {}
|
||||
for k, v in curr_map.items():
|
||||
if k not in prev_map:
|
||||
diff_map[k] = v
|
||||
else:
|
||||
diff = [x - y for x, y in zip(v, prev_map[k])]
|
||||
if max(diff) > 0:
|
||||
diff_map[k] = diff
|
||||
|
||||
entries_changed = []
|
||||
for model_name, diff_values in diff_map.items():
|
||||
diff = [str(x) for x in diff_values]
|
||||
diff = [f"+{x}" if (x != "0" and not x.startswith("-")) else x for x in diff]
|
||||
diff = [x.rjust(9) for x in diff]
|
||||
device_report = " | ".join(diff) + " | "
|
||||
report = f"{device_report}{model_name}"
|
||||
entries_changed.append(report)
|
||||
entries_changed = sorted(entries_changed, key=lambda s: s.split("| ")[-1])
|
||||
|
||||
return entries_changed
|
||||
|
||||
@property
|
||||
def model_failures(self) -> Dict:
|
||||
# Obtain per-model failures
|
||||
|
@ -331,44 +369,86 @@ class Message:
|
|||
|
||||
model_reports.append(report)
|
||||
|
||||
# (Possibly truncated) reports for the current workflow run - to be sent to Slack channels
|
||||
model_header = "Single PT | Multi PT | Single TF | Multi TF | Other | Category\n"
|
||||
sorted_model_reports = sorted(model_reports, key=lambda s: s.split("] ")[-1])
|
||||
sorted_model_reports = sorted(model_reports, key=lambda s: s.split("| ")[-1])
|
||||
model_failures_report = prepare_reports(
|
||||
title="These following model modules had failures", header=model_header, reports=sorted_model_reports
|
||||
)
|
||||
|
||||
module_header = "Single | Multi | Category\n"
|
||||
sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("] ")[-1])
|
||||
sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("| ")[-1])
|
||||
module_failures_report = prepare_reports(
|
||||
title="The following non-model modules had failures", header=module_header, reports=sorted_module_reports
|
||||
)
|
||||
|
||||
# To be sent to Slack channels
|
||||
model_failure_sections = [
|
||||
{"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}},
|
||||
{"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}},
|
||||
]
|
||||
|
||||
# Save complete tables (for past CI) - to be uploaded as artifacts
|
||||
if ci_event.startswith("Past CI"):
|
||||
model_failures_report = prepare_reports(
|
||||
title="These following model modules had failures",
|
||||
header=model_header,
|
||||
reports=sorted_model_reports,
|
||||
to_truncate=False,
|
||||
)
|
||||
file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
|
||||
with open(file_path, "w", encoding="UTF-8") as fp:
|
||||
fp.write(model_failures_report)
|
||||
# Save the complete (i.e. no truncation) failure tables (of the current workflow run)
|
||||
# (to be uploaded as artifacts)
|
||||
if not os.path.isdir(os.path.join(os.getcwd(), "test_failure_tables")):
|
||||
os.makedirs(os.path.join(os.getcwd(), "test_failure_tables"))
|
||||
|
||||
module_failures_report = prepare_reports(
|
||||
title="The following non-model modules had failures",
|
||||
header=module_header,
|
||||
reports=sorted_module_reports,
|
||||
to_truncate=False,
|
||||
model_failures_report = prepare_reports(
|
||||
title="These following model modules had failures",
|
||||
header=model_header,
|
||||
reports=sorted_model_reports,
|
||||
to_truncate=False,
|
||||
)
|
||||
file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
|
||||
with open(file_path, "w", encoding="UTF-8") as fp:
|
||||
fp.write(model_failures_report)
|
||||
|
||||
module_failures_report = prepare_reports(
|
||||
title="The following non-model modules had failures",
|
||||
header=module_header,
|
||||
reports=sorted_module_reports,
|
||||
to_truncate=False,
|
||||
)
|
||||
file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
|
||||
with open(file_path, "w", encoding="UTF-8") as fp:
|
||||
fp.write(module_failures_report)
|
||||
|
||||
target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main"
|
||||
if os.environ.get("CI_WORKFLOW_REF") == target_workflow:
|
||||
# Get the last previously completed CI's failure tables
|
||||
artifact_names = ["test_failure_tables"]
|
||||
output_dir = os.path.join(os.getcwd(), "previous_reports")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
prev_tables = get_last_daily_ci_reports(
|
||||
artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"]
|
||||
)
|
||||
file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
|
||||
with open(file_path, "w", encoding="UTF-8") as fp:
|
||||
fp.write(module_failures_report)
|
||||
|
||||
# The last run doesn't produce `test_failure_tables` (by some issues or have no model failure at all)
|
||||
if len(prev_tables) > 0:
|
||||
# Compute the difference of the previous/current (model failure) table
|
||||
prev_model_failures = prev_tables["test_failure_tables"]["model_failures_report.txt"]
|
||||
entries_changed = self.compute_diff_for_failure_reports(model_failures_report, prev_model_failures)
|
||||
if len(entries_changed) > 0:
|
||||
# Save the complete difference
|
||||
diff_report = prepare_reports(
|
||||
title="Changed model modules failures",
|
||||
header=model_header,
|
||||
reports=entries_changed,
|
||||
to_truncate=False,
|
||||
)
|
||||
file_path = os.path.join(os.getcwd(), "test_failure_tables/changed_model_failures_report.txt")
|
||||
with open(file_path, "w", encoding="UTF-8") as fp:
|
||||
fp.write(diff_report)
|
||||
|
||||
# To be sent to Slack channels
|
||||
diff_report = prepare_reports(
|
||||
title="*Changed model modules failures*",
|
||||
header=model_header,
|
||||
reports=entries_changed,
|
||||
)
|
||||
model_failure_sections.append(
|
||||
{"type": "section", "text": {"type": "mrkdwn", "text": diff_report}},
|
||||
)
|
||||
|
||||
return model_failure_sections
|
||||
|
||||
|
@ -487,14 +567,15 @@ class Message:
|
|||
)
|
||||
|
||||
def post(self):
|
||||
payload = self.payload
|
||||
print("Sending the following payload")
|
||||
print(json.dumps({"blocks": json.loads(self.payload)}))
|
||||
print(json.dumps({"blocks": json.loads(payload)}))
|
||||
|
||||
text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed."
|
||||
|
||||
self.thread_ts = client.chat_postMessage(
|
||||
channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
|
||||
blocks=self.payload,
|
||||
blocks=payload,
|
||||
text=text,
|
||||
)
|
||||
|
||||
|
@ -748,6 +829,9 @@ if __name__ == "__main__":
|
|||
else:
|
||||
ci_title = f"<{ci_url}|{ci_title}>\nAuthor: {ci_author} | Merged by: {merged_by}"
|
||||
|
||||
elif ci_sha:
|
||||
ci_title = f"<{ci_url}|commit: {ci_sha}>"
|
||||
|
||||
else:
|
||||
ci_title = ""
|
||||
|
||||
|
|
Loading…
Reference in New Issue