Show diff between 2 CI runs on Slack reports (#22798)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar 2023-04-19 19:27:37 +02:00 committed by GitHub
parent 5f97bbc124
commit 648bd5a8aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 197 additions and 36 deletions

View File

@ -487,12 +487,23 @@ jobs:
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
CI_EVENT: scheduled
CI_SHA: ${{ github.sha }}
CI_WORKFLOW_REF: ${{ github.workflow_ref }}
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
sudo apt-get install -y curl
pip install slack_sdk
pip show slack_sdk
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
- name: Failure table artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: test_failure_tables
path: test_failure_tables

View File

@ -2,7 +2,6 @@ import argparse
import json
import math
import os
import subprocess
import time
import traceback
import zipfile
@ -70,19 +69,16 @@ def download_artifact(artifact_name, artifact_url, output_dir, token):
but it can't be used to download directly. We need to get a redirect URL first.
See https://docs.github.com/en/rest/actions/artifacts#download-an-artifact
"""
# Get the redirect URL first
cmd = f'curl -v -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}" {artifact_url}'
output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
o = output.stdout.decode("utf-8")
lines = o.splitlines()
headers = None
if token is not None:
headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
for line in lines:
if line.startswith("< Location: "):
redirect_url = line[len("< Location: ") :]
r = requests.get(redirect_url, allow_redirects=True)
p = os.path.join(output_dir, f"{artifact_name}.zip")
open(p, "wb").write(r.content)
break
result = requests.get(artifact_url, headers=headers, allow_redirects=False)
download_url = result.headers["Location"]
response = requests.get(download_url, allow_redirects=True)
file_path = os.path.join(output_dir, f"{artifact_name}.zip")
with open(file_path, "wb") as fp:
fp.write(response.content)
def get_errors_from_single_artifact(artifact_zip_path, job_links=None):

View File

@ -0,0 +1,70 @@
import os
import zipfile
import requests
from get_ci_error_statistics import download_artifact, get_artifacts_links
def get_daily_ci_runs(token, num_runs=7):
"""Get the workflow runs of the scheduled (daily) CI.
This only selects the runs triggered by the `schedule` event on the `main` branch.
"""
headers = None
if token is not None:
headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
# The id of a workflow (not of a workflow run)
workflow_id = "636036"
url = f"https://api.github.com/repos/huggingface/transformers/actions/workflows/{workflow_id}/runs"
# On `main` branch + event being `schedule` + not returning PRs + only `num_runs` results
url += f"?branch=main&event=schedule&exclude_pull_requests=true&per_page={num_runs}"
result = requests.get(url, headers=headers).json()
return result["workflow_runs"]
def get_last_daily_ci_runs(token):
"""Get the last completed workflow run id of the scheduled (daily) CI."""
workflow_runs = get_daily_ci_runs(token)
workflow_run_id = None
for workflow_run in workflow_runs:
if workflow_run["status"] == "completed":
workflow_run_id = workflow_run["id"]
break
return workflow_run_id
def get_last_daily_ci_artifacts(artifact_names, output_dir, token):
"""Get the artifacts of last completed workflow run id of the scheduled (daily) CI."""
workflow_run_id = get_last_daily_ci_runs(token)
if workflow_run_id is not None:
artifacts_links = get_artifacts_links(worflow_run_id=workflow_run_id, token=token)
for artifact_name in artifact_names:
if artifact_name in artifacts_links:
artifact_url = artifacts_links[artifact_name]
download_artifact(
artifact_name=artifact_name, artifact_url=artifact_url, output_dir=output_dir, token=token
)
def get_last_daily_ci_reports(artifact_names, output_dir, token):
"""Get the artifacts' content of the last completed workflow run id of the scheduled (daily) CI."""
get_last_daily_ci_artifacts(artifact_names, output_dir, token)
results = {}
for artifact_name in artifact_names:
results[artifact_name] = {}
artifact_zip_path = os.path.join(output_dir, f"{artifact_name}.zip")
if os.path.isfile(artifact_zip_path):
with zipfile.ZipFile(artifact_zip_path) as z:
for filename in z.namelist():
if not os.path.isdir(filename):
# read the file
with z.open(filename) as f:
results[artifact_name][filename] = f.read().decode("UTF-8")
return results

View File

@ -25,6 +25,7 @@ from typing import Dict, List, Optional, Union
import requests
from get_ci_error_statistics import get_job_links
from get_previous_daily_ci import get_last_daily_ci_reports
from slack_sdk import WebClient
@ -274,6 +275,43 @@ class Message:
return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}}
def compute_diff_for_failure_reports(self, curr_failure_report, prev_failure_report): # noqa
# Remove the leading and training parts that don't contain failure count information.
model_failures = curr_failure_report.split("\n")[3:-2]
prev_model_failures = prev_failure_report.split("\n")[3:-2]
entries_changed = set(model_failures).difference(prev_model_failures)
prev_map = {}
for f in prev_model_failures:
items = [x.strip() for x in f.split("| ")]
prev_map[items[-1]] = [int(x) for x in items[:-1]]
curr_map = {}
for f in entries_changed:
items = [x.strip() for x in f.split("| ")]
curr_map[items[-1]] = [int(x) for x in items[:-1]]
diff_map = {}
for k, v in curr_map.items():
if k not in prev_map:
diff_map[k] = v
else:
diff = [x - y for x, y in zip(v, prev_map[k])]
if max(diff) > 0:
diff_map[k] = diff
entries_changed = []
for model_name, diff_values in diff_map.items():
diff = [str(x) for x in diff_values]
diff = [f"+{x}" if (x != "0" and not x.startswith("-")) else x for x in diff]
diff = [x.rjust(9) for x in diff]
device_report = " | ".join(diff) + " | "
report = f"{device_report}{model_name}"
entries_changed.append(report)
entries_changed = sorted(entries_changed, key=lambda s: s.split("| ")[-1])
return entries_changed
@property
def model_failures(self) -> Dict:
# Obtain per-model failures
@ -331,44 +369,86 @@ class Message:
model_reports.append(report)
# (Possibly truncated) reports for the current workflow run - to be sent to Slack channels
model_header = "Single PT | Multi PT | Single TF | Multi TF | Other | Category\n"
sorted_model_reports = sorted(model_reports, key=lambda s: s.split("] ")[-1])
sorted_model_reports = sorted(model_reports, key=lambda s: s.split("| ")[-1])
model_failures_report = prepare_reports(
title="These following model modules had failures", header=model_header, reports=sorted_model_reports
)
module_header = "Single | Multi | Category\n"
sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("] ")[-1])
sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("| ")[-1])
module_failures_report = prepare_reports(
title="The following non-model modules had failures", header=module_header, reports=sorted_module_reports
)
# To be sent to Slack channels
model_failure_sections = [
{"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}},
{"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}},
]
# Save complete tables (for past CI) - to be uploaded as artifacts
if ci_event.startswith("Past CI"):
model_failures_report = prepare_reports(
title="These following model modules had failures",
header=model_header,
reports=sorted_model_reports,
to_truncate=False,
)
file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(model_failures_report)
# Save the complete (i.e. no truncation) failure tables (of the current workflow run)
# (to be uploaded as artifacts)
if not os.path.isdir(os.path.join(os.getcwd(), "test_failure_tables")):
os.makedirs(os.path.join(os.getcwd(), "test_failure_tables"))
module_failures_report = prepare_reports(
title="The following non-model modules had failures",
header=module_header,
reports=sorted_module_reports,
to_truncate=False,
model_failures_report = prepare_reports(
title="These following model modules had failures",
header=model_header,
reports=sorted_model_reports,
to_truncate=False,
)
file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(model_failures_report)
module_failures_report = prepare_reports(
title="The following non-model modules had failures",
header=module_header,
reports=sorted_module_reports,
to_truncate=False,
)
file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(module_failures_report)
target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main"
if os.environ.get("CI_WORKFLOW_REF") == target_workflow:
# Get the last previously completed CI's failure tables
artifact_names = ["test_failure_tables"]
output_dir = os.path.join(os.getcwd(), "previous_reports")
os.makedirs(output_dir, exist_ok=True)
prev_tables = get_last_daily_ci_reports(
artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"]
)
file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(module_failures_report)
# The last run doesn't produce `test_failure_tables` (by some issues or have no model failure at all)
if len(prev_tables) > 0:
# Compute the difference of the previous/current (model failure) table
prev_model_failures = prev_tables["test_failure_tables"]["model_failures_report.txt"]
entries_changed = self.compute_diff_for_failure_reports(model_failures_report, prev_model_failures)
if len(entries_changed) > 0:
# Save the complete difference
diff_report = prepare_reports(
title="Changed model modules failures",
header=model_header,
reports=entries_changed,
to_truncate=False,
)
file_path = os.path.join(os.getcwd(), "test_failure_tables/changed_model_failures_report.txt")
with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(diff_report)
# To be sent to Slack channels
diff_report = prepare_reports(
title="*Changed model modules failures*",
header=model_header,
reports=entries_changed,
)
model_failure_sections.append(
{"type": "section", "text": {"type": "mrkdwn", "text": diff_report}},
)
return model_failure_sections
@ -487,14 +567,15 @@ class Message:
)
def post(self):
payload = self.payload
print("Sending the following payload")
print(json.dumps({"blocks": json.loads(self.payload)}))
print(json.dumps({"blocks": json.loads(payload)}))
text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed."
self.thread_ts = client.chat_postMessage(
channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
blocks=self.payload,
blocks=payload,
text=text,
)
@ -748,6 +829,9 @@ if __name__ == "__main__":
else:
ci_title = f"<{ci_url}|{ci_title}>\nAuthor: {ci_author} | Merged by: {merged_by}"
elif ci_sha:
ci_title = f"<{ci_url}|commit: {ci_sha}>"
else:
ci_title = ""