Save other CI jobs' result (torch/tf pipeline, example, deepspeed etc) (#30699)

* update

* update

* update

* update

* update

* update

* update

* update

* Update utils/notification_service.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Yih-Dar 2024-05-13 17:27:44 +02:00 committed by GitHub
parent 2e27291ce4
commit 82c1625ec3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 46 additions and 17 deletions

View File

@ -60,12 +60,10 @@ jobs:
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
- name: Failure table artifacts - name: Failure table artifacts
# Only the model testing job is concerned for this step
if: ${{ inputs.job == 'run_models_gpu' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: ci_results name: ci_results_${{ inputs.job }}
path: ci_results path: ci_results_${{ inputs.job }}
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- uses: actions/download-artifact@v4 - uses: actions/download-artifact@v4
@ -77,6 +75,7 @@ jobs:
SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
CI_EVENT: scheduled CI_EVENT: scheduled
CI_SHA: ${{ github.sha }} CI_SHA: ${{ github.sha }}
CI_TEST_JOB: ${{ inputs.job }}
SETUP_STATUS: ${{ inputs.setup_status }} SETUP_STATUS: ${{ inputs.setup_status }}
# We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change
# `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`. # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`.
@ -85,3 +84,11 @@ jobs:
pip install slack_sdk pip install slack_sdk
pip show slack_sdk pip show slack_sdk
python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}"
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
- name: Failure table artifacts
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
uses: actions/upload-artifact@v4
with:
name: ci_results_${{ inputs.job }}
path: ci_results_${{ inputs.job }}

View File

@ -416,7 +416,7 @@ class Message:
reports=sorted_model_reports, reports=sorted_model_reports,
to_truncate=False, to_truncate=False,
) )
file_path = os.path.join(os.getcwd(), "ci_results/model_failures_report.txt") file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/model_failures_report.txt")
with open(file_path, "w", encoding="UTF-8") as fp: with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(model_failures_report) fp.write(model_failures_report)
@ -426,18 +426,18 @@ class Message:
reports=sorted_module_reports, reports=sorted_module_reports,
to_truncate=False, to_truncate=False,
) )
file_path = os.path.join(os.getcwd(), "ci_results/module_failures_report.txt") file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/module_failures_report.txt")
with open(file_path, "w", encoding="UTF-8") as fp: with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(module_failures_report) fp.write(module_failures_report)
if self.prev_ci_artifacts is not None: if self.prev_ci_artifacts is not None:
# if the last run produces artifact named `ci_results` # if the last run produces artifact named `ci_results_{job_name}`
if ( if (
"ci_results" in self.prev_ci_artifacts f"ci_results_{job_name}" in self.prev_ci_artifacts
and "model_failures_report.txt" in self.prev_ci_artifacts["ci_results"] and "model_failures_report.txt" in self.prev_ci_artifacts[f"ci_results_{job_name}"]
): ):
# Compute the difference of the previous/current (model failure) table # Compute the difference of the previous/current (model failure) table
prev_model_failures = self.prev_ci_artifacts["ci_results"]["model_failures_report.txt"] prev_model_failures = self.prev_ci_artifacts[f"ci_results_{job_name}"]["model_failures_report.txt"]
entries_changed = self.compute_diff_for_failure_reports(model_failures_report, prev_model_failures) entries_changed = self.compute_diff_for_failure_reports(model_failures_report, prev_model_failures)
if len(entries_changed) > 0: if len(entries_changed) > 0:
# Save the complete difference # Save the complete difference
@ -447,7 +447,7 @@ class Message:
reports=entries_changed, reports=entries_changed,
to_truncate=False, to_truncate=False,
) )
file_path = os.path.join(os.getcwd(), "ci_results/changed_model_failures_report.txt") file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/changed_model_failures_report.txt")
with open(file_path, "w", encoding="UTF-8") as fp: with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(diff_report) fp.write(diff_report)
@ -643,8 +643,11 @@ class Message:
sorted_dict = sorted(self.model_results.items(), key=lambda t: t[0]) sorted_dict = sorted(self.model_results.items(), key=lambda t: t[0])
prev_model_results = {} prev_model_results = {}
if "ci_results" in self.prev_ci_artifacts and "model_results.json" in self.prev_ci_artifacts["ci_results"]: if (
prev_model_results = json.loads(self.prev_ci_artifacts["ci_results"]["model_results.json"]) f"ci_results_{job_name}" in self.prev_ci_artifacts
and "model_results.json" in self.prev_ci_artifacts[f"ci_results_{job_name}"]
):
prev_model_results = json.loads(self.prev_ci_artifacts[f"ci_results_{job_name}"]["model_results.json"])
all_failure_lines = {} all_failure_lines = {}
for job, job_result in sorted_dict: for job, job_result in sorted_dict:
@ -1139,20 +1142,32 @@ if __name__ == "__main__":
with open(os.path.join(directory, "selected_warnings.json")) as fp: with open(os.path.join(directory, "selected_warnings.json")) as fp:
selected_warnings = json.load(fp) selected_warnings = json.load(fp)
if not os.path.isdir(os.path.join(os.getcwd(), "ci_results")): if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")):
os.makedirs(os.path.join(os.getcwd(), "ci_results")) os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}"))
# Only the model testing job is concerned: this condition is to avoid other jobs to upload the empty list as # Only the model testing job is concerned: this condition is to avoid other jobs to upload the empty list as
# results. # results.
if job_name == "run_models_gpu": if job_name == "run_models_gpu":
with open("ci_results/model_results.json", "w", encoding="UTF-8") as fp: with open(f"ci_results_{job_name}/model_results.json", "w", encoding="UTF-8") as fp:
json.dump(model_results, fp, indent=4, ensure_ascii=False) json.dump(model_results, fp, indent=4, ensure_ascii=False)
# Must have the same keys as in `additional_results`.
# The values are used as the file names where to save the corresponding CI job results.
test_to_result_name = {
"PyTorch pipelines": "torch_pipeline",
"TensorFlow pipelines": "tf_pipeline",
"Examples directory": "example",
"Torch CUDA extension tests": "deepspeed",
}
for job, job_result in additional_results.items():
with open(f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", "w", encoding="UTF-8") as fp:
json.dump(job_result, fp, indent=4, ensure_ascii=False)
prev_ci_artifacts = None prev_ci_artifacts = None
target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main" target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main"
if os.environ.get("CI_WORKFLOW_REF") == target_workflow: if os.environ.get("CI_WORKFLOW_REF") == target_workflow:
# Get the last previously completed CI's failure tables # Get the last previously completed CI's failure tables
artifact_names = ["ci_results"] artifact_names = [f"ci_results_{job_name}"]
output_dir = os.path.join(os.getcwd(), "previous_reports") output_dir = os.path.join(os.getcwd(), "previous_reports")
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
prev_ci_artifacts = get_last_daily_ci_reports( prev_ci_artifacts = get_last_daily_ci_reports(

View File

@ -242,6 +242,13 @@ if __name__ == "__main__":
{"line": line, "trace": stacktraces.pop(0)} {"line": line, "trace": stacktraces.pop(0)}
) )
job_name = os.getenv("CI_TEST_JOB")
if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")):
os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}"))
with open(f"ci_results_{job_name}/quantization_results.json", "w", encoding="UTF-8") as fp:
json.dump(quantization_results, fp, indent=4, ensure_ascii=False)
message = QuantizationMessage( message = QuantizationMessage(
title, title,
results=quantization_results, results=quantization_results,