Compare commits

...

4 Commits

Author SHA1 Message Date
ydshieh 22d9ac23f5 trigger 2024-01-24 18:30:14 +01:00
ydshieh 80efaa69a7 fix 2024-01-24 17:07:17 +01:00
ydshieh 629febe9b2 fix 2024-01-24 16:42:54 +01:00
ydshieh 2b9b539429 avoid using job name 2024-01-24 15:32:45 +01:00
3 changed files with 48 additions and 202 deletions

View File

@ -12,7 +12,7 @@ on:
- cron: "17 2 * * *"
push:
branches:
- run_scheduled_ci*
- run_no_job_name
env:
HF_HOME: /mnt/cache
@ -61,7 +61,7 @@ jobs:
name: Identify models to test
working-directory: /transformers/tests
run: |
echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2[:2] + d1[:2]; print(d)')" >> $GITHUB_OUTPUT
- name: NVIDIA-SMI
run: |
@ -121,7 +121,7 @@ jobs:
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
- name: Test suite reports artifacts
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
@ -182,7 +182,7 @@ jobs:
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
- name: Test suite reports artifacts
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
@ -233,174 +233,13 @@ jobs:
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
- name: Test suite reports artifacts
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu"
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.machine_type }}_run_examples_gpu
path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
run_pipelines_torch_gpu:
name: PyTorch pipelines
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
container:
image: huggingface/transformers-pytorch-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
run_pipelines_tf_gpu:
name: TensorFlow pipelines
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
container:
image: huggingface/transformers-tensorflow-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
- name: Update clone
working-directory: /transformers
run: |
git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines
- name: Failure short reports
if: ${{ always() }}
run: |
cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu
path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu
run_all_tests_torch_cuda_extensions_gpu:
name: Torch CUDA extension tests
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
needs: setup
container:
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /workspace/transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /workspace/transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Remove cached torch extensions
run: rm -rf /github/home/.cache/torch_extensions/
# To avoid unknown test failures
- name: Pre build DeepSpeed *again*
working-directory: /workspace
run: |
python3 -m pip uninstall -y deepspeed
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /workspace/transformers
run: |
python utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /workspace/transformers
run: pip freeze
- name: Run all tests on GPU
working-directory: /workspace/transformers
run: |
python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
run_extract_warnings:
name: Extract warnings in CI artifacts
runs-on: ubuntu-22.04
@ -410,9 +249,6 @@ jobs:
run_tests_single_gpu,
run_tests_multi_gpu,
run_examples_gpu,
run_pipelines_tf_gpu,
run_pipelines_torch_gpu,
run_all_tests_torch_cuda_extensions_gpu
]
steps:
- name: Checkout transformers
@ -458,9 +294,6 @@ jobs:
run_tests_single_gpu,
run_tests_multi_gpu,
run_examples_gpu,
run_pipelines_tf_gpu,
run_pipelines_torch_gpu,
run_all_tests_torch_cuda_extensions_gpu,
run_extract_warnings
]
steps:
@ -478,7 +311,7 @@ jobs:
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
CI_EVENT: scheduled
CI_SHA: ${{ github.sha }}

View File

@ -10,6 +10,32 @@ from collections import Counter
import requests
def get_jobs(workflow_run_id, token=None):
"""Extract jobs in a GitHub Actions workflow run"""
headers = None
if token is not None:
headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{workflow_run_id}/jobs?per_page=100"
result = requests.get(url, headers=headers).json()
jobs = []
try:
jobs.extend(result["jobs"])
pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100)
for i in range(pages_to_iterate_over):
result = requests.get(url + f"&page={i + 2}", headers=headers).json()
jobs.extend(result["jobs"])
return jobs
except Exception:
print(f"Unknown error, could not fetch links:\n{traceback.format_exc()}")
return []
def get_job_links(workflow_run_id, token=None):
"""Extract job names and their job links in a GitHub Actions workflow run"""

View File

@ -24,7 +24,7 @@ import time
from typing import Dict, List, Optional, Union
import requests
from get_ci_error_statistics import get_job_links
from get_ci_error_statistics import get_jobs
from get_previous_daily_ci import get_last_daily_ci_reports
from slack_sdk import WebClient
@ -938,9 +938,19 @@ if __name__ == "__main__":
Message.error_out(title, ci_title)
raise ValueError("Errored out.")
github_actions_job_links = get_job_links(
github_actions_jobs = get_jobs(
workflow_run_id=os.environ["GITHUB_RUN_ID"], token=os.environ["ACCESS_REPO_INFO_TOKEN"]
)
github_actions_job_links = {job["name"]: job["html_url"] for job in github_actions_jobs}
artifact_name_to_job_map = {}
for job in github_actions_jobs:
for step in job["steps"]:
if step["name"].startswith("Test suite reports artifacts: "):
artifact_name = step["name"][len("Test suite reports artifacts: ") :]
artifact_name_to_job_map[artifact_name] = job
break
available_artifacts = retrieve_available_artifacts()
modeling_categories = [
@ -974,32 +984,13 @@ if __name__ == "__main__":
unclassified_model_failures = []
# This prefix is used to get job links below. For past CI, we use `workflow_call`, which changes the job names from
# `Model tests (...)` to `PyTorch 1.5 / Model tests (...)` for example.
job_name_prefix = ""
if ci_event.startswith("Past CI - "):
framework, version = ci_event.replace("Past CI - ", "").split("-")
framework = "PyTorch" if framework == "pytorch" else "TensorFlow"
job_name_prefix = f"{framework} {version}"
elif ci_event.startswith("Nightly CI"):
job_name_prefix = "Nightly CI"
elif ci_event.startswith("Push CI (AMD) - "):
flavor = ci_event.replace("Push CI (AMD) - ", "")
job_name_prefix = f"AMD {flavor}"
elif ci_event.startswith("Scheduled CI (AMD) - "):
flavor = ci_event.replace("Scheduled CI (AMD) - ", "")
job_name_prefix = f"AMD {flavor}"
for model in model_results.keys():
for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths:
artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
if "stats" in artifact:
# Link to the GitHub Action job
# The job names use `matrix.folder` which contain things like `models/bert` instead of `models_bert`
job_name = f"Model tests ({model.replace('models_', 'models/')}, {artifact_path['gpu']}-gpu)"
if job_name_prefix:
job_name = f"{job_name_prefix} / {job_name}"
model_results[model]["job_link"][artifact_path["gpu"]] = github_actions_job_links.get(job_name)
job = artifact_name_to_job_map[artifact_path["path"]]
model_results[model]["job_link"][artifact_path["gpu"]] = job["html_url"]
failed, success, time_spent = handle_test_results(artifact["stats"])
model_results[model]["success"] += success
model_results[model]["time_spent"] += time_spent[1:-1] + ", "
@ -1084,12 +1075,8 @@ if __name__ == "__main__":
for artifact_path in available_artifacts[additional_files[key]].paths:
# Link to the GitHub Action job
job_name = key
if artifact_path["gpu"] is not None:
job_name = f"{key} ({artifact_path['gpu']}-gpu)"
if job_name_prefix:
job_name = f"{job_name_prefix} / {job_name}"
additional_results[key]["job_link"][artifact_path["gpu"]] = github_actions_job_links.get(job_name)
job = artifact_name_to_job_map[artifact_path["path"]]
additional_results[key]["job_link"][artifact_path["gpu"]] = job["html_url"]
artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
stacktraces = handle_stacktraces(artifact["failures_line"])