Compare commits
4 Commits
main
...
run_no_job
Author | SHA1 | Date |
---|---|---|
ydshieh | 22d9ac23f5 | |
ydshieh | 80efaa69a7 | |
ydshieh | 629febe9b2 | |
ydshieh | 2b9b539429 |
|
@ -12,7 +12,7 @@ on:
|
|||
- cron: "17 2 * * *"
|
||||
push:
|
||||
branches:
|
||||
- run_scheduled_ci*
|
||||
- run_no_job_name
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
|
@ -61,7 +61,7 @@ jobs:
|
|||
name: Identify models to test
|
||||
working-directory: /transformers/tests
|
||||
run: |
|
||||
echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
|
||||
echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2[:2] + d1[:2]; print(d)')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
|
@ -121,7 +121,7 @@ jobs:
|
|||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
|
@ -182,7 +182,7 @@ jobs:
|
|||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
|
@ -233,174 +233,13 @@ jobs:
|
|||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: ${{ matrix.machine_type }}_run_examples_gpu
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
|
||||
|
||||
run_pipelines_torch_gpu:
|
||||
name: PyTorch pipelines
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all pipeline tests on GPU
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
|
||||
|
||||
run_pipelines_tf_gpu:
|
||||
name: TensorFlow pipelines
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
container:
|
||||
image: huggingface/transformers-tensorflow-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all pipeline tests on GPU
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu
|
||||
|
||||
run_all_tests_torch_cuda_extensions_gpu:
|
||||
name: Torch CUDA extension tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
needs: setup
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /workspace/transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /workspace/transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: Remove cached torch extensions
|
||||
run: rm -rf /github/home/.cache/torch_extensions/
|
||||
|
||||
# To avoid unknown test failures
|
||||
- name: Pre build DeepSpeed *again*
|
||||
working-directory: /workspace
|
||||
run: |
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /workspace/transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
|
||||
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
|
||||
|
||||
run_extract_warnings:
|
||||
name: Extract warnings in CI artifacts
|
||||
runs-on: ubuntu-22.04
|
||||
|
@ -410,9 +249,6 @@ jobs:
|
|||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
run_examples_gpu,
|
||||
run_pipelines_tf_gpu,
|
||||
run_pipelines_torch_gpu,
|
||||
run_all_tests_torch_cuda_extensions_gpu
|
||||
]
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
|
@ -458,9 +294,6 @@ jobs:
|
|||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
run_examples_gpu,
|
||||
run_pipelines_tf_gpu,
|
||||
run_pipelines_torch_gpu,
|
||||
run_all_tests_torch_cuda_extensions_gpu,
|
||||
run_extract_warnings
|
||||
]
|
||||
steps:
|
||||
|
@ -478,7 +311,7 @@ jobs:
|
|||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
CI_EVENT: scheduled
|
||||
CI_SHA: ${{ github.sha }}
|
||||
|
|
|
@ -10,6 +10,32 @@ from collections import Counter
|
|||
import requests
|
||||
|
||||
|
||||
def get_jobs(workflow_run_id, token=None):
|
||||
"""Extract jobs in a GitHub Actions workflow run"""
|
||||
|
||||
headers = None
|
||||
if token is not None:
|
||||
headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
|
||||
|
||||
url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{workflow_run_id}/jobs?per_page=100"
|
||||
result = requests.get(url, headers=headers).json()
|
||||
jobs = []
|
||||
|
||||
try:
|
||||
jobs.extend(result["jobs"])
|
||||
pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100)
|
||||
|
||||
for i in range(pages_to_iterate_over):
|
||||
result = requests.get(url + f"&page={i + 2}", headers=headers).json()
|
||||
jobs.extend(result["jobs"])
|
||||
|
||||
return jobs
|
||||
except Exception:
|
||||
print(f"Unknown error, could not fetch links:\n{traceback.format_exc()}")
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def get_job_links(workflow_run_id, token=None):
|
||||
"""Extract job names and their job links in a GitHub Actions workflow run"""
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ import time
|
|||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import requests
|
||||
from get_ci_error_statistics import get_job_links
|
||||
from get_ci_error_statistics import get_jobs
|
||||
from get_previous_daily_ci import get_last_daily_ci_reports
|
||||
from slack_sdk import WebClient
|
||||
|
||||
|
@ -938,9 +938,19 @@ if __name__ == "__main__":
|
|||
Message.error_out(title, ci_title)
|
||||
raise ValueError("Errored out.")
|
||||
|
||||
github_actions_job_links = get_job_links(
|
||||
github_actions_jobs = get_jobs(
|
||||
workflow_run_id=os.environ["GITHUB_RUN_ID"], token=os.environ["ACCESS_REPO_INFO_TOKEN"]
|
||||
)
|
||||
github_actions_job_links = {job["name"]: job["html_url"] for job in github_actions_jobs}
|
||||
|
||||
artifact_name_to_job_map = {}
|
||||
for job in github_actions_jobs:
|
||||
for step in job["steps"]:
|
||||
if step["name"].startswith("Test suite reports artifacts: "):
|
||||
artifact_name = step["name"][len("Test suite reports artifacts: ") :]
|
||||
artifact_name_to_job_map[artifact_name] = job
|
||||
break
|
||||
|
||||
available_artifacts = retrieve_available_artifacts()
|
||||
|
||||
modeling_categories = [
|
||||
|
@ -974,32 +984,13 @@ if __name__ == "__main__":
|
|||
|
||||
unclassified_model_failures = []
|
||||
|
||||
# This prefix is used to get job links below. For past CI, we use `workflow_call`, which changes the job names from
|
||||
# `Model tests (...)` to `PyTorch 1.5 / Model tests (...)` for example.
|
||||
job_name_prefix = ""
|
||||
if ci_event.startswith("Past CI - "):
|
||||
framework, version = ci_event.replace("Past CI - ", "").split("-")
|
||||
framework = "PyTorch" if framework == "pytorch" else "TensorFlow"
|
||||
job_name_prefix = f"{framework} {version}"
|
||||
elif ci_event.startswith("Nightly CI"):
|
||||
job_name_prefix = "Nightly CI"
|
||||
elif ci_event.startswith("Push CI (AMD) - "):
|
||||
flavor = ci_event.replace("Push CI (AMD) - ", "")
|
||||
job_name_prefix = f"AMD {flavor}"
|
||||
elif ci_event.startswith("Scheduled CI (AMD) - "):
|
||||
flavor = ci_event.replace("Scheduled CI (AMD) - ", "")
|
||||
job_name_prefix = f"AMD {flavor}"
|
||||
|
||||
for model in model_results.keys():
|
||||
for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths:
|
||||
artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
|
||||
if "stats" in artifact:
|
||||
# Link to the GitHub Action job
|
||||
# The job names use `matrix.folder` which contain things like `models/bert` instead of `models_bert`
|
||||
job_name = f"Model tests ({model.replace('models_', 'models/')}, {artifact_path['gpu']}-gpu)"
|
||||
if job_name_prefix:
|
||||
job_name = f"{job_name_prefix} / {job_name}"
|
||||
model_results[model]["job_link"][artifact_path["gpu"]] = github_actions_job_links.get(job_name)
|
||||
job = artifact_name_to_job_map[artifact_path["path"]]
|
||||
model_results[model]["job_link"][artifact_path["gpu"]] = job["html_url"]
|
||||
failed, success, time_spent = handle_test_results(artifact["stats"])
|
||||
model_results[model]["success"] += success
|
||||
model_results[model]["time_spent"] += time_spent[1:-1] + ", "
|
||||
|
@ -1084,12 +1075,8 @@ if __name__ == "__main__":
|
|||
|
||||
for artifact_path in available_artifacts[additional_files[key]].paths:
|
||||
# Link to the GitHub Action job
|
||||
job_name = key
|
||||
if artifact_path["gpu"] is not None:
|
||||
job_name = f"{key} ({artifact_path['gpu']}-gpu)"
|
||||
if job_name_prefix:
|
||||
job_name = f"{job_name_prefix} / {job_name}"
|
||||
additional_results[key]["job_link"][artifact_path["gpu"]] = github_actions_job_links.get(job_name)
|
||||
job = artifact_name_to_job_map[artifact_path["path"]]
|
||||
additional_results[key]["job_link"][artifact_path["gpu"]] = job["html_url"]
|
||||
|
||||
artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
|
||||
stacktraces = handle_stacktraces(artifact["failures_line"])
|
||||
|
|
Loading…
Reference in New Issue