diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 6627812a66..a7df11e8fb 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -3,7 +3,7 @@ name: Build docker images (scheduled) on: push: branches: - - docker-image* + - build_ci_docker_image* repository_dispatch: workflow_call: inputs: @@ -67,35 +67,6 @@ jobs: push: true tags: huggingface/transformers-all-latest-gpu-push-ci - latest-with-torch-nightly-docker: - name: "Nightly PyTorch + Stable TensorFlow" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-latest - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v3 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - PYTORCH=pre - push: true - tags: huggingface/transformers-all-latest-torch-nightly-gpu - latest-torch-deepspeed-docker: name: "Latest PyTorch + DeepSpeed" runs-on: ubuntu-latest @@ -153,34 +124,6 @@ jobs: push: true tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - nightly-torch-deepspeed-docker: - name: "Nightly PyTorch + DeepSpeed" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-latest - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v3 - with: - context: ./docker/transformers-pytorch-deepspeed-nightly-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu - doc-builder: name: "Doc builder" # Push CI doesn't need this image diff --git a/.github/workflows/build-nightly-ci-docker-images.yml b/.github/workflows/build-nightly-ci-docker-images.yml new file mode 100644 index 0000000000..f13dda7daa --- /dev/null +++ b/.github/workflows/build-nightly-ci-docker-images.yml @@ -0,0 +1,75 @@ +name: Build docker images (Nightly CI) + +on: + workflow_call: + push: + branches: + - build_nightly_ci_docker_image* + +concurrency: + group: docker-images-builds + cancel-in-progress: false + +jobs: + latest-with-torch-nightly-docker: + name: "Nightly PyTorch + Stable TensorFlow" + runs-on: ubuntu-latest + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v3 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + PYTORCH=pre + push: true + tags: huggingface/transformers-all-latest-torch-nightly-gpu + + nightly-torch-deepspeed-docker: + name: "Nightly PyTorch + DeepSpeed" + runs-on: ubuntu-latest + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v3 + with: + context: ./docker/transformers-pytorch-deepspeed-nightly-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu \ No newline at end of file diff --git a/.github/workflows/build-past-ci-docker-images.yml b/.github/workflows/build-past-ci-docker-images.yml index 3a0e161245..18d88f2d52 100644 --- a/.github/workflows/build-past-ci-docker-images.yml +++ b/.github/workflows/build-past-ci-docker-images.yml @@ -3,7 +3,7 @@ name: Build docker images (Past CI) on: push: branches: - - past-ci-docker-image* + - build_past_ci_docker_image* concurrency: group: docker-images-builds @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - version: ["1.11", "1.10", "1.9", "1.8", "1.7", "1.6", "1.5", "1.4"] + version: ["1.13", "1.12", "1.11", "1.10", "1.9"] runs-on: ubuntu-latest steps: - @@ -24,6 +24,17 @@ jobs: - name: Check out code uses: actions/checkout@v3 + - + id: get-base-image + name: Get Base Image + env: + framework_version: ${{ matrix.version }} + run: | + echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["pytorch"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT + - + name: Print Base Image + run: | + echo ${{ steps.get-base-image.outputs.base_image }} - name: Login to DockerHub uses: docker/login-action@v2 @@ -37,6 +48,7 @@ jobs: context: ./docker/transformers-past-gpu build-args: | REF=main + BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }} FRAMEWORK=pytorch VERSION=${{ matrix.version }} push: true @@ -47,7 +59,7 @@ jobs: strategy: fail-fast: false matrix: - version: ["2.8", "2.7", "2.6", "2.5"] + version: ["2.11", "2.10", "2.9", "2.8", "2.7", "2.6", "2.5"] runs-on: ubuntu-latest steps: - @@ -56,6 +68,17 @@ jobs: - name: Check out code uses: actions/checkout@v3 + - + id: get-base-image + name: Get Base Image + env: + framework_version: ${{ matrix.version }} + run: | + echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["tensorflow"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT + - + name: Print Base Image + run: | + echo ${{ steps.get-base-image.outputs.base_image }} - name: Login to DockerHub uses: docker/login-action@v2 @@ -69,40 +92,8 @@ jobs: context: ./docker/transformers-past-gpu build-args: | REF=main + BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }} FRAMEWORK=tensorflow VERSION=${{ matrix.version }} push: true tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu - - past-tensorflow-docker-2-4: - name: "Past TensorFlow Docker" - strategy: - fail-fast: false - matrix: - version: ["2.4"] - runs-on: ubuntu-latest - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v3 - with: - context: ./docker/transformers-past-gpu - build-args: | - REF=main - BASE_DOCKER_IMAGE=nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04 - FRAMEWORK=tensorflow - VERSION=${{ matrix.version }} - push: true - tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu \ No newline at end of file diff --git a/.github/workflows/self-nightly-past-ci-caller.yml b/.github/workflows/self-nightly-past-ci-caller.yml new file mode 100644 index 0000000000..e86e6a1666 --- /dev/null +++ b/.github/workflows/self-nightly-past-ci-caller.yml @@ -0,0 +1,143 @@ +name: Self-hosted runner (nightly-past-ci-caller) + +on: + schedule: + # 2 am on each Sunday and Thursday + - cron: "0 2 * * 0,4" + push: + branches: + - run_nightly_ci* + - run_past_ci* + +jobs: + build_nightly_ci_images: + name: Build Nightly CI Docker Images + if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci')) + uses: ./.github/workflows/build-nightly-ci-docker-images.yml + secrets: inherit + + run_nightly_ci: + name: Nightly CI + needs: [build_nightly_ci_images] + uses: ./.github/workflows/self-nightly-scheduled.yml + secrets: inherit + + run_past_ci_pytorch_1-13: + name: PyTorch 1.13 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_nightly_ci] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.13" + secrets: inherit + + run_past_ci_pytorch_1-12: + name: PyTorch 1.12 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_past_ci_pytorch_1-13] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.12" + secrets: inherit + + run_past_ci_pytorch_1-11: + name: PyTorch 1.11 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_past_ci_pytorch_1-12] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.11" + secrets: inherit + + run_past_ci_pytorch_1-10: + name: PyTorch 1.10 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_past_ci_pytorch_1-11] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.10" + secrets: inherit + + run_past_ci_pytorch_1-9: + name: PyTorch 1.9 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_past_ci_pytorch_1-10] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.9" + secrets: inherit + + run_past_ci_tensorflow_2-11: + name: TensorFlow 2.11 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_pytorch_1-9] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.11" + secrets: inherit + + run_past_ci_tensorflow_2-10: + name: TensorFlow 2.10 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-11] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.10" + secrets: inherit + + run_past_ci_tensorflow_2-9: + name: TensorFlow 2.9 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-10] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.9" + secrets: inherit + + run_past_ci_tensorflow_2-8: + name: TensorFlow 2.8 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-9] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.8" + secrets: inherit + + run_past_ci_tensorflow_2-7: + name: TensorFlow 2.7 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-8] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.7" + secrets: inherit + + run_past_ci_tensorflow_2-6: + name: TensorFlow 2.6 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-7] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.6" + secrets: inherit + + run_past_ci_tensorflow_2-5: + name: TensorFlow 2.5 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-6] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.5" + secrets: inherit diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml index ca5186e736..b3e13cbb1b 100644 --- a/.github/workflows/self-nightly-scheduled.yml +++ b/.github/workflows/self-nightly-scheduled.yml @@ -1,4 +1,4 @@ -name: Self-hosted runner (nightly) +name: Self-hosted runner (nightly-ci) # Note that each job's dependencies go into a corresponding docker file. # @@ -8,9 +8,7 @@ name: Self-hosted runner (nightly) on: repository_dispatch: -# Disable temporarily until the test suite can be run under 12 hours. -# schedule: -# - cron: "0 16 * * *" + workflow_call: env: HF_HOME: /mnt/cache @@ -33,7 +31,7 @@ jobs: fetch-depth: 2 - name: Check Runner Status - run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} check_runners: name: Check Runners @@ -41,7 +39,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-all-latest-torch-nightly-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -56,7 +54,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-all-latest-torch-nightly-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -96,7 +94,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [single-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-all-latest-torch-nightly-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -143,7 +141,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_multi_gpu: @@ -153,7 +151,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-all-latest-torch-nightly-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -200,7 +198,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_all_tests_torch_cuda_extensions_gpu: @@ -209,7 +207,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} needs: setup container: image: huggingface/transformers-pytorch-deepspeed-nightly-gpu @@ -258,7 +256,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports + name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_nightly path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu send_results: @@ -292,7 +290,7 @@ jobs: CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} - CI_EVENT: nightly-build + CI_EVENT: Nightly CI RUNNER_STATUS: ${{ needs.check_runner_status.result }} RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} SETUP_STATUS: ${{ needs.setup.result }} @@ -302,3 +300,11 @@ jobs: pip install slack_sdk pip show slack_sdk python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" + + + # delete-artifact + - uses: geekyeggo/delete-artifact@v2 + with: + name: | + single-* + multi-* \ No newline at end of file diff --git a/.github/workflows/self-past-caller.yml b/.github/workflows/self-past-caller.yml deleted file mode 100644 index 2cc81dac8c..0000000000 --- a/.github/workflows/self-past-caller.yml +++ /dev/null @@ -1,136 +0,0 @@ -name: Self-hosted runner (past-ci-caller) - -on: - push: - branches: - - run-past-ci* - -jobs: - run_past_ci_pytorch_1-11: - name: PyTorch 1.11 - if: always() - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.11" - secrets: inherit - - run_past_ci_pytorch_1-10: - name: PyTorch 1.10 - if: always() - needs: [run_past_ci_pytorch_1-11] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.10" - secrets: inherit - - run_past_ci_pytorch_1-9: - name: PyTorch 1.9 - if: always() - needs: [run_past_ci_pytorch_1-10] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.9" - secrets: inherit - - run_past_ci_pytorch_1-8: - name: PyTorch 1.8 - if: always() - needs: [run_past_ci_pytorch_1-9] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.8" - secrets: inherit - - run_past_ci_pytorch_1-7: - name: PyTorch 1.7 - if: always() - needs: [run_past_ci_pytorch_1-8] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.7" - secrets: inherit - - run_past_ci_pytorch_1-6: - name: PyTorch 1.6 - if: always() - needs: [run_past_ci_pytorch_1-7] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.6" - secrets: inherit - - run_past_ci_pytorch_1-5: - name: PyTorch 1.5 - if: always() - needs: [run_past_ci_pytorch_1-6] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.5" - secrets: inherit - - run_past_ci_pytorch_1-4: - name: PyTorch 1.4 - if: always() - needs: [run_past_ci_pytorch_1-5] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.4" - secrets: inherit - - run_past_ci_tensorflow_2-8: - name: TensorFlow 2.8 - if: always() - needs: [run_past_ci_pytorch_1-4] - uses: ./.github/workflows/self-past.yml - with: - framework: tensorflow - version: "2.8" - secrets: inherit - - run_past_ci_tensorflow_2-7: - name: TensorFlow 2.7 - if: always() - needs: [run_past_ci_tensorflow_2-8] - uses: ./.github/workflows/self-past.yml - with: - framework: tensorflow - version: "2.7" - secrets: inherit - - run_past_ci_tensorflow_2-6: - name: TensorFlow 2.6 - if: always() - needs: [run_past_ci_tensorflow_2-7] - uses: ./.github/workflows/self-past.yml - with: - framework: tensorflow - version: "2.6" - secrets: inherit - - run_past_ci_tensorflow_2-5: - name: TensorFlow 2.5 - if: always() - needs: [run_past_ci_tensorflow_2-6] - uses: ./.github/workflows/self-past.yml - with: - framework: tensorflow - version: "2.5" - secrets: inherit - - run_past_ci_tensorflow_2-4: - name: TensorFlow 2.4 - if: always() - needs: [run_past_ci_tensorflow_2-5] - uses: ./.github/workflows/self-past.yml - with: - framework: tensorflow - version: "2.4" - secrets: inherit \ No newline at end of file diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml index 12ddcc6658..bcb6639a79 100644 --- a/.github/workflows/self-past.yml +++ b/.github/workflows/self-past.yml @@ -1,4 +1,4 @@ -name: Self-hosted runner (past) +name: Self-hosted runner (past-ci) # Note that each job's dependencies go into a corresponding docker file. # @@ -157,7 +157,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_multi_gpu: @@ -223,14 +223,80 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + run_all_tests_torch_cuda_extensions_gpu: + name: Torch CUDA extension tests + if: inputs.framework == 'pytorch' + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} + needs: setup + container: + image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Remove cached torch extensions + run: rm -rf /github/home/.cache/torch_extensions/ + + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* + working-directory: / + run: | + python3 -m pip uninstall -y deepspeed + rm -rf DeepSpeed + git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu + send_results: name: Send results to webhook runs-on: ubuntu-latest if: always() - needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu] + needs: [ + check_runner_status, + check_runners, + setup, + run_tests_single_gpu, + run_tests_multi_gpu, + run_all_tests_torch_cuda_extensions_gpu + ] steps: - name: Preliminary job status shell: bash @@ -272,4 +338,11 @@ jobs: uses: actions/upload-artifact@v3 with: name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }} - path: test_failure_tables \ No newline at end of file + path: test_failure_tables + + # delete-artifact + - uses: geekyeggo/delete-artifact@v2 + with: + name: | + single-* + multi-* \ No newline at end of file diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index f535efba27..3ebf38062c 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -10,6 +10,9 @@ on: repository_dispatch: schedule: - cron: "0 2 * * *" + push: + branches: + - run_scheduled_ci* env: HF_HOME: /mnt/cache diff --git a/docker/transformers-past-gpu/Dockerfile b/docker/transformers-past-gpu/Dockerfile index 99fb550c6a..8ecc83c339 100644 --- a/docker/transformers-past-gpu/Dockerfile +++ b/docker/transformers-past-gpu/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_DOCKER_IMAGE="nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04" +ARG BASE_DOCKER_IMAGE FROM $BASE_DOCKER_IMAGE LABEL maintainer="Hugging Face" @@ -8,7 +8,7 @@ ARG DEBIAN_FRONTEND=noninteractive SHELL ["sh", "-lc"] RUN apt update -RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs +RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs libaio-dev RUN git lfs install RUN python3 -m pip install --no-cache-dir --upgrade pip @@ -23,6 +23,9 @@ RUN cd transformers && python3 setup.py develop ARG FRAMEWORK ARG VERSION +# Control `setuptools` version to avoid some issues +RUN [ "$VERSION" != "1.9" -a "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5" + # Remove all frameworks # (`accelerate` requires `torch`, and this causes import issues for TF-only testing) RUN python3 -m pip uninstall -y torch torchvision torchaudio accelerate tensorflow jax flax @@ -34,4 +37,20 @@ RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --ve RUN echo "INSTALL_CMD = $INSTALL_CMD" RUN $INSTALL_CMD +RUN [ "$FRAMEWORK" != "pytorch" ] && echo "`deepspeed-testing` installation is skipped" || python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] + +# Uninstall `torch-tensorrt` and `apex` shipped with the base image +RUN python3 -m pip uninstall -y torch-tensorrt apex + +# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) +RUN python3 -m pip uninstall -y deepspeed +# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.) +# Issue: https://github.com/microsoft/DeepSpeed/issues/2010 +# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \ +# DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 + RUN python3 -m pip install -U "itsdangerous<2.1.0" + +# When installing in editable mode, `transformers` is not recognized as a package. +# this line must be added in order for python to be aware of transformers. +RUN cd transformers && python3 setup.py develop diff --git a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile index 573e09c22a..fcb599ddc2 100644 --- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile @@ -1,10 +1,11 @@ -FROM nvcr.io/nvidia/pytorch:21.03-py3 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08 +FROM nvcr.io/nvidia/pytorch:22.08-py3 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive # Example: `cu102`, `cu113`, etc. -ARG CUDA='cu113' +ARG CUDA='cu117' RUN apt -y update RUN apt install -y libaio-dev @@ -20,6 +21,9 @@ RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] +# Uninstall `torch-tensorrt` and `apex` shipped with the base image +RUN python3 -m pip uninstall -y torch-tensorrt apex + # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) RUN python3 -m pip uninstall -y deepspeed # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.) @@ -27,23 +31,23 @@ RUN python3 -m pip uninstall -y deepspeed # RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \ # DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 -# For `torchdynamo` tests -# (see https://github.com/huggingface/transformers/pull/17765) -RUN git clone https://github.com/pytorch/functorch -RUN python3 -m pip install --no-cache-dir ./functorch[aot] -RUN cd functorch && python3 setup.py develop - -RUN git clone https://github.com/pytorch/torchdynamo -RUN python3 -m pip install -r ./torchdynamo/requirements.txt -RUN cd torchdynamo && python3 setup.py develop - -# install TensorRT -RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex -RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2 - -# install torch_tensorrt (fx path) -RUN git clone https://github.com/pytorch/TensorRT.git -RUN cd TensorRT/py && python3 setup.py install --fx-only +## For `torchdynamo` tests +## (see https://github.com/huggingface/transformers/pull/17765) +#RUN git clone https://github.com/pytorch/functorch +#RUN python3 -m pip install --no-cache-dir ./functorch[aot] +#RUN cd functorch && python3 setup.py develop +# +#RUN git clone https://github.com/pytorch/torchdynamo +#RUN python3 -m pip install -r ./torchdynamo/requirements.txt +#RUN cd torchdynamo && python3 setup.py develop +# +## install TensorRT +#RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex +#RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2 +# +## install torch_tensorrt (fx path) +#RUN git clone https://github.com/pytorch/TensorRT.git +#RUN cd TensorRT/py && python3 setup.py install --fx-only # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. diff --git a/utils/get_ci_error_statistics.py b/utils/get_ci_error_statistics.py index 09dc4d7dd2..e9dc52b5bb 100644 --- a/utils/get_ci_error_statistics.py +++ b/utils/get_ci_error_statistics.py @@ -66,12 +66,12 @@ def get_artifacts_links(worflow_run_id, token=None): def download_artifact(artifact_name, artifact_url, output_dir, token): """Download a GitHub Action artifact from a URL. - The URL is of the from `https://api.github.com/repos/huggingface/transformers/actions/artifacts/{ARTIFACT_ID}/zip`, + The URL is of the form `https://api.github.com/repos/huggingface/transformers/actions/artifacts/{ARTIFACT_ID}/zip`, but it can't be used to download directly. We need to get a redirect URL first. See https://docs.github.com/en/rest/actions/artifacts#download-an-artifact """ # Get the redirect URL first - cmd = f'curl -v -H "Accept: application/vnd.github+json" -H "Authorization: token {token}" {artifact_url}' + cmd = f'curl -v -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}" {artifact_url}' output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) o = output.stdout.decode("utf-8") lines = o.splitlines() diff --git a/utils/notification_service.py b/utils/notification_service.py index 0aefd5844d..7251b4d400 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -590,23 +590,20 @@ class Message: time.sleep(1) -def retrieve_artifact(name: str, gpu: Optional[str]): +def retrieve_artifact(artifact_path: str, gpu: Optional[str]): if gpu not in [None, "single", "multi"]: raise ValueError(f"Invalid GPU for artifact. Passed GPU: `{gpu}`.") - if gpu is not None: - name = f"{gpu}-gpu_{name}" - _artifact = {} - if os.path.exists(name): - files = os.listdir(name) + if os.path.exists(artifact_path): + files = os.listdir(artifact_path) for file in files: try: - with open(os.path.join(name, file)) as f: + with open(os.path.join(artifact_path, file)) as f: _artifact[file.split(".")[0]] = f.read() except UnicodeDecodeError as e: - raise ValueError(f"Could not open {os.path.join(name, file)}.") from e + raise ValueError(f"Could not open {os.path.join(artifact_path, file)}.") from e return _artifact @@ -629,8 +626,14 @@ def retrieve_available_artifacts(): directories = filter(os.path.isdir, os.listdir()) for directory in directories: - if directory.startswith("single-gpu"): - artifact_name = directory[len("single-gpu") + 1 :] + artifact_name = directory + + name_parts = artifact_name.split("_postfix_") + if len(name_parts) > 1: + artifact_name = name_parts[0] + + if artifact_name.startswith("single-gpu"): + artifact_name = artifact_name[len("single-gpu") + 1 :] if artifact_name in _available_artifacts: _available_artifacts[artifact_name].single_gpu = True @@ -639,7 +642,7 @@ def retrieve_available_artifacts(): _available_artifacts[artifact_name].add_path(directory, gpu="single") - elif directory.startswith("multi-gpu"): + elif artifact_name.startswith("multi-gpu"): artifact_name = directory[len("multi-gpu") + 1 :] if artifact_name in _available_artifacts: @@ -649,7 +652,6 @@ def retrieve_available_artifacts(): _available_artifacts[artifact_name].add_path(directory, gpu="multi") else: - artifact_name = directory if artifact_name not in _available_artifacts: _available_artifacts[artifact_name] = Artifact(artifact_name) @@ -805,10 +807,12 @@ if __name__ == "__main__": framework, version = ci_event.replace("Past CI - ", "").split("-") framework = "PyTorch" if framework == "pytorch" else "TensorFlow" job_name_prefix = f"{framework} {version}" + elif ci_event.startswith("Nightly CI"): + job_name_prefix = "Nightly CI" for model in model_results.keys(): for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths: - artifact = retrieve_artifact(artifact_path["name"], artifact_path["gpu"]) + artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"]) if "stats" in artifact: # Link to the GitHub Action job # The job names use `matrix.folder` which contain things like `models/bert` instead of `models_bert` @@ -901,7 +905,7 @@ if __name__ == "__main__": else: additional_results[key]["job_link"][artifact_path["gpu"]] = github_actions_job_links.get(key) - artifact = retrieve_artifact(artifact_path["name"], artifact_path["gpu"]) + artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"]) stacktraces = handle_stacktraces(artifact["failures_line"]) failed, success, time_spent = handle_test_results(artifact["stats"]) diff --git a/utils/past_ci_versions.py b/utils/past_ci_versions.py index c50bbb9b14..61495ab2a4 100644 --- a/utils/past_ci_versions.py +++ b/utils/past_ci_versions.py @@ -4,6 +4,18 @@ import os past_versions_testing = { "pytorch": { + "1.13": { + "torch": "1.13.1", + "torchvision": "0.14.1", + "torchaudio": "0.13.1", + "python": 3.9, + "cuda": "cu116", + "install": ( + "python3 -m pip install --no-cache-dir -U torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1" + " --extra-index-url https://download.pytorch.org/whl/cu116" + ), + "base_image": "nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04", + }, "1.12": { "torch": "1.12.1", "torchvision": "0.13.1", @@ -14,6 +26,7 @@ past_versions_testing = { "python3 -m pip install --no-cache-dir -U torch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1" " --extra-index-url https://download.pytorch.org/whl/cu113" ), + "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04", }, "1.11": { "torch": "1.11.0", @@ -25,6 +38,7 @@ past_versions_testing = { "python3 -m pip install --no-cache-dir -U torch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0" " --extra-index-url https://download.pytorch.org/whl/cu113" ), + "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04", }, "1.10": { "torch": "1.10.2", @@ -36,6 +50,7 @@ past_versions_testing = { "python3 -m pip install --no-cache-dir -U torch==1.10.2 torchvision==0.11.3 torchaudio==0.10.2" " --extra-index-url https://download.pytorch.org/whl/cu113" ), + "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04", }, # torchaudio < 0.10 has no CUDA-enabled binary distributions "1.9": { @@ -48,87 +63,44 @@ past_versions_testing = { "python3 -m pip install --no-cache-dir -U torch==1.9.1 torchvision==0.10.1 torchaudio==0.9.1" " --extra-index-url https://download.pytorch.org/whl/cu111" ), - }, - "1.8": { - "torch": "1.8.1", - "torchvision": "0.9.1", - "torchaudio": "0.8.1", - "python": 3.9, - "cuda": "cu111", - "install": ( - "python3 -m pip install --no-cache-dir -U torch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1" - " --extra-index-url https://download.pytorch.org/whl/cu111" - ), - }, - "1.7": { - "torch": "1.7.1", - "torchvision": "0.8.2", - "torchaudio": "0.7.2", - "python": 3.9, - "cuda": "cu110", - "install": ( - "python3 -m pip install --no-cache-dir -U torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2" - " --extra-index-url https://download.pytorch.org/whl/cu110" - ), - }, - "1.6": { - "torch": "1.6.0", - "torchvision": "0.7.0", - "torchaudio": "0.6.0", - "python": 3.8, - "cuda": "cu101", - "install": ( - "python3 -m pip install --no-cache-dir -U torch==1.6.0 torchvision==0.7.0 torchaudio==0.6.0" - " --extra-index-url https://download.pytorch.org/whl/cu101" - ), - }, - "1.5": { - "torch": "1.5.1", - "torchvision": "0.6.1", - "torchaudio": "0.5.1", - "python": 3.8, - "cuda": "cu101", - "install": ( - "python3 -m pip install --no-cache-dir -U torch==1.5.1 torchvision==0.6.1 torchaudio==0.5.1" - " --extra-index-url https://download.pytorch.org/whl/cu101" - ), - }, - "1.4": { - "torch": "1.4.0", - "torchvision": "0.5.0", - "torchaudio": "0.4.0", - "python": 3.8, - "cuda": "cu100", - "install": ( - "python3 -m pip install --no-cache-dir -U torch==1.4.0 torchvision==0.5.0 torchaudio==0.4.0" - " --extra-index-url https://download.pytorch.org/whl/cu100" - ), + "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04", }, }, "tensorflow": { + "2.11": { + "tensorflow": "2.11.1", + "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.11.1", + "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04", + }, + "2.10": { + "tensorflow": "2.10.1", + "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.10.1", + "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04", + }, + "2.9": { + "tensorflow": "2.9.3", + "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.9.3", + "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04", + }, "2.8": { "tensorflow": "2.8.2", "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.8.2", + "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04", }, "2.7": { "tensorflow": "2.7.3", "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.7.3", + "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04", }, "2.6": { "tensorflow": "2.6.5", "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.6.5", + "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04", }, "2.5": { "tensorflow": "2.5.3", "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.5.3", - }, - # need another `nvidia:cuda` docker image, otherwise GPU not working - "2.4": { - "tensorflow": "2.4.4", - "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.4.4", - # This should be specified as a docker build argument. - # We keep the information here for reference only. - "base_docker": "nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04", + "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04", }, }, }