From 659b27fd26dee80f7ecd313089c3fa2e457ea90f Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 2 Jun 2022 10:24:16 +0200 Subject: [PATCH] Print more library versions in CI (#17384) * print more lib. versions and just befor test runs * update print_env_pt.py * rename to print_env * Disable warning + better job name * print python version Co-authored-by: ydshieh --- .github/workflows/doctests.yml | 4 +- .github/workflows/self-nightly-scheduled.yml | 8 +-- .github/workflows/self-push.yml | 64 +++++++++----------- .github/workflows/self-scheduled.yml | 62 ++++++++++++++++--- utils/print_env.py | 57 +++++++++++++++++ utils/print_env_pt.py | 28 --------- 6 files changed, 146 insertions(+), 77 deletions(-) create mode 100644 utils/print_env.py delete mode 100755 utils/print_env_pt.py diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index 917945a063..9fc74e2e6c 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -32,9 +32,7 @@ jobs: - name: GPU visibility run: | - utils/print_env_pt.py - TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" - TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" + python3 utils/print_env.py - name: Prepare files for doctests run: | diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml index a2ac3398f6..4b4c0aca9d 100644 --- a/.github/workflows/self-nightly-scheduled.yml +++ b/.github/workflows/self-nightly-scheduled.yml @@ -41,7 +41,7 @@ jobs: - name: Are GPUs recognized by our DL frameworks run: | - utils/print_env_pt.py + utils/print_env.py - name: Run all tests on GPU run: | @@ -109,7 +109,7 @@ jobs: - name: Are GPUs recognized by our DL frameworks run: | - utils/print_env_pt.py + utils/print_env.py - name: Run all tests on GPU env: @@ -163,7 +163,7 @@ jobs: - name: Are GPUs recognized by our DL frameworks run: | - utils/print_env_pt.py + utils/print_env.py - name: Run all tests on GPU run: | @@ -206,7 +206,7 @@ jobs: - name: Are GPUs recognized by our DL frameworks run: | - utils/print_env_pt.py + utils/print_env.py - name: Run all tests on GPU run: | diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 4ab736618c..165af80673 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -87,17 +87,6 @@ jobs: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Are GPUs recognized by our DL frameworks - working-directory: /transformers - run: | - utils/print_env_pt.py - TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" - TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - - name: Echo folder ${{ matrix.folders }} shell: bash # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to @@ -114,6 +103,15 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + - name: Run all non-slow selected tests on GPU working-directory: /transformers run: | @@ -146,17 +144,6 @@ jobs: image: huggingface/transformers-all-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Are GPUs recognized by our DL frameworks - working-directory: /transformers - run: | - utils/print_env_pt.py - TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" - TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - - name: Echo folder ${{ matrix.folders }} shell: bash # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to @@ -173,6 +160,15 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + - name: Run all non-slow selected tests on GPU env: MKL_SERVICE_FORCE_INTEL: 1 @@ -210,19 +206,19 @@ jobs: with: fetch-depth: 2 - - name: NVIDIA-SMI - run: | - nvidia-smi - - name: Install dependencies run: | apt -y update && apt install -y libaio-dev pip install --upgrade pip pip install .[deepspeed-testing] - - name: Are GPUs recognized by our DL frameworks + - name: NVIDIA-SMI run: | - utils/print_env_pt.py + nvidia-smi + + - name: Environment + run: | + python utils/print_env.py - name: Run all non-slow selected tests on GPU # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. @@ -259,10 +255,6 @@ jobs: with: fetch-depth: 2 - - name: NVIDIA-SMI - run: | - nvidia-smi - - name: Install dependencies run: | apt -y update && apt install -y libaio-dev @@ -270,9 +262,13 @@ jobs: rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds pip install .[testing,deepspeed,fairscale] - - name: Are GPUs recognized by our DL frameworks + - name: NVIDIA-SMI run: | - utils/print_env_pt.py + nvidia-smi + + - name: Environment + run: | + python utils/print_env.py - name: Run all non-slow selected tests on GPU # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 89634fdd62..8f378a6618 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -56,13 +56,6 @@ jobs: run: | nvidia-smi - - name: GPU visibility - working-directory: /transformers - run: | - utils/print_env_pt.py - TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" - TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - run_tests_single_gpu: name: Model tests strategy: @@ -91,6 +84,15 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + - name: Run all tests on GPU working-directory: /transformers run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} @@ -135,6 +137,15 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + - name: Run all tests on GPU working-directory: /transformers run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} @@ -163,6 +174,15 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + - name: Run examples tests on GPU working-directory: /transformers run: | @@ -197,6 +217,15 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + - name: Run all pipeline tests on GPU working-directory: /transformers env: @@ -233,6 +262,15 @@ jobs: run: | git fetch && git checkout ${{ github.sha }} + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + - name: Run all pipeline tests on GPU working-directory: /transformers env: @@ -276,6 +314,15 @@ jobs: git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /workspace/transformers + run: | + python utils/print_env.py + - name: Run all tests on GPU working-directory: /workspace/transformers run: | @@ -293,7 +340,6 @@ jobs: name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu - send_results: name: Send results to webhook runs-on: ubuntu-latest diff --git a/utils/print_env.py b/utils/print_env.py new file mode 100644 index 0000000000..443ed6eab6 --- /dev/null +++ b/utils/print_env.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# this script dumps information about the environment + +import os +import sys + +import transformers + + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + +print("Python version:", sys.version) +print("transformers version:", transformers.__version__) + +try: + import torch + + print("Torch version:", torch.__version__) + print("Cuda available:", torch.cuda.is_available()) + print("Cuda version:", torch.version.cuda) + print("CuDNN version:", torch.backends.cudnn.version()) + print("Number of GPUs available:", torch.cuda.device_count()) + print("NCCL version:", torch.cuda.nccl.version()) +except ImportError: + print("Torch version:", None) + +try: + import deepspeed + + print("DeepSpeed version:", deepspeed.__version__) +except ImportError: + print("DeepSpeed version:", None) + +try: + import tensorflow as tf + + print("TensorFlow version:", tf.__version__) + print("TF GPUs available:", bool(tf.config.list_physical_devices("GPU"))) + print("Number of TF GPUs available:", len(tf.config.list_physical_devices("GPU"))) +except ImportError: + print("TensorFlow version:", None) diff --git a/utils/print_env_pt.py b/utils/print_env_pt.py deleted file mode 100755 index 94451541f6..0000000000 --- a/utils/print_env_pt.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python3 - -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# this script dumps information about the environment - -import torch - - -print("Torch version:", torch.__version__) -print("Cuda available:", torch.cuda.is_available()) -print("Cuda version:", torch.version.cuda) -print("CuDNN version:", torch.backends.cudnn.version()) -print("Number of GPUs available:", torch.cuda.device_count()) -print("NCCL version:", torch.cuda.nccl.version())