version: 2.1 orbs: gcp-gke: circleci/gcp-gke@1.0.4 go: circleci/go@1.3.0 # TPU REFERENCES references: checkout_ml_testing: &checkout_ml_testing run: name: Checkout ml-testing-accelerators command: | git clone cd ml-testing-accelerators git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable git checkout stable build_push_docker: &build_push_docker run: name: Configure Docker command: | gcloud --quiet auth configure-docker cd docker/transformers-pytorch-tpu if [ -z "$CIRCLE_PR_NUMBER" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" . ; else docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=pull/$CIRCLE_PR_NUMBER/head" . ; fi docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" deploy_cluster: &deploy_cluster run: name: Deploy the job on the kubernetes cluster command: | go get && \ export PATH=$PATH:$HOME/go/bin && \ kubectl create -f docker/transformers-pytorch-tpu/dataset.yaml || true && \ job_name=$(jsonnet -J ml-testing-accelerators/ docker/transformers-pytorch-tpu/bert-base-cased.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -) && \ job_name=${job_name#job.batch/} && \ job_name=${job_name% created} && \ echo "Waiting on kubernetes job: $job_name" && \ i=0 && \ # 30 checks spaced 30s apart = 900s total. max_checks=30 && \ status_code=2 && \ # Check on the job periodically. Set the status code depending on what # happened to the job in Kubernetes. If we try max_checks times and # still the job hasn't finished, give up and return the starting # non-zero status code. while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \ echo "Done waiting. Job status code: $status_code" && \ # Allow time for logs to flush. sleep 60 && \ echo "JOB_NAME: $job_name" && \ gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID && \ echo "Done with log retrieval attempt." && \ gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \ exit $status_code delete_gke_jobs: &delete_gke_jobs run: name: Delete GKE Jobs command: | # Match jobs whose age matches patterns like '1h' or '1d', i.e. any job # that has been around longer than 1hr. First print all columns for # matches, then execute the delete. kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}' kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}') jobs: run_tests_torch_and_tf: working_directory: ~/transformers docker: - image: circleci/python:3.6 environment: OMP_NUM_THREADS: 1 resource_class: xlarge parallelism: 1 steps: - checkout - restore_cache: keys: - v0.3-torch_and_tf-{{ checksum "" }} - v0.3-{{ checksum "" }} - run: pip install --upgrade pip - run: pip install .[sklearn,tf-cpu,torch,testing] - run: pip install codecov pytest-cov - save_cache: key: v0.3-{{ checksum "" }} paths: - '~/.cache/pip' - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ --cov | tee output.txt - run: codecov - store_artifacts: path: ~/transformers/output.txt destination: test_output.txt run_tests_torch: working_directory: ~/transformers docker: - image: circleci/python:3.7 environment: OMP_NUM_THREADS: 1 resource_class: xlarge parallelism: 1 steps: - checkout - restore_cache: keys: - v0.3-torch-{{ checksum "" }} - v0.3-{{ checksum "" }} - run: pip install --upgrade pip - run: pip install .[sklearn,torch,testing] - save_cache: key: v0.3-torch-{{ checksum "" }} paths: - '~/.cache/pip' - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt - store_artifacts: path: ~/transformers/output.txt destination: test_output.txt run_tests_tf: working_directory: ~/transformers docker: - image: circleci/python:3.7 environment: OMP_NUM_THREADS: 1 resource_class: xlarge parallelism: 1 steps: - checkout - restore_cache: keys: - v0.3-tf-{{ checksum "" }} - v0.3-{{ checksum "" }} - run: pip install --upgrade pip - run: pip install .[sklearn,tf-cpu,testing] - save_cache: key: v0.3-tf-{{ checksum "" }} paths: - '~/.cache/pip' - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt - store_artifacts: path: ~/transformers/output.txt destination: test_output.txt run_tests_custom_tokenizers: working_directory: ~/transformers docker: - image: circleci/python:3.6 environment: RUN_CUSTOM_TOKENIZERS: yes steps: - checkout - restore_cache: keys: - v0.3-custom_tokenizers-{{ checksum "" }} - v0.3-{{ checksum "" }} - run: pip install --upgrade pip - run: pip install .[ja,testing] - save_cache: key: v0.3-custom_tokenizers-{{ checksum "" }} paths: - '~/.cache/pip' - run: python -m pytest -s ./tests/ | tee output.txt - store_artifacts: path: ~/transformers/output.txt destination: test_output.txt run_examples_torch: working_directory: ~/transformers docker: - image: circleci/python:3.6 environment: OMP_NUM_THREADS: 1 resource_class: xlarge parallelism: 1 steps: - checkout - restore_cache: keys: - v0.3-torch_examples-{{ checksum "" }} - v0.3-{{ checksum "" }} - run: pip install --upgrade pip - run: pip install .[sklearn,torch,testing] - run: pip install -r examples/requirements.txt - save_cache: key: v0.3-torch_examples-{{ checksum "" }} paths: - '~/.cache/pip' - run: python -m pytest -n 8 --dist=loadfile -rA -s ./examples/ | tee output.txt - store_artifacts: path: ~/transformers/output.txt destination: test_output.txt build_doc: working_directory: ~/transformers docker: - image: circleci/python:3.6 steps: - checkout - restore_cache: keys: - v0.3-build_doc-{{ checksum "" }} - v0.3-{{ checksum "" }} - run: pip install --upgrade pip - run: pip install .[tf,torch,docs] - save_cache: key: v0.3-build_doc-{{ checksum "" }} paths: - '~/.cache/pip' - run: cd docs && make html SPHINXOPTS="-W" - store_artifacts: path: ./docs/_build deploy_doc: working_directory: ~/transformers docker: - image: circleci/python:3.6 steps: - add_ssh_keys: fingerprints: - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71" - checkout - restore_cache: keys: - v0.3-deploy_doc-{{ checksum "" }} - v0.3-{{ checksum "" }} - run: pip install .[tf,torch,docs] - save_cache: key: v0.3-deploy_doc-{{ checksum "" }} paths: - '~/.cache/pip' - run: ./.circleci/ check_code_quality: working_directory: ~/transformers docker: - image: circleci/python:3.6 resource_class: medium parallelism: 1 steps: - checkout - restore_cache: keys: - v0.3-code_quality-{{ checksum "" }} - v0.3-{{ checksum "" }} - run: pip install --upgrade pip # we need a version of isort with - run: pip install git+git:// - run: pip install .[tf,torch,quality] - save_cache: key: v0.3-code_quality-{{ checksum "" }} paths: - '~/.cache/pip' - run: black --check --line-length 119 --target-version py35 examples templates tests src utils - run: isort --check-only --recursive examples templates tests src utils - run: flake8 examples templates tests src utils check_repository_consistency: working_directory: ~/transformers docker: - image: circleci/python:3.6 resource_class: small parallelism: 1 steps: - checkout - run: pip install requests - run: python ./utils/ # TPU JOBS run_examples_tpu: docker: - image: circleci/python:3.6 environment: OMP_NUM_THREADS: 1 resource_class: xlarge parallelism: 1 steps: - checkout - go/install - *checkout_ml_testing - gcp-gke/install - gcp-gke/update-kubeconfig-with-credentials: cluster: $GKE_CLUSTER perform-login: true - setup_remote_docker - *build_push_docker - *deploy_cluster cleanup-gke-jobs: docker: - image: circleci/python:3.6 steps: - gcp-gke/install - gcp-gke/update-kubeconfig-with-credentials: cluster: $GKE_CLUSTER perform-login: true - *delete_gke_jobs workflow_filters: &workflow_filters filters: branches: only: - master workflows: version: 2 build_and_test: jobs: - check_code_quality - check_repository_consistency - run_examples_torch - run_tests_custom_tokenizers - run_tests_torch_and_tf - run_tests_torch - run_tests_tf - build_doc - deploy_doc: *workflow_filters # tpu_testing_jobs: # triggers: # - schedule: # # Set to run at the first minute of every hour. # cron: "0 8 * * *" # filters: # branches: # only: # - master # jobs: # - cleanup-gke-jobs # - run_examples_tpu