diff --git a/.github/workflows/docker_publish.yml b/.github/workflows/docker_publish.yml index e4fdedeee..32d595e02 100644 --- a/.github/workflows/docker_publish.yml +++ b/.github/workflows/docker_publish.yml @@ -15,6 +15,11 @@ on: description: 'specify which sha value the image was built with.' required: false default: '' + arch: + description: 'which images to build [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm, lmi, aarch64]' + type: string + required: false + default: '["cpu", "cpu-full", "pytorch-inf2", "pytorch-gpu", "tensorrt-llm", "lmi", "aarch64"]' workflow_call: inputs: mode: @@ -27,6 +32,11 @@ on: description: 'specify which sha value the image aws built with.' required: false default: '' + arch: + description: 'which images to build [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm, lmi, aarch64]' + type: string + required: false + default: '["cpu", "cpu-full", "pytorch-inf2", "pytorch-gpu", "tensorrt-llm", "lmi", "aarch64"]' permissions: id-token: write @@ -38,25 +48,13 @@ env: ECR_REPO_REGION: "us-east-1" jobs: - create-aarch64-runner: - runs-on: [ self-hosted, scheduler ] - steps: - - name: Create new Graviton instance - id: create_aarch64 - run: | - cd /home/ubuntu/djl_benchmark_script/scripts - token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ - https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ - --fail \ - | jq '.token' | tr -d '"' ) - ./start_instance.sh action_graviton $token djl-serving - outputs: - aarch64_instance_id: ${{ steps.create_aarch64.outputs.action_graviton_instance_id }} - - nightly-aarch64: - runs-on: [ self-hosted, aarch64 ] + docker-sync: + runs-on: ubuntu-latest timeout-minutes: 60 - needs: create-aarch64-runner + strategy: + fail-fast: false + matrix: + arch: ${{ startsWith(inputs.arch, '[') && fromJson(inputs.arch) || fromJson(format('[{0}]', inputs.arch)) }} steps: - uses: actions/checkout@v4 - name: Clean docker env @@ -88,11 +86,11 @@ jobs: - name: Pull and sync to docker hub working-directory: serving/docker run: | - ./scripts/push_image_from_ECR.sh $DJL_VERSION deepjavalibrary/djl-serving ${{ inputs.mode }} ${{ inputs.commit_sha }} + ./scripts/push_image_from_ECR.sh $DJL_VERSION deepjavalibrary/djl-serving ${{ inputs.mode }} ${{ matrix.arch }} ${{ inputs.commit_sha }} - name: Pull and sync to ECR working-directory: serving/docker run: | - ./scripts/push_image_from_ECR.sh $DJL_VERSION $AWS_STAGING_ECR_REPO ${{ inputs.mode }} ${{ inputs.commit_sha }} + ./scripts/push_image_from_ECR.sh $DJL_VERSION $AWS_STAGING_ECR_REPO ${{ inputs.mode }} ${{ matrix.arch }} ${{ inputs.commit_sha }} - name: Retag image for release latest if: ${{ inputs.mode == 'release' }} working-directory: serving/docker @@ -103,13 +101,3 @@ jobs: working-directory: serving/docker run: | yes | docker system prune -a --volumes - stop-aarch64-runner: - if: always() - runs-on: [ self-hosted, scheduler ] - needs: [nightly-aarch64, create-aarch64-runner] - steps: - - name: Stop all instances - run: | - cd /home/ubuntu/djl_benchmark_script/scripts - instance_id=${{ needs.create-aarch64-runner.outputs.aarch64_instance_id }} - ./stop_instance.sh $instance_id \ No newline at end of file diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 17781a671..9646b4c0a 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -24,6 +24,19 @@ on: required: false type: string default: '' + outputs: + failure_cpu: + value: ${{ jobs.test.outputs.failure_cpu || '0' }} + failure_gpu: + value: ${{ jobs.test.outputs.failure_gpu || '0' }} + failure_aarch64: + value: ${{ jobs.test.outputs.failure_aarch64 || '0' }} + failure_lmi: + value: ${{ jobs.test.outputs.failure_lmi || '0' }} + failure_trtllm: + value: ${{ jobs.test.outputs.failure_trtllm || '0' }} + failure_neuron: + value: ${{ jobs.test.outputs.failure_neuron || jobs.transformers-neuronx-container-unit-tests.outputs.failure || '0' }} permissions: id-token: write @@ -125,45 +138,72 @@ jobs: - test: TestCpuFull instance: ubuntu-latest gh-runner: true + failure-prefix: cpu - test: TestCpuBoth instance: ubuntu-latest gh-runner: true + failure-prefix: cpu - test: TestGpu instance: g6 + failure-prefix: gpu - test: TestAarch64 instance: aarch64 + failure-prefix: aarch64 - test: TestHfHandler instance: g6 + failure-prefix: lmi - test: TestTrtLlmHandler1 instance: g6 + failure-prefix: trtllm - test: TestTrtLlmHandler2 instance: g6 + failure-prefix: trtllm - test: TestSchedulerSingleGPU instance: g6 + failure-prefix: lmi - test: TestSchedulerMultiGPU instance: g6 + failure-prefix: lmi - test: TestLmiDist1 instance: g6 + failure-prefix: lmi - test: TestLmiDist2 instance: g6 + failure-prefix: lmi - test: TestVllm1 instance: g6 + failure-prefix: lmi - test: TestVllmLora instance: g6 + failure-prefix: lmi - test: TestLmiDistLora instance: g6 + failure-prefix: lmi - test: TestNeuronx1 instance: inf2 + failure-prefix: neuron - test: TestNeuronx2 instance: inf2 + failure-prefix: neuron - test: TestNeuronxRollingBatch instance: inf2 + failure-prefix: neuron - test: TestMultiModal instance: g6 + failure-prefix: lmi - test: TestTextEmbedding instance: g6 + failure-prefix: lmi - test: TestLmiDistPipelineParallel instance: g6 + failure-prefix: lmi + outputs: + failure_cpu: ${{ steps.test-failure.outputs.failure_cpu }} + failure_gpu: ${{ steps.test-failure.outputs.failure_gpu }} + failure_aarch64: ${{ steps.test-failure.outputs.failure_aarch64 }} + failure_lmi: ${{ steps.test-failure.outputs.failure_lmi }} + failure_trtllm: ${{ steps.test-failure.outputs.failure_trtllm }} + failure_neuron: ${{ steps.test-failure.outputs.failure_neuron }} steps: - uses: actions/checkout@v4 - name: Clean env @@ -222,6 +262,7 @@ jobs: rm -rf outputs rm awscurl - name: On Failure + id: test-failure if: ${{ failure() }} working-directory: tests/integration run: | @@ -229,6 +270,8 @@ jobs: sudo rm -rf outputs && sudo rm -rf models rm awscurl ./remove_container.sh + failure_prefix="${{ matrix.test.failure-prefix }}" + echo "failure_${failure_prefix}=1" >> "$GITHUB_OUTPUT" - name: Upload test logs if: ${{ always() }} uses: actions/upload-artifact@v4 @@ -245,6 +288,8 @@ jobs: - SHA-${{ github.sha }} timeout-minutes: 15 needs: create-runners + outputs: + failure: ${{ steps.failure.outputs.failure }} steps: - uses: actions/checkout@v4 - name: Clean env @@ -300,10 +345,12 @@ jobs: # Fail on failed tests if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi - name: On fail step + id: failure if: ${{ failure() }} working-directory: engines/python/setup run: | cat logs/results.log + echo "failure=1" >> "$GITHUB_OUTPUT" - name: Upload test logs uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 4e940e873..a608408e4 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -55,10 +55,45 @@ jobs: secrets: inherit with: tag-suffix: ${{ needs.get_image_tag_suffix.outputs.test_image_tag_suffix }} + determine_images_to_publish: + if: always() + needs: [ integration-test ] + runs-on: ubuntu-latest + outputs: + images: ${{ steps.generate-images.outputs.images }} + steps: + - name: Generate image list from test results + id: generate-images + run: | + images=() + if [[ "${{ needs.integration-test.outputs.failure_cpu }}" == "0" ]]; then + images+=("cpu") + images+=("cpu-full") + fi + if [[ "${{ needs.integration-test.outputs.failure_gpu }}" == "0" ]]; then + images+=("pytorch-gpu") + fi + if [[ "${{ needs.integration-test.outputs.failure_aarch64 }}" == "0" ]]; then + images+=("aarch64") + fi + if [[ "${{ needs.integration-test.outputs.failure_lmi }}" == "0" ]]; then + images+=("lmi") + fi + if [[ "${{ needs.integration-test.outputs.failure_trtllm }}" == "0" ]]; then + images+=("tensorrt-llm") + fi + if [[ "${{ needs.integration-test.outputs.failure_neuron }}" == "0" ]]; then + images+=("pytorch-inf2") + fi + json_images=$(jq --compact-output --null-input '$ARGS.positional' --args -- "${images[@]}") + echo "images are ${json_images}" + echo "images=${json_images}" >> "$GITHUB_OUTPUT" publish: - needs: [integration-test, get_image_tag_suffix] + if: always() + needs: [determine_images_to_publish] uses: ./.github/workflows/docker_publish.yml secrets: inherit with: mode: ${{ inputs.mode || 'nightly' }} commit_sha: ${{ github.sha }} + arch: ${{ needs.determine_images_to_publish.outputs.images }} diff --git a/serving/docker/scripts/push_image_from_ECR.sh b/serving/docker/scripts/push_image_from_ECR.sh index 729348a6a..55b545f65 100755 --- a/serving/docker/scripts/push_image_from_ECR.sh +++ b/serving/docker/scripts/push_image_from_ECR.sh @@ -16,27 +16,25 @@ fi version=$1 to_repo=$2 mode=$3 -commit_sha=${4:-$GITHUB_SHA} # Use parameter expansion for default value +image=$4 +commit_sha=${5:-$GITHUB_SHA} # Use parameter expansion for default value -images=(cpu aarch64 cpu-full pytorch-inf2 pytorch-gpu lmi tensorrt-llm) from_repo=$AWS_TMP_ECR_REPO set -x -for image in "${images[@]}"; do - if [[ "$mode" == "release" ]]; then - if [[ "$image" == "cpu" ]]; then - tag=$version - else - tag="$version-$image" - fi +if [[ "$mode" == "release" ]]; then + if [[ "$image" == "cpu" ]]; then + tag=$version + else + tag="$version-$image" fi +fi - if [[ "$mode" == "nightly" ]]; then - tag="$image-nightly" - fi - docker pull $from_repo:$image-$mode-$commit_sha - echo docker tag $from_repo:$image-$mode-$commit_sha $to_repo:$tag - echo docker push $to_repo:$tag -done \ No newline at end of file +if [[ "$mode" == "nightly" ]]; then + tag="$image-nightly" +fi +docker pull $from_repo:$image-$mode-$commit_sha +echo docker tag $from_repo:$image-$mode-$commit_sha $to_repo:$tag +docker push $to_repo:$tag