Merge branch 'main' into meng_ut

intel · Jan 3, 2025 · 47ac81a · 47ac81a
2 parents 9df808a + f634c3c
commit 47ac81a
Show file tree

Hide file tree

Showing 26 changed files with 500 additions and 55 deletions.
diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py
@@ -16,14 +16,14 @@
 
 
 # load csv files
-test_data= pd.read_csv(args.csv_file)
+test_data= pd.read_csv(args.csv_file, comment='#')
 # test_data = test_data.reset_index()  # make sure indexes pair with number of rows
 # test_data = test_data.sort_values(by=["name"], ascending=True)
 test_names = [row["name"] for index, row in test_data.iterrows()]
 
 current_path = pathlib.Path(__file__).parent.resolve()
 refer_file = str(current_path) + "/" + args.category + "_" + args.suite + "_" + args.mode + ".csv"
-refer_data= pd.read_csv(refer_file)
+refer_data= pd.read_csv(refer_file, comment='#')
 # refer_data = refer_data.reset_index()  # make sure indexes pair with number of rows
 # refer_data = refer_data.sort_values(by=["name"], ascending=True)
 refer_names = [row["name"] for index, row in refer_data.iterrows()]

diff --git a/.github/ci_expected_accuracy/inductor_huggingface_training.csv b/.github/ci_expected_accuracy/inductor_huggingface_training.csv
@@ -13,7 +13,8 @@ CamemBert,pass,pass,pass,pass,pass
 DebertaForMaskedLM,pass,pass,pass,pass,pass
 DebertaForQuestionAnswering,pass,pass,pass,pass,pass
 DebertaV2ForMaskedLM,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip
-DebertaV2ForQuestionAnswering,pass,pass,pass,pass,pass
+# Skip DebertaV2ForQuestionAnswering issue: https://github.com/intel/torch-xpu-ops/issues/1216
+DebertaV2ForQuestionAnswering,fail_accuracy,fail_accuracy,fail_accuracy,pass,pass
 DistilBertForMaskedLM,pass,pass,pass,pass,pass
 DistilBertForQuestionAnswering,pass,pass,pass,pass,pass
 DistillGPT2,pass,pass,pass,pass,pass

diff --git a/.github/ci_expected_accuracy/inductor_torchbench_inference.csv b/.github/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -102,5 +102,6 @@ torch_multimodal_clip,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
 tts_angular,pass,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 vgg16,pass,pass,pass,pass,pass
 vision_maskrcnn,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
-yolov3,pass,pass,pass,pass,pass
+# Skip yolov3 for known torchbench issue: https://github.com/intel/torch-xpu-ops/issues/1229
+yolov3,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 hf_Roberta_base,pass,pass,pass,pass,pass
diff --git a/.github/ci_expected_accuracy/inductor_torchbench_training.csv b/.github/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -102,5 +102,6 @@ torch_multimodal_clip,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
 tts_angular,pass,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 vgg16,pass,pass,pass,pass,pass
 vision_maskrcnn,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
-yolov3,pass,pass,pass,pass,pass
+# Skip yolov3 for known torchbench issue: https://github.com/intel/torch-xpu-ops/issues/1229
+yolov3,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 hf_Roberta_base,pass,pass,pass,pass,pass
diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml
@@ -51,6 +51,7 @@ jobs:
       python: ${{ inputs.python != '' && inputs.python || '3.10' }}
       pytorch: ${{ inputs.pytorch != '' && inputs.pytorch || 'nightly' }}
       transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }}
+      PYTORCH_DEBUG_XPU_FALLBACK: '1'
       TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py'
     steps:
       - name: Checkout torch-xpu-ops
@@ -112,6 +113,8 @@ jobs:
           lspci -d ::0380 | tee ${{ github.workspace }}/transformers/tests_log/lspci_0380.txt
           echo "GPU render nodes:"
           cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/transformers/tests_log/device_IDs.txt
+          echo "xpu-smi output:"
+          xpu-smi discovery -y --json --dump -1
       - name: Sanitry check installed packages
         run: |
           source activate huggingface_transformers_test
@@ -267,6 +270,32 @@ jobs:
               echo "| $test_group | $file | $error | $comment |"
             done <_failures_uniq.txt
           } >> $GITHUB_STEP_SUMMARY
+      - name: Print not implemented XPU backend ops
+        run: |
+          cd transformers
+          {
+            echo "### Not implemented ops"
+            echo "| Test group | Operator | Status |"
+            echo "| --- | --- | --- |"
+            rm -rf _ops.txt && touch _ops.txt
+            for log in $(find reports -name failures_line.txt); do
+              # Each failure_line.txt is located in: reports/$test_group/failure_line.txt
+              test_group=$(echo $log | cut -f2 -d/)
+              ops=$(grep NotImplementedError $log | grep "for the XPU device" | sed "s/.*The operator '\(.*\)' is not.*/\1/")
+              for op in $ops; do
+                echo "| $test_group | <pre>$op</pre> | not implemented |" >> _ops.txt
+              done
+            done
+            for log in $(find reports -name warnings.txt); do
+              # Each warnings.txt is located in: reports/$test_group/warnings.txt
+              test_group=$(echo $log | cut -f2 -d/)
+              ops=$(grep UserWarning $log | grep "on the XPU backend" | sed "s/.*The operator '\(.*\) on the XPU.*/\1/")
+              for op in $ops; do
+                echo "| $test_group | <pre>$op</pre> | fallback to CPU happens |" >> _ops.txt
+              done
+            done
+            sort _ops.txt | uniq
+          } >> $GITHUB_STEP_SUMMARY
       - name: Print annotations
         if: ${{ ! cancelled() }}
         run: |
@@ -313,6 +342,8 @@ jobs:
             # printing annotations with key environment variables
             echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |"
             echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |"
+            echo "| jobs.$GITHUB_JOB.env.PYTORCH_ENABLE_XPU_FALLBACK | $PYTORCH_ENABLE_XPU_FALLBACK |"
+            echo "| jobs.$GITHUB_JOB.env.PYTORCH_DEBUG_XPU_FALLBACK | $PYTORCH_DEBUG_XPU_FALLBACK |"
           } >> $GITHUB_STEP_SUMMARY
       - name: Upload Test log
         if: ${{ ! cancelled() }}

diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
@@ -156,7 +156,7 @@ jobs:
           fi
           echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"

diff --git a/.github/workflows/nightly_ondemand_rolling.yml b/.github/workflows/nightly_ondemand_rolling.yml
@@ -158,7 +158,7 @@ jobs:
           fi
           echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"

diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml
@@ -132,7 +132,7 @@ jobs:
           echo "TORCHAUDIO_COMMIT_ID=$(python -c 'import torchaudio; print(torchaudio.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TRITON_COMMIT_ID=$(python -c 'import triton; print(triton.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           cd ../pytorch
-          echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -90,7 +90,7 @@ jobs:
           cd ../pytorch
           echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
           echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
-          echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
+          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
           echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
           echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
@@ -144,9 +144,9 @@ jobs:
         run: |
           rm -rf ${{ github.workspace }}/upload_files
           cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
-          failed_case=$(grep "Real failed: models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true)
+          failed_case=$(grep "Real failed models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true)
           if [ ${failed_case} -ne 0 ];then
-            grep -E "Real failed: models: [1-9]|Summary for" ${{ github.workspace }}/summary_accuracy.log
+            grep -E "Real failed models: [1-9]|Summary for" ${{ github.workspace }}/upload_files/summary_accuracy.log
             exit 1
           fi
       - name: Upload Inductor XPU E2E Data

diff --git a/src/ATen/native/xpu/SoftMax.cpp b/src/ATen/native/xpu/SoftMax.cpp
@@ -76,6 +76,17 @@ TORCH_IMPL_FUNC(log_softmax_xpu_out)
   xpu::_log_softmax_kernel(input, dim, half_to_float, output);
 }
 
+Tensor _safe_softmax_xpu(
+    const Tensor& self,
+    int64_t dim,
+    std::optional<ScalarType> dtype) {
+  // TODO: uncomment after XPU softmax support half_to_float=true
+  // if (self.scalar_type() == ScalarType::Half && dtype == ScalarType::Float)
+  //   return xpu::_safe_softmax_kernel(self, dim_, true);
+  Tensor converted = dtype.has_value() ? self.toType(dtype.value()) : self;
+  return xpu::_safe_softmax_kernel(converted, dim, false);
+}
+
 Tensor masked_softmax_xpu(
     const Tensor& input_,
     const Tensor& mask_,

diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
@@ -1287,7 +1287,7 @@ struct BatchNormTransformInputVectorizedKernelFunctor {
     } else {
       invstd =
           static_cast<stat_accscalar_t>(1) /
-          device_sqrt(
+          std::sqrt(
               static_cast<stat_accscalar_t>(var_or_invstd_[plane]) + epsilon_);
     }
 
@@ -1302,25 +1302,16 @@ struct BatchNormTransformInputVectorizedKernelFunctor {
       for (index_t feature_vec_begin = item.get_local_id(1) * VEC_SIZE;
            feature_vec_begin < fs;
            feature_vec_begin += VEC_SIZE * item.get_local_range(1)) {
-        auto remaining = fs - feature_vec_begin;
-        if (remaining < VEC_SIZE) {
-          for (index_t idx = 0; idx < remaining; ++idx) {
-            index_t feature = feature_vec_begin + idx;
-            o[feature] = static_cast<input_scalar_t>(
-                gamma * (i[feature] - mean) * invstd + beta);
-          }
-        } else {
-          using vec_t = memory::aligned_vector<input_scalar_t, VEC_SIZE>;
-          vec_t vec;
+        using vec_t = memory::aligned_vector<input_scalar_t, VEC_SIZE>;
+        vec_t vec;
 #pragma unroll
-          for (int vt = 0; vt < VEC_SIZE; ++vt) {
-            index_t feature = feature_vec_begin + vt;
-            vec[vt] = static_cast<input_scalar_t>(
-                gamma * (i[feature] - mean) * invstd + beta);
-          }
-          input_scalar_t* write_ptr = &o[feature_vec_begin];
-          *(reinterpret_cast<vec_t*>(write_ptr)) = vec;
+        for (int vt = 0; vt < VEC_SIZE; ++vt) {
+          index_t feature = feature_vec_begin + vt;
+          vec[vt] = static_cast<input_scalar_t>(
+              gamma * (i[feature] - mean) * invstd + beta);
         }
+        input_scalar_t* write_ptr = &o[feature_vec_begin];
+        *(reinterpret_cast<vec_t*>(write_ptr)) = vec;
       }
     }
   }
@@ -1459,7 +1450,7 @@ void batch_norm_elemt_template(
   auto output_ptr = (char*)output_reshaped.data_ptr();
   if (output_reshaped.is_contiguous() &&
       memory::can_vectorize_up_to<input_scalar_t>(output_ptr) >= 4 &&
-      sizeof(input_scalar_t) < sizeof(float)) {
+      sizeof(input_scalar_t) < sizeof(float) && input.size(2) % 4 == 0) {
     auto kfn = BatchNormTransformInputVectorizedKernelFunctor<
         4,
         input_scalar_t,