diff --git a/.github/workflows/job_gpu_tests.yml b/.github/workflows/job_gpu_tests.yml new file mode 100644 index 00000000000000..7ba71afec09748 --- /dev/null +++ b/.github/workflows/job_gpu_tests.yml @@ -0,0 +1,134 @@ +name: GPU + +on: + workflow_call: + inputs: + test_type: + description: 'Type of tests to execute' + type: string + required: true + device: + description: 'Device name (igpu or dgpu)' + type: string + required: true + runner: + description: 'Runner labels by which the runner will be chosen. Example: [ "self-hosted", "igpu" ]' + type: string + required: true + container: + description: 'JSON to be converted to the value of the "container" configuration for the job' + type: string + required: false + default: '{"image": null}' + +jobs: + GPU: + timeout-minutes: 80 + runs-on: ${{ fromJSON(inputs.runner) }} + container: ${{ fromJSON(inputs.container) }} + defaults: + run: + shell: bash + env: + DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input + INSTALL_DIR: ${{ github.workspace }}/install + INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests + GTEST_PARALLEL_SCRIPT: ${{ github.workspace }}/gtest_parallel.py + steps: + - name: Download OpenVINO package + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + name: 'openvino_package' + path: ${{ env.INSTALL_DIR }} + + - name: Download OpenVINO tests package + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + name: 'openvino_tests' + path: ${{ env.INSTALL_TEST_DIR }} + + # Needed as ${{ github.workspace }} is not working correctly when using Docker + - name: Setup Variables + run: | + echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" + echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" + echo "GTEST_PARALLEL_SCRIPT=$GITHUB_WORKSPACE/gtest_parallel.py" >> "$GITHUB_ENV" + + - name: Extract OpenVINO packages + run: | + pushd $INSTALL_DIR + tar -xzf openvino_package.tar.gz -C $INSTALL_DIR + popd + pushd $INSTALL_TEST_DIR + tar -xzf openvino_tests.tar.gz -C $INSTALL_DIR + popd + + - name: Install dependencies (Linux) + run: | + $INSTALL_DIR/install_dependencies/install_openvino_dependencies.sh -c=core -c=dev -c=gpu -y + + apt-get update && apt-get install -y wget software-properties-common ca-certificates gpg-agent tzdata clinfo + env: + DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input + TZ: "Europe/London" # to prevent tzdata from waiting user input + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Get gtest-parallel script + run: wget https://raw.githubusercontent.com/google/gtest-parallel/master/gtest_parallel.py + + - name: Install compute runtime drivers + run: | + wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-core_1.0.15985.7_amd64.deb + wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-opencl_1.0.15985.7_amd64.deb + wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu-dbgsym_1.3.28454.6_amd64.ddeb + wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu_1.3.28454.6_amd64.deb + wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd-dbgsym_24.05.28454.6_amd64.ddeb + wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd_24.05.28454.6_amd64.deb + wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/libigdgmm12_22.3.11_amd64.deb + dpkg -i *.deb + + - name: Install media & display runtimes + if: ${{ inputs.device == 'dgpu' }} + run: | + apt-get update && apt-get install -y \ + libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ + libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm11 libxatracker2 mesa-va-drivers \ + mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all + + - name: Verify devices + run: clinfo + + # + # Tests + # + + - name: OpenVINO GPU ${{ inputs.test_type }} Tests + id: run_tests + run: | + source ${INSTALL_DIR}/setupvars.sh + + TEST_RESULTS_DIR="${{ inputs.device }}_${{ inputs.test_type }}_tests" + echo "test_results_dir=$TEST_RESULTS_DIR" >> $GITHUB_OUTPUT + + rm -rf ${INSTALL_TEST_DIR}/${TEST_RESULTS_DIR} && mkdir -p ${INSTALL_TEST_DIR}/${TEST_RESULTS_DIR} + + test_filter='' + if [[ "${{ inputs.test_type }}" == "unit" ]]; then + # Ticket: 138018 + test_filter='-*scatter_nd_update_gpu.dynamic_padded_output*:*border_gpu.basic_zero_input*:*bicubic_zeros_no_align_data1x1*:*bicubic_border_align_batches*:*bilinear_zeros_no_align_data1x1*:*non_zero_gpu.empty_input*:*mark_shape_of_subgraphs.concat_with_empty_tensor_inputs*:*concat_cpu_impl.dynamic_4d_f*:*border_gpu.basic_zero_input_dynamic*:*network_test.model_with_empty_input_is_not_dynamic*:*bicubic_zeros_align_data1x1*' + else + test_filter='*smoke*' + fi + python3 ${GTEST_PARALLEL_SCRIPT} ${INSTALL_TEST_DIR}/ov_gpu_${{ inputs.test_type }}_tests --dump_json_test_results=${INSTALL_TEST_DIR}/${TEST_RESULTS_DIR}/ov_gpu_${{ inputs.test_type }}_tests.json -- --report_unique_name --gtest_filter=$test_filter + + + - name: Upload Test Results + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 + if: always() + with: + name: test-results-${{ inputs.test_type }}-${{ inputs.device }} + path: ${{ env.INSTALL_TEST_DIR }}/${{ steps.run_tests.outputs.test_results_dir }} + if-no-files-found: 'error' diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 462e30e44103b8..744e693b1cff51 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -652,119 +652,44 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} if: fromJSON(needs.smart_ci.outputs.affected_components).TOKENIZERS - GPU: - name: GPU Tests + iGPU: + name: iGPU Tests needs: [ Build, Smart_CI ] - if: fromJSON(needs.smart_ci.outputs.affected_components).GPU - timeout-minutes: 80 - runs-on: [ self-hosted, gpu ] + uses: ./.github/workflows/job_gpu_tests.yml strategy: max-parallel: 2 fail-fast: false matrix: TEST_TYPE: ['unit', 'func'] - container: - image: ubuntu:20.04 - options: --device /dev/dri:/dev/dri --group-add 109 --group-add 44 - volumes: - - /dev/dri:/dev/dri - defaults: - run: - shell: bash - env: - DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input - INSTALL_DIR: ${{ github.workspace }}/install - INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests - GTEST_PARALLEL_SCRIPT: ${{ github.workspace }}/gtest_parallel.py - steps: - - name: Download OpenVINO package - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 - with: - name: 'openvino_package' - path: ${{ env.INSTALL_DIR }} - - - name: Download OpenVINO tests package - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 - with: - name: 'openvino_tests' - path: ${{ env.INSTALL_TEST_DIR }} - - # Needed as ${{ github.workspace }} is not working correctly when using Docker - - name: Setup Variables - run: | - echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" - echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" - echo "GTEST_PARALLEL_SCRIPT=$GITHUB_WORKSPACE/gtest_parallel.py" >> "$GITHUB_ENV" - - - name: Extract OpenVINO packages - run: | - pushd $INSTALL_DIR - tar -xzf openvino_package.tar.gz -C $INSTALL_DIR - popd - pushd $INSTALL_TEST_DIR - tar -xzf openvino_tests.tar.gz -C $INSTALL_DIR - popd - - - name: Install dependencies (Linux) - run: | - $INSTALL_DIR/install_dependencies/install_openvino_dependencies.sh -c=core -c=dev -c=gpu -y - - apt-get update && apt-get install -y wget software-properties-common ca-certificates gpg-agent tzdata - env: - DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input - TZ: "Europe/London" # to prevent tzdata from waiting user input - - - name: Setup Python ${{ env.PYTHON_VERSION }} - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Get gtest-parallel script - run: wget https://raw.githubusercontent.com/google/gtest-parallel/master/gtest_parallel.py - - - name: Install GPU Drivers - run: | - wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-core_1.0.15985.7_amd64.deb - wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-opencl_1.0.15985.7_amd64.deb - wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu-dbgsym_1.3.28454.6_amd64.ddeb - wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu_1.3.28454.6_amd64.deb - wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd-dbgsym_24.05.28454.6_amd64.ddeb - wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd_24.05.28454.6_amd64.deb - wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/libigdgmm12_22.3.11_amd64.deb - dpkg -i *.deb - - # - # Tests - # - - - name: OpenVINO GPU ${{ matrix.TEST_TYPE }} Tests - run: | - source ${INSTALL_DIR}/setupvars.sh - - rm -rf ${INSTALL_TEST_DIR}/gpu_${{ matrix.TEST_TYPE }}_tests && mkdir -p ${INSTALL_TEST_DIR}/gpu_${{ matrix.TEST_TYPE }}_tests - - test_filter='' - if [[ "${{ matrix.TEST_TYPE }}" == "unit" ]]; then - # Ticket: 138018 - test_filter='-*scatter_nd_update_gpu.dynamic_padded_output*:*border_gpu.basic_zero_input*:*bicubic_zeros_no_align_data1x1*:*bicubic_border_align_batches*:*bilinear_zeros_no_align_data1x1*:*non_zero_gpu.empty_input*:*mark_shape_of_subgraphs.concat_with_empty_tensor_inputs*:*concat_cpu_impl.dynamic_4d_f*:*border_gpu.basic_zero_input_dynamic*:*network_test.model_with_empty_input_is_not_dynamic*:*bicubic_zeros_align_data1x1*' - else - test_filter='*smoke*' - fi - python3 ${GTEST_PARALLEL_SCRIPT} ${INSTALL_TEST_DIR}/ov_gpu_${{ matrix.TEST_TYPE }}_tests --dump_json_test_results=${INSTALL_TEST_DIR}/gpu_${{ matrix.TEST_TYPE }}_tests/ov_gpu_${{ matrix.TEST_TYPE }}_tests.json -- --report_unique_name --gtest_filter=$test_filter - + with: + device: 'igpu' + test_type: ${{ matrix.TEST_TYPE }} + runner: "[ 'self-hosted', 'igpu' ]" + container: '{"image": "ubuntu:20.04", "volumes": ["/dev/dri:/dev/dri"], "options": "--group-add 109 --group-add 44 + --device /dev/dri:/dev/dri"}' + if: fromJSON(needs.smart_ci.outputs.affected_components).GPU - - name: Upload Test Results - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 - if: always() - with: - name: test-results-${{ matrix.TEST_TYPE }}-gpu - path: ${{ env.INSTALL_TEST_DIR }}/gpu_${{ matrix.TEST_TYPE }}_tests - if-no-files-found: 'error' + dGPU: + name: dGPU Tests + needs: [ Build, Smart_CI ] + uses: ./.github/workflows/job_gpu_tests.yml + strategy: + max-parallel: 2 + fail-fast: false + matrix: + TEST_TYPE: ['unit', 'func'] + with: + device: 'dgpu' + test_type: ${{ matrix.TEST_TYPE }} + runner: "[ 'self-hosted', 'dgpu' ]" + container: '{"image": "ubuntu:20.04", "volumes": ["/dev/dri:/dev/dri"], "options": "--group-add 109 --group-add 44 + --device /dev/dri/card0:/dev/dri/card0 --device /dev/dri/renderD128:/dev/dri/renderD128"}' + if: ${{ github.event_name == 'schedule' }} Overall_Status: name: ci/gha_overall_status needs: [Smart_CI, Build, Debian_Packages, Samples, Conformance, ONNX_Runtime, CXX_Unit_Tests, Python_Unit_Tests, TensorFlow_Layer_Tests, - CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers, GPU] + CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers, iGPU] if: ${{ always() }} runs-on: ubuntu-latest steps: diff --git a/docs/articles_en/assets/snippets/multi_threading.py b/docs/articles_en/assets/snippets/multi_threading.py index 9a5baa1e7575b1..6994b26a0d6552 100644 --- a/docs/articles_en/assets/snippets/multi_threading.py +++ b/docs/articles_en/assets/snippets/multi_threading.py @@ -37,7 +37,7 @@ # ! [ov:intel_cpu:multi_threading:part0] # ! [ov:intel_cpu:multi_threading:part1] -# Disable CPU threads pinning for inference when system supoprt it +# Disable CPU threads pinning for inference when the system supports it compiled_model_4 = core.compile_model( model=model, device_name=device_name, diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst index b45ff8140031e6..d95f97959f5b2a 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst @@ -3,7 +3,11 @@ CPU Device ========== +.. toctree:: + :maxdepth: 1 + :hidden: + cpu-device/performance-hint-and-threads-scheduling .. meta:: :description: The CPU plugin in the Intel® Distribution of OpenVINO™ toolkit @@ -246,12 +250,6 @@ For more details, see the :doc:`optimization guide <../optimize-inference>` and on data transfer between NUMA nodes. In that case it is better to use the ``ov::hint::PerformanceMode::LATENCY`` performance hint. For more details see the :doc:`performance hints <../optimize-inference/high-level-performance-hints>` overview. - .. toctree:: - :maxdepth: 1 - :hidden: - - cpu-device/performance-hint-and-threads-scheduling - Dynamic Shapes +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst index 93c8c0bd6b36c7..3087bcf2d95783 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst @@ -1,6 +1,5 @@ -.. {#openvino_docs_OV_UG_supported_plugins_CPU_Hints_Threading} -Performance Hints and Threads Scheduling +Performance Hints and Threads Scheduling ======================================== .. meta:: @@ -8,37 +7,46 @@ Performance Hints and Threads Scheduling detects CPU architecture and sets low-level properties based on performance hints automatically. -While all supported devices in OpenVINO offer low-level performance settings, it is advisable not to widely use these settings unless targeting specific platforms and models. The recommended approach is configuring performance in OpenVINO Runtime using high-level performance hints property ``ov::hint::performance_mode``. Performance hints ensure optimal portability and scalability of the applications across various platforms and models. - -To simplify the configuration of hardware devices, OpenVINO offers two performance hints: the latency hint ``ov::hint::PerformanceMode::LATENCY`` and the throughput hint ``ov::hint::PerformanceMode::THROUGHPUT``. - -- ``ov::inference_num_threads`` limits number of logical processors used for CPU inference. - If the number set by the user is greater than the number of logical processors on the platform, multi-threading scheduler only uses the platform number for CPU inference. -- ``ov::num_streams`` limits number of infer requests that can be run in parallel. - If the number set by the user is greater than the number of inference threads, multi-threading scheduler only uses the number of inference threads to ensure that there is at least one thread per stream. -- ``ov::hint::scheduling_core_type`` limits the type of CPU cores for CPU inference when user runs inference on a hybird platform that includes both Performance-cores (P-cores) with Efficient-cores (E-cores). - If user platform only has one type of CPU cores, this property has no effect, and CPU inference always uses this unique core type. -- ``ov::hint::enable_hyper_threading`` limits the use of one or two logical processors per CPU core when platform has CPU hyperthreading enabled. +While all supported devices in OpenVINO offer low-level performance settings, it is advisable +not to use these settings widely unless targeting specific platforms and models. The recommended +approach is to configure performance in OpenVINO Runtime using the high-level performance hints +property ``ov::hint::performance_mode``. Performance hints ensure optimal portability and +scalability of applications across various platforms and models. + +To simplify the configuration of hardware devices, OpenVINO offers two performance hints: the +latency hint ``ov::hint::PerformanceMode::LATENCY`` and the throughput hint +``ov::hint::PerformanceMode::THROUGHPUT``. + +- ``ov::inference_num_threads`` limits the number of logical processors used for CPU inference. + If the number set by the user is greater than the number of logical processors on the platform, + the multi-threading scheduler only uses the platform number for CPU inference. +- ``ov::num_streams`` limits the number of infer requests that can be run in parallel. + If the number set by the user is greater than the number of inference threads, multi-threading + scheduler only uses the number of inference threads to ensure that there is at least one thread per stream. +- ``ov::hint::scheduling_core_type`` specifies the type of CPU cores for CPU inference when the user runs + inference on a hybird platform that includes both Performance-cores (P-cores) and Efficient-cores (E-cores). + If the user platform only has one type of CPU core, this property has no effect, and CPU inference always uses this unique core type. +- ``ov::hint::enable_hyper_threading`` limits the use of one or two logical processors per CPU + core when the platform has CPU hyperthreading enabled. If there is only one logical processor per CPU core, such as Efficient-cores, this property has no effect, and CPU inference uses all logical processors. -- ``ov::hint::enable_cpu_pinning`` enable CPU pinning during CPU inference. - If user enable this property but inference scenario cannot support it, this property will be disabled during model compilation. - -For additional details on the above configurations, refer to: +- ``ov::hint::enable_cpu_pinning`` enables CPU pinning during CPU inference. + If the user enables this property but the inference scenario does not support it, this property will be disabled during model compilation. -- `Multi-stream Execution `__ +For additional details on the above configurations, refer to `Multi-stream Execution `__. Latency Hint ################################### -In this scenario, the default setting of ``ov::hint::scheduling_core_type`` is determined by the model precision and the ratio of P-cores and E-cores. +In this scenario, the default setting of ``ov::hint::scheduling_core_type`` is determined by +the model precision and the ratio of P-cores and E-cores. .. note:: - P-cores is short for Performance-cores and E-cores is for Efficient-cores. These are available after 12th Gen Intel® Core™ Processor. + P-cores is short for Performance-cores and E-cores stands for Efficient-cores. These types of cores are available starting with the 12th Gen Intel® Core™ processors. .. _Core Type Table of Latency Hint: +----------------------------+---------------------+---------------------+ -| | INT8 model | FP32 model | +| | INT8 Model | FP32 Model | +============================+=====================+=====================+ | E-cores / P-cores < 2 | P-cores | P-cores | +----------------------------+---------------------+---------------------+ @@ -49,38 +57,39 @@ In this scenario, the default setting of ``ov::hint::scheduling_core_type`` is d .. note:: - Both P-cores and E-cores may be used for any configuration starting from 14th Gen Intel® Core™ Processor on Windows. + Both P-cores and E-cores may be used for any configuration starting with 14th Gen Intel® Core™ processors on Windows. -Then the default settings of low-level performance properties on Windows and Linux are as follows: +Then the default settings for low-level performance properties on Windows and Linux are as follows: -+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+ -| Property | Windows | Linux | -+======================================+================================================================+================================================================+ -| ``ov::num_streams`` | 1 | 1 | -+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+ -| ``ov::inference_num_threads`` | is equal to number of P-cores or P-cores+E-cores on one socket | is equal to number of P-cores or P-cores+E-cores on one socket | -+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+ -| ``ov::hint::scheduling_core_type`` | `Core Type Table of Latency Hint`_ | `Core Type Table of Latency Hint`_ | -+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+ -| ``ov::hint::enable_hyper_threading`` | No | No | -+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+ -| ``ov::hint::enable_cpu_pinning`` | No / Not Supported | Yes except using P-cores and E-cores together | -+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+ ++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+ +| Property | Windows | Linux | ++======================================+========================================================================+====================================================================+ +| ``ov::num_streams`` | 1 | 1 | ++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+ +| ``ov::inference_num_threads`` | is equal to the number of P-cores or P-cores+E-cores on one socket | is equal to the number of P-cores or P-cores+E-cores on one socket | ++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+ +| ``ov::hint::scheduling_core_type`` | `Core Type Table of Latency Hint`_ | `Core Type Table of Latency Hint`_ | ++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+ +| ``ov::hint::enable_hyper_threading`` | No | No | ++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+ +| ``ov::hint::enable_cpu_pinning`` | No / Not Supported | Yes except using P-cores and E-cores together | ++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+ .. note:: - - ``ov::hint::scheduling_core_type`` might be adjusted for particular inferred model on particular platform based on internal heuristics to guarantee best performance. + - ``ov::hint::scheduling_core_type`` may be adjusted for a particular inferred model on a specific platform based on internal heuristics to guarantee optimal performance. - Both P-cores and E-cores are used for the Latency Hint on Intel® Core™ Ultra Processors on Windows, except in the case of large language models. - - In case hyper-threading is enabled, two logical processors share hardware resource of one CPU core. OpenVINO do not expect to use both logical processors in one stream for one infer request. So ``ov::hint::enable_hyper_threading`` is ``No`` in this scenario. - - ``ov::hint::enable_cpu_pinning`` is disabled by default on Windows/Mac, and enabled on Linux. Such default settings are aligned with typical workloads running in corresponding environment to guarantee better OOB performance. + - In case hyper-threading is enabled, two logical processors share the hardware resources of one CPU core. OpenVINO does not expect to use both logical processors in one stream for a single infer request. So ``ov::hint::enable_hyper_threading`` is set to ``No`` in this scenario. + - ``ov::hint::enable_cpu_pinning`` is disabled by default on Windows and macOS, and enabled on Linux. Such default settings are aligned with typical workloads running in the corresponding environments to guarantee better out-of-the-box (OOB) performance. Throughput Hint ###################################### -In this scenario, thread scheduling first evaluates the memory pressure of the model being inferred on the current platform, and determines the number of threads per stream, as shown below. +In this scenario, thread scheduling first evaluates the memory pressure of the model being +inferred on the current platform, and determines the number of threads per stream, as shown below. +-----------------+-----------------------+ -| Memory Pressure | Threads per stream | +| Memory Pressure | Threads per Stream | +=================+=======================+ | low | 1 P-core or 2 E-cores | +-----------------+-----------------------+ @@ -89,12 +98,13 @@ In this scenario, thread scheduling first evaluates the memory pressure of the m | high | 3 or 4 or 5 | +-----------------+-----------------------+ -Then the value of ``ov::num_streams`` is calculated as ``ov::inference_num_threads`` divided by the number of threads per stream. The default settings of low-level performance properties on Windows and Linux are as follows: +Then the value of ``ov::num_streams`` is calculated by dividing ``ov::inference_num_threads`` +by the number of threads per stream. The default settings for low-level performance properties on Windows and Linux are as follows: +--------------------------------------+-------------------------------+-------------------------------+ | Property | Windows | Linux | +======================================+===============================+===============================+ -| ``ov::num_streams`` | Calculate as above | Calculate as above | +| ``ov::num_streams`` | Calculated as above | Calculated as above | +--------------------------------------+-------------------------------+-------------------------------+ | ``ov::inference_num_threads`` | Number of P-cores and E-cores | Number of P-cores and E-cores | +--------------------------------------+-------------------------------+-------------------------------+ @@ -107,16 +117,17 @@ Then the value of ``ov::num_streams`` is calculated as ``ov::inference_num_threa .. note:: - - By default, different core types are not mixed within single stream in this scenario. And cores from different numa nodes are not mixed within single stream. + - By default, different core types are not mixed within a single stream in this scenario. The cores from different NUMA nodes are not mixed within a single stream. Multi-Threading Optimization ############################################## -User can use the following properties to limit available CPU resource for model inference. If the platform or operating system can support this behavior, OpenVINO Runtime will perform multi-threading scheduling based on limited available CPU. +The following properties can be used to limit the available CPU resources for model inference. +If the platform or operating system supports this behavior, the OpenVINO Runtime will perform multi-threading scheduling based on the limited available CPU. -- ``ov::inference_num_threads`` -- ``ov::hint::scheduling_core_type`` -- ``ov::hint::enable_hyper_threading`` +- ``ov::inference_num_threads`` +- ``ov::hint::scheduling_core_type`` +- ``ov::hint::enable_hyper_threading`` .. tab-set:: @@ -137,9 +148,11 @@ User can use the following properties to limit available CPU resource for model .. note:: - ``ov::hint::scheduling_core_type`` and ``ov::hint::enable_hyper_threading`` only support Intel® x86-64 CPU on Linux and Windows in current release. + ``ov::hint::scheduling_core_type`` and ``ov::hint::enable_hyper_threading`` only support Intel® x86-64 CPU on Linux and Windows in the current release. -In some use cases, OpenVINO Runtime will enable CPU threads pinning by default for better performance. User can also turn it on or off using property ``ov::hint::enable_cpu_pinning``. Disable threads pinning might be beneficial in complex applications with several workloads executed in parallel. +In some use cases, OpenVINO Runtime will enable CPU thread pinning by default for better performance. +Users can also turn this feature on or off using the property ``ov::hint::enable_cpu_pinning``. +Disabling thread pinning may be beneficial in complex applications where several workloads are executed in parallel. .. tab-set:: diff --git a/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css b/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css index b180a5a096eaf3..8c038c795542e6 100644 --- a/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css +++ b/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css @@ -55,13 +55,6 @@ body { border-color: rgb(var(--ost-color-primary)); } -/* Scrollbox Extension */ - -.scrollbox { - overflow-y:scroll; - height:300px; - margin-bottom: 20px; -} /* Syntax Highlighting */ diff --git a/docs/sphinx_setup/_static/css/custom.css b/docs/sphinx_setup/_static/css/custom.css index a9536c7aa05401..57e2b35a395e06 100644 --- a/docs/sphinx_setup/_static/css/custom.css +++ b/docs/sphinx_setup/_static/css/custom.css @@ -129,7 +129,7 @@ nav.bd-links li > a:hover { text-decoration: underline } -ul#navbar-main-elements > li:hover { +ul#navbar-main-elements > li:hover { text-decoration: underline; color: #fff; } @@ -223,7 +223,7 @@ details.sd-dropdown:not([open]).sd-card { /* Ttile is at the same place for both open and close states */ .sd-card-header { border-radius: 0px !important; - + } /* Ttile is at the same place for both open and close states */ @@ -262,7 +262,7 @@ details.sd-dropdown .sd-summary-title { min-width: 125px!important; } -[aria-labelledby="version-selector"] .dropdown-item { +[aria-labelledby="version-selector"] .dropdown-item { padding: 0.25rem 0.5rem!important; } @@ -437,21 +437,21 @@ div.highlight { /* =================================================== */ @media (max-width: 720px) { - .container, + .container, .container-lg, .container-md, .container-sm, .container-xl { max-width: 1850px; } - + .transition-banner { margin-top: 2rem; } } @media (min-width: 1200px) { - .container, + .container, .container-lg, .container-md, .container-sm, @@ -921,6 +921,7 @@ div.highlight { /* Content formatting for the benchmark pages */ +/* =================================================== */ .picker-options { margin: 15px 0; } @@ -1223,7 +1224,7 @@ table#model-accuracy-and-perf-int8-fp32-table td.data { .newsletter-submit-btn:before { font-family: "Font Awesome 5 Free"; - content: "\f0e0\00a0"; + content: "\f0e0\00a0"; font-size: 1rem; } @@ -1307,3 +1308,29 @@ input:-webkit-autofill { -webkit-box-shadow: 0 0 0px 1000px white inset; } +/* Scrollbox Extension */ +/* =================================================== */ +.scrollbox { + overflow-y:scroll; + height:300px; + margin-bottom: 20px; +} + +/* overriding the 'back to top btn' style from webpack://pydata_sphinx_theme/src/pydata_sphinx_theme/assets/styles/base/_base.scss */ +/* =================================================== */ +#pst-back-to-top { + top: unset; + bottom: 3rem; + left: unset; + right: -2rem; + background-color: #0068b5; + font-size: .8rem; + border-radius: .25rem !important; +} + +/* hide the header for the side menu */ +/* =================================================== */ + +nav.bd-links p.bd-links__title { + display: none; +} \ No newline at end of file diff --git a/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp b/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp index 52ba997e6ac2c5..dbb6608b50f0b5 100644 --- a/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp +++ b/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp @@ -15,6 +15,7 @@ #include "pyopenvino/core/common.hpp" #include "pyopenvino/core/infer_request.hpp" +#include "pyopenvino/utils/utils.hpp" namespace py = pybind11; @@ -64,7 +65,7 @@ class AsyncInferQueue { }); size_t idle_handle = m_idle_handles.front(); // wait for request to make sure it returned from callback - m_requests[idle_handle].m_request.wait(); + m_requests[idle_handle].m_request->wait(); if (m_errors.size() > 0) throw m_errors.front(); return idle_handle; @@ -75,7 +76,7 @@ class AsyncInferQueue { // release GIL to avoid deadlock on python callback py::gil_scoped_release release; for (auto&& request : m_requests) { - request.m_request.wait(); + request.m_request->wait(); } // acquire the mutex to access m_errors std::lock_guard lock(m_mutex); @@ -87,7 +88,7 @@ class AsyncInferQueue { for (size_t handle = 0; handle < m_requests.size(); handle++) { // auto end_time = m_requests[handle].m_end_time; // TODO: pass it bellow? like in InferRequestWrapper - m_requests[handle].m_request.set_callback([this, handle /* ... */](std::exception_ptr exception_ptr) { + m_requests[handle].m_request->set_callback([this, handle /* ... */](std::exception_ptr exception_ptr) { *m_requests[handle].m_end_time = Time::now(); { // acquire the mutex to access m_idle_handles @@ -110,14 +111,17 @@ class AsyncInferQueue { } void set_custom_callbacks(py::function f_callback) { + // need to acquire GIL before py::function deletion + auto callback_sp = Common::utils::wrap_pyfunction(std::move(f_callback)); + for (size_t handle = 0; handle < m_requests.size(); handle++) { - m_requests[handle].m_request.set_callback([this, f_callback, handle](std::exception_ptr exception_ptr) { + m_requests[handle].m_request->set_callback([this, callback_sp, handle](std::exception_ptr exception_ptr) { *m_requests[handle].m_end_time = Time::now(); if (exception_ptr == nullptr) { // Acquire GIL, execute Python function py::gil_scoped_acquire acquire; try { - f_callback(m_requests[handle], m_user_ids[handle]); + (*callback_sp)(m_requests[handle], m_user_ids[handle]); } catch (const py::error_already_set& py_error) { // This should behave the same as assert(!PyErr_Occurred()) // since constructor for pybind11's error_already_set is @@ -193,13 +197,13 @@ void regclass_AsyncInferQueue(py::module m) { // Set new inputs label/id from user self.m_user_ids[handle] = userdata; // Update inputs if there are any - self.m_requests[handle].m_request.set_input_tensor(inputs); + self.m_requests[handle].m_request->set_input_tensor(inputs); // Now GIL can be released - we are NOT working with Python objects in this block { py::gil_scoped_release release; *self.m_requests[handle].m_start_time = Time::now(); // Start InferRequest in asynchronus mode - self.m_requests[handle].m_request.start_async(); + self.m_requests[handle].m_request->start_async(); } }, py::arg("inputs"), @@ -239,13 +243,13 @@ void regclass_AsyncInferQueue(py::module m) { // Set new inputs label/id from user self.m_user_ids[handle] = userdata; // Update inputs if there are any - Common::set_request_tensors(self.m_requests[handle].m_request, inputs); + Common::set_request_tensors(*self.m_requests[handle].m_request, inputs); // Now GIL can be released - we are NOT working with Python objects in this block { py::gil_scoped_release release; *self.m_requests[handle].m_start_time = Time::now(); // Start InferRequest in asynchronus mode - self.m_requests[handle].m_request.start_async(); + self.m_requests[handle].m_request->start_async(); } }, py::arg("inputs"), diff --git a/src/bindings/python/src/pyopenvino/core/common.cpp b/src/bindings/python/src/pyopenvino/core/common.cpp index 9f57b794e2bff6..179002127960cd 100644 --- a/src/bindings/python/src/pyopenvino/core/common.cpp +++ b/src/bindings/python/src/pyopenvino/core/common.cpp @@ -433,10 +433,14 @@ ov::op::v0::Constant create_shared(py::array& array) { // If ndim is equal to 0, creates scalar Constant. // If size is equal to 0, creates empty Constant. if (array_helpers::is_contiguous(array)) { - auto memory = std::make_shared>( + auto buffer = new ov::SharedBuffer( static_cast((array.ndim() == 0 || array.size() == 0) ? array.mutable_data() : array.mutable_data(0)), array.ndim() == 0 ? array.itemsize() : array.nbytes(), array); + std::shared_ptr> memory(buffer, [](ov::SharedBuffer* buffer) { + py::gil_scoped_acquire acquire; + delete buffer; + }); return ov::op::v0::Constant(type_helpers::get_ov_type(array), array_helpers::get_shape(array), memory); } // If passed array is not C-style, throw an error. @@ -614,7 +618,7 @@ uint32_t get_optimal_number_of_requests(const ov::CompiledModel& actual) { py::dict outputs_to_dict(InferRequestWrapper& request, bool share_outputs, bool decode_strings) { py::dict res; for (const auto& out : request.m_outputs) { - auto t = request.m_request.get_tensor(out); + auto t = request.m_request->get_tensor(out); if (t.get_element_type() == ov::element::string) { if (share_outputs) { PyErr_WarnEx(PyExc_RuntimeWarning, "Result of a string type will be copied to OVDict!", 1); diff --git a/src/bindings/python/src/pyopenvino/core/infer_request.cpp b/src/bindings/python/src/pyopenvino/core/infer_request.cpp index 93a52b1dad681f..9f572d273dc5f3 100644 --- a/src/bindings/python/src/pyopenvino/core/infer_request.cpp +++ b/src/bindings/python/src/pyopenvino/core/infer_request.cpp @@ -18,7 +18,7 @@ inline py::object run_sync_infer(InferRequestWrapper& self, bool share_outputs, { py::gil_scoped_release release; *self.m_start_time = Time::now(); - self.m_request.infer(); + self.m_request->infer(); *self.m_end_time = Time::now(); } return Common::outputs_to_dict(self, share_outputs, decode_strings); @@ -38,7 +38,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_tensors", [](InferRequestWrapper& self, const py::dict& inputs) { - Common::set_request_tensors(self.m_request, inputs); + Common::set_request_tensors(*self.m_request, inputs); }, py::arg("inputs"), R"( @@ -51,7 +51,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_tensors", [](InferRequestWrapper& self, const std::string& tensor_name, const std::vector& tensors) { - self.m_request.set_tensors(tensor_name, tensors); + self.m_request->set_tensors(tensor_name, tensors); }, py::arg("tensor_name"), py::arg("tensors"), @@ -73,7 +73,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_tensors", [](InferRequestWrapper& self, const ov::Output& port, const std::vector& tensors) { - self.m_request.set_tensors(port, tensors); + self.m_request->set_tensors(port, tensors); }, py::arg("port"), py::arg("tensors"), @@ -100,7 +100,7 @@ void regclass_InferRequest(py::module m) { [](InferRequestWrapper& self, const py::dict& outputs) { auto outputs_map = Common::containers::cast_to_tensor_index_map(outputs); for (auto&& output : outputs_map) { - self.m_request.set_output_tensor(output.first, output.second); + self.m_request->set_output_tensor(output.first, output.second); } }, py::arg("outputs"), @@ -117,7 +117,7 @@ void regclass_InferRequest(py::module m) { [](InferRequestWrapper& self, const py::dict& inputs) { auto inputs_map = Common::containers::cast_to_tensor_index_map(inputs); for (auto&& input : inputs_map) { - self.m_request.set_input_tensor(input.first, input.second); + self.m_request->set_input_tensor(input.first, input.second); } }, py::arg("inputs"), @@ -131,7 +131,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_input_tensors", [](InferRequestWrapper& self, const std::vector& tensors) { - self.m_request.set_input_tensors(tensors); + self.m_request->set_input_tensors(tensors); }, py::arg("tensors"), R"( @@ -148,7 +148,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_input_tensors", [](InferRequestWrapper& self, size_t idx, const std::vector& tensors) { - self.m_request.set_input_tensors(idx, tensors); + self.m_request->set_input_tensors(idx, tensors); }, py::arg("idx"), py::arg("tensors"), @@ -168,7 +168,7 @@ void regclass_InferRequest(py::module m) { cls.def( "infer", [](InferRequestWrapper& self, const ov::Tensor& inputs, bool share_outputs, bool decode_strings) { - self.m_request.set_input_tensor(inputs); + self.m_request->set_input_tensor(inputs); return run_sync_infer(self, share_outputs, decode_strings); }, py::arg("inputs"), @@ -197,7 +197,7 @@ void regclass_InferRequest(py::module m) { "infer", [](InferRequestWrapper& self, const py::dict& inputs, bool share_outputs, bool decode_strings) { // Update inputs if there are any - Common::set_request_tensors(self.m_request, inputs); + Common::set_request_tensors(*self.m_request, inputs); // Call Infer function return run_sync_infer(self, share_outputs, decode_strings); }, @@ -222,7 +222,7 @@ void regclass_InferRequest(py::module m) { "start_async", [](InferRequestWrapper& self, const ov::Tensor& inputs, py::object& userdata) { // Update inputs if there are any - self.m_request.set_input_tensor(inputs); + self.m_request->set_input_tensor(inputs); if (!userdata.is(py::none())) { if (self.m_user_callback_defined) { self.m_userdata = userdata; @@ -232,7 +232,7 @@ void regclass_InferRequest(py::module m) { } py::gil_scoped_release release; *self.m_start_time = Time::now(); - self.m_request.start_async(); + self.m_request->start_async(); }, py::arg("inputs"), py::arg("userdata"), @@ -261,7 +261,7 @@ void regclass_InferRequest(py::module m) { "start_async", [](InferRequestWrapper& self, const py::dict& inputs, py::object& userdata) { // Update inputs if there are any - Common::set_request_tensors(self.m_request, inputs); + Common::set_request_tensors(*self.m_request, inputs); if (!userdata.is(py::none())) { if (self.m_user_callback_defined) { self.m_userdata = userdata; @@ -271,7 +271,7 @@ void regclass_InferRequest(py::module m) { } py::gil_scoped_release release; *self.m_start_time = Time::now(); - self.m_request.start_async(); + self.m_request->start_async(); }, py::arg("inputs"), py::arg("userdata"), @@ -293,7 +293,7 @@ void regclass_InferRequest(py::module m) { cls.def( "cancel", [](InferRequestWrapper& self) { - self.m_request.cancel(); + self.m_request->cancel(); }, R"( Cancels inference request. @@ -303,7 +303,7 @@ void regclass_InferRequest(py::module m) { "wait", [](InferRequestWrapper& self) { py::gil_scoped_release release; - self.m_request.wait(); + self.m_request->wait(); }, R"( Waits for the result to become available. @@ -316,7 +316,7 @@ void regclass_InferRequest(py::module m) { "wait_for", [](InferRequestWrapper& self, const int timeout) { py::gil_scoped_release release; - return self.m_request.wait_for(std::chrono::milliseconds(timeout)); + return self.m_request->wait_for(std::chrono::milliseconds(timeout)); }, py::arg("timeout"), R"( @@ -337,7 +337,11 @@ void regclass_InferRequest(py::module m) { [](InferRequestWrapper& self, py::function callback, py::object& userdata) { self.m_userdata = userdata; self.m_user_callback_defined = true; - self.m_request.set_callback([&self, callback](std::exception_ptr exception_ptr) { + + // need to acquire GIL before py::function deletion + auto callback_sp = Common::utils::wrap_pyfunction(std::move(callback)); + + self.m_request->set_callback([&self, callback_sp](std::exception_ptr exception_ptr) { *self.m_end_time = Time::now(); try { if (exception_ptr) { @@ -348,7 +352,7 @@ void regclass_InferRequest(py::module m) { } // Acquire GIL, execute Python function py::gil_scoped_acquire acquire; - callback(self.m_userdata); + (*callback_sp)(self.m_userdata); }); }, py::arg("callback"), @@ -365,7 +369,7 @@ void regclass_InferRequest(py::module m) { cls.def( "get_tensor", [](InferRequestWrapper& self, const std::string& name) { - return self.m_request.get_tensor(name); + return self.m_request->get_tensor(name); }, py::arg("name"), R"( @@ -380,7 +384,7 @@ void regclass_InferRequest(py::module m) { cls.def( "get_tensor", [](InferRequestWrapper& self, const ov::Output& port) { - return self.m_request.get_tensor(port); + return self.m_request->get_tensor(port); }, py::arg("port"), R"( @@ -395,7 +399,7 @@ void regclass_InferRequest(py::module m) { cls.def( "get_tensor", [](InferRequestWrapper& self, const ov::Output& port) { - return self.m_request.get_tensor(port); + return self.m_request->get_tensor(port); }, py::arg("port"), R"( @@ -410,7 +414,7 @@ void regclass_InferRequest(py::module m) { cls.def( "get_input_tensor", [](InferRequestWrapper& self, size_t idx) { - return self.m_request.get_input_tensor(idx); + return self.m_request->get_input_tensor(idx); }, py::arg("index"), R"( @@ -427,7 +431,7 @@ void regclass_InferRequest(py::module m) { cls.def( "get_input_tensor", [](InferRequestWrapper& self) { - return self.m_request.get_input_tensor(); + return self.m_request->get_input_tensor(); }, R"( Gets input tensor of InferRequest. @@ -440,7 +444,7 @@ void regclass_InferRequest(py::module m) { cls.def( "get_output_tensor", [](InferRequestWrapper& self, size_t idx) { - return self.m_request.get_output_tensor(idx); + return self.m_request->get_output_tensor(idx); }, py::arg("index"), R"( @@ -456,7 +460,7 @@ void regclass_InferRequest(py::module m) { cls.def( "get_output_tensor", [](InferRequestWrapper& self) { - return self.m_request.get_output_tensor(); + return self.m_request->get_output_tensor(); }, R"( Gets output tensor of InferRequest. @@ -469,7 +473,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_tensor", [](InferRequestWrapper& self, const std::string& name, const ov::Tensor& tensor) { - self.m_request.set_tensor(name, tensor); + self.m_request->set_tensor(name, tensor); }, py::arg("name"), py::arg("tensor"), @@ -486,7 +490,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_tensor", [](InferRequestWrapper& self, const ov::Output& port, const ov::Tensor& tensor) { - self.m_request.set_tensor(port, tensor); + self.m_request->set_tensor(port, tensor); }, py::arg("port"), py::arg("tensor"), @@ -503,7 +507,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_tensor", [](InferRequestWrapper& self, const ov::Output& port, const ov::Tensor& tensor) { - self.m_request.set_tensor(port, tensor); + self.m_request->set_tensor(port, tensor); }, py::arg("port"), py::arg("tensor"), @@ -520,7 +524,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_input_tensor", [](InferRequestWrapper& self, size_t idx, const ov::Tensor& tensor) { - self.m_request.set_input_tensor(idx, tensor); + self.m_request->set_input_tensor(idx, tensor); }, py::arg("index"), py::arg("tensor"), @@ -538,7 +542,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_input_tensor", [](InferRequestWrapper& self, const ov::Tensor& tensor) { - self.m_request.set_input_tensor(tensor); + self.m_request->set_input_tensor(tensor); }, py::arg("tensor"), R"( @@ -553,7 +557,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_output_tensor", [](InferRequestWrapper& self, size_t idx, const ov::Tensor& tensor) { - self.m_request.set_output_tensor(idx, tensor); + self.m_request->set_output_tensor(idx, tensor); }, py::arg("index"), py::arg("tensor"), @@ -570,7 +574,7 @@ void regclass_InferRequest(py::module m) { cls.def( "set_output_tensor", [](InferRequestWrapper& self, const ov::Tensor& tensor) { - self.m_request.set_output_tensor(tensor); + self.m_request->set_output_tensor(tensor); }, py::arg("tensor"), R"( @@ -585,7 +589,7 @@ void regclass_InferRequest(py::module m) { cls.def( "get_profiling_info", [](InferRequestWrapper& self) { - return self.m_request.get_profiling_info(); + return self.m_request->get_profiling_info(); }, py::call_guard(), R"( @@ -602,7 +606,7 @@ void regclass_InferRequest(py::module m) { cls.def( "query_state", [](InferRequestWrapper& self) { - return self.m_request.query_state(); + return self.m_request->query_state(); }, py::call_guard(), R"( @@ -617,7 +621,7 @@ void regclass_InferRequest(py::module m) { cls.def( "reset_state", [](InferRequestWrapper& self) { - return self.m_request.reset_state(); + return self.m_request->reset_state(); }, R"( Resets all internal variable states for relevant infer request to @@ -627,7 +631,7 @@ void regclass_InferRequest(py::module m) { cls.def( "get_compiled_model", [](InferRequestWrapper& self) { - return self.m_request.get_compiled_model(); + return self.m_request->get_compiled_model(); }, R"( Returns the compiled model. @@ -700,7 +704,7 @@ void regclass_InferRequest(py::module m) { cls.def_property_readonly( "profiling_info", [](InferRequestWrapper& self) { - return self.m_request.get_profiling_info(); + return self.m_request->get_profiling_info(); }, py::call_guard(), R"( diff --git a/src/bindings/python/src/pyopenvino/core/infer_request.hpp b/src/bindings/python/src/pyopenvino/core/infer_request.hpp index 69f0412a1745c9..719d0374af6ff3 100644 --- a/src/bindings/python/src/pyopenvino/core/infer_request.hpp +++ b/src/bindings/python/src/pyopenvino/core/infer_request.hpp @@ -32,7 +32,7 @@ class InferRequestWrapper { const std::vector>& outputs, bool set_default_callback = true, py::object userdata = py::none()) - : m_request{std::move(request)}, + : m_request{InferRequestWrapper::wrap_infer_request_to_sp(std::move(request))}, m_inputs{inputs}, m_outputs{outputs}, m_userdata{userdata} { @@ -44,7 +44,7 @@ class InferRequestWrapper { // Bump reference counter auto end_time = m_end_time; // Set standard callback which saves "end-time" for inference call - m_request.set_callback([end_time](std::exception_ptr exception_ptr) { + m_request->set_callback([end_time](std::exception_ptr exception_ptr) { *end_time = Time::now(); try { if (exception_ptr) { @@ -73,7 +73,7 @@ class InferRequestWrapper { } // Original ov::InferRequest class that is held by this wrapper - ov::InferRequest m_request; + std::shared_ptr m_request; // Inputs and Outputs inherrited from ov::CompiledModel std::vector> m_inputs; std::vector> m_outputs; @@ -91,11 +91,18 @@ class InferRequestWrapper { tensors.reserve(v.size()); for (auto&& node : v) { - tensors.push_back(m_request.get_tensor(node)); + tensors.push_back(m_request->get_tensor(node)); } return tensors; } + + static std::shared_ptr wrap_infer_request_to_sp(ov::InferRequest request) { + return std::shared_ptr(new ov::InferRequest(std::move(request)), [](ov::InferRequest* request) { + py::gil_scoped_release release; + delete request; + }); + } }; void regclass_InferRequest(py::module m); diff --git a/src/bindings/python/src/pyopenvino/frontend/extension.cpp b/src/bindings/python/src/pyopenvino/frontend/extension.cpp index a4f2e9cae1ca0c..4446ea2c9acc33 100644 --- a/src/bindings/python/src/pyopenvino/frontend/extension.cpp +++ b/src/bindings/python/src/pyopenvino/frontend/extension.cpp @@ -30,19 +30,26 @@ void regclass_frontend_TelemetryExtension(py::module m) { py::function& send_event, py::function& send_error, py::function& send_stack_trace) { + auto send_event_sp = Common::utils::wrap_pyfunction(send_event); + auto send_error_sp = Common::utils::wrap_pyfunction(send_error); + auto send_stack_trace_sp = Common::utils::wrap_pyfunction(send_stack_trace); + return std::make_shared( event_category, - [send_event](const std::string& category, const std::string& action, const std::string& label, int value) { + [send_event_sp](const std::string& category, + const std::string& action, + const std::string& label, + int value) { py::gil_scoped_acquire acquire; - send_event(category, action, label, value); + (*send_event_sp)(category, action, label, value); }, - [send_error](const std::string& category, const std::string& error_message) { + [send_error_sp](const std::string& category, const std::string& error_message) { py::gil_scoped_acquire acquire; - send_error(category, error_message); + (*send_error_sp)(category, error_message); }, - [send_stack_trace](const std::string& category, const std::string& error_message) { + [send_stack_trace_sp](const std::string& category, const std::string& error_message) { py::gil_scoped_acquire acquire; - send_stack_trace(category, error_message); + (*send_stack_trace_sp)(category, error_message); }); })); diff --git a/src/bindings/python/src/pyopenvino/utils/utils.cpp b/src/bindings/python/src/pyopenvino/utils/utils.cpp index 27f015b14272c2..feeac2d7a02a73 100644 --- a/src/bindings/python/src/pyopenvino/utils/utils.cpp +++ b/src/bindings/python/src/pyopenvino/utils/utils.cpp @@ -419,5 +419,12 @@ ov::Any py_object_to_any(const py::object& py_obj) { } OPENVINO_ASSERT(false, "Unsupported attribute type."); } +std::shared_ptr wrap_pyfunction(py::function f_callback) { + auto callback_sp = std::shared_ptr(new py::function(std::move(f_callback)), [](py::function* c) { + py::gil_scoped_acquire acquire; + delete c; + }); + return callback_sp; +} }; // namespace utils }; // namespace Common diff --git a/src/bindings/python/src/pyopenvino/utils/utils.hpp b/src/bindings/python/src/pyopenvino/utils/utils.hpp index 1e0e7f23069d2e..e4048b3f52feb3 100644 --- a/src/bindings/python/src/pyopenvino/utils/utils.hpp +++ b/src/bindings/python/src/pyopenvino/utils/utils.hpp @@ -58,5 +58,7 @@ namespace utils { ov::pass::Serialize::Version convert_to_version(const std::string& version); + std::shared_ptr wrap_pyfunction(py::function f_callback); + }; // namespace utils }; // namespace Common diff --git a/src/bindings/python/tests/test_graph/test_op.py b/src/bindings/python/tests/test_graph/test_op.py index 2bd609ef5278f1..5a8abdc55ea86c 100644 --- a/src/bindings/python/tests/test_graph/test_op.py +++ b/src/bindings/python/tests/test_graph/test_op.py @@ -107,9 +107,7 @@ def test_custom_add_model(): def test_custom_op(): model = create_snake_model() - # todo: CVS-141744 - # it hangs with AUTO plugin, but works well with CPU - compiled_model = compile_model(model, "CPU") + compiled_model = compile_model(model) assert isinstance(compiled_model, CompiledModel) request = compiled_model.create_infer_request() diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp index 08deb95b12ec22..d928cdd1d33eba 100644 --- a/src/common/snippets/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/src/pass/mha_tokenization.cpp @@ -16,7 +16,7 @@ namespace { bool is_supported_tensor(const ov::descriptor::Tensor& t) { - return t.get_partial_shape().is_static() && ov::snippets::utils::one_of(t.get_shape().size(), 3lu, 4lu); + return t.get_partial_shape().rank().is_static() && ov::snippets::utils::one_of(t.get_partial_shape().size(), 3lu, 4lu); } bool is_supported_intermediate_op(const std::shared_ptr& node) { @@ -68,6 +68,10 @@ void tokenize_broadcast(const std::shared_ptr& interm_op, ov::NodeVect // TODO: Can we reuse AppropriateForSubgraph here? Seems like it's huge check for Broadcast if (broadcast && broadcast->get_broadcast_spec().m_type == ov::op::AutoBroadcastType::NUMPY && broadcast->get_output_target_inputs(0).size() == 1) { + // TODO: Add support of Broadcast with ShapeOf subgraph on second input + if (!ov::is_type(broadcast->input_value(1).get_node_shared_ptr())) + continue; + broadcast_nodes.push_back(broadcast); const auto pshape = broadcast->get_input_partial_shape(0); @@ -96,10 +100,17 @@ void tokenize_broadcast(const std::shared_ptr& interm_op, ov::NodeVect bool tokenize_reshape_around_softmax(std::shared_ptr& interm_op, std::shared_ptr& reshape, ov::NodeVector& ordered_ops) { reshape = ov::as_type_ptr(interm_op); if (reshape) { - const auto in_shape = reshape->get_input_shape(0); - const auto out_shape = reshape->get_output_shape(0); - if (in_shape.back() != out_shape.back() || reshape->get_output_target_inputs(0).size() != 1) + // TODO: Add support of Reshape with ShapeOf subgraph on second input + if (!ov::is_type(reshape->input_value(1).get_node_shared_ptr())) + return false; + + const auto in_shape = reshape->get_input_partial_shape(0); + const auto out_shape = reshape->get_output_partial_shape(0); + const auto in_last_dim = *in_shape.crbegin(); + const auto out_last_dim = *out_shape.crbegin(); + if (in_last_dim.is_dynamic() || out_last_dim.is_dynamic() || in_last_dim != out_last_dim || reshape->get_output_target_inputs(0).size() != 1) return false; + ordered_ops.push_back(reshape); interm_op = reshape->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); } @@ -204,8 +215,7 @@ bool ov::snippets::pass::TokenizeMHASnippets::is_matmul0_supported(const std::sh ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsTokenization::Config& config) { MATCHER_SCOPE(TokenizeMHASnippets); - auto m_matmul0 = std::make_shared(ov::pass::pattern::any_input(ov::pass::pattern::has_static_shape()), - ov::pass::pattern::any_input(ov::pass::pattern::has_static_shape())); + auto m_matmul0 = std::make_shared(ov::pass::pattern::any_input(), ov::pass::pattern::any_input()); register_matcher(std::make_shared(m_matmul0, matcher_name), [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher &m) { @@ -224,20 +234,14 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken // Example: // Buffer - i32 [32, 128] -> ~ Loop ~ -> Buffer - i8 [32, 128] // After each Loop iteration we should increment pointers of Buffers: accordingly on 4 byte and 1 byte for scalar case. - // It means that these Buffers cannot be inplace => Each Buffer should have the own register + // It means that these increments are not proportional => Each Buffer should have the own register // For that we can just check the following "branches": // - Between MatMul0 and MatMul1 - Softmax is sync point. The operations between MatMul0 -> Softmax and Softmax -> MatMul1 // will be fused into one loop after conversion to snippet dialect (Because it's just FQ, Eltwise nodes) - // - Between MatMul0 and Transpose1 - At the moment operations after Transpose1 cannot be fused in Transpose Loop (to avoid performance regressions). + // - Between MatMul0 and Transpose1 - At the moment operations after Transpose1 cannot be fused in inner Transpose Loop + // (to avoid performance regressions due to scalar calculations). // But operations after Transpose1 and before MatMul0 will be fused into one loop as well (look at first point) - // Note: If the pass is updated, need to check the new possible branches for potential non-inplace Buffers! - // Default value is 2 because - // - Firstly, Softmax always needs Buffers - // - Secondly, Softmax needs 2 Buffers but they can be inplace - One virtual port is enough for Softmax => buffer_count = 1 - // - Thirdly, MatMul requires unique Buffers on inputs and outputs because blocking implementation increments input/output pointers during computations - // However, all of the Buffers are usually reused by the next MatMul and Softmax. - // So on sufficiently large subgraphs we use only one additional unique buffer => buffer_count increments by 1 - size_t buffer_count = 2; + size_t uniqie_buffer_reg_group_count = 1; // After MatMul0 there is always one Buffer std::string fused_names; ov::NodeVector ordered_ops; @@ -260,24 +264,20 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken if (!is_matmul0_supported(matmul0)) return false; - const auto matmul0_prc = op::Brgemm::get_output_type(matmul0->get_input_element_type(0), matmul0->get_input_element_type(1)); - // Between MatMul0 and Softmax will be the one Loop because of LoopFusing optimization. - // The Loop will have one Buffer with the same shape both on input and output. - // Need to check for precision to get if we need one more register for Buffer - if (matmul0_prc.size() != ov::element::f32.size()) { - if (buffer_count < 2) - buffer_count++; - } - ordered_ops.push_back(matmul0); const auto pattern_rank = matmul0->get_output_partial_shape(0).size(); + const auto ops_count_before_softmax = ordered_ops.size(); auto interm_op = matmul0->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); // Add supported operations which are between MatMul0 and Softmax to ordered_ops if (!update_intermediate_supported_ops(interm_op, ordered_ops, hidden_virtual_ports_count, potential_body_params_count)) return false; + // If before Softmax there is Eltwise ops, there will be one more Buffer + if (ops_count_before_softmax != ordered_ops.size() && interm_op->get_output_partial_shape(0).rbegin()->is_dynamic()) + uniqie_buffer_reg_group_count++; + std::shared_ptr reshape0 = nullptr; if (!tokenize_reshape_around_softmax(interm_op, reshape0, ordered_ops)) return false; @@ -294,6 +294,11 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken if (axis != rank.get_length() - 1 || interm_op->get_output_target_inputs(0).size() != 1) return false; + + // Softmax need one buffer at least + if (interm_op->get_output_partial_shape(0).rbegin()->is_dynamic()) + uniqie_buffer_reg_group_count++; + ordered_ops.push_back(interm_op); interm_op = interm_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); @@ -302,7 +307,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken return false; if (((reshape0 == nullptr) != (reshape1 == nullptr)) || - (reshape0 && reshape1 && (reshape0->get_input_shape(0) != reshape1->get_output_shape(0)))) + (reshape0 && reshape1 && (reshape0->get_input_partial_shape(0) != reshape1->get_output_partial_shape(0)))) return false; // Add supported operations which are between Softmax and MatMul1 to ordered_ops @@ -310,8 +315,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken return false; const auto matmul1 = ov::as_type_ptr(interm_op); - if (!matmul1 || matmul1->get_output_target_inputs(0).size() != 1 || - matmul1->get_transpose_a() || matmul1->get_transpose_b()) + if (!matmul1 || matmul1->get_transpose_a() || matmul1->get_transpose_b()) return false; const auto matmul1_out_type = op::Brgemm::get_output_type(matmul1->get_input_element_type(0), @@ -328,8 +332,9 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken // Between Softmax and MatMul1 will be the one Loop because of LoopFusing optimization. // The Loop will have one Buffer with the same shape both on input and output. // Need to check for precision to get if we need one more register for Buffer - if (matmul1->get_input_element_type(0).size() != ov::element::f32.size()) { - buffer_count++; + const auto matmul0_prc = op::Brgemm::get_output_type(matmul0->get_input_element_type(0), matmul0->get_input_element_type(1)); + if (matmul1->get_input_element_type(0).size() != matmul0_prc.size() || matmul1->get_input_partial_shape(0).is_dynamic()) { + uniqie_buffer_reg_group_count++; } /***********************/ @@ -358,6 +363,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken // There is transformation ExplicitTransposeMatMulInputs that set supported order and transposed_b(false). // We can allow to call this pass only if ops have scalar shapes to avoid shape mismatching const auto is_transposed_b_0 = matmul0->get_transpose_b(); + bool has_matmul0_has_ops_on_input = false; while (is_supported_intermediate_op(parent)) { // All supported ops have only one output port if (parent->get_output_target_inputs(0).size() != 1) @@ -379,6 +385,11 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken ordered_ops.insert(ordered_ops.begin(), parent); // [107731] To go always through 0-th port - is it safe? parent = parent->get_input_node_shared_ptr(0); + has_matmul0_has_ops_on_input = true; + } + // If there are ops on second input of MatMul0 -> there always will be unique Buffer + if (has_matmul0_has_ops_on_input) { + uniqie_buffer_reg_group_count++; } auto tokenize_transpose = [&](const std::shared_ptr& transpose, @@ -412,7 +423,9 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken bool are_ops_after_matmul1 = false; auto child = matmul1->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); - while (is_supported_intermediate_op(child)) { + const auto can_be_ops_after_matmul1_tokenized = matmul1->get_output_target_inputs(0).size() == 1; + bool has_matmul1_has_ops_on_output = false; + while (can_be_ops_after_matmul1_tokenized && is_supported_intermediate_op(child)) { are_ops_after_matmul1 = true; // All supported ops have only one output port if (child->get_output_target_inputs(0).size() != 1) @@ -427,19 +440,23 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken // TODO [75567]: move this plugin-specific constraint to the plugin callback // We cannot collapse op to Subgraph if count of potential Parameter and Result count is higher 12 - if (potential_body_params_count + child->get_output_target_inputs(0).size() + hidden_virtual_ports_count + buffer_count > 12) { + if (potential_body_params_count + child->get_output_target_inputs(0).size() + hidden_virtual_ports_count + uniqie_buffer_reg_group_count > 12) { break; } ordered_ops.push_back(child); child = child->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + has_matmul1_has_ops_on_output = true; + } + if (has_matmul1_has_ops_on_output) { + uniqie_buffer_reg_group_count++; } // At the moment Snippets don't support nodes between MatMul1 and Transpose3 due to Loop and strided calculations limitations // MatMul1 // // Transpose3 - if (!are_ops_after_matmul1) { + if (can_be_ops_after_matmul1_tokenized && !are_ops_after_matmul1) { auto transpose3 = config.get_mha_token_enable_transpose_on_output() ? ov::as_type_ptr(child) : nullptr; if (is_valid_transpose(transpose3, config.get_mha_supported_transpose_ranks(), get_fusion_transpose_order(pattern_rank)) && transpose3->get_input_element_type(0) == matmul1_out_type) { // To avoid Convert between MatMul1 and Transpose3 @@ -455,7 +472,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken // TODO [75567]: move this plugin-specific constraint to the plugin callback const auto last_node = ordered_ops.back(); - if (potential_body_params_count + last_node->get_output_size() + hidden_virtual_ports_count + buffer_count > 11) { + if (potential_body_params_count + last_node->get_output_size() + hidden_virtual_ports_count + uniqie_buffer_reg_group_count > 11) { return false; } diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp index 6438dff516cded..b411aace066203 100644 --- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp @@ -39,6 +39,30 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D) { run(); } +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D_Dynamic) { + const auto &f = MHAFunction(std::vector{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, + std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false); + model = f.getOriginal(); + model_ref = f.getReference(); + run(); +} + +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D_Dynamic_M) { + const auto &f = MHAFunction(std::vector{{1, -1, 12, 64}, {1, 128, 12, 64}, {1, 12, -1, 128}, {1, 128, 12, 64}}, + std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false); + model = f.getOriginal(); + model_ref = f.getReference(); + run(); +} + +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D_Dynamic_K) { + const auto &f = MHAFunction(std::vector{{1, 128, 12, -1}, {1, 128, 12, -1}, {1, 12, 128, 128}, {1, 128, 12, 64}}, + std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false); + model = f.getOriginal(); + model_ref = f.getReference(); + run(); +} + TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_3D) { const auto &f = MHAFunction(std::vector{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32})); @@ -47,8 +71,15 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_3D) { run(); } -TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_MatMul0_Transpose) { - GTEST_SKIP(); +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_3D_Dynamic) { + const auto &f = MHAFunction(std::vector{{-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}}, + std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false); + model = f.getOriginal(); + model_ref = f.getReference(); + run(); +} + +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_with_MatMul0_Transpose) { const auto &f = MHAMatMul0TransposeFunction(std::vector{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32})); model = f.getOriginal(); @@ -56,6 +87,16 @@ TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_M run(); } +TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_with_MatMul0_Transpose_Dynamic) { + GTEST_SKIP(); + const auto &f = MHAMatMul0TransposeFunction(std::vector{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, + std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), + false); + model = f.getOriginal(); + model_ref = f.getReference(); + run(); +} + TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_int_Matmuls) { GTEST_SKIP(); const auto &f = MHAINT8MatMulTypeRelaxedFunction(std::vector{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}); @@ -71,6 +112,14 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_extraction) { run(); } +TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_Dynamic_Transpose_extraction) { + GTEST_SKIP(); + const auto& f = MHATransposedInputFunction(std::vector{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, true); + model = f.getOriginal(); + model_ref = f.getReference(); + run(); +} + TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_extraction_and_unsupported_existing_transpose) { const auto& f = MHATransposedInputFunction(std::vector{{1, 128, 12, 64}, {1, 12, 64, 128}, {1, 128, 12, 64}}, true, std::vector{0, 3, 1, 2}); @@ -79,6 +128,15 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_extraction_and_uns run(); } +TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_Dynamic_Transpose_extraction_and_unsupported_existing_transpose) { + GTEST_SKIP(); + const auto& f = MHATransposedInputFunction(std::vector{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, true, + std::vector{0, 3, 1, 2}); + model = f.getOriginal(); + model_ref = f.getReference(); + run(); +} + TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_fusion) { const auto& f = MHATransposedInputFunction(std::vector{{1, 128, 12, 64}, {1, 64, 128, 12}, {1, 128, 12, 64}}, false, std::vector{0, 2, 1, 3}); @@ -87,6 +145,14 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_fusion) { run(); } +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Dyanmic_Transpose_fusion) { + const auto& f = MHATransposedInputFunction(std::vector{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, false, + std::vector{0, 2, 1, 3}); + model = f.getOriginal(); + model_ref = f.getReference(); + run(); +} + TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) { const auto& f = MHASplitMFunction(std::vector{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), diff --git a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp index 2f3ddd5d843ae3..86507326c25a44 100644 --- a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp @@ -555,7 +555,7 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) { {{"special_zero", true}}); auto slice_Slice_543 = GenSlice(view_Reshape_424, 0, head_size, 1, 3); // tensor_array - auto hidden_states = makePattern("f32[?,?,?]"); // + auto hidden_states = makePattern(); // auto ShapeOf_485735 = makePattern({hidden_states}, {}); auto Multiply_567524 = makePattern({ShapeOf_485735, {-1}}, {{"auto_broadcast", "numpy"}}); auto Gather_377635 = makePattern({Multiply_567524, {1}, 0}, {{"batch_dims", 0}}); diff --git a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp index 1a4507c08dc9f0..3bf315bebf4467 100644 --- a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp +++ b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp @@ -16,6 +16,7 @@ #include "openvino/op/shape_of.hpp" #include "openvino/op/squeeze.hpp" #include "openvino/op/util/multi_subgraph_base.hpp" +#include "openvino/op/util/op_types.hpp" #include "transformations/utils/utils.hpp" namespace { @@ -222,7 +223,101 @@ void optimize_value_usage(ov::Output& output, STS_map& symbol_shape_so } } -void save_shape_sources(const ov::Output& output, STS_map& symbol_shape_source) { +std::vector> topological_order(const std::shared_ptr& m) { + auto order = m->get_ordered_ops(); + + // step 1: split model into parameter related and parameter non-related ops + const std::string op_depends_on_parameter = "topological_sort_op_depends_on"; + // values: true - parameter dependent; false otherwise + for (const auto& op : order) { + if (ov::as_type_ptr(op)) { + op->get_rt_info()[op_depends_on_parameter] = true; + } else if (ov::as_type_ptr(op) || ov::as_type_ptr(op) || + ov::as_type_ptr(op) || + std::dynamic_pointer_cast(op)) { + op->get_rt_info()[op_depends_on_parameter] = false; + } else { // deduce op type from inputs + const auto& inputs = op->input_values(); + op->get_rt_info()[op_depends_on_parameter] = + std::any_of(inputs.begin(), + inputs.end(), + [&op_depends_on_parameter](const ov::Output& input) { + return input.get_node_shared_ptr()->get_rt_info()[op_depends_on_parameter].as(); + }); + } + } + // step 2: starting from Result -- assign weight to ops: + // if parameter dependant, weights is maximum of output indices plus one + // else weights is maximum of output indices + // this step doesn't assign weights to all the ops, this is intentional and will be used in the following step + const std::string weight_rt_info_name = "topological_sort_weight"; + for (auto it = order.rbegin(); it != order.rend(); ++it) { + const auto& op = *it; + int64_t weight = 0; + if (ov::as_type_ptr(op)) { + op->get_rt_info()[weight_rt_info_name] = weight; + } else { + bool output_has_weight = false; + for (const auto& output : op->outputs()) { + for (const auto& input : output.get_target_inputs()) { + const auto& output_op = input.get_node(); + const auto& rt_info = output_op->get_rt_info(); + if (!rt_info.count(weight_rt_info_name)) + continue; + output_has_weight = true; + auto output_weight = rt_info.at(weight_rt_info_name).as(); + weight = output_weight > weight ? output_weight : weight; + } + } + if (output_has_weight) { + if (op->get_rt_info()[op_depends_on_parameter].as()) { + weight += 1; + } + op->get_rt_info()[weight_rt_info_name] = weight; + } + } + } + // step 3: make propagation for all the nodes: + // if weight is already assigned -- skip operation + // else operation weights is minimum of input indices + // if all operation inputs have no weights -- this op is isolated and this algorithm doesn't make sense, + // such cases are extremely rare and rather theoretical, to handle them we return original ov::Model op order + std::map>> level_to_vector; + for (const auto& op : order) { + if (!op->get_rt_info().count(weight_rt_info_name)) { + int64_t weight = std::numeric_limits::max(); + for (const auto& input : op->input_values()) { + const auto& rt_info = input.get_node_shared_ptr()->get_rt_info(); + if (!rt_info.count(weight_rt_info_name)) + continue; + auto input_weight = rt_info.at(weight_rt_info_name).as(); + weight = input_weight < weight ? input_weight : weight; + } + if (weight != std::numeric_limits::max()) + op->get_rt_info()[weight_rt_info_name] = weight; + else + return m->get_ordered_ops(); + } + level_to_vector[op->get_rt_info().at(weight_rt_info_name).as()].push_back(op); + } + // finalization: descending order for levels and ops within level are ordered by get_ordered_ops + std::vector> result; + result.reserve(order.size()); + for (auto it = level_to_vector.rbegin(); it != level_to_vector.rend(); ++it) { + const auto& item = *it; + result.insert(result.end(), item.second.begin(), item.second.end()); + for (const auto& op : item.second) { + op->get_rt_info().erase(weight_rt_info_name); + op->get_rt_info().erase(op_depends_on_parameter); + } + } + return result; +} + +void save_shape_sources(const std::shared_ptr& op, STS_map& symbol_shape_source) { + if (!ov::is_type(op) && !ov::is_type(op)) + return; + const auto& output = op->input_value(0); for (const auto& d : output.get_partial_shape()) { if (d.is_static()) continue; @@ -240,7 +335,7 @@ bool ov::pass::OptimizeSymbolsUsedAsValues::run_on_model(const std::shared_ptrget_ordered_ops()) { + for (const auto& op : topological_order(m)) { // Result has output port which has shared (during validate_and_infer_type) tensor with input port. // Transformations may replace input of Result. After replacement and before Result::validate_and_infer_type -- // output tensor of Result may contain inaccurate shape / symbols due to the sharing with tensor which may be @@ -252,10 +347,9 @@ bool ov::pass::OptimizeSymbolsUsedAsValues::run_on_model(const std::shared_ptroutputs()) { + for (auto& output : op->outputs()) optimize_value_usage(output, symbol_shape_source, symbol_value_source); - save_shape_sources(output, symbol_shape_source); - } + save_shape_sources(op, symbol_shape_source); } return true; } diff --git a/src/common/transformations/tests/sdpa_to_paged_attention_test.cpp b/src/common/transformations/tests/sdpa_to_paged_attention_test.cpp new file mode 100644 index 00000000000000..0443e7b82de5cc --- /dev/null +++ b/src/common/transformations/tests/sdpa_to_paged_attention_test.cpp @@ -0,0 +1,27 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/pass/sdpa_to_paged_attention.hpp" + +#include + +#include "common_test_utils/test_common.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" +#include "openvino/pass/manager.hpp" + +using namespace ov; + +TEST(SDPATOPATest, SDPANotPresent) { + const auto p0 = std::make_shared(element::f32, Shape{1, 32, 32}); + const auto p1 = std::make_shared(element::f32, Shape{1, 32, 32}); + const auto add = std::make_shared(p0, p1); + const auto result = std::make_shared(add); + + auto model = std::make_shared(ResultVector{result}, ParameterVector{p0, p1}); + + ov::pass::Manager manager; + manager.register_pass(); + EXPECT_THROW(manager.run_passes(model), ov::Exception); +} \ No newline at end of file diff --git a/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp b/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp index 881d02b20d295a..eb108e4c6591ba 100644 --- a/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp +++ b/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp @@ -75,22 +75,16 @@ TEST_F(TransformationTestsF, ApplySymbolEquivalence_Concat_Values) { auto input_2 = make_shared(element::f32, PartialShape::dynamic(4)); auto concat = make_shared(OutputVector{input_1, input_2}, -1); - auto shape_1 = make_shared(input_1); - auto gather_1 = make_shared(shape_1, - v0::Constant::create(element::i64, {1}, {3}), - v0::Constant::create(element::i64, {}, {0})); - - auto shape_2 = make_shared(input_2); - auto gather_2 = make_shared(shape_2, - v0::Constant::create(element::i64, {1}, {3}), - v0::Constant::create(element::i64, {}, {0})); - - auto sum = make_shared(gather_1, gather_2); + auto shape = make_shared(concat); + auto gather = make_shared(shape, + v0::Constant::create(element::i64, {1}, {-1}), + v0::Constant::create(element::i64, {}, {0})); auto reshape = make_shared( concat, - make_shared(OutputVector{sum, v0::Constant::create(element::i64, {1}, {-1})}, 0), + make_shared(OutputVector{gather, v0::Constant::create(element::i64, {1}, {-1})}, 0), false); + model_ref = make_shared(NodeVector{reshape}, ParameterVector{input_2, input_1}); } } diff --git a/src/core/reference/include/openvino/reference/atanh.hpp b/src/core/reference/include/openvino/reference/atanh.hpp index 5ba554d55179e3..56be82694d55e4 100644 --- a/src/core/reference/include/openvino/reference/atanh.hpp +++ b/src/core/reference/include/openvino/reference/atanh.hpp @@ -18,9 +18,9 @@ T atanh(const T in) { return std::atanh(in); } -template ::value>::type* = nullptr> +// Integral types don't support NAN and INFINITY, use integral limits instead for special values. +template ::value && std::is_signed::value>::type* = nullptr> T atanh(const T in) { - // Integral type not support NAN and INFINITY, use integral limits instead for special values. if (in > 0) { return std::numeric_limits::max(); } else if (in < 0) { @@ -29,6 +29,11 @@ T atanh(const T in) { return 0; } } + +template ::value>::type* = nullptr> +T atanh(const T in) { + return in > 0 ? std::numeric_limits::max() : 0; +} } // namespace func /** diff --git a/src/core/reference/include/openvino/reference/matmul.hpp b/src/core/reference/include/openvino/reference/matmul.hpp index 964bbc5c4a264d..92d6fa3cefb6b6 100644 --- a/src/core/reference/include/openvino/reference/matmul.hpp +++ b/src/core/reference/include/openvino/reference/matmul.hpp @@ -161,7 +161,7 @@ void matmul(const T* arg0, broadcast_axes, sizeof(T)); - arg0_shape_tmp = arg0_br_target_shape; + arg0_shape_tmp = std::move(arg0_br_target_shape); arg0_rank = arg0_shape_tmp.size(); arg0_new_data.swap(tmp); arg0_data = arg0_new_data.data(); @@ -175,7 +175,7 @@ void matmul(const T* arg0, arg1_br_target_shape, broadcast_axes, sizeof(T)); - arg1_shape_tmp = arg1_br_target_shape; + arg1_shape_tmp = std::move(arg1_br_target_shape); arg1_rank = arg1_shape_tmp.size(); arg1_new_data.swap(tmp); arg1_data = arg1_new_data.data(); diff --git a/src/core/reference/src/op/einsum.cpp b/src/core/reference/src/op/einsum.cpp index 74027f424ecb7a..b8b23964346225 100644 --- a/src/core/reference/src/op/einsum.cpp +++ b/src/core/reference/src/op/einsum.cpp @@ -124,7 +124,7 @@ std::unordered_map> compute_label_dim_map(const for (size_t ind = 0; ind < num_broadcasted_dims; ++ind) { label_dims.push_back(static_cast(current_dim + ind)); } - resulted_map[label] = label_dims; + resulted_map[label] = std::move(label_dims); current_dim += num_broadcasted_dims; } else if (resulted_map.find(label) != resulted_map.end()) { resulted_map[label].push_back(static_cast(current_dim)); @@ -132,7 +132,7 @@ std::unordered_map> compute_label_dim_map(const } else { std::vector label_dims; label_dims.push_back(static_cast(current_dim)); - resulted_map[label] = label_dims; + resulted_map[label] = std::move(label_dims); ++current_dim; } } @@ -350,8 +350,8 @@ void reduce_input(ov::TensorVector& inputs, reference::reduce_sum(input_ptr.data(), output_ptr.data(), input_shape, reduced_axes); // update a vector of inputs and input subscripts - inputs[input_ind] = output_ptr; - input_subscripts[input_ind] = new_input_subscript; + inputs[input_ind] = std::move(output_ptr); + input_subscripts[input_ind] = std::move(new_input_subscript); } /// \brief Transpose input to layout specified through the required subscript @@ -408,7 +408,7 @@ void transpose_input(ov::TensorVector& inputs, output_shape); // update a vector of inputs and input subscripts - inputs[input_ind] = output_ptr; + inputs[input_ind] = std::move(output_ptr); input_subscripts[input_ind] = required_subscript; } @@ -452,7 +452,7 @@ void broadcast_input(ov::TensorVector& inputs, broadcast_axes, input.get_element_type().size()); - input = output; + input = std::move(output); } /// \brief Build identity tensor that will be used to zero non-diagonal tensor @@ -528,7 +528,7 @@ ov::Tensor build_multi_identity(const ov::Tensor& input, multi_identity.get_shape(), identity.get_shape(), ov::op::AutoBroadcastType::NUMPY); - multi_identity = mul_output; + multi_identity = std::move(mul_output); } return multi_identity; } @@ -545,7 +545,7 @@ void extract_diagonal(ov::TensorVector& inputs, std::vector& input_ const auto& input_ptr = inputs[input_ind]; const auto& input_subscript = input_subscripts[input_ind]; - const auto input_shape = input_ptr.get_shape(); + const auto& input_shape = input_ptr.get_shape(); std::string resultant_subscript = ""; constexpr char ellipsis[] = "..."; @@ -591,8 +591,8 @@ void extract_diagonal(ov::TensorVector& inputs, std::vector& input_ auto result = ov::Tensor(input_ptr.get_element_type(), result_shape); reference::reduce_sum(mul_output.data(), result.data(), mul_output.get_shape(), reduced_axes); - inputs[input_ind] = result; - input_subscripts[input_ind] = resultant_subscript; + inputs[input_ind] = std::move(result); + input_subscripts[input_ind] = std::move(resultant_subscript); } /// \brief Reshape input to the new shape specified by sub-shapes of the diff --git a/src/core/reference/src/op/fft.cpp b/src/core/reference/src/op/fft.cpp index 9c88b21fd8d1b8..1e0c04eb4c4e35 100644 --- a/src/core/reference/src/op/fft.cpp +++ b/src/core/reference/src/op/fft.cpp @@ -306,7 +306,8 @@ InfoForFFTCalculation get_info_for_calculation(const Shape& input_data_shape, const int64_t complex_data_rank = static_cast(input_data_shape.size() - 1); const auto reversed_output_shape = fft_common::reverse_shape_of_emulated_complex_tensor(output_shape); - auto fft_axes = get_axes(axes_data, axes_data_shape, complex_data_rank); + auto& fft_axes = result.fft_axes; + fft_axes = get_axes(axes_data, axes_data_shape, complex_data_rank); fft_axes = fft_common::reverse_fft_axes(fft_axes, complex_data_rank); const int64_t fft_rank = fft_axes.size(); @@ -320,30 +321,22 @@ InfoForFFTCalculation get_info_for_calculation(const Shape& input_data_shape, const auto outer_strides = fft_common::compute_strides(outer_lengths); const int64_t outer_size = outer_strides[outer_rank]; - const int64_t buffer_size = compute_buffer_size(fft_lengths); - const auto output_strides = fft_common::compute_strides(reversed_output_shape); - const auto output_fft_strides = get_lengths(output_strides, fft_axes); - const auto output_outer_strides = get_lengths(output_strides, outer_axes); const auto reversed_input_shape = fft_common::reverse_shape_of_emulated_complex_tensor(input_data_shape); - const auto input_fft_lengths = get_lengths(reversed_input_shape, fft_axes); const auto input_strides = fft_common::compute_strides(reversed_input_shape); - const auto input_fft_strides = get_lengths(input_strides, fft_axes); - const auto input_outer_strides = get_lengths(input_strides, outer_axes); - result.fft_axes = fft_axes; result.fft_lengths = fft_lengths; result.fft_strides = fft_strides; result.outer_strides = outer_strides; - result.output_fft_strides = output_fft_strides; - result.output_outer_strides = output_outer_strides; - result.input_fft_lengths = input_fft_lengths; - result.input_fft_strides = input_fft_strides; - result.input_outer_strides = input_outer_strides; + result.output_fft_strides = get_lengths(output_strides, fft_axes); + result.output_outer_strides = get_lengths(output_strides, outer_axes); + result.input_fft_lengths = get_lengths(reversed_input_shape, fft_axes); + result.input_fft_strides = get_lengths(input_strides, fft_axes); + result.input_outer_strides = get_lengths(input_strides, outer_axes); result.fft_rank = fft_rank; result.fft_size = fft_size; result.outer_size = outer_size; - result.buffer_size = buffer_size; + result.buffer_size = compute_buffer_size(fft_lengths); return result; } diff --git a/src/core/reference/src/op/interpolate.cpp b/src/core/reference/src/op/interpolate.cpp index 3b4adc340507cf..ff9bf20eb1a293 100644 --- a/src/core/reference/src/op/interpolate.cpp +++ b/src/core/reference/src/op/interpolate.cpp @@ -93,10 +93,10 @@ InterpolateEvalHelper::InfoForGenericLinearONNXMode InterpolateEvalHelper::get_i result.batch_size = batch_size; result.num_channels = num_channels; result.spatial_rank = static_cast(spatial_rank); - result.input_index_multipliers = input_index_multipliers; - result.output_index_multipliers = output_index_multipliers; - result.input_spatial_shape = input_spatial_shape; - result.output_spatial_shape = output_spatial_shape; + result.input_index_multipliers = std::move(input_index_multipliers); + result.output_index_multipliers = std::move(output_index_multipliers); + result.input_spatial_shape = std::move(input_spatial_shape); + result.output_spatial_shape = std::move(output_spatial_shape); return result; } @@ -134,10 +134,10 @@ InterpolateEvalHelper::InfoForLinearMode InterpolateEvalHelper::get_info_for_lin InfoForLinearMode result; result.antialias = antialias; - result.a = a; - result.r = r; + result.a = std::move(a); + result.r = std::move(r); result.prod_a = prod_a; - result.shape_for_indices = shape_for_indices; + result.shape_for_indices = std::move(shape_for_indices); return result; } @@ -163,8 +163,8 @@ InterpolateEvalHelper::ICoords InterpolateEvalHelper::get_icoords(const Coordina icoords_r[axis] = static_cast(std::round(in_coord)); } - result.icoords = icoords; - result.icoords_r = icoords_r; + result.icoords = std::move(icoords); + result.icoords_r = std::move(icoords_r); return result; } @@ -218,7 +218,7 @@ InterpolateEvalHelper::LinearModeInnerIterationResult InterpolateEvalHelper::inn Coordinate inner_coord{unsigned_inner_coords_vector}; result.w = w; - result.inner_coord = inner_coord; + result.inner_coord = std::move(inner_coord); return result; } diff --git a/src/core/reference/src/op/loop.cpp b/src/core/reference/src/op/loop.cpp index f6cbae6ffaec46..17d9a57e538b93 100644 --- a/src/core/reference/src/op/loop.cpp +++ b/src/core/reference/src/op/loop.cpp @@ -51,7 +51,7 @@ void loop(const std::shared_ptr& func, ov::Tensor in_tensor(func->get_parameters().at(cur_iter_idx)->get_element_type(), func->get_parameters().at(cur_iter_idx)->get_shape()); std::memset(in_tensor.data(), 0, in_tensor.get_byte_size()); - inputs_to_body.at(cur_iter_idx) = in_tensor; + inputs_to_body.at(cur_iter_idx) = std::move(in_tensor); } // Port map processing: inputs and back edges diff --git a/src/core/src/pass/sdpa_to_paged_attention.cpp b/src/core/src/pass/sdpa_to_paged_attention.cpp index 1eaf15c928db01..0d71c6a4b0d8dc 100644 --- a/src/core/src/pass/sdpa_to_paged_attention.cpp +++ b/src/core/src/pass/sdpa_to_paged_attention.cpp @@ -7,6 +7,7 @@ #include "openvino/cc/pass/itt.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/gather.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" #include "openvino/op/shape_of.hpp" #include "openvino/op/unsqueeze.hpp" #include "openvino/pass/manager.hpp" @@ -29,6 +30,11 @@ static std::shared_ptr setName(std::shared_ptr nod bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptr& model) { RUN_ON_MODEL_SCOPE(SDPAToPagedAttention); + + OPENVINO_ASSERT(ov::op::util::has_op_with_type(model), + "No ScaledDotProductAttention operation observed in the graph, cannot perform" + "the SDPAToPagedAttention transformation."); + auto max_context_len = setName(std::make_shared(element::i32, PartialShape{}), "max_context_len"); ParameterVector model_remaining_params = { setName(std::make_shared(element::i32, PartialShape{-1}), "past_lens"), diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp index 853dfc20d11299..d3f1ae0ba691a5 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp @@ -495,7 +495,7 @@ void Deconvolution::getSupportedDescriptors() { creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); } - for (size_t i = 0; i < getChildEdges().size(); ++i) { + for (size_t i = 0; i < config.outConfs.size(); ++i) { config.outConfs[i].setMemDesc( creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); } @@ -1145,7 +1145,7 @@ void Deconvolution::initSupportedPrimitiveDescriptors() { creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); } - for (size_t i = 0; i < getChildEdges().size(); ++i) { + for (size_t i = 0; i < config.outConfs.size(); ++i) { config.outConfs[i].setMemDesc( creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); } diff --git a/src/plugins/intel_cpu/src/nodes/unique.cpp b/src/plugins/intel_cpu/src/nodes/unique.cpp index ad322756ab28e3..130213dfcb8703 100644 --- a/src/plugins/intel_cpu/src/nodes/unique.cpp +++ b/src/plugins/intel_cpu/src/nodes/unique.cpp @@ -225,41 +225,31 @@ void Unique::flattenTensorExec() { } } } else { - uniDataTmpPtr[0] = srcDataPtr[0]; - if (definedOutputs[FIRST_UNIQUE_IDX]) { - firstTmpPtr[0] = 0; - } - if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - inToOutTmpPtr[0] = 0; - } + std::unordered_map uniq; + uniq.reserve(inputLen); + if (definedOutputs[OCCURRENCES_NUM]) { std::fill(occurTmpPtr, occurTmpPtr + inputLen, 1); } - uniqueLen = 1; - - for (size_t i = 1; i < inputLen; i++) { - bool found = false; - size_t j = 0; - for (; j < uniqueLen; j++) { - if (uniDataTmpPtr[j] == srcDataPtr[i]) { - found = true; - break; - } - } - if (!found) { - uniDataTmpPtr[uniqueLen] = srcDataPtr[i]; + + for (size_t i = 0, j = 0; i < inputLen; ++i) { + auto it = uniq.emplace(srcDataPtr[i], j); + inToOutTmpPtr[i] = it.first->second; + if (it.second) { if (definedOutputs[FIRST_UNIQUE_IDX]) { - firstTmpPtr[uniqueLen] = i; + firstTmpPtr[j] = i; } - uniqueLen++; + ++j; } else { if (definedOutputs[OCCURRENCES_NUM]) { - occurTmpPtr[j]++; + occurTmpPtr[inToOutTmpPtr[i]]++; } } - if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - inToOutTmpPtr[i] = j; - } + } + + uniqueLen = static_cast(uniq.size()); + for (const auto& it : uniq) { + uniDataTmpPtr[it.second] = it.first; } } diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 3d13cab76dbb23..006935a85e85de 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -867,7 +867,7 @@ void Transformations::MainSnippets(void) { #if defined(OPENVINO_ARCH_X86_64) auto is_supported_matmul = [this](const std::shared_ptr& n) { const auto matmul = ov::as_type_ptr(n); - if (!matmul) + if (!matmul || matmul->is_dynamic()) return false; const auto in_type0 = matmul->get_input_element_type(0); const auto in_type1 = matmul->get_input_element_type(1); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/deconv_multiple_output_edges.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/deconv_multiple_output_edges.cpp new file mode 100644 index 00000000000000..b2cb4785fb5720 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/deconv_multiple_output_edges.cpp @@ -0,0 +1,70 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/node_builders/constant.hpp" +#include "common_test_utils/node_builders/eltwise.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "utils/cpu_test_utils.hpp" +#include "common_test_utils/node_builders/convolution_backprop_data.hpp" + +using namespace CPUTestUtils; + +namespace ov { +namespace test { + +// Subgraph: +/* +┌──────────────────┐ ┌──────────────────┐ +│ INPUT │ │ WEIGHTS │ +└─────────┬────────┘ └─────────┬────────┘ + │ ┌──────────────────┐ │ + └──────┤ DECONVOLUTION ├────┘ + └──┬───────────┬───┘ + │ │ + ┌───────────────┴──┐ ┌──┴───────────────┐ + │ MULTIPLY │ │ MULTIPLY │ + └──────────────────┘ └──────────────────┘ + +Verify deconvolution node correctly handles + multiple output edges on a single output port + */ + +class DeconvMultipleOutputEdges : virtual public SubgraphBaseStaticTest { +public: + void SetUp() override { + auto ngPrc = ov::element::f32; + const ov::Shape inShape = {2, 12, 7, 7}; + const ov::Shape weiShape = {12, 6, 3, 3}; + ov::ParameterVector inputParams{std::make_shared(ngPrc, inShape), + std::make_shared(ngPrc, weiShape)}; + + auto deconv = utils::make_convolution_backprop_data(inputParams[0], + inputParams[1], + ov::element::f32, + ov::Strides({1, 1}), + ov::CoordinateDiff({0, 0}), + ov::CoordinateDiff({0, 0}), + ov::Strides({1, 1}), + ov::op::PadType::NOTSET, + false); + deconv->get_rt_info() = CPUTestsBase::makeCPUInfo({nchw}, {nchw}, {}); + + const auto const1 = ov::test::utils::make_constant(ngPrc, std::vector{2, 6, 9, 9}); + const auto const2 = ov::test::utils::make_constant(ngPrc, std::vector{2, 6, 9, 9}); + + const auto mul1 = utils::make_eltwise(deconv->output(0), const1, utils::EltwiseTypes::MULTIPLY); + const auto mul2 = utils::make_eltwise(deconv->output(0), const2, utils::EltwiseTypes::MULTIPLY); + + NodeVector results{mul1, mul2}; + function = std::make_shared(results, inputParams, "DeconvMultipleOutputEdges"); + targetDevice = ov::test::utils::DEVICE_CPU; + } +}; + +TEST_F(DeconvMultipleOutputEdges, smoke_DeconvMultipleOutputEdges_CPU) { + run(); +} + +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index b0cd612cd3a378..a320d02d6e733c 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit b0cd612cd3a378fb2dd73a84efddfca1df2a22db +Subproject commit a320d02d6e733c775724901675cbc8944391459d diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 635aa4d796d3db..b1aaded5ad7780 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -202,8 +202,7 @@ KERNEL(sdpa_opt)( #define QUERY_BLOCK_SIZE 1 INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset); - - query_local[query_local_offset] = val; + query_local[query_local_offset] = val * scale_val; query_local_offset += QUERY_STEP_LOCAL; query_offset += query_pitch; } @@ -338,7 +337,6 @@ KERNEL(sdpa_opt)( for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE)) { // Read value from SLM and apply scale qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len]; - qk_val[seq_idx] *= scale_val; // Apply attention mask #if IS_CAUSAL diff --git a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp index 3e3cb995b70555..9ec265c9322f5c 100644 --- a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp +++ b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp @@ -42,8 +42,9 @@ namespace snippets { */ class MHAFunction : public SnippetsFunctionBase { public: - explicit MHAFunction(const std::vector& inputShapes, const std::vector& precisions, bool with_mul = true) - : SnippetsFunctionBase(inputShapes), with_mul(with_mul), precisions(precisions) { + explicit MHAFunction(const std::vector& inputShapes, const std::vector& precisions, + bool with_mul = true, bool with_reshape = true) + : SnippetsFunctionBase(inputShapes), with_mul(with_mul), with_reshape(with_reshape), precisions(precisions) { OPENVINO_ASSERT(input_shapes.size() == 4, "Got invalid number of input shapes"); OPENVINO_ASSERT(precisions.size() == 4, "Got invalid number of input precisions"); } @@ -51,8 +52,9 @@ class MHAFunction : public SnippetsFunctionBase { std::shared_ptr initOriginal() const override; std::shared_ptr initReference() const override; - bool with_mul = true; - std::vector precisions; + const bool with_mul = true; + const bool with_reshape = true; + const std::vector precisions; }; class MHASplitMFunction : public MHAFunction { @@ -85,8 +87,9 @@ class MHASplitMFunction : public MHAFunction { */ class MHAMatMul0TransposeFunction : public SnippetsFunctionBase { public: - explicit MHAMatMul0TransposeFunction(const std::vector& inputShapes, const std::vector& precisions) - : SnippetsFunctionBase(inputShapes), precisions(precisions) { + explicit MHAMatMul0TransposeFunction(const std::vector& inputShapes, const std::vector& precisions, + bool with_reshape = true) + : SnippetsFunctionBase(inputShapes), with_reshape(with_reshape), precisions(precisions) { OPENVINO_ASSERT(input_shapes.size() == 4, "Got invalid number of input shapes"); OPENVINO_ASSERT(precisions.size() == 4, "Got invalid number of input precisions"); } @@ -94,7 +97,8 @@ class MHAMatMul0TransposeFunction : public SnippetsFunctionBase { std::shared_ptr initOriginal() const override; std::shared_ptr initReference() const override; - std::vector precisions; + const bool with_reshape = true; + const std::vector precisions; }; /* Graph: diff --git a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp index 3157c53fbb32de..f923a9a3aa168e 100644 --- a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp +++ b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp @@ -70,26 +70,35 @@ std::shared_ptr MHAFunction::initOriginal() const { std::shared_ptr matmul_parent1 = transpose1; if (with_mul) { ov::Shape shape(rank, 1); - shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3]; - std::vector mulConstData(ov::shape_size(shape)); + if (transpose1->get_output_partial_shape(0).is_static()) { + shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3]; + } const auto mulConst = ov::test::utils::make_constant(precisions[1], shape); matmul_parent1 = std::make_shared(transpose1, mulConst); } const auto matMul0 = std::make_shared(transpose0, matmul_parent1); const auto add = std::make_shared(matMul0, addParam); - const auto interm_shape = add->get_output_shape(0); - const auto batch = std::accumulate(interm_shape.cbegin(), interm_shape.cbegin() + rank - 1, 1, std::multiplies()); - const auto reshape0ConstData = std::vector{ batch, -1 }; - const auto reshape1ConstData = interm_shape; - const auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape0ConstData.size()}, reshape0ConstData); - const auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape1ConstData.size()}, reshape1ConstData); + auto softmax_out = add->output(0); + if (with_reshape) { + const auto interm_shape = add->get_output_shape(0); + const auto batch = std::accumulate(interm_shape.cbegin(), interm_shape.cbegin() + rank - 1, 1, std::multiplies()); + const auto reshape0ConstData = std::vector{ batch, -1 }; + const auto reshape1ConstData = interm_shape; + const auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape0ConstData.size()}, reshape0ConstData); + const auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape1ConstData.size()}, reshape1ConstData); + + const auto reshape0 = std::make_shared(add, reshape0Const, true); + const auto softMax = std::make_shared(reshape0, 1); + const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + softmax_out = reshape1->output(0); + } else { + const auto softMax = std::make_shared(add, rank - 1); + softmax_out = softMax->output(0); + } - const auto reshape0 = std::make_shared(add, reshape0Const, true); - const auto softMax = std::make_shared(reshape0, 1); - const auto reshape1 = std::make_shared(softMax, reshape1Const, true); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); - const auto matMul1 = std::make_shared(reshape1, transpose2); + const auto matMul1 = std::make_shared(softmax_out, transpose2); const auto transpose3 = std::make_shared(matMul1, transpose3Const); ov::ResultVector results{std::make_shared(transpose3)}; @@ -124,13 +133,19 @@ std::shared_ptr MHAFunction::initReference() const { std::shared_ptr matmul_parent1 = transpose1; if (with_mul) { ov::Shape shape(rank, 1); - shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3]; - std::vector mulConstData(ov::shape_size(shape)); + if (transpose1->get_output_partial_shape(0).is_static()) { + shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3]; + } const auto mulConst = ov::test::utils::make_constant(precisions[1], shape); - const auto mulParam = std::make_shared(precisions[1], mulConst->get_shape()); - matmul_parent1 = std::make_shared(transpose1, mulParam); - subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param}; - subgraph_inputs = {data0, data1, mulConst, data2, data3}; + + if (ov::shape_size(shape) > 1) { + const auto mulParam = std::make_shared(precisions[1], mulConst->get_shape()); + matmul_parent1 = std::make_shared(transpose1, mulParam); + subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param}; + subgraph_inputs = {data0, data1, mulConst, data2, data3}; + } else { + matmul_parent1 = std::make_shared(transpose1, mulConst); + } } const auto matMul0 = std::make_shared(transpose0, matmul_parent1); @@ -182,16 +197,22 @@ std::shared_ptr MHASplitMFunction::initReference() const { std::shared_ptr matmul_parent1 = transpose1; if (with_mul) { ov::Shape shape(rank - 1, 1); - shape[rank - 4] = transpose1->get_output_shape(0)[rank - 4]; - ov::Shape reshape_shape = shape; - reshape_shape.insert(reshape_shape.cbegin() + rank - 3, 1); - std::vector mulConstData(ov::shape_size(shape)); + if (transpose1->get_output_partial_shape(0).is_static()) { + shape[rank - 4] = transpose1->get_output_shape(0)[rank - 4]; + } const auto mulConst = ov::test::utils::make_constant(precisions[1], shape); - const auto reshape_mul = make_reshape(mulConst, reshape_shape); - const auto mulParam = std::make_shared(precisions[1], reshape_mul->get_shape()); - matmul_parent1 = std::make_shared(transpose1, mulParam); - subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param}; - subgraph_inputs = {reshape0, reshape1, reshape_mul, reshape2, reshape3}; + + if (ov::shape_size(shape) > 1) { + ov::Shape reshape_shape = shape; + reshape_shape.insert(reshape_shape.cbegin() + rank - 3, 1); + const auto mulReshape = make_reshape(mulConst, reshape_shape); + const auto mulParam = std::make_shared(precisions[1], mulReshape->get_shape()); + matmul_parent1 = std::make_shared(transpose1, mulParam); + subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param}; + subgraph_inputs = {reshape0, reshape1, mulReshape, reshape2, reshape3}; + } else { + matmul_parent1 = std::make_shared(transpose1, mulConst); + } } const auto matMul0 = std::make_shared(transpose0, matmul_parent1); @@ -217,47 +238,42 @@ std::shared_ptr MHAMatMul0TransposeFunction::initOriginal() const { auto transpose2Param = std::make_shared(precisions[3], input_shapes[3]); ov::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; - std::vector constantShapes; - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1})); - constantShapes.push_back(ov::Shape({2})); - constantShapes.push_back(ov::Shape({4})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - - const auto order = std::vector{0, 2, 1, 3}; - auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[0], order); - auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[1], order); - auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[5], order); - auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[6], order); - - std::vector mulConstData(1); - auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1}); - - std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * - input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), - -1}; - auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[3], reshape0ConstData); + const auto rank = input_shapes[0].size(); + const auto fusion_order = get_fusion_order(rank); - std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), - static_cast(input_shapes[0].get_shape()[2]), - static_cast(input_shapes[0].get_shape()[1]), - static_cast(input_shapes[0].get_shape()[1])}; - auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[4], reshape1ConstData); + const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); - float transA = false; - float transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); + + const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1}); const auto mul = std::make_shared(transpose1, mulConst); - const auto matMul0 = std::make_shared(transpose0, mul, transA, true); + const auto matMul0 = std::make_shared(transpose0, mul, false, true); const auto add = std::make_shared(matMul0, addParam); - const auto reshape0 = std::make_shared(add, reshape0Const, true); - const auto softMax = std::make_shared(reshape0, 1); - const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + + auto softmax_out = add->output(0); + if (with_reshape) { + const auto interm_shape = add->get_output_shape(0); + const auto batch = std::accumulate(interm_shape.cbegin(), interm_shape.cbegin() + rank - 1, 1, std::multiplies()); + const auto reshape0ConstData = std::vector{ batch, -1 }; + const auto reshape1ConstData = interm_shape; + const auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape0ConstData.size()}, reshape0ConstData); + const auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape1ConstData.size()}, reshape1ConstData); + + const auto reshape0 = std::make_shared(add, reshape0Const, true); + const auto softMax = std::make_shared(reshape0, 1); + const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + softmax_out = reshape1->output(0); + } else { + const auto softMax = std::make_shared(add, rank - 1); + softmax_out = softMax->output(0); + } + const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); - const auto matMul1 = std::make_shared(reshape1, transpose2, transA, transB); + const auto matMul1 = std::make_shared(softmax_out, transpose2); const auto transpose3 = std::make_shared(matMul1, transpose3Const); ov::ResultVector results{std::make_shared(transpose3)}; @@ -269,58 +285,38 @@ std::shared_ptr MHAMatMul0TransposeFunction::initReference() const { auto data2 = std::make_shared(precisions[2], input_shapes[2]); auto data3 = std::make_shared(precisions[3], input_shapes[3]); ov::ParameterVector ngraphParams = {data0, data1, data2, data3}; + NodeVector subgraph_inputs = {data0, data1, data2, data3}; auto transpose0Param = std::make_shared(precisions[0], input_shapes[0]); auto transpose1Param = std::make_shared(precisions[1], input_shapes[1]); auto addParam = std::make_shared(precisions[2], input_shapes[2]); auto transpose2Param = std::make_shared(precisions[3], input_shapes[3]); - std::vector constantShapes; - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1})); - constantShapes.push_back(ov::Shape({2})); - constantShapes.push_back(ov::Shape({4})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - - auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[0], std::vector{0, 2, 1, 3}); - auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[1], std::vector{0, 2, 3, 1}); - auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[5], std::vector{0, 2, 1, 3}); - auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[6], std::vector{0, 2, 1, 3}); - - std::vector mulConstData(1); - auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1}); - ov::ParameterVector subgraphParams = {transpose0Param, transpose1Param, addParam, transpose2Param}; + ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; - std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * - input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), - -1}; - auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[3], reshape0ConstData); + const auto rank = input_shapes[0].size(); + const auto fusion_order = get_fusion_order(rank); + const auto decomposed_order = get_decomposed_order(rank); - std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), - static_cast(input_shapes[0].get_shape()[2]), - static_cast(input_shapes[0].get_shape()[1]), - static_cast(input_shapes[0].get_shape()[1])}; - auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[4], reshape1ConstData); + const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order); + const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); - float transA = false; - float transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); + + const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1}); const auto mul = std::make_shared(transpose1, mulConst); - const auto matMul0 = std::make_shared(transpose0, mul, transA, transB); + const auto matMul0 = std::make_shared(transpose0, mul); const auto add = std::make_shared(matMul0, addParam); - const auto reshape0 = std::make_shared(add, reshape0Const, true); - const auto softMax = std::make_shared(reshape0, 1); - const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + const auto softMax = std::make_shared(add, rank - 1); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); - const auto matMul1 = std::make_shared(reshape1, transpose2, transA, transB); + const auto matMul1 = std::make_shared(softMax, transpose2); const auto transpose3 = std::make_shared(matMul1, transpose3Const); - auto subgraph = std::make_shared( - NodeVector{data0, data1, data2, data3}, - std::make_shared(NodeVector{transpose3}, subgraphParams)); + auto subgraph = std::make_shared(subgraph_inputs, + std::make_shared(NodeVector{transpose3}, subgraph_params)); return std::make_shared(NodeVector{subgraph}, ngraphParams); } @@ -982,9 +978,9 @@ std::shared_ptr MHATransposedInputFunction::initReference() const { } } - const auto param0 = std::make_shared(precision, data0->get_shape()); - const auto param1 = std::make_shared(precision, in1->get_shape()); - const auto param2 = std::make_shared(precision, data2->get_shape()); + const auto param0 = std::make_shared(precision, data0->get_output_partial_shape(0)); + const auto param1 = std::make_shared(precision, in1->get_output_partial_shape(0)); + const auto param2 = std::make_shared(precision, data2->get_output_partial_shape(0)); std::shared_ptr matmul0_in1 = param1; if (!m_order.empty() && is_supported) {