diff --git a/.github/workflows/job_gpu_tests.yml b/.github/workflows/job_gpu_tests.yml
new file mode 100644
index 00000000000000..7ba71afec09748
--- /dev/null
+++ b/.github/workflows/job_gpu_tests.yml
@@ -0,0 +1,134 @@
+name: GPU
+
+on:
+  workflow_call:
+    inputs:
+      test_type:
+        description: 'Type of tests to execute'
+        type: string
+        required: true
+      device:
+        description: 'Device name (igpu or dgpu)'
+        type: string
+        required: true
+      runner:
+        description: 'Runner labels by which the runner will be chosen. Example: [ "self-hosted", "igpu" ]'
+        type: string
+        required: true
+      container:
+        description: 'JSON to be converted to the value of the "container" configuration for the job'
+        type: string
+        required: false
+        default: '{"image": null}'
+
+jobs:
+  GPU:
+    timeout-minutes: 80
+    runs-on: ${{ fromJSON(inputs.runner) }}
+    container: ${{ fromJSON(inputs.container) }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
+      INSTALL_DIR: ${{ github.workspace }}/install
+      INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests
+      GTEST_PARALLEL_SCRIPT: ${{ github.workspace }}/gtest_parallel.py
+    steps:
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+        with:
+          name: 'openvino_package'
+          path: ${{ env.INSTALL_DIR }}
+
+      - name: Download OpenVINO tests package
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+        with:
+          name: 'openvino_tests'
+          path: ${{ env.INSTALL_TEST_DIR }}
+
+      # Needed as ${{ github.workspace }} is not working correctly when using Docker
+      - name: Setup Variables
+        run: |
+          echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV"
+          echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
+          echo "GTEST_PARALLEL_SCRIPT=$GITHUB_WORKSPACE/gtest_parallel.py" >> "$GITHUB_ENV"
+
+      - name: Extract OpenVINO packages
+        run: |
+          pushd $INSTALL_DIR
+            tar -xzf openvino_package.tar.gz -C $INSTALL_DIR
+          popd
+          pushd $INSTALL_TEST_DIR
+            tar -xzf openvino_tests.tar.gz -C $INSTALL_DIR
+          popd
+
+      - name: Install dependencies (Linux)
+        run: |
+          $INSTALL_DIR/install_dependencies/install_openvino_dependencies.sh -c=core -c=dev -c=gpu -y
+          
+          apt-get update && apt-get install -y wget software-properties-common ca-certificates gpg-agent tzdata clinfo
+        env:
+          DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
+          TZ: "Europe/London" # to prevent tzdata from waiting user input
+      - name: Setup Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Get gtest-parallel script
+        run: wget https://raw.githubusercontent.com/google/gtest-parallel/master/gtest_parallel.py
+
+      - name: Install compute runtime drivers
+        run: |
+          wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-core_1.0.15985.7_amd64.deb
+          wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-opencl_1.0.15985.7_amd64.deb
+          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu-dbgsym_1.3.28454.6_amd64.ddeb
+          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu_1.3.28454.6_amd64.deb
+          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd-dbgsym_24.05.28454.6_amd64.ddeb
+          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd_24.05.28454.6_amd64.deb
+          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/libigdgmm12_22.3.11_amd64.deb
+          dpkg -i *.deb
+
+      - name: Install media & display runtimes
+        if: ${{ inputs.device == 'dgpu' }}
+        run: |
+          apt-get update && apt-get install -y \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm11 libxatracker2 mesa-va-drivers \
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all
+
+      - name: Verify devices
+        run: clinfo
+
+      #
+      # Tests
+      #
+
+      - name: OpenVINO GPU ${{ inputs.test_type }} Tests
+        id: run_tests
+        run: |
+          source ${INSTALL_DIR}/setupvars.sh
+
+          TEST_RESULTS_DIR="${{ inputs.device }}_${{ inputs.test_type }}_tests"
+          echo "test_results_dir=$TEST_RESULTS_DIR" >> $GITHUB_OUTPUT
+
+          rm -rf ${INSTALL_TEST_DIR}/${TEST_RESULTS_DIR} && mkdir -p ${INSTALL_TEST_DIR}/${TEST_RESULTS_DIR}
+
+          test_filter=''
+          if [[ "${{ inputs.test_type }}" == "unit" ]]; then
+            # Ticket: 138018
+            test_filter='-*scatter_nd_update_gpu.dynamic_padded_output*:*border_gpu.basic_zero_input*:*bicubic_zeros_no_align_data1x1*:*bicubic_border_align_batches*:*bilinear_zeros_no_align_data1x1*:*non_zero_gpu.empty_input*:*mark_shape_of_subgraphs.concat_with_empty_tensor_inputs*:*concat_cpu_impl.dynamic_4d_f*:*border_gpu.basic_zero_input_dynamic*:*network_test.model_with_empty_input_is_not_dynamic*:*bicubic_zeros_align_data1x1*'
+          else
+            test_filter='*smoke*'
+          fi
+          python3 ${GTEST_PARALLEL_SCRIPT} ${INSTALL_TEST_DIR}/ov_gpu_${{ inputs.test_type }}_tests --dump_json_test_results=${INSTALL_TEST_DIR}/${TEST_RESULTS_DIR}/ov_gpu_${{ inputs.test_type }}_tests.json -- --report_unique_name --gtest_filter=$test_filter
+
+
+      - name: Upload Test Results
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
+        if: always()
+        with:
+          name: test-results-${{ inputs.test_type }}-${{ inputs.device }}
+          path: ${{ env.INSTALL_TEST_DIR }}/${{ steps.run_tests.outputs.test_results_dir }}
+          if-no-files-found: 'error'
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 462e30e44103b8..744e693b1cff51 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -652,119 +652,44 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
     if: fromJSON(needs.smart_ci.outputs.affected_components).TOKENIZERS
 
-  GPU:
-    name: GPU Tests
+  iGPU:
+    name: iGPU Tests
     needs: [ Build, Smart_CI ]
-    if: fromJSON(needs.smart_ci.outputs.affected_components).GPU
-    timeout-minutes: 80
-    runs-on: [ self-hosted, gpu ]
+    uses: ./.github/workflows/job_gpu_tests.yml
     strategy:
       max-parallel: 2
       fail-fast: false
       matrix:
         TEST_TYPE: ['unit', 'func']
-    container:
-      image: ubuntu:20.04
-      options: --device /dev/dri:/dev/dri --group-add 109 --group-add 44
-      volumes:
-        - /dev/dri:/dev/dri
-    defaults:
-      run:
-        shell: bash
-    env:
-      DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
-      INSTALL_DIR: ${{ github.workspace }}/install
-      INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests
-      GTEST_PARALLEL_SCRIPT: ${{ github.workspace }}/gtest_parallel.py
-    steps:
-      - name: Download OpenVINO package
-        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
-        with:
-          name: 'openvino_package'
-          path: ${{ env.INSTALL_DIR }}
-
-      - name: Download OpenVINO tests package
-        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
-        with:
-          name: 'openvino_tests'
-          path: ${{ env.INSTALL_TEST_DIR }}
-
-      # Needed as ${{ github.workspace }} is not working correctly when using Docker
-      - name: Setup Variables
-        run: |
-          echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV"
-          echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
-          echo "GTEST_PARALLEL_SCRIPT=$GITHUB_WORKSPACE/gtest_parallel.py" >> "$GITHUB_ENV"
-
-      - name: Extract OpenVINO packages
-        run: |
-          pushd $INSTALL_DIR
-            tar -xzf openvino_package.tar.gz -C $INSTALL_DIR
-          popd
-          pushd $INSTALL_TEST_DIR
-            tar -xzf openvino_tests.tar.gz -C $INSTALL_DIR
-          popd
-
-      - name: Install dependencies (Linux)
-        run: |
-          $INSTALL_DIR/install_dependencies/install_openvino_dependencies.sh -c=core -c=dev -c=gpu -y
-
-          apt-get update && apt-get install -y wget software-properties-common ca-certificates gpg-agent tzdata
-        env:
-          DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
-          TZ: "Europe/London" # to prevent tzdata from waiting user input
-
-      - name: Setup Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
-        with:
-          python-version: ${{ env.PYTHON_VERSION }}
-
-      - name: Get gtest-parallel script
-        run: wget https://raw.githubusercontent.com/google/gtest-parallel/master/gtest_parallel.py
-
-      - name: Install GPU Drivers
-        run: |
-          wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-core_1.0.15985.7_amd64.deb
-          wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-opencl_1.0.15985.7_amd64.deb
-          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu-dbgsym_1.3.28454.6_amd64.ddeb
-          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu_1.3.28454.6_amd64.deb
-          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd-dbgsym_24.05.28454.6_amd64.ddeb
-          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd_24.05.28454.6_amd64.deb
-          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/libigdgmm12_22.3.11_amd64.deb
-          dpkg -i *.deb
-
-      #
-      # Tests
-      #
-
-      - name: OpenVINO GPU ${{ matrix.TEST_TYPE }} Tests
-        run: |
-          source ${INSTALL_DIR}/setupvars.sh
-
-          rm -rf ${INSTALL_TEST_DIR}/gpu_${{ matrix.TEST_TYPE }}_tests && mkdir -p ${INSTALL_TEST_DIR}/gpu_${{ matrix.TEST_TYPE }}_tests
-
-          test_filter=''
-          if [[ "${{ matrix.TEST_TYPE }}" == "unit" ]]; then
-            # Ticket: 138018
-            test_filter='-*scatter_nd_update_gpu.dynamic_padded_output*:*border_gpu.basic_zero_input*:*bicubic_zeros_no_align_data1x1*:*bicubic_border_align_batches*:*bilinear_zeros_no_align_data1x1*:*non_zero_gpu.empty_input*:*mark_shape_of_subgraphs.concat_with_empty_tensor_inputs*:*concat_cpu_impl.dynamic_4d_f*:*border_gpu.basic_zero_input_dynamic*:*network_test.model_with_empty_input_is_not_dynamic*:*bicubic_zeros_align_data1x1*'
-          else
-            test_filter='*smoke*'
-          fi
-          python3 ${GTEST_PARALLEL_SCRIPT} ${INSTALL_TEST_DIR}/ov_gpu_${{ matrix.TEST_TYPE }}_tests --dump_json_test_results=${INSTALL_TEST_DIR}/gpu_${{ matrix.TEST_TYPE }}_tests/ov_gpu_${{ matrix.TEST_TYPE }}_tests.json -- --report_unique_name --gtest_filter=$test_filter
-
+    with:
+      device: 'igpu'
+      test_type: ${{ matrix.TEST_TYPE }}
+      runner: "[ 'self-hosted', 'igpu' ]"
+      container: '{"image": "ubuntu:20.04", "volumes": ["/dev/dri:/dev/dri"], "options": "--group-add 109 --group-add 44
+        --device /dev/dri:/dev/dri"}'
+    if: fromJSON(needs.smart_ci.outputs.affected_components).GPU
 
-      - name: Upload Test Results
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
-        if: always()
-        with:
-          name: test-results-${{ matrix.TEST_TYPE }}-gpu
-          path: ${{ env.INSTALL_TEST_DIR }}/gpu_${{ matrix.TEST_TYPE }}_tests
-          if-no-files-found: 'error'
+  dGPU:
+    name: dGPU Tests
+    needs: [ Build, Smart_CI ]
+    uses: ./.github/workflows/job_gpu_tests.yml
+    strategy:
+      max-parallel: 2
+      fail-fast: false
+      matrix:
+        TEST_TYPE: ['unit', 'func']
+    with:
+      device: 'dgpu'
+      test_type: ${{ matrix.TEST_TYPE }}
+      runner: "[ 'self-hosted', 'dgpu' ]"
+      container: '{"image": "ubuntu:20.04", "volumes": ["/dev/dri:/dev/dri"], "options": "--group-add 109 --group-add 44
+        --device /dev/dri/card0:/dev/dri/card0  --device /dev/dri/renderD128:/dev/dri/renderD128"}'
+    if: ${{ github.event_name == 'schedule' }}
 
   Overall_Status:
     name: ci/gha_overall_status
     needs: [Smart_CI, Build, Debian_Packages, Samples, Conformance, ONNX_Runtime, CXX_Unit_Tests, Python_Unit_Tests, TensorFlow_Layer_Tests,
-            CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers, GPU]
+            CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers, iGPU]
     if: ${{ always() }}
     runs-on: ubuntu-latest
     steps:
diff --git a/docs/articles_en/assets/snippets/multi_threading.py b/docs/articles_en/assets/snippets/multi_threading.py
index 9a5baa1e7575b1..6994b26a0d6552 100644
--- a/docs/articles_en/assets/snippets/multi_threading.py
+++ b/docs/articles_en/assets/snippets/multi_threading.py
@@ -37,7 +37,7 @@
 # ! [ov:intel_cpu:multi_threading:part0]
 
 # ! [ov:intel_cpu:multi_threading:part1]
-# Disable CPU threads pinning for inference when system supoprt it
+# Disable CPU threads pinning for inference when the system supports it
 compiled_model_4 = core.compile_model(
     model=model,
     device_name=device_name,
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst
index b45ff8140031e6..d95f97959f5b2a 100644
--- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst
@@ -3,7 +3,11 @@
 CPU Device
 ==========
 
+.. toctree::
+   :maxdepth: 1
+   :hidden:
 
+   cpu-device/performance-hint-and-threads-scheduling
 
 .. meta::
    :description: The CPU plugin in the Intel® Distribution of OpenVINO™ toolkit
@@ -246,12 +250,6 @@ For more details, see the :doc:`optimization guide <../optimize-inference>` and
    on data transfer between NUMA nodes. In that case it is better to use the ``ov::hint::PerformanceMode::LATENCY`` performance hint.
    For more details see the :doc:`performance hints <../optimize-inference/high-level-performance-hints>` overview.
 
- .. toctree::
-    :maxdepth: 1
-    :hidden:
- 
-    cpu-device/performance-hint-and-threads-scheduling
-
 Dynamic Shapes
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst
index 93c8c0bd6b36c7..3087bcf2d95783 100644
--- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst
@@ -1,6 +1,5 @@
-.. {#openvino_docs_OV_UG_supported_plugins_CPU_Hints_Threading}
 
-Performance Hints and Threads Scheduling 
+Performance Hints and Threads Scheduling
 ========================================
 
 .. meta::
@@ -8,37 +7,46 @@ Performance Hints and Threads Scheduling
                  detects CPU architecture and sets low-level properties based
                  on performance hints automatically.
 
-While all supported devices in OpenVINO offer low-level performance settings, it is advisable not to widely use these settings unless targeting specific platforms and models. The recommended approach is configuring performance in OpenVINO Runtime using high-level performance hints property ``ov::hint::performance_mode``. Performance hints ensure optimal portability and scalability of the applications across various platforms and models.
-
-To simplify the configuration of hardware devices, OpenVINO offers two performance hints: the latency hint ``ov::hint::PerformanceMode::LATENCY`` and the throughput hint ``ov::hint::PerformanceMode::THROUGHPUT``.
-
-- ``ov::inference_num_threads`` limits number of logical processors used for CPU inference.
-  If the number set by the user is greater than the number of logical processors on the platform, multi-threading scheduler only uses the platform number for CPU inference.
-- ``ov::num_streams`` limits number of infer requests that can be run in parallel.
-  If the number set by the user is greater than the number of inference threads, multi-threading scheduler only uses the number of inference threads to ensure that there is at least one thread per stream.
-- ``ov::hint::scheduling_core_type`` limits the type of CPU cores for CPU inference when user runs inference on a hybird platform that includes both Performance-cores (P-cores) with Efficient-cores (E-cores).
-  If user platform only has one type of CPU cores, this property has no effect, and CPU inference always uses this unique core type.
-- ``ov::hint::enable_hyper_threading`` limits the use of one or two logical processors per CPU core when platform has CPU hyperthreading enabled.
+While all supported devices in OpenVINO offer low-level performance settings, it is advisable
+not to use these settings widely unless targeting specific platforms and models. The recommended
+approach is to configure performance in OpenVINO Runtime using the high-level performance hints
+property ``ov::hint::performance_mode``. Performance hints ensure optimal portability and
+scalability of applications across various platforms and models.
+
+To simplify the configuration of hardware devices, OpenVINO offers two performance hints: the
+latency hint ``ov::hint::PerformanceMode::LATENCY`` and the throughput hint
+``ov::hint::PerformanceMode::THROUGHPUT``.
+
+- ``ov::inference_num_threads`` limits the number of logical processors used for CPU inference.
+  If the number set by the user is greater than the number of logical processors on the platform,
+  the multi-threading scheduler only uses the platform number for CPU inference.
+- ``ov::num_streams`` limits the number of infer requests that can be run in parallel.
+  If the number set by the user is greater than the number of inference threads, multi-threading
+  scheduler only uses the number of inference threads to ensure that there is at least one thread per stream.
+- ``ov::hint::scheduling_core_type`` specifies the type of CPU cores for CPU inference when the user runs
+  inference on a hybird platform that includes both Performance-cores (P-cores) and Efficient-cores (E-cores).
+  If the user platform only has one type of CPU core, this property has no effect, and CPU inference always uses this unique core type.
+- ``ov::hint::enable_hyper_threading`` limits the use of one or two logical processors per CPU
+  core when the platform has CPU hyperthreading enabled.
   If there is only one logical processor per CPU core, such as Efficient-cores, this property has no effect, and CPU inference uses all logical processors.
-- ``ov::hint::enable_cpu_pinning`` enable CPU pinning during CPU inference. 
-  If user enable this property but inference scenario cannot support it, this property will be disabled during model compilation. 
-
-For additional details on the above configurations, refer to:
+- ``ov::hint::enable_cpu_pinning`` enables CPU pinning during CPU inference.
+  If the user enables this property but the inference scenario does not support it, this property will be disabled during model compilation.
 
-- `Multi-stream Execution <https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.html#multi-stream-execution>`__
+For additional details on the above configurations, refer to `Multi-stream Execution <https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.html#multi-stream-execution>`__.
 
 Latency Hint
 ###################################
 
-In this scenario, the default setting of ``ov::hint::scheduling_core_type`` is determined by the model precision and the ratio of P-cores and E-cores.
+In this scenario, the default setting of ``ov::hint::scheduling_core_type`` is determined by
+the model precision and the ratio of P-cores and E-cores.
 
 .. note::
 
-    P-cores is short for Performance-cores and E-cores is for Efficient-cores. These are available after 12th Gen Intel® Core™ Processor. 
+    P-cores is short for Performance-cores and E-cores stands for Efficient-cores. These types of cores are available starting with the 12th Gen Intel® Core™ processors.
 
 .. _Core Type Table of Latency Hint:
 +----------------------------+---------------------+---------------------+
-|                            | INT8 model          | FP32 model          |
+|                            | INT8 Model          | FP32 Model          |
 +============================+=====================+=====================+
 | E-cores / P-cores < 2      | P-cores             | P-cores             |
 +----------------------------+---------------------+---------------------+
@@ -49,38 +57,39 @@ In this scenario, the default setting of ``ov::hint::scheduling_core_type`` is d
 
 .. note::
 
-   Both P-cores and E-cores may be used for any configuration starting from 14th Gen Intel® Core™ Processor on Windows.
+   Both P-cores and E-cores may be used for any configuration starting with 14th Gen Intel® Core™ processors on Windows.
 
-Then the default settings of low-level performance properties on Windows and Linux are as follows:
+Then the default settings for low-level performance properties on Windows and Linux are as follows:
 
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
-| Property                             | Windows                                                        | Linux                                                          |
-+======================================+================================================================+================================================================+
-| ``ov::num_streams``                  | 1                                                              | 1                                                              |
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
-| ``ov::inference_num_threads``        | is equal to number of P-cores or P-cores+E-cores on one socket | is equal to number of P-cores or P-cores+E-cores on one socket |
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
-| ``ov::hint::scheduling_core_type``   | `Core Type Table of Latency Hint`_                             | `Core Type Table of Latency Hint`_                             |
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
-| ``ov::hint::enable_hyper_threading`` | No                                                             | No                                                             |
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
-| ``ov::hint::enable_cpu_pinning``     | No / Not Supported                                             | Yes except using P-cores and E-cores together                  |
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
+| Property                             | Windows                                                                | Linux                                                              |
++======================================+========================================================================+====================================================================+
+| ``ov::num_streams``                  | 1                                                                      | 1                                                                  |
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
+| ``ov::inference_num_threads``        | is equal to the number of P-cores or P-cores+E-cores on one socket     | is equal to the number of P-cores or P-cores+E-cores on one socket |
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
+| ``ov::hint::scheduling_core_type``   | `Core Type Table of Latency Hint`_                                     | `Core Type Table of Latency Hint`_                                 |
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
+| ``ov::hint::enable_hyper_threading`` | No                                                                     | No                                                                 |
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
+| ``ov::hint::enable_cpu_pinning``     | No / Not Supported                                                     | Yes except using P-cores and E-cores together                      |
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
 
 .. note::
 
-    - ``ov::hint::scheduling_core_type`` might be adjusted for particular inferred model on particular platform based on internal heuristics to guarantee best performance.
+    - ``ov::hint::scheduling_core_type`` may be adjusted for a particular inferred model on a specific platform based on internal heuristics to guarantee optimal performance.
     - Both P-cores and E-cores are used for the Latency Hint on Intel® Core™ Ultra Processors on Windows, except in the case of large language models.
-    - In case hyper-threading is enabled, two logical processors share hardware resource of one CPU core. OpenVINO do not expect to use both logical processors in one stream for one infer request. So ``ov::hint::enable_hyper_threading`` is ``No`` in this scenario.
-    - ``ov::hint::enable_cpu_pinning`` is disabled by default on Windows/Mac, and enabled on Linux. Such default settings are aligned with typical workloads running in corresponding environment to guarantee better OOB performance.
+    - In case hyper-threading is enabled, two logical processors share the hardware resources of one CPU core. OpenVINO does not expect to use both logical processors in one stream for a single infer request. So ``ov::hint::enable_hyper_threading`` is set to ``No`` in this scenario.
+    - ``ov::hint::enable_cpu_pinning`` is disabled by default on Windows and macOS, and enabled on Linux. Such default settings are aligned with typical workloads running in the corresponding environments to guarantee better out-of-the-box (OOB) performance.
 
 Throughput Hint
 ######################################
 
-In this scenario, thread scheduling first evaluates the memory pressure of the model being inferred on the current platform, and determines the number of threads per stream, as shown below.
+In this scenario, thread scheduling first evaluates the memory pressure of the model being
+inferred on the current platform, and determines the number of threads per stream, as shown below.
 
 +-----------------+-----------------------+
-| Memory Pressure | Threads per stream    |
+| Memory Pressure | Threads per Stream    |
 +=================+=======================+
 | low             | 1 P-core or 2 E-cores |
 +-----------------+-----------------------+
@@ -89,12 +98,13 @@ In this scenario, thread scheduling first evaluates the memory pressure of the m
 | high            | 3 or 4 or 5           |
 +-----------------+-----------------------+
 
-Then the value of ``ov::num_streams`` is calculated as ``ov::inference_num_threads`` divided by the number of threads per stream. The default settings of low-level performance properties on Windows and Linux are as follows:
+Then the value of ``ov::num_streams`` is calculated by dividing ``ov::inference_num_threads``
+by the number of threads per stream. The default settings for low-level performance properties on Windows and Linux are as follows:
 
 +--------------------------------------+-------------------------------+-------------------------------+
 | Property                             | Windows                       | Linux                         |
 +======================================+===============================+===============================+
-| ``ov::num_streams``                  | Calculate as above            | Calculate as above            |
+| ``ov::num_streams``                  | Calculated as above           | Calculated as above           |
 +--------------------------------------+-------------------------------+-------------------------------+
 | ``ov::inference_num_threads``        | Number of P-cores and E-cores | Number of P-cores and E-cores |
 +--------------------------------------+-------------------------------+-------------------------------+
@@ -107,16 +117,17 @@ Then the value of ``ov::num_streams`` is calculated as ``ov::inference_num_threa
 
 .. note::
 
-    - By default, different core types are not mixed within single stream in this scenario. And cores from different numa nodes are not mixed within single stream.
+    - By default, different core types are not mixed within a single stream in this scenario. The cores from different NUMA nodes are not mixed within a single stream.
 
 Multi-Threading Optimization
 ##############################################
 
-User can use the following properties to limit available CPU resource for model inference. If the platform or operating system can support this behavior, OpenVINO Runtime will perform multi-threading scheduling based on limited available CPU.
+The following properties can be used to limit the available CPU resources for model inference.
+If the platform or operating system supports this behavior, the OpenVINO Runtime will perform multi-threading scheduling based on the limited available CPU.
 
-- ``ov::inference_num_threads`` 
-- ``ov::hint::scheduling_core_type`` 
-- ``ov::hint::enable_hyper_threading`` 
+- ``ov::inference_num_threads``
+- ``ov::hint::scheduling_core_type``
+- ``ov::hint::enable_hyper_threading``
 
 .. tab-set::
 
@@ -137,9 +148,11 @@ User can use the following properties to limit available CPU resource for model
 
 .. note::
 
-   ``ov::hint::scheduling_core_type`` and ``ov::hint::enable_hyper_threading`` only support Intel® x86-64 CPU on Linux and Windows in current release.
+   ``ov::hint::scheduling_core_type`` and ``ov::hint::enable_hyper_threading`` only support Intel® x86-64 CPU on Linux and Windows in the current release.
 
-In some use cases, OpenVINO Runtime will enable CPU threads pinning by default for better performance. User can also turn it on or off using property ``ov::hint::enable_cpu_pinning``. Disable threads pinning might be beneficial in complex applications with several workloads executed in parallel.
+In some use cases, OpenVINO Runtime will enable CPU thread pinning by default for better performance.
+Users can also turn this feature on or off using the property ``ov::hint::enable_cpu_pinning``.
+Disabling thread pinning may be beneficial in complex applications where several workloads are executed in parallel.
 
 .. tab-set::
 
diff --git a/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css b/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css
index b180a5a096eaf3..8c038c795542e6 100644
--- a/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css
+++ b/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css
@@ -55,13 +55,6 @@ body {
     border-color: rgb(var(--ost-color-primary));
 }
 
-/* Scrollbox Extension */
-
-.scrollbox {
-    overflow-y:scroll;
-    height:300px;
-    margin-bottom: 20px;
-}
 
 /* Syntax Highlighting */
 
diff --git a/docs/sphinx_setup/_static/css/custom.css b/docs/sphinx_setup/_static/css/custom.css
index a9536c7aa05401..57e2b35a395e06 100644
--- a/docs/sphinx_setup/_static/css/custom.css
+++ b/docs/sphinx_setup/_static/css/custom.css
@@ -129,7 +129,7 @@ nav.bd-links li > a:hover {
     text-decoration: underline
 }
 
-ul#navbar-main-elements > li:hover { 
+ul#navbar-main-elements > li:hover {
     text-decoration: underline;
     color: #fff;
 }
@@ -223,7 +223,7 @@ details.sd-dropdown:not([open]).sd-card {
 /* Ttile is at the same place for both open and close states */
 .sd-card-header {
     border-radius: 0px !important;
-    
+
 }
 
 /* Ttile is at the same place for both open and close states */
@@ -262,7 +262,7 @@ details.sd-dropdown .sd-summary-title {
     min-width: 125px!important;
 }
 
-[aria-labelledby="version-selector"]  .dropdown-item { 
+[aria-labelledby="version-selector"]  .dropdown-item {
     padding: 0.25rem 0.5rem!important;
 }
 
@@ -437,21 +437,21 @@ div.highlight {
 /* =================================================== */
 @media (max-width: 720px) {
 
-    .container, 
+    .container,
     .container-lg,
     .container-md,
     .container-sm,
     .container-xl {
         max-width: 1850px;
     }
-    
+
     .transition-banner {
         margin-top: 2rem;
     }
 }
 
 @media (min-width: 1200px) {
-    .container, 
+    .container,
     .container-lg,
     .container-md,
     .container-sm,
@@ -921,6 +921,7 @@ div.highlight {
 
 
 /* Content formatting for the benchmark pages */
+/* =================================================== */
 .picker-options {
     margin: 15px 0;
 }
@@ -1223,7 +1224,7 @@ table#model-accuracy-and-perf-int8-fp32-table td.data {
 
 .newsletter-submit-btn:before {
     font-family: "Font Awesome 5 Free";
-    content: "\f0e0\00a0"; 
+    content: "\f0e0\00a0";
     font-size: 1rem;
 }
 
@@ -1307,3 +1308,29 @@ input:-webkit-autofill {
     -webkit-box-shadow: 0 0 0px 1000px white inset;
 }
 
+/* Scrollbox Extension */
+/* =================================================== */
+.scrollbox {
+    overflow-y:scroll;
+    height:300px;
+    margin-bottom: 20px;
+}
+
+/* overriding the 'back to top btn' style from webpack://pydata_sphinx_theme/src/pydata_sphinx_theme/assets/styles/base/_base.scss */
+/* =================================================== */
+#pst-back-to-top {
+    top: unset;
+    bottom: 3rem;
+    left: unset;
+    right: -2rem;
+    background-color: #0068b5;
+    font-size: .8rem;
+    border-radius: .25rem !important;
+}
+
+/* hide the header for the side menu */
+/* =================================================== */
+
+nav.bd-links p.bd-links__title {
+    display: none;
+}
\ No newline at end of file
diff --git a/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp b/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp
index 52ba997e6ac2c5..dbb6608b50f0b5 100644
--- a/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp
+++ b/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp
@@ -15,6 +15,7 @@
 
 #include "pyopenvino/core/common.hpp"
 #include "pyopenvino/core/infer_request.hpp"
+#include "pyopenvino/utils/utils.hpp"
 
 namespace py = pybind11;
 
@@ -64,7 +65,7 @@ class AsyncInferQueue {
         });
         size_t idle_handle = m_idle_handles.front();
         // wait for request to make sure it returned from callback
-        m_requests[idle_handle].m_request.wait();
+        m_requests[idle_handle].m_request->wait();
         if (m_errors.size() > 0)
             throw m_errors.front();
         return idle_handle;
@@ -75,7 +76,7 @@ class AsyncInferQueue {
         // release GIL to avoid deadlock on python callback
         py::gil_scoped_release release;
         for (auto&& request : m_requests) {
-            request.m_request.wait();
+            request.m_request->wait();
         }
         // acquire the mutex to access m_errors
         std::lock_guard<std::mutex> lock(m_mutex);
@@ -87,7 +88,7 @@ class AsyncInferQueue {
         for (size_t handle = 0; handle < m_requests.size(); handle++) {
             // auto end_time = m_requests[handle].m_end_time; // TODO: pass it bellow? like in InferRequestWrapper
 
-            m_requests[handle].m_request.set_callback([this, handle /* ... */](std::exception_ptr exception_ptr) {
+            m_requests[handle].m_request->set_callback([this, handle /* ... */](std::exception_ptr exception_ptr) {
                 *m_requests[handle].m_end_time = Time::now();
                 {
                     // acquire the mutex to access m_idle_handles
@@ -110,14 +111,17 @@ class AsyncInferQueue {
     }
 
     void set_custom_callbacks(py::function f_callback) {
+        // need to acquire GIL before py::function deletion
+        auto callback_sp = Common::utils::wrap_pyfunction(std::move(f_callback));
+
         for (size_t handle = 0; handle < m_requests.size(); handle++) {
-            m_requests[handle].m_request.set_callback([this, f_callback, handle](std::exception_ptr exception_ptr) {
+            m_requests[handle].m_request->set_callback([this, callback_sp, handle](std::exception_ptr exception_ptr) {
                 *m_requests[handle].m_end_time = Time::now();
                 if (exception_ptr == nullptr) {
                     // Acquire GIL, execute Python function
                     py::gil_scoped_acquire acquire;
                     try {
-                        f_callback(m_requests[handle], m_user_ids[handle]);
+                        (*callback_sp)(m_requests[handle], m_user_ids[handle]);
                     } catch (const py::error_already_set& py_error) {
                         // This should behave the same as assert(!PyErr_Occurred())
                         // since constructor for pybind11's error_already_set is
@@ -193,13 +197,13 @@ void regclass_AsyncInferQueue(py::module m) {
             // Set new inputs label/id from user
             self.m_user_ids[handle] = userdata;
             // Update inputs if there are any
-            self.m_requests[handle].m_request.set_input_tensor(inputs);
+            self.m_requests[handle].m_request->set_input_tensor(inputs);
             // Now GIL can be released - we are NOT working with Python objects in this block
             {
                 py::gil_scoped_release release;
                 *self.m_requests[handle].m_start_time = Time::now();
                 // Start InferRequest in asynchronus mode
-                self.m_requests[handle].m_request.start_async();
+                self.m_requests[handle].m_request->start_async();
             }
         },
         py::arg("inputs"),
@@ -239,13 +243,13 @@ void regclass_AsyncInferQueue(py::module m) {
             // Set new inputs label/id from user
             self.m_user_ids[handle] = userdata;
             // Update inputs if there are any
-            Common::set_request_tensors(self.m_requests[handle].m_request, inputs);
+            Common::set_request_tensors(*self.m_requests[handle].m_request, inputs);
             // Now GIL can be released - we are NOT working with Python objects in this block
             {
                 py::gil_scoped_release release;
                 *self.m_requests[handle].m_start_time = Time::now();
                 // Start InferRequest in asynchronus mode
-                self.m_requests[handle].m_request.start_async();
+                self.m_requests[handle].m_request->start_async();
             }
         },
         py::arg("inputs"),
diff --git a/src/bindings/python/src/pyopenvino/core/common.cpp b/src/bindings/python/src/pyopenvino/core/common.cpp
index 9f57b794e2bff6..179002127960cd 100644
--- a/src/bindings/python/src/pyopenvino/core/common.cpp
+++ b/src/bindings/python/src/pyopenvino/core/common.cpp
@@ -433,10 +433,14 @@ ov::op::v0::Constant create_shared(py::array& array) {
     // If ndim is equal to 0, creates scalar Constant.
     // If size is equal to 0, creates empty Constant.
     if (array_helpers::is_contiguous(array)) {
-        auto memory = std::make_shared<ov::SharedBuffer<py::array>>(
+        auto buffer = new ov::SharedBuffer<py::array>(
             static_cast<char*>((array.ndim() == 0 || array.size() == 0) ? array.mutable_data() : array.mutable_data(0)),
             array.ndim() == 0 ? array.itemsize() : array.nbytes(),
             array);
+        std::shared_ptr<ov::SharedBuffer<py::array>> memory(buffer, [](ov::SharedBuffer<py::array>* buffer) {
+            py::gil_scoped_acquire acquire;
+            delete buffer;
+        });
         return ov::op::v0::Constant(type_helpers::get_ov_type(array), array_helpers::get_shape(array), memory);
     }
     // If passed array is not C-style, throw an error.
@@ -614,7 +618,7 @@ uint32_t get_optimal_number_of_requests(const ov::CompiledModel& actual) {
 py::dict outputs_to_dict(InferRequestWrapper& request, bool share_outputs, bool decode_strings) {
     py::dict res;
     for (const auto& out : request.m_outputs) {
-        auto t = request.m_request.get_tensor(out);
+        auto t = request.m_request->get_tensor(out);
         if (t.get_element_type() == ov::element::string) {
             if (share_outputs) {
                 PyErr_WarnEx(PyExc_RuntimeWarning, "Result of a string type will be copied to OVDict!", 1);
diff --git a/src/bindings/python/src/pyopenvino/core/infer_request.cpp b/src/bindings/python/src/pyopenvino/core/infer_request.cpp
index 93a52b1dad681f..9f572d273dc5f3 100644
--- a/src/bindings/python/src/pyopenvino/core/infer_request.cpp
+++ b/src/bindings/python/src/pyopenvino/core/infer_request.cpp
@@ -18,7 +18,7 @@ inline py::object run_sync_infer(InferRequestWrapper& self, bool share_outputs,
     {
         py::gil_scoped_release release;
         *self.m_start_time = Time::now();
-        self.m_request.infer();
+        self.m_request->infer();
         *self.m_end_time = Time::now();
     }
     return Common::outputs_to_dict(self, share_outputs, decode_strings);
@@ -38,7 +38,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensors",
         [](InferRequestWrapper& self, const py::dict& inputs) {
-            Common::set_request_tensors(self.m_request, inputs);
+            Common::set_request_tensors(*self.m_request, inputs);
         },
         py::arg("inputs"),
         R"(
@@ -51,7 +51,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensors",
         [](InferRequestWrapper& self, const std::string& tensor_name, const std::vector<ov::Tensor>& tensors) {
-            self.m_request.set_tensors(tensor_name, tensors);
+            self.m_request->set_tensors(tensor_name, tensors);
         },
         py::arg("tensor_name"),
         py::arg("tensors"),
@@ -73,7 +73,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensors",
         [](InferRequestWrapper& self, const ov::Output<const ov::Node>& port, const std::vector<ov::Tensor>& tensors) {
-            self.m_request.set_tensors(port, tensors);
+            self.m_request->set_tensors(port, tensors);
         },
         py::arg("port"),
         py::arg("tensors"),
@@ -100,7 +100,7 @@ void regclass_InferRequest(py::module m) {
         [](InferRequestWrapper& self, const py::dict& outputs) {
             auto outputs_map = Common::containers::cast_to_tensor_index_map(outputs);
             for (auto&& output : outputs_map) {
-                self.m_request.set_output_tensor(output.first, output.second);
+                self.m_request->set_output_tensor(output.first, output.second);
             }
         },
         py::arg("outputs"),
@@ -117,7 +117,7 @@ void regclass_InferRequest(py::module m) {
         [](InferRequestWrapper& self, const py::dict& inputs) {
             auto inputs_map = Common::containers::cast_to_tensor_index_map(inputs);
             for (auto&& input : inputs_map) {
-                self.m_request.set_input_tensor(input.first, input.second);
+                self.m_request->set_input_tensor(input.first, input.second);
             }
         },
         py::arg("inputs"),
@@ -131,7 +131,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_input_tensors",
         [](InferRequestWrapper& self, const std::vector<ov::Tensor>& tensors) {
-            self.m_request.set_input_tensors(tensors);
+            self.m_request->set_input_tensors(tensors);
         },
         py::arg("tensors"),
         R"(
@@ -148,7 +148,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_input_tensors",
         [](InferRequestWrapper& self, size_t idx, const std::vector<ov::Tensor>& tensors) {
-            self.m_request.set_input_tensors(idx, tensors);
+            self.m_request->set_input_tensors(idx, tensors);
         },
         py::arg("idx"),
         py::arg("tensors"),
@@ -168,7 +168,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "infer",
         [](InferRequestWrapper& self, const ov::Tensor& inputs, bool share_outputs, bool decode_strings) {
-            self.m_request.set_input_tensor(inputs);
+            self.m_request->set_input_tensor(inputs);
             return run_sync_infer(self, share_outputs, decode_strings);
         },
         py::arg("inputs"),
@@ -197,7 +197,7 @@ void regclass_InferRequest(py::module m) {
         "infer",
         [](InferRequestWrapper& self, const py::dict& inputs, bool share_outputs, bool decode_strings) {
             // Update inputs if there are any
-            Common::set_request_tensors(self.m_request, inputs);
+            Common::set_request_tensors(*self.m_request, inputs);
             // Call Infer function
             return run_sync_infer(self, share_outputs, decode_strings);
         },
@@ -222,7 +222,7 @@ void regclass_InferRequest(py::module m) {
         "start_async",
         [](InferRequestWrapper& self, const ov::Tensor& inputs, py::object& userdata) {
             // Update inputs if there are any
-            self.m_request.set_input_tensor(inputs);
+            self.m_request->set_input_tensor(inputs);
             if (!userdata.is(py::none())) {
                 if (self.m_user_callback_defined) {
                     self.m_userdata = userdata;
@@ -232,7 +232,7 @@ void regclass_InferRequest(py::module m) {
             }
             py::gil_scoped_release release;
             *self.m_start_time = Time::now();
-            self.m_request.start_async();
+            self.m_request->start_async();
         },
         py::arg("inputs"),
         py::arg("userdata"),
@@ -261,7 +261,7 @@ void regclass_InferRequest(py::module m) {
         "start_async",
         [](InferRequestWrapper& self, const py::dict& inputs, py::object& userdata) {
             // Update inputs if there are any
-            Common::set_request_tensors(self.m_request, inputs);
+            Common::set_request_tensors(*self.m_request, inputs);
             if (!userdata.is(py::none())) {
                 if (self.m_user_callback_defined) {
                     self.m_userdata = userdata;
@@ -271,7 +271,7 @@ void regclass_InferRequest(py::module m) {
             }
             py::gil_scoped_release release;
             *self.m_start_time = Time::now();
-            self.m_request.start_async();
+            self.m_request->start_async();
         },
         py::arg("inputs"),
         py::arg("userdata"),
@@ -293,7 +293,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "cancel",
         [](InferRequestWrapper& self) {
-            self.m_request.cancel();
+            self.m_request->cancel();
         },
         R"(
             Cancels inference request.
@@ -303,7 +303,7 @@ void regclass_InferRequest(py::module m) {
         "wait",
         [](InferRequestWrapper& self) {
             py::gil_scoped_release release;
-            self.m_request.wait();
+            self.m_request->wait();
         },
         R"(
             Waits for the result to become available. 
@@ -316,7 +316,7 @@ void regclass_InferRequest(py::module m) {
         "wait_for",
         [](InferRequestWrapper& self, const int timeout) {
             py::gil_scoped_release release;
-            return self.m_request.wait_for(std::chrono::milliseconds(timeout));
+            return self.m_request->wait_for(std::chrono::milliseconds(timeout));
         },
         py::arg("timeout"),
         R"(
@@ -337,7 +337,11 @@ void regclass_InferRequest(py::module m) {
         [](InferRequestWrapper& self, py::function callback, py::object& userdata) {
             self.m_userdata = userdata;
             self.m_user_callback_defined = true;
-            self.m_request.set_callback([&self, callback](std::exception_ptr exception_ptr) {
+
+            // need to acquire GIL before py::function deletion
+            auto callback_sp = Common::utils::wrap_pyfunction(std::move(callback));
+
+            self.m_request->set_callback([&self, callback_sp](std::exception_ptr exception_ptr) {
                 *self.m_end_time = Time::now();
                 try {
                     if (exception_ptr) {
@@ -348,7 +352,7 @@ void regclass_InferRequest(py::module m) {
                 }
                 // Acquire GIL, execute Python function
                 py::gil_scoped_acquire acquire;
-                callback(self.m_userdata);
+                (*callback_sp)(self.m_userdata);
             });
         },
         py::arg("callback"),
@@ -365,7 +369,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_tensor",
         [](InferRequestWrapper& self, const std::string& name) {
-            return self.m_request.get_tensor(name);
+            return self.m_request->get_tensor(name);
         },
         py::arg("name"),
         R"(
@@ -380,7 +384,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_tensor",
         [](InferRequestWrapper& self, const ov::Output<const ov::Node>& port) {
-            return self.m_request.get_tensor(port);
+            return self.m_request->get_tensor(port);
         },
         py::arg("port"),
         R"(
@@ -395,7 +399,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_tensor",
         [](InferRequestWrapper& self, const ov::Output<ov::Node>& port) {
-            return self.m_request.get_tensor(port);
+            return self.m_request->get_tensor(port);
         },
         py::arg("port"),
         R"(
@@ -410,7 +414,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_input_tensor",
         [](InferRequestWrapper& self, size_t idx) {
-            return self.m_request.get_input_tensor(idx);
+            return self.m_request->get_input_tensor(idx);
         },
         py::arg("index"),
         R"(
@@ -427,7 +431,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_input_tensor",
         [](InferRequestWrapper& self) {
-            return self.m_request.get_input_tensor();
+            return self.m_request->get_input_tensor();
         },
         R"(
             Gets input tensor of InferRequest.
@@ -440,7 +444,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_output_tensor",
         [](InferRequestWrapper& self, size_t idx) {
-            return self.m_request.get_output_tensor(idx);
+            return self.m_request->get_output_tensor(idx);
         },
         py::arg("index"),
         R"(
@@ -456,7 +460,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_output_tensor",
         [](InferRequestWrapper& self) {
-            return self.m_request.get_output_tensor();
+            return self.m_request->get_output_tensor();
         },
         R"(
             Gets output tensor of InferRequest.
@@ -469,7 +473,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensor",
         [](InferRequestWrapper& self, const std::string& name, const ov::Tensor& tensor) {
-            self.m_request.set_tensor(name, tensor);
+            self.m_request->set_tensor(name, tensor);
         },
         py::arg("name"),
         py::arg("tensor"),
@@ -486,7 +490,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensor",
         [](InferRequestWrapper& self, const ov::Output<const ov::Node>& port, const ov::Tensor& tensor) {
-            self.m_request.set_tensor(port, tensor);
+            self.m_request->set_tensor(port, tensor);
         },
         py::arg("port"),
         py::arg("tensor"),
@@ -503,7 +507,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensor",
         [](InferRequestWrapper& self, const ov::Output<ov::Node>& port, const ov::Tensor& tensor) {
-            self.m_request.set_tensor(port, tensor);
+            self.m_request->set_tensor(port, tensor);
         },
         py::arg("port"),
         py::arg("tensor"),
@@ -520,7 +524,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_input_tensor",
         [](InferRequestWrapper& self, size_t idx, const ov::Tensor& tensor) {
-            self.m_request.set_input_tensor(idx, tensor);
+            self.m_request->set_input_tensor(idx, tensor);
         },
         py::arg("index"),
         py::arg("tensor"),
@@ -538,7 +542,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_input_tensor",
         [](InferRequestWrapper& self, const ov::Tensor& tensor) {
-            self.m_request.set_input_tensor(tensor);
+            self.m_request->set_input_tensor(tensor);
         },
         py::arg("tensor"),
         R"(
@@ -553,7 +557,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_output_tensor",
         [](InferRequestWrapper& self, size_t idx, const ov::Tensor& tensor) {
-            self.m_request.set_output_tensor(idx, tensor);
+            self.m_request->set_output_tensor(idx, tensor);
         },
         py::arg("index"),
         py::arg("tensor"),
@@ -570,7 +574,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_output_tensor",
         [](InferRequestWrapper& self, const ov::Tensor& tensor) {
-            self.m_request.set_output_tensor(tensor);
+            self.m_request->set_output_tensor(tensor);
         },
         py::arg("tensor"),
         R"(
@@ -585,7 +589,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_profiling_info",
         [](InferRequestWrapper& self) {
-            return self.m_request.get_profiling_info();
+            return self.m_request->get_profiling_info();
         },
         py::call_guard<py::gil_scoped_release>(),
         R"(
@@ -602,7 +606,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "query_state",
         [](InferRequestWrapper& self) {
-            return self.m_request.query_state();
+            return self.m_request->query_state();
         },
         py::call_guard<py::gil_scoped_release>(),
         R"(
@@ -617,7 +621,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "reset_state",
         [](InferRequestWrapper& self) {
-            return self.m_request.reset_state();
+            return self.m_request->reset_state();
         },
         R"(
             Resets all internal variable states for relevant infer request to
@@ -627,7 +631,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_compiled_model",
         [](InferRequestWrapper& self) {
-            return self.m_request.get_compiled_model();
+            return self.m_request->get_compiled_model();
         },
         R"(
             Returns the compiled model.
@@ -700,7 +704,7 @@ void regclass_InferRequest(py::module m) {
     cls.def_property_readonly(
         "profiling_info",
         [](InferRequestWrapper& self) {
-            return self.m_request.get_profiling_info();
+            return self.m_request->get_profiling_info();
         },
         py::call_guard<py::gil_scoped_release>(),
         R"(
diff --git a/src/bindings/python/src/pyopenvino/core/infer_request.hpp b/src/bindings/python/src/pyopenvino/core/infer_request.hpp
index 69f0412a1745c9..719d0374af6ff3 100644
--- a/src/bindings/python/src/pyopenvino/core/infer_request.hpp
+++ b/src/bindings/python/src/pyopenvino/core/infer_request.hpp
@@ -32,7 +32,7 @@ class InferRequestWrapper {
                         const std::vector<ov::Output<const ov::Node>>& outputs,
                         bool set_default_callback = true,
                         py::object userdata = py::none())
-        : m_request{std::move(request)},
+        : m_request{InferRequestWrapper::wrap_infer_request_to_sp(std::move(request))},
           m_inputs{inputs},
           m_outputs{outputs},
           m_userdata{userdata} {
@@ -44,7 +44,7 @@ class InferRequestWrapper {
             // Bump reference counter
             auto end_time = m_end_time;
             // Set standard callback which saves "end-time" for inference call
-            m_request.set_callback([end_time](std::exception_ptr exception_ptr) {
+            m_request->set_callback([end_time](std::exception_ptr exception_ptr) {
                 *end_time = Time::now();
                 try {
                     if (exception_ptr) {
@@ -73,7 +73,7 @@ class InferRequestWrapper {
     }
 
     // Original ov::InferRequest class that is held by this wrapper
-    ov::InferRequest m_request;
+    std::shared_ptr<ov::InferRequest> m_request;
     // Inputs and Outputs inherrited from ov::CompiledModel
     std::vector<ov::Output<const ov::Node>> m_inputs;
     std::vector<ov::Output<const ov::Node>> m_outputs;
@@ -91,11 +91,18 @@ class InferRequestWrapper {
         tensors.reserve(v.size());
 
         for (auto&& node : v) {
-            tensors.push_back(m_request.get_tensor(node));
+            tensors.push_back(m_request->get_tensor(node));
         }
 
         return tensors;
     }
+
+    static std::shared_ptr<ov::InferRequest> wrap_infer_request_to_sp(ov::InferRequest request) {
+        return std::shared_ptr<ov::InferRequest>(new ov::InferRequest(std::move(request)), [](ov::InferRequest* request) {
+                py::gil_scoped_release release;
+                delete request;
+        });
+    }
 };
 
 void regclass_InferRequest(py::module m);
diff --git a/src/bindings/python/src/pyopenvino/frontend/extension.cpp b/src/bindings/python/src/pyopenvino/frontend/extension.cpp
index a4f2e9cae1ca0c..4446ea2c9acc33 100644
--- a/src/bindings/python/src/pyopenvino/frontend/extension.cpp
+++ b/src/bindings/python/src/pyopenvino/frontend/extension.cpp
@@ -30,19 +30,26 @@ void regclass_frontend_TelemetryExtension(py::module m) {
                         py::function& send_event,
                         py::function& send_error,
                         py::function& send_stack_trace) {
+        auto send_event_sp = Common::utils::wrap_pyfunction(send_event);
+        auto send_error_sp = Common::utils::wrap_pyfunction(send_error);
+        auto send_stack_trace_sp = Common::utils::wrap_pyfunction(send_stack_trace);
+
         return std::make_shared<TelemetryExtension>(
             event_category,
-            [send_event](const std::string& category, const std::string& action, const std::string& label, int value) {
+            [send_event_sp](const std::string& category,
+                            const std::string& action,
+                            const std::string& label,
+                            int value) {
                 py::gil_scoped_acquire acquire;
-                send_event(category, action, label, value);
+                (*send_event_sp)(category, action, label, value);
             },
-            [send_error](const std::string& category, const std::string& error_message) {
+            [send_error_sp](const std::string& category, const std::string& error_message) {
                 py::gil_scoped_acquire acquire;
-                send_error(category, error_message);
+                (*send_error_sp)(category, error_message);
             },
-            [send_stack_trace](const std::string& category, const std::string& error_message) {
+            [send_stack_trace_sp](const std::string& category, const std::string& error_message) {
                 py::gil_scoped_acquire acquire;
-                send_stack_trace(category, error_message);
+                (*send_stack_trace_sp)(category, error_message);
             });
     }));
 
diff --git a/src/bindings/python/src/pyopenvino/utils/utils.cpp b/src/bindings/python/src/pyopenvino/utils/utils.cpp
index 27f015b14272c2..feeac2d7a02a73 100644
--- a/src/bindings/python/src/pyopenvino/utils/utils.cpp
+++ b/src/bindings/python/src/pyopenvino/utils/utils.cpp
@@ -419,5 +419,12 @@ ov::Any py_object_to_any(const py::object& py_obj) {
     }
     OPENVINO_ASSERT(false, "Unsupported attribute type.");
 }
+std::shared_ptr<py::function> wrap_pyfunction(py::function f_callback) {
+    auto callback_sp = std::shared_ptr<py::function>(new py::function(std::move(f_callback)), [](py::function* c) {
+        py::gil_scoped_acquire acquire;
+        delete c;
+    });
+    return callback_sp;
+}
 };  // namespace utils
 };  // namespace Common
diff --git a/src/bindings/python/src/pyopenvino/utils/utils.hpp b/src/bindings/python/src/pyopenvino/utils/utils.hpp
index 1e0e7f23069d2e..e4048b3f52feb3 100644
--- a/src/bindings/python/src/pyopenvino/utils/utils.hpp
+++ b/src/bindings/python/src/pyopenvino/utils/utils.hpp
@@ -58,5 +58,7 @@ namespace utils {
 
     ov::pass::Serialize::Version convert_to_version(const std::string& version);
 
+    std::shared_ptr<py::function> wrap_pyfunction(py::function f_callback);
+
 }; // namespace utils
 }; // namespace Common
diff --git a/src/bindings/python/tests/test_graph/test_op.py b/src/bindings/python/tests/test_graph/test_op.py
index 2bd609ef5278f1..5a8abdc55ea86c 100644
--- a/src/bindings/python/tests/test_graph/test_op.py
+++ b/src/bindings/python/tests/test_graph/test_op.py
@@ -107,9 +107,7 @@ def test_custom_add_model():
 
 def test_custom_op():
     model = create_snake_model()
-    # todo: CVS-141744
-    # it hangs with AUTO plugin, but works well with CPU
-    compiled_model = compile_model(model, "CPU")
+    compiled_model = compile_model(model)
 
     assert isinstance(compiled_model, CompiledModel)
     request = compiled_model.create_infer_request()
diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp
index 08deb95b12ec22..d928cdd1d33eba 100644
--- a/src/common/snippets/src/pass/mha_tokenization.cpp
+++ b/src/common/snippets/src/pass/mha_tokenization.cpp
@@ -16,7 +16,7 @@
 
 namespace {
 bool is_supported_tensor(const ov::descriptor::Tensor& t) {
-    return t.get_partial_shape().is_static() && ov::snippets::utils::one_of(t.get_shape().size(), 3lu, 4lu);
+    return t.get_partial_shape().rank().is_static() && ov::snippets::utils::one_of(t.get_partial_shape().size(), 3lu, 4lu);
 }
 
 bool is_supported_intermediate_op(const std::shared_ptr<ov::Node>& node) {
@@ -68,6 +68,10 @@ void tokenize_broadcast(const std::shared_ptr<ov::Node>& interm_op, ov::NodeVect
         // TODO: Can we reuse AppropriateForSubgraph here? Seems like it's huge check for Broadcast
         if (broadcast && broadcast->get_broadcast_spec().m_type == ov::op::AutoBroadcastType::NUMPY &&
             broadcast->get_output_target_inputs(0).size() == 1) {
+            // TODO: Add support of Broadcast with ShapeOf subgraph on second input
+            if (!ov::is_type<ov::op::v0::Constant>(broadcast->input_value(1).get_node_shared_ptr()))
+                continue;
+
             broadcast_nodes.push_back(broadcast);
 
             const auto pshape = broadcast->get_input_partial_shape(0);
@@ -96,10 +100,17 @@ void tokenize_broadcast(const std::shared_ptr<ov::Node>& interm_op, ov::NodeVect
 bool tokenize_reshape_around_softmax(std::shared_ptr<ov::Node>& interm_op, std::shared_ptr<ov::opset1::Reshape>& reshape, ov::NodeVector& ordered_ops) {
     reshape = ov::as_type_ptr<ov::opset1::Reshape>(interm_op);
     if (reshape) {
-        const auto in_shape = reshape->get_input_shape(0);
-        const auto out_shape = reshape->get_output_shape(0);
-        if (in_shape.back() != out_shape.back() || reshape->get_output_target_inputs(0).size() != 1)
+        // TODO: Add support of Reshape with ShapeOf subgraph on second input
+        if (!ov::is_type<ov::op::v0::Constant>(reshape->input_value(1).get_node_shared_ptr()))
+            return false;
+
+        const auto in_shape = reshape->get_input_partial_shape(0);
+        const auto out_shape = reshape->get_output_partial_shape(0);
+        const auto in_last_dim = *in_shape.crbegin();
+        const auto out_last_dim = *out_shape.crbegin();
+        if (in_last_dim.is_dynamic() || out_last_dim.is_dynamic() || in_last_dim != out_last_dim || reshape->get_output_target_inputs(0).size() != 1)
             return false;
+
         ordered_ops.push_back(reshape);
         interm_op = reshape->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
     }
@@ -204,8 +215,7 @@ bool ov::snippets::pass::TokenizeMHASnippets::is_matmul0_supported(const std::sh
 ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsTokenization::Config& config) {
     MATCHER_SCOPE(TokenizeMHASnippets);
 
-    auto m_matmul0 = std::make_shared<ov::opset1::MatMul>(ov::pass::pattern::any_input(ov::pass::pattern::has_static_shape()),
-                                                          ov::pass::pattern::any_input(ov::pass::pattern::has_static_shape()));
+    auto m_matmul0 = std::make_shared<ov::opset1::MatMul>(ov::pass::pattern::any_input(), ov::pass::pattern::any_input());
 
     register_matcher(std::make_shared<ov::pass::pattern::Matcher>(m_matmul0, matcher_name),
         [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher &m) {
@@ -224,20 +234,14 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         // Example:
         //     Buffer - i32 [32, 128] -> ~ Loop ~ -> Buffer - i8 [32, 128]
         //     After each Loop iteration we should increment pointers of Buffers: accordingly on 4 byte and 1 byte for scalar case.
-        //     It means that these Buffers cannot be inplace => Each Buffer should have the own register
+        //     It means that these increments are not proportional => Each Buffer should have the own register
         // For that we can just check the following "branches":
         //  - Between MatMul0 and MatMul1 - Softmax is sync point. The operations between MatMul0 -> Softmax and Softmax -> MatMul1
         //                                  will be fused into one loop after conversion to snippet dialect (Because it's just FQ, Eltwise nodes)
-        //  - Between MatMul0 and Transpose1 - At the moment operations after Transpose1 cannot be fused in Transpose Loop (to avoid performance regressions).
+        //  - Between MatMul0 and Transpose1 - At the moment operations after Transpose1 cannot be fused in inner Transpose Loop
+        //                                     (to avoid performance regressions due to scalar calculations).
         //                                     But operations after Transpose1 and before MatMul0  will be fused into one loop as well (look at first point)
-        // Note: If the pass is updated, need to check the new possible branches for potential non-inplace Buffers!
-        // Default value is 2 because
-        //  - Firstly, Softmax always needs Buffers
-        //  - Secondly, Softmax needs 2 Buffers but they can be inplace - One virtual port is enough for Softmax => buffer_count = 1
-        //  - Thirdly, MatMul requires unique Buffers on inputs and outputs because blocking implementation increments input/output pointers during computations
-        //    However, all of the Buffers are usually reused by the next MatMul and Softmax.
-        //    So on sufficiently large subgraphs we use only one additional unique buffer => buffer_count increments by 1
-        size_t buffer_count = 2;
+        size_t uniqie_buffer_reg_group_count = 1;  // After MatMul0 there is always one Buffer
         std::string fused_names;
         ov::NodeVector ordered_ops;
 
@@ -260,24 +264,20 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         if (!is_matmul0_supported(matmul0))
             return false;
 
-        const auto matmul0_prc = op::Brgemm::get_output_type(matmul0->get_input_element_type(0), matmul0->get_input_element_type(1));
-        // Between MatMul0 and Softmax will be the one Loop because of LoopFusing optimization.
-        // The Loop will have one Buffer with the same shape both on input and output.
-        // Need to check for precision to get if we need one more register for Buffer
-        if (matmul0_prc.size() != ov::element::f32.size()) {
-            if (buffer_count < 2)
-                buffer_count++;
-        }
-
         ordered_ops.push_back(matmul0);
 
         const auto pattern_rank = matmul0->get_output_partial_shape(0).size();
 
+        const auto ops_count_before_softmax = ordered_ops.size();
         auto interm_op = matmul0->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
         // Add supported operations which are between MatMul0 and Softmax to ordered_ops
         if (!update_intermediate_supported_ops(interm_op, ordered_ops, hidden_virtual_ports_count, potential_body_params_count))
             return false;
 
+        // If before Softmax there is Eltwise ops, there will be one more Buffer
+        if (ops_count_before_softmax != ordered_ops.size() && interm_op->get_output_partial_shape(0).rbegin()->is_dynamic())
+            uniqie_buffer_reg_group_count++;
+
         std::shared_ptr<ov::opset1::Reshape> reshape0 = nullptr;
         if (!tokenize_reshape_around_softmax(interm_op, reshape0, ordered_ops))
             return false;
@@ -294,6 +294,11 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
 
         if (axis != rank.get_length() - 1 || interm_op->get_output_target_inputs(0).size() != 1)
             return false;
+
+        // Softmax need one buffer at least
+        if (interm_op->get_output_partial_shape(0).rbegin()->is_dynamic())
+            uniqie_buffer_reg_group_count++;
+
         ordered_ops.push_back(interm_op);
 
         interm_op = interm_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
@@ -302,7 +307,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
             return false;
 
         if (((reshape0 == nullptr) != (reshape1 == nullptr)) ||
-             (reshape0 && reshape1 && (reshape0->get_input_shape(0) != reshape1->get_output_shape(0))))
+             (reshape0 && reshape1 && (reshape0->get_input_partial_shape(0) != reshape1->get_output_partial_shape(0))))
             return false;
 
         // Add supported operations which are between Softmax and MatMul1 to ordered_ops
@@ -310,8 +315,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
             return false;
 
         const auto matmul1 = ov::as_type_ptr<ov::opset1::MatMul>(interm_op);
-        if (!matmul1 || matmul1->get_output_target_inputs(0).size() != 1 ||
-            matmul1->get_transpose_a() || matmul1->get_transpose_b())
+        if (!matmul1 || matmul1->get_transpose_a() || matmul1->get_transpose_b())
             return false;
 
         const auto matmul1_out_type = op::Brgemm::get_output_type(matmul1->get_input_element_type(0),
@@ -328,8 +332,9 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         // Between Softmax and MatMul1 will be the one Loop because of LoopFusing optimization.
         // The Loop will have one Buffer with the same shape both on input and output.
         // Need to check for precision to get if we need one more register for Buffer
-        if (matmul1->get_input_element_type(0).size() != ov::element::f32.size()) {
-            buffer_count++;
+        const auto matmul0_prc = op::Brgemm::get_output_type(matmul0->get_input_element_type(0), matmul0->get_input_element_type(1));
+        if (matmul1->get_input_element_type(0).size() != matmul0_prc.size() || matmul1->get_input_partial_shape(0).is_dynamic()) {
+            uniqie_buffer_reg_group_count++;
         }
 
         /***********************/
@@ -358,6 +363,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         // There is transformation ExplicitTransposeMatMulInputs that set supported order and transposed_b(false).
         // We can allow to call this pass only if ops have scalar shapes to avoid shape mismatching
         const auto is_transposed_b_0 = matmul0->get_transpose_b();
+        bool has_matmul0_has_ops_on_input = false;
         while (is_supported_intermediate_op(parent)) {
             // All supported ops have only one output port
             if (parent->get_output_target_inputs(0).size() != 1)
@@ -379,6 +385,11 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
             ordered_ops.insert(ordered_ops.begin(), parent);
             // [107731] To go always through 0-th port - is it safe?
             parent = parent->get_input_node_shared_ptr(0);
+            has_matmul0_has_ops_on_input = true;
+        }
+        // If there are ops on second input of MatMul0 -> there always will be unique Buffer
+        if (has_matmul0_has_ops_on_input) {
+            uniqie_buffer_reg_group_count++;
         }
 
         auto tokenize_transpose = [&](const std::shared_ptr<ov::opset1::Transpose>& transpose,
@@ -412,7 +423,9 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
 
         bool are_ops_after_matmul1 = false;
         auto child = matmul1->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
-        while (is_supported_intermediate_op(child)) {
+        const auto can_be_ops_after_matmul1_tokenized = matmul1->get_output_target_inputs(0).size() == 1;
+        bool has_matmul1_has_ops_on_output = false;
+        while (can_be_ops_after_matmul1_tokenized && is_supported_intermediate_op(child)) {
             are_ops_after_matmul1 = true;
             // All supported ops have only one output port
             if (child->get_output_target_inputs(0).size() != 1)
@@ -427,19 +440,23 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
 
             // TODO [75567]: move this plugin-specific constraint to the plugin callback
             //               We cannot collapse op to Subgraph if count of potential Parameter and Result count is higher 12
-            if (potential_body_params_count + child->get_output_target_inputs(0).size() + hidden_virtual_ports_count + buffer_count > 12) {
+            if (potential_body_params_count + child->get_output_target_inputs(0).size() + hidden_virtual_ports_count + uniqie_buffer_reg_group_count > 12) {
                 break;
             }
 
             ordered_ops.push_back(child);
             child = child->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+            has_matmul1_has_ops_on_output = true;
+        }
+        if (has_matmul1_has_ops_on_output) {
+            uniqie_buffer_reg_group_count++;
         }
 
         // At the moment Snippets don't support nodes between MatMul1 and Transpose3 due to Loop and strided calculations limitations
         //     MatMul1
         //  <Supported ops>
         //    Transpose3
-        if (!are_ops_after_matmul1) {
+        if (can_be_ops_after_matmul1_tokenized && !are_ops_after_matmul1) {
             auto transpose3 = config.get_mha_token_enable_transpose_on_output() ? ov::as_type_ptr<ov::opset1::Transpose>(child) : nullptr;
             if (is_valid_transpose(transpose3, config.get_mha_supported_transpose_ranks(), get_fusion_transpose_order(pattern_rank)) &&
                 transpose3->get_input_element_type(0) == matmul1_out_type) {  // To avoid Convert between MatMul1 and Transpose3
@@ -455,7 +472,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
 
         // TODO [75567]: move this plugin-specific constraint to the plugin callback
         const auto last_node = ordered_ops.back();
-        if (potential_body_params_count + last_node->get_output_size() + hidden_virtual_ports_count + buffer_count > 11) {
+        if (potential_body_params_count + last_node->get_output_size() + hidden_virtual_ports_count + uniqie_buffer_reg_group_count > 11) {
             return false;
         }
 
diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp
index 6438dff516cded..b411aace066203 100644
--- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp
+++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp
@@ -39,6 +39,30 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D) {
     run();
 }
 
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D_Dynamic) {
+    const auto &f = MHAFunction(std::vector<PartialShape>{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}},
+                                std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D_Dynamic_M) {
+    const auto &f = MHAFunction(std::vector<PartialShape>{{1, -1, 12, 64}, {1, 128, 12, 64}, {1, 12, -1, 128}, {1, 128, 12, 64}},
+                                std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D_Dynamic_K) {
+    const auto &f = MHAFunction(std::vector<PartialShape>{{1, 128, 12, -1}, {1, 128, 12, -1}, {1, 12, 128, 128}, {1, 128, 12, 64}},
+                                std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_3D) {
     const auto &f = MHAFunction(std::vector<PartialShape>{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}},
                                 std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}));
@@ -47,8 +71,15 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_3D) {
     run();
 }
 
-TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_MatMul0_Transpose) {
-    GTEST_SKIP();
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_3D_Dynamic) {
+    const auto &f = MHAFunction(std::vector<PartialShape>{{-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}},
+                                std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_with_MatMul0_Transpose) {
     const auto &f = MHAMatMul0TransposeFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}},
                                                 std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}));
     model = f.getOriginal();
@@ -56,6 +87,16 @@ TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_M
     run();
 }
 
+TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_with_MatMul0_Transpose_Dynamic) {
+    GTEST_SKIP();
+    const auto &f = MHAMatMul0TransposeFunction(std::vector<PartialShape>{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}},
+                                                std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
+                                                false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
 TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_int_Matmuls) {
     GTEST_SKIP();
     const auto &f = MHAINT8MatMulTypeRelaxedFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}});
@@ -71,6 +112,14 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_extraction) {
     run();
 }
 
+TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_Dynamic_Transpose_extraction) {
+    GTEST_SKIP();
+    const auto& f = MHATransposedInputFunction(std::vector<PartialShape>{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, true);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_extraction_and_unsupported_existing_transpose) {
     const auto& f = MHATransposedInputFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 12, 64, 128}, {1, 128, 12, 64}}, true,
                                                std::vector<int64_t>{0, 3, 1, 2});
@@ -79,6 +128,15 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_extraction_and_uns
     run();
 }
 
+TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_Dynamic_Transpose_extraction_and_unsupported_existing_transpose) {
+    GTEST_SKIP();
+    const auto& f = MHATransposedInputFunction(std::vector<PartialShape>{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, true,
+                                               std::vector<int64_t>{0, 3, 1, 2});
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_fusion) {
     const auto& f = MHATransposedInputFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 64, 128, 12}, {1, 128, 12, 64}}, false,
                                                std::vector<int64_t>{0, 2, 1, 3});
@@ -87,6 +145,14 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_fusion) {
     run();
 }
 
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Dyanmic_Transpose_fusion) {
+    const auto& f = MHATransposedInputFunction(std::vector<PartialShape>{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, false,
+                                               std::vector<int64_t>{0, 2, 1, 3});
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) {
     const auto& f = MHASplitMFunction(std::vector<PartialShape>{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}},
                                       std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
diff --git a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
index 2f3ddd5d843ae3..86507326c25a44 100644
--- a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
@@ -555,7 +555,7 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
         {{"special_zero", true}});
     auto slice_Slice_543 = GenSlice(view_Reshape_424, 0, head_size, 1, 3);  //  tensor_array<f32[?,?,32,128]>
 
-    auto hidden_states = makePattern("f32[?,?,?]");  //
+    auto hidden_states = makePattern();  //
     auto ShapeOf_485735 = makePattern<opset1::ShapeOf>({hidden_states}, {});
     auto Multiply_567524 = makePattern<opset1::Multiply>({ShapeOf_485735, {-1}}, {{"auto_broadcast", "numpy"}});
     auto Gather_377635 = makePattern<opset8::Gather>({Multiply_567524, {1}, 0}, {{"batch_dims", 0}});
diff --git a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
index 1a4507c08dc9f0..3bf315bebf4467 100644
--- a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
+++ b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
@@ -16,6 +16,7 @@
 #include "openvino/op/shape_of.hpp"
 #include "openvino/op/squeeze.hpp"
 #include "openvino/op/util/multi_subgraph_base.hpp"
+#include "openvino/op/util/op_types.hpp"
 #include "transformations/utils/utils.hpp"
 
 namespace {
@@ -222,7 +223,101 @@ void optimize_value_usage(ov::Output<ov::Node>& output, STS_map& symbol_shape_so
     }
 }
 
-void save_shape_sources(const ov::Output<ov::Node>& output, STS_map& symbol_shape_source) {
+std::vector<std::shared_ptr<ov::Node>> topological_order(const std::shared_ptr<ov::Model>& m) {
+    auto order = m->get_ordered_ops();
+
+    // step 1: split model into parameter related and parameter non-related ops
+    const std::string op_depends_on_parameter = "topological_sort_op_depends_on";
+    // values: true - parameter dependent; false otherwise
+    for (const auto& op : order) {
+        if (ov::as_type_ptr<ov::op::v0::Parameter>(op)) {
+            op->get_rt_info()[op_depends_on_parameter] = true;
+        } else if (ov::as_type_ptr<ov::op::v0::Constant>(op) || ov::as_type_ptr<ov::op::v0::ShapeOf>(op) ||
+                   ov::as_type_ptr<ov::op::v3::ShapeOf>(op) ||
+                   std::dynamic_pointer_cast<ov::op::util::VariableExtension>(op)) {
+            op->get_rt_info()[op_depends_on_parameter] = false;
+        } else {  // deduce op type from inputs
+            const auto& inputs = op->input_values();
+            op->get_rt_info()[op_depends_on_parameter] =
+                std::any_of(inputs.begin(),
+                            inputs.end(),
+                            [&op_depends_on_parameter](const ov::Output<ov::Node>& input) {
+                                return input.get_node_shared_ptr()->get_rt_info()[op_depends_on_parameter].as<bool>();
+                            });
+        }
+    }
+    // step 2: starting from Result -- assign weight to ops:
+    //      if parameter dependant, weights is maximum of output indices plus one
+    //      else weights is maximum of output indices
+    // this step doesn't assign weights to all the ops, this is intentional and will be used in the following step
+    const std::string weight_rt_info_name = "topological_sort_weight";
+    for (auto it = order.rbegin(); it != order.rend(); ++it) {
+        const auto& op = *it;
+        int64_t weight = 0;
+        if (ov::as_type_ptr<ov::op::v0::Result>(op)) {
+            op->get_rt_info()[weight_rt_info_name] = weight;
+        } else {
+            bool output_has_weight = false;
+            for (const auto& output : op->outputs()) {
+                for (const auto& input : output.get_target_inputs()) {
+                    const auto& output_op = input.get_node();
+                    const auto& rt_info = output_op->get_rt_info();
+                    if (!rt_info.count(weight_rt_info_name))
+                        continue;
+                    output_has_weight = true;
+                    auto output_weight = rt_info.at(weight_rt_info_name).as<int64_t>();
+                    weight = output_weight > weight ? output_weight : weight;
+                }
+            }
+            if (output_has_weight) {
+                if (op->get_rt_info()[op_depends_on_parameter].as<bool>()) {
+                    weight += 1;
+                }
+                op->get_rt_info()[weight_rt_info_name] = weight;
+            }
+        }
+    }
+    // step 3: make propagation for all the nodes:
+    // if weight is already assigned -- skip operation
+    // else operation weights is minimum of input indices
+    // if all operation inputs have no weights -- this op is isolated and this algorithm doesn't make sense,
+    // such cases are extremely rare and rather theoretical, to handle them we return original ov::Model op order
+    std::map<int64_t, std::vector<std::shared_ptr<ov::Node>>> level_to_vector;
+    for (const auto& op : order) {
+        if (!op->get_rt_info().count(weight_rt_info_name)) {
+            int64_t weight = std::numeric_limits<int64_t>::max();
+            for (const auto& input : op->input_values()) {
+                const auto& rt_info = input.get_node_shared_ptr()->get_rt_info();
+                if (!rt_info.count(weight_rt_info_name))
+                    continue;
+                auto input_weight = rt_info.at(weight_rt_info_name).as<int64_t>();
+                weight = input_weight < weight ? input_weight : weight;
+            }
+            if (weight != std::numeric_limits<int64_t>::max())
+                op->get_rt_info()[weight_rt_info_name] = weight;
+            else
+                return m->get_ordered_ops();
+        }
+        level_to_vector[op->get_rt_info().at(weight_rt_info_name).as<int64_t>()].push_back(op);
+    }
+    // finalization: descending order for levels and ops within level are ordered by get_ordered_ops
+    std::vector<std::shared_ptr<ov::Node>> result;
+    result.reserve(order.size());
+    for (auto it = level_to_vector.rbegin(); it != level_to_vector.rend(); ++it) {
+        const auto& item = *it;
+        result.insert(result.end(), item.second.begin(), item.second.end());
+        for (const auto& op : item.second) {
+            op->get_rt_info().erase(weight_rt_info_name);
+            op->get_rt_info().erase(op_depends_on_parameter);
+        }
+    }
+    return result;
+}
+
+void save_shape_sources(const std::shared_ptr<ov::Node>& op, STS_map& symbol_shape_source) {
+    if (!ov::is_type<ov::op::v0::ShapeOf>(op) && !ov::is_type<ov::op::v3::ShapeOf>(op))
+        return;
+    const auto& output = op->input_value(0);
     for (const auto& d : output.get_partial_shape()) {
         if (d.is_static())
             continue;
@@ -240,7 +335,7 @@ bool ov::pass::OptimizeSymbolsUsedAsValues::run_on_model(const std::shared_ptr<o
     RUN_ON_FUNCTION_SCOPE(OptimizeSymbolsUsedAsValues);
     STS_map symbol_shape_source;
     STS_map symbol_value_source;
-    for (const auto& op : m->get_ordered_ops()) {
+    for (const auto& op : topological_order(m)) {
         // Result has output port which has shared (during validate_and_infer_type) tensor with input port.
         // Transformations may replace input of Result. After replacement and before Result::validate_and_infer_type --
         // output tensor of Result may contain inaccurate shape / symbols due to the sharing with tensor which may be
@@ -252,10 +347,9 @@ bool ov::pass::OptimizeSymbolsUsedAsValues::run_on_model(const std::shared_ptr<o
         // LTS maps aren't shared with sub-graphs because inner graph can not access outer graph for label sources
         ov::op::util::process_subgraph(*this, op);
 
-        for (auto& output : op->outputs()) {
+        for (auto& output : op->outputs())
             optimize_value_usage(output, symbol_shape_source, symbol_value_source);
-            save_shape_sources(output, symbol_shape_source);
-        }
+        save_shape_sources(op, symbol_shape_source);
     }
     return true;
 }
diff --git a/src/common/transformations/tests/sdpa_to_paged_attention_test.cpp b/src/common/transformations/tests/sdpa_to_paged_attention_test.cpp
new file mode 100644
index 00000000000000..0443e7b82de5cc
--- /dev/null
+++ b/src/common/transformations/tests/sdpa_to_paged_attention_test.cpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/pass/sdpa_to_paged_attention.hpp"
+
+#include <gtest/gtest.h>
+
+#include "common_test_utils/test_common.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
+#include "openvino/pass/manager.hpp"
+
+using namespace ov;
+
+TEST(SDPATOPATest, SDPANotPresent) {
+    const auto p0 = std::make_shared<op::v0::Parameter>(element::f32, Shape{1, 32, 32});
+    const auto p1 = std::make_shared<op::v0::Parameter>(element::f32, Shape{1, 32, 32});
+    const auto add = std::make_shared<op::v1::Add>(p0, p1);
+    const auto result = std::make_shared<op::v0::Result>(add);
+
+    auto model = std::make_shared<Model>(ResultVector{result}, ParameterVector{p0, p1});
+
+    ov::pass::Manager manager;
+    manager.register_pass<pass::SDPAToPagedAttention>();
+    EXPECT_THROW(manager.run_passes(model), ov::Exception);
+}
\ No newline at end of file
diff --git a/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp b/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp
index 881d02b20d295a..eb108e4c6591ba 100644
--- a/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp
+++ b/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp
@@ -75,22 +75,16 @@ TEST_F(TransformationTestsF, ApplySymbolEquivalence_Concat_Values) {
         auto input_2 = make_shared<v0::Parameter>(element::f32, PartialShape::dynamic(4));
         auto concat = make_shared<v0::Concat>(OutputVector{input_1, input_2}, -1);
 
-        auto shape_1 = make_shared<v3::ShapeOf>(input_1);
-        auto gather_1 = make_shared<v8::Gather>(shape_1,
-                                                v0::Constant::create(element::i64, {1}, {3}),
-                                                v0::Constant::create(element::i64, {}, {0}));
-
-        auto shape_2 = make_shared<v3::ShapeOf>(input_2);
-        auto gather_2 = make_shared<v8::Gather>(shape_2,
-                                                v0::Constant::create(element::i64, {1}, {3}),
-                                                v0::Constant::create(element::i64, {}, {0}));
-
-        auto sum = make_shared<v1::Add>(gather_1, gather_2);
+        auto shape = make_shared<v3::ShapeOf>(concat);
+        auto gather = make_shared<v8::Gather>(shape,
+                                              v0::Constant::create(element::i64, {1}, {-1}),
+                                              v0::Constant::create(element::i64, {}, {0}));
 
         auto reshape = make_shared<v1::Reshape>(
             concat,
-            make_shared<v0::Concat>(OutputVector{sum, v0::Constant::create(element::i64, {1}, {-1})}, 0),
+            make_shared<v0::Concat>(OutputVector{gather, v0::Constant::create(element::i64, {1}, {-1})}, 0),
             false);
+
         model_ref = make_shared<Model>(NodeVector{reshape}, ParameterVector{input_2, input_1});
     }
 }
diff --git a/src/core/reference/include/openvino/reference/atanh.hpp b/src/core/reference/include/openvino/reference/atanh.hpp
index 5ba554d55179e3..56be82694d55e4 100644
--- a/src/core/reference/include/openvino/reference/atanh.hpp
+++ b/src/core/reference/include/openvino/reference/atanh.hpp
@@ -18,9 +18,9 @@ T atanh(const T in) {
     return std::atanh(in);
 }
 
-template <class T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+// Integral types don't support NAN and INFINITY, use integral limits instead for special values.
+template <class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value>::type* = nullptr>
 T atanh(const T in) {
-    // Integral type not support NAN and INFINITY, use integral limits instead for special values.
     if (in > 0) {
         return std::numeric_limits<T>::max();
     } else if (in < 0) {
@@ -29,6 +29,11 @@ T atanh(const T in) {
         return 0;
     }
 }
+
+template <class T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+T atanh(const T in) {
+    return in > 0 ? std::numeric_limits<T>::max() : 0;
+}
 }  // namespace func
 
 /**
diff --git a/src/core/reference/include/openvino/reference/matmul.hpp b/src/core/reference/include/openvino/reference/matmul.hpp
index 964bbc5c4a264d..92d6fa3cefb6b6 100644
--- a/src/core/reference/include/openvino/reference/matmul.hpp
+++ b/src/core/reference/include/openvino/reference/matmul.hpp
@@ -161,7 +161,7 @@ void matmul(const T* arg0,
                           broadcast_axes,
                           sizeof(T));
 
-                arg0_shape_tmp = arg0_br_target_shape;
+                arg0_shape_tmp = std::move(arg0_br_target_shape);
                 arg0_rank = arg0_shape_tmp.size();
                 arg0_new_data.swap(tmp);
                 arg0_data = arg0_new_data.data();
@@ -175,7 +175,7 @@ void matmul(const T* arg0,
                           arg1_br_target_shape,
                           broadcast_axes,
                           sizeof(T));
-                arg1_shape_tmp = arg1_br_target_shape;
+                arg1_shape_tmp = std::move(arg1_br_target_shape);
                 arg1_rank = arg1_shape_tmp.size();
                 arg1_new_data.swap(tmp);
                 arg1_data = arg1_new_data.data();
diff --git a/src/core/reference/src/op/einsum.cpp b/src/core/reference/src/op/einsum.cpp
index 74027f424ecb7a..b8b23964346225 100644
--- a/src/core/reference/src/op/einsum.cpp
+++ b/src/core/reference/src/op/einsum.cpp
@@ -124,7 +124,7 @@ std::unordered_map<std::string, std::vector<size_t>> compute_label_dim_map(const
             for (size_t ind = 0; ind < num_broadcasted_dims; ++ind) {
                 label_dims.push_back(static_cast<size_t>(current_dim + ind));
             }
-            resulted_map[label] = label_dims;
+            resulted_map[label] = std::move(label_dims);
             current_dim += num_broadcasted_dims;
         } else if (resulted_map.find(label) != resulted_map.end()) {
             resulted_map[label].push_back(static_cast<size_t>(current_dim));
@@ -132,7 +132,7 @@ std::unordered_map<std::string, std::vector<size_t>> compute_label_dim_map(const
         } else {
             std::vector<size_t> label_dims;
             label_dims.push_back(static_cast<size_t>(current_dim));
-            resulted_map[label] = label_dims;
+            resulted_map[label] = std::move(label_dims);
             ++current_dim;
         }
     }
@@ -350,8 +350,8 @@ void reduce_input(ov::TensorVector& inputs,
     reference::reduce_sum(input_ptr.data<T>(), output_ptr.data<T>(), input_shape, reduced_axes);
 
     // update a vector of inputs and input subscripts
-    inputs[input_ind] = output_ptr;
-    input_subscripts[input_ind] = new_input_subscript;
+    inputs[input_ind] = std::move(output_ptr);
+    input_subscripts[input_ind] = std::move(new_input_subscript);
 }
 
 /// \brief      Transpose input to layout specified through the required subscript
@@ -408,7 +408,7 @@ void transpose_input(ov::TensorVector& inputs,
                          output_shape);
 
     // update a vector of inputs and input subscripts
-    inputs[input_ind] = output_ptr;
+    inputs[input_ind] = std::move(output_ptr);
     input_subscripts[input_ind] = required_subscript;
 }
 
@@ -452,7 +452,7 @@ void broadcast_input(ov::TensorVector& inputs,
                          broadcast_axes,
                          input.get_element_type().size());
 
-    input = output;
+    input = std::move(output);
 }
 
 /// \brief      Build identity tensor that will be used to zero non-diagonal tensor
@@ -528,7 +528,7 @@ ov::Tensor build_multi_identity(const ov::Tensor& input,
                                multi_identity.get_shape(),
                                identity.get_shape(),
                                ov::op::AutoBroadcastType::NUMPY);
-        multi_identity = mul_output;
+        multi_identity = std::move(mul_output);
     }
     return multi_identity;
 }
@@ -545,7 +545,7 @@ void extract_diagonal(ov::TensorVector& inputs, std::vector<std::string>& input_
 
     const auto& input_ptr = inputs[input_ind];
     const auto& input_subscript = input_subscripts[input_ind];
-    const auto input_shape = input_ptr.get_shape();
+    const auto& input_shape = input_ptr.get_shape();
 
     std::string resultant_subscript = "";
     constexpr char ellipsis[] = "...";
@@ -591,8 +591,8 @@ void extract_diagonal(ov::TensorVector& inputs, std::vector<std::string>& input_
 
     auto result = ov::Tensor(input_ptr.get_element_type(), result_shape);
     reference::reduce_sum(mul_output.data<T>(), result.data<T>(), mul_output.get_shape(), reduced_axes);
-    inputs[input_ind] = result;
-    input_subscripts[input_ind] = resultant_subscript;
+    inputs[input_ind] = std::move(result);
+    input_subscripts[input_ind] = std::move(resultant_subscript);
 }
 
 /// \brief      Reshape input to the new shape specified by sub-shapes of the
diff --git a/src/core/reference/src/op/fft.cpp b/src/core/reference/src/op/fft.cpp
index 9c88b21fd8d1b8..1e0c04eb4c4e35 100644
--- a/src/core/reference/src/op/fft.cpp
+++ b/src/core/reference/src/op/fft.cpp
@@ -306,7 +306,8 @@ InfoForFFTCalculation get_info_for_calculation(const Shape& input_data_shape,
     const int64_t complex_data_rank = static_cast<int64_t>(input_data_shape.size() - 1);
 
     const auto reversed_output_shape = fft_common::reverse_shape_of_emulated_complex_tensor(output_shape);
-    auto fft_axes = get_axes(axes_data, axes_data_shape, complex_data_rank);
+    auto& fft_axes = result.fft_axes;
+    fft_axes = get_axes(axes_data, axes_data_shape, complex_data_rank);
     fft_axes = fft_common::reverse_fft_axes(fft_axes, complex_data_rank);
 
     const int64_t fft_rank = fft_axes.size();
@@ -320,30 +321,22 @@ InfoForFFTCalculation get_info_for_calculation(const Shape& input_data_shape,
     const auto outer_strides = fft_common::compute_strides(outer_lengths);
     const int64_t outer_size = outer_strides[outer_rank];
 
-    const int64_t buffer_size = compute_buffer_size(fft_lengths);
-
     const auto output_strides = fft_common::compute_strides(reversed_output_shape);
-    const auto output_fft_strides = get_lengths(output_strides, fft_axes);
-    const auto output_outer_strides = get_lengths(output_strides, outer_axes);
     const auto reversed_input_shape = fft_common::reverse_shape_of_emulated_complex_tensor(input_data_shape);
-    const auto input_fft_lengths = get_lengths(reversed_input_shape, fft_axes);
     const auto input_strides = fft_common::compute_strides(reversed_input_shape);
-    const auto input_fft_strides = get_lengths(input_strides, fft_axes);
-    const auto input_outer_strides = get_lengths(input_strides, outer_axes);
 
-    result.fft_axes = fft_axes;
     result.fft_lengths = fft_lengths;
     result.fft_strides = fft_strides;
     result.outer_strides = outer_strides;
-    result.output_fft_strides = output_fft_strides;
-    result.output_outer_strides = output_outer_strides;
-    result.input_fft_lengths = input_fft_lengths;
-    result.input_fft_strides = input_fft_strides;
-    result.input_outer_strides = input_outer_strides;
+    result.output_fft_strides = get_lengths(output_strides, fft_axes);
+    result.output_outer_strides = get_lengths(output_strides, outer_axes);
+    result.input_fft_lengths = get_lengths(reversed_input_shape, fft_axes);
+    result.input_fft_strides = get_lengths(input_strides, fft_axes);
+    result.input_outer_strides = get_lengths(input_strides, outer_axes);
     result.fft_rank = fft_rank;
     result.fft_size = fft_size;
     result.outer_size = outer_size;
-    result.buffer_size = buffer_size;
+    result.buffer_size = compute_buffer_size(fft_lengths);
 
     return result;
 }
diff --git a/src/core/reference/src/op/interpolate.cpp b/src/core/reference/src/op/interpolate.cpp
index 3b4adc340507cf..ff9bf20eb1a293 100644
--- a/src/core/reference/src/op/interpolate.cpp
+++ b/src/core/reference/src/op/interpolate.cpp
@@ -93,10 +93,10 @@ InterpolateEvalHelper::InfoForGenericLinearONNXMode InterpolateEvalHelper::get_i
     result.batch_size = batch_size;
     result.num_channels = num_channels;
     result.spatial_rank = static_cast<int64_t>(spatial_rank);
-    result.input_index_multipliers = input_index_multipliers;
-    result.output_index_multipliers = output_index_multipliers;
-    result.input_spatial_shape = input_spatial_shape;
-    result.output_spatial_shape = output_spatial_shape;
+    result.input_index_multipliers = std::move(input_index_multipliers);
+    result.output_index_multipliers = std::move(output_index_multipliers);
+    result.input_spatial_shape = std::move(input_spatial_shape);
+    result.output_spatial_shape = std::move(output_spatial_shape);
 
     return result;
 }
@@ -134,10 +134,10 @@ InterpolateEvalHelper::InfoForLinearMode InterpolateEvalHelper::get_info_for_lin
     InfoForLinearMode result;
 
     result.antialias = antialias;
-    result.a = a;
-    result.r = r;
+    result.a = std::move(a);
+    result.r = std::move(r);
     result.prod_a = prod_a;
-    result.shape_for_indices = shape_for_indices;
+    result.shape_for_indices = std::move(shape_for_indices);
 
     return result;
 }
@@ -163,8 +163,8 @@ InterpolateEvalHelper::ICoords InterpolateEvalHelper::get_icoords(const Coordina
         icoords_r[axis] = static_cast<int64_t>(std::round(in_coord));
     }
 
-    result.icoords = icoords;
-    result.icoords_r = icoords_r;
+    result.icoords = std::move(icoords);
+    result.icoords_r = std::move(icoords_r);
 
     return result;
 }
@@ -218,7 +218,7 @@ InterpolateEvalHelper::LinearModeInnerIterationResult InterpolateEvalHelper::inn
     Coordinate inner_coord{unsigned_inner_coords_vector};
 
     result.w = w;
-    result.inner_coord = inner_coord;
+    result.inner_coord = std::move(inner_coord);
 
     return result;
 }
diff --git a/src/core/reference/src/op/loop.cpp b/src/core/reference/src/op/loop.cpp
index f6cbae6ffaec46..17d9a57e538b93 100644
--- a/src/core/reference/src/op/loop.cpp
+++ b/src/core/reference/src/op/loop.cpp
@@ -51,7 +51,7 @@ void loop(const std::shared_ptr<Model>& func,
         ov::Tensor in_tensor(func->get_parameters().at(cur_iter_idx)->get_element_type(),
                              func->get_parameters().at(cur_iter_idx)->get_shape());
         std::memset(in_tensor.data(), 0, in_tensor.get_byte_size());
-        inputs_to_body.at(cur_iter_idx) = in_tensor;
+        inputs_to_body.at(cur_iter_idx) = std::move(in_tensor);
     }
 
     // Port map processing: inputs and back edges
diff --git a/src/core/src/pass/sdpa_to_paged_attention.cpp b/src/core/src/pass/sdpa_to_paged_attention.cpp
index 1eaf15c928db01..0d71c6a4b0d8dc 100644
--- a/src/core/src/pass/sdpa_to_paged_attention.cpp
+++ b/src/core/src/pass/sdpa_to_paged_attention.cpp
@@ -7,6 +7,7 @@
 #include "openvino/cc/pass/itt.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/gather.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
 #include "openvino/op/shape_of.hpp"
 #include "openvino/op/unsqueeze.hpp"
 #include "openvino/pass/manager.hpp"
@@ -29,6 +30,11 @@ static std::shared_ptr<v0::Parameter> setName(std::shared_ptr<v0::Parameter> nod
 
 bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptr<ov::Model>& model) {
     RUN_ON_MODEL_SCOPE(SDPAToPagedAttention);
+
+    OPENVINO_ASSERT(ov::op::util::has_op_with_type<ov::op::v13::ScaledDotProductAttention>(model),
+                    "No ScaledDotProductAttention operation observed in the graph, cannot perform"
+                    "the SDPAToPagedAttention transformation.");
+
     auto max_context_len = setName(std::make_shared<v0::Parameter>(element::i32, PartialShape{}), "max_context_len");
     ParameterVector model_remaining_params = {
         setName(std::make_shared<v0::Parameter>(element::i32, PartialShape{-1}), "past_lens"),
diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp
index 853dfc20d11299..d3f1ae0ba691a5 100644
--- a/src/plugins/intel_cpu/src/nodes/deconv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp
@@ -495,7 +495,7 @@ void Deconvolution::getSupportedDescriptors() {
                     creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i)));
         }
 
-        for (size_t i = 0; i < getChildEdges().size(); ++i) {
+        for (size_t i = 0; i < config.outConfs.size(); ++i) {
             config.outConfs[i].setMemDesc(
                     creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i)));
         }
@@ -1145,7 +1145,7 @@ void Deconvolution::initSupportedPrimitiveDescriptors() {
                     creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i)));
         }
 
-        for (size_t i = 0; i < getChildEdges().size(); ++i) {
+        for (size_t i = 0; i < config.outConfs.size(); ++i) {
             config.outConfs[i].setMemDesc(
                     creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i)));
         }
diff --git a/src/plugins/intel_cpu/src/nodes/unique.cpp b/src/plugins/intel_cpu/src/nodes/unique.cpp
index ad322756ab28e3..130213dfcb8703 100644
--- a/src/plugins/intel_cpu/src/nodes/unique.cpp
+++ b/src/plugins/intel_cpu/src/nodes/unique.cpp
@@ -225,41 +225,31 @@ void Unique::flattenTensorExec() {
             }
         }
     } else {
-        uniDataTmpPtr[0] = srcDataPtr[0];
-        if (definedOutputs[FIRST_UNIQUE_IDX]) {
-            firstTmpPtr[0] = 0;
-        }
-        if (definedOutputs[INPUT_TO_UNIQ_IDX]) {
-            inToOutTmpPtr[0] = 0;
-        }
+        std::unordered_map<T, int32_t> uniq;
+        uniq.reserve(inputLen);
+
         if (definedOutputs[OCCURRENCES_NUM]) {
             std::fill(occurTmpPtr, occurTmpPtr + inputLen, 1);
         }
-        uniqueLen = 1;
-
-        for (size_t i = 1; i < inputLen; i++) {
-            bool found = false;
-            size_t j = 0;
-            for (; j < uniqueLen; j++) {
-                if (uniDataTmpPtr[j] == srcDataPtr[i]) {
-                    found = true;
-                    break;
-                }
-            }
-            if (!found) {
-                uniDataTmpPtr[uniqueLen] = srcDataPtr[i];
+
+        for (size_t i = 0, j = 0; i < inputLen; ++i) {
+            auto it = uniq.emplace(srcDataPtr[i], j);
+            inToOutTmpPtr[i] = it.first->second;
+            if (it.second) {
                 if (definedOutputs[FIRST_UNIQUE_IDX]) {
-                    firstTmpPtr[uniqueLen] = i;
+                    firstTmpPtr[j] = i;
                 }
-                uniqueLen++;
+                ++j;
             } else {
                 if (definedOutputs[OCCURRENCES_NUM]) {
-                    occurTmpPtr[j]++;
+                    occurTmpPtr[inToOutTmpPtr[i]]++;
                 }
             }
-            if (definedOutputs[INPUT_TO_UNIQ_IDX]) {
-                inToOutTmpPtr[i] = j;
-            }
+        }
+
+        uniqueLen = static_cast<int64_t>(uniq.size());
+        for (const auto& it : uniq) {
+            uniDataTmpPtr[it.second] = it.first;
         }
     }
 
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 3d13cab76dbb23..006935a85e85de 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -867,7 +867,7 @@ void Transformations::MainSnippets(void) {
 #if defined(OPENVINO_ARCH_X86_64)
     auto is_supported_matmul = [this](const std::shared_ptr<const ov::Node>& n) {
         const auto matmul = ov::as_type_ptr<const ov::op::v0::MatMul>(n);
-        if (!matmul)
+        if (!matmul || matmul->is_dynamic())
             return false;
         const auto in_type0 = matmul->get_input_element_type(0);
         const auto in_type1 = matmul->get_input_element_type(1);
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/deconv_multiple_output_edges.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/deconv_multiple_output_edges.cpp
new file mode 100644
index 00000000000000..b2cb4785fb5720
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/deconv_multiple_output_edges.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/node_builders/constant.hpp"
+#include "common_test_utils/node_builders/eltwise.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "utils/cpu_test_utils.hpp"
+#include "common_test_utils/node_builders/convolution_backprop_data.hpp"
+
+using namespace CPUTestUtils;
+
+namespace ov {
+namespace test {
+
+// Subgraph:
+/*
+┌──────────────────┐           ┌──────────────────┐
+│      INPUT       │           │      WEIGHTS     │
+└─────────┬────────┘           └─────────┬────────┘
+          │      ┌──────────────────┐    │
+          └──────┤  DECONVOLUTION   ├────┘
+                 └──┬───────────┬───┘
+                    │           │
+    ┌───────────────┴──┐     ┌──┴───────────────┐
+    │     MULTIPLY     │     │     MULTIPLY     │
+    └──────────────────┘     └──────────────────┘
+
+Verify deconvolution node correctly handles
+ multiple output edges on a single output port
+ */
+
+class DeconvMultipleOutputEdges : virtual public SubgraphBaseStaticTest {
+public:
+    void SetUp() override {
+        auto ngPrc = ov::element::f32;
+        const ov::Shape inShape = {2, 12, 7, 7};
+        const ov::Shape weiShape = {12, 6, 3, 3};
+        ov::ParameterVector inputParams{std::make_shared<ov::op::v0::Parameter>(ngPrc, inShape),
+                                        std::make_shared<ov::op::v0::Parameter>(ngPrc, weiShape)};
+
+        auto deconv = utils::make_convolution_backprop_data(inputParams[0],
+                                                            inputParams[1],
+                                                            ov::element::f32,
+                                                            ov::Strides({1, 1}),
+                                                            ov::CoordinateDiff({0, 0}),
+                                                            ov::CoordinateDiff({0, 0}),
+                                                            ov::Strides({1, 1}),
+                                                            ov::op::PadType::NOTSET,
+                                                            false);
+        deconv->get_rt_info() = CPUTestsBase::makeCPUInfo({nchw}, {nchw}, {});
+
+        const auto const1 = ov::test::utils::make_constant(ngPrc, std::vector<size_t>{2, 6, 9, 9});
+        const auto const2 = ov::test::utils::make_constant(ngPrc, std::vector<size_t>{2, 6, 9, 9});
+
+        const auto mul1 = utils::make_eltwise(deconv->output(0), const1, utils::EltwiseTypes::MULTIPLY);
+        const auto mul2 = utils::make_eltwise(deconv->output(0), const2, utils::EltwiseTypes::MULTIPLY);
+
+        NodeVector results{mul1, mul2};
+        function = std::make_shared<ov::Model>(results, inputParams, "DeconvMultipleOutputEdges");
+        targetDevice = ov::test::utils::DEVICE_CPU;
+    }
+};
+
+TEST_F(DeconvMultipleOutputEdges, smoke_DeconvMultipleOutputEdges_CPU) {
+    run();
+}
+
+}  // namespace test
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn
index b0cd612cd3a378..a320d02d6e733c 160000
--- a/src/plugins/intel_cpu/thirdparty/onednn
+++ b/src/plugins/intel_cpu/thirdparty/onednn
@@ -1 +1 @@
-Subproject commit b0cd612cd3a378fb2dd73a84efddfca1df2a22db
+Subproject commit a320d02d6e733c775724901675cbc8944391459d
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
index 635aa4d796d3db..b1aaded5ad7780 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
@@ -202,8 +202,7 @@ KERNEL(sdpa_opt)(
                     #define QUERY_BLOCK_SIZE 1
 
                     INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset);
-
-                    query_local[query_local_offset] = val;
+                    query_local[query_local_offset] = val * scale_val;
                     query_local_offset += QUERY_STEP_LOCAL;
                     query_offset += query_pitch;
                 }
@@ -338,7 +337,6 @@ KERNEL(sdpa_opt)(
                     for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE)) {
                         // Read value from SLM and apply scale
                         qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len];
-                        qk_val[seq_idx] *= scale_val;
 
                         // Apply attention mask
 #if IS_CAUSAL
diff --git a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp
index 3e3cb995b70555..9ec265c9322f5c 100644
--- a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp
+++ b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp
@@ -42,8 +42,9 @@ namespace snippets {
  */
 class MHAFunction : public SnippetsFunctionBase {
 public:
-    explicit MHAFunction(const std::vector<PartialShape>& inputShapes, const std::vector<ov::element::Type>& precisions, bool with_mul = true)
-        : SnippetsFunctionBase(inputShapes), with_mul(with_mul), precisions(precisions) {
+    explicit MHAFunction(const std::vector<PartialShape>& inputShapes, const std::vector<ov::element::Type>& precisions,
+                         bool with_mul = true, bool with_reshape = true)
+        : SnippetsFunctionBase(inputShapes), with_mul(with_mul), with_reshape(with_reshape), precisions(precisions) {
         OPENVINO_ASSERT(input_shapes.size() == 4, "Got invalid number of input shapes");
         OPENVINO_ASSERT(precisions.size() == 4, "Got invalid number of input precisions");
     }
@@ -51,8 +52,9 @@ class MHAFunction : public SnippetsFunctionBase {
     std::shared_ptr<ov::Model> initOriginal() const override;
     std::shared_ptr<ov::Model> initReference() const override;
 
-    bool with_mul = true;
-    std::vector<ov::element::Type> precisions;
+    const bool with_mul = true;
+    const bool with_reshape = true;
+    const std::vector<ov::element::Type> precisions;
 };
 
 class MHASplitMFunction : public MHAFunction {
@@ -85,8 +87,9 @@ class MHASplitMFunction : public MHAFunction {
  */
 class MHAMatMul0TransposeFunction : public SnippetsFunctionBase {
 public:
-    explicit MHAMatMul0TransposeFunction(const std::vector<PartialShape>& inputShapes, const std::vector<ov::element::Type>& precisions)
-            : SnippetsFunctionBase(inputShapes), precisions(precisions) {
+    explicit MHAMatMul0TransposeFunction(const std::vector<PartialShape>& inputShapes, const std::vector<ov::element::Type>& precisions,
+                                         bool with_reshape = true)
+            : SnippetsFunctionBase(inputShapes), with_reshape(with_reshape), precisions(precisions) {
         OPENVINO_ASSERT(input_shapes.size() == 4, "Got invalid number of input shapes");
         OPENVINO_ASSERT(precisions.size() == 4, "Got invalid number of input precisions");
     }
@@ -94,7 +97,8 @@ class MHAMatMul0TransposeFunction : public SnippetsFunctionBase {
     std::shared_ptr<ov::Model> initOriginal() const override;
     std::shared_ptr<ov::Model> initReference() const override;
 
-    std::vector<ov::element::Type> precisions;
+    const bool with_reshape = true;
+    const std::vector<ov::element::Type> precisions;
 };
 
 /* Graph:
diff --git a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
index 3157c53fbb32de..f923a9a3aa168e 100644
--- a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
+++ b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
@@ -70,26 +70,35 @@ std::shared_ptr<ov::Model> MHAFunction::initOriginal() const {
     std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
     if (with_mul) {
         ov::Shape shape(rank, 1);
-        shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3];
-        std::vector<float> mulConstData(ov::shape_size(shape));
+        if (transpose1->get_output_partial_shape(0).is_static()) {
+            shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3];
+        }
         const auto mulConst = ov::test::utils::make_constant(precisions[1], shape);
         matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
     }
     const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, matmul_parent1);
     const auto add = std::make_shared<ov::op::v1::Add>(matMul0, addParam);
 
-    const auto interm_shape = add->get_output_shape(0);
-    const auto batch = std::accumulate(interm_shape.cbegin(), interm_shape.cbegin() + rank - 1, 1, std::multiplies<size_t>());
-    const auto reshape0ConstData = std::vector<int64_t>{ batch, -1 };
-    const auto reshape1ConstData = interm_shape;
-    const auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape0ConstData.size()}, reshape0ConstData);
-    const auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape1ConstData.size()}, reshape1ConstData);
+    auto softmax_out = add->output(0);
+    if (with_reshape) {
+        const auto interm_shape = add->get_output_shape(0);
+        const auto batch = std::accumulate(interm_shape.cbegin(), interm_shape.cbegin() + rank - 1, 1, std::multiplies<size_t>());
+        const auto reshape0ConstData = std::vector<int64_t>{ batch, -1 };
+        const auto reshape1ConstData = interm_shape;
+        const auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape0ConstData.size()}, reshape0ConstData);
+        const auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape1ConstData.size()}, reshape1ConstData);
+
+        const auto reshape0 = std::make_shared<ov::opset1::Reshape>(add, reshape0Const, true);
+        const auto softMax = std::make_shared<ov::opset1::Softmax>(reshape0, 1);
+        const auto reshape1 = std::make_shared<ov::opset1::Reshape>(softMax, reshape1Const, true);
+        softmax_out = reshape1->output(0);
+    } else {
+        const auto softMax = std::make_shared<ov::opset1::Softmax>(add, rank - 1);
+        softmax_out = softMax->output(0);
+    }
 
-    const auto reshape0 = std::make_shared<ov::opset1::Reshape>(add, reshape0Const, true);
-    const auto softMax = std::make_shared<ov::opset1::Softmax>(reshape0, 1);
-    const auto reshape1 = std::make_shared<ov::opset1::Reshape>(softMax, reshape1Const, true);
     const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
-    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(reshape1, transpose2);
+    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(softmax_out, transpose2);
     const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
 
     ov::ResultVector results{std::make_shared<ov::opset1::Result>(transpose3)};
@@ -124,13 +133,19 @@ std::shared_ptr<ov::Model> MHAFunction::initReference() const {
     std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
     if (with_mul) {
         ov::Shape shape(rank, 1);
-        shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3];
-        std::vector<float> mulConstData(ov::shape_size(shape));
+        if (transpose1->get_output_partial_shape(0).is_static()) {
+            shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3];
+        }
         const auto mulConst = ov::test::utils::make_constant(precisions[1], shape);
-        const auto mulParam = std::make_shared<ov::opset1::Parameter>(precisions[1], mulConst->get_shape());
-        matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulParam);
-        subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
-        subgraph_inputs = {data0, data1, mulConst, data2, data3};
+
+        if (ov::shape_size(shape) > 1) {
+            const auto mulParam = std::make_shared<ov::opset1::Parameter>(precisions[1], mulConst->get_shape());
+            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulParam);
+            subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
+            subgraph_inputs = {data0, data1, mulConst, data2, data3};
+        } else {
+            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
+        }
     }
 
     const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, matmul_parent1);
@@ -182,16 +197,22 @@ std::shared_ptr<ov::Model> MHASplitMFunction::initReference() const {
     std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
     if (with_mul) {
         ov::Shape shape(rank - 1, 1);
-        shape[rank - 4] = transpose1->get_output_shape(0)[rank - 4];
-        ov::Shape reshape_shape = shape;
-        reshape_shape.insert(reshape_shape.cbegin() + rank - 3, 1);
-        std::vector<float> mulConstData(ov::shape_size(shape));
+        if (transpose1->get_output_partial_shape(0).is_static()) {
+            shape[rank - 4] = transpose1->get_output_shape(0)[rank - 4];
+        }
         const auto mulConst = ov::test::utils::make_constant(precisions[1], shape);
-        const auto reshape_mul = make_reshape(mulConst, reshape_shape);
-        const auto mulParam = std::make_shared<ov::opset1::Parameter>(precisions[1], reshape_mul->get_shape());
-        matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulParam);
-        subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
-        subgraph_inputs = {reshape0, reshape1, reshape_mul, reshape2, reshape3};
+
+        if (ov::shape_size(shape) > 1) {
+            ov::Shape reshape_shape = shape;
+            reshape_shape.insert(reshape_shape.cbegin() + rank - 3, 1);
+            const auto mulReshape = make_reshape(mulConst, reshape_shape);
+            const auto mulParam = std::make_shared<ov::opset1::Parameter>(precisions[1], mulReshape->get_shape());
+            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulParam);
+            subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
+            subgraph_inputs = {reshape0, reshape1, mulReshape, reshape2, reshape3};
+        } else {
+            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
+        }
     }
 
     const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, matmul_parent1);
@@ -217,47 +238,42 @@ std::shared_ptr<ov::Model> MHAMatMul0TransposeFunction::initOriginal() const {
     auto transpose2Param = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
     ov::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param};
 
-    std::vector<ov::Shape> constantShapes;
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1}));
-    constantShapes.push_back(ov::Shape({2}));
-    constantShapes.push_back(ov::Shape({4}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-
-    const auto order = std::vector<int64_t>{0, 2, 1, 3};
-    auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[0], order);
-    auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[1], order);
-    auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[5], order);
-    auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[6], order);
-
-    std::vector<float> mulConstData(1);
-    auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1});
-
-    std::vector<int64_t> reshape0ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0] *
-                                                                   input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]),
-                                              -1};
-    auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[3], reshape0ConstData);
+    const auto rank = input_shapes[0].size();
+    const auto fusion_order = get_fusion_order(rank);
 
-    std::vector<int64_t> reshape1ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[2]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1])};
-    auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[4], reshape1ConstData);
+    const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
 
-    float transA = false;
-    float transB = false;
     const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
     const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
+
+    const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1});
     const auto mul = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
-    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, mul, transA, true);
+    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, mul, false, true);
     const auto add = std::make_shared<ov::op::v1::Add>(matMul0, addParam);
-    const auto reshape0 = std::make_shared<ov::opset1::Reshape>(add, reshape0Const, true);
-    const auto softMax = std::make_shared<ov::opset1::Softmax>(reshape0, 1);
-    const auto reshape1 = std::make_shared<ov::opset1::Reshape>(softMax, reshape1Const, true);
+
+    auto softmax_out = add->output(0);
+    if (with_reshape) {
+        const auto interm_shape = add->get_output_shape(0);
+        const auto batch = std::accumulate(interm_shape.cbegin(), interm_shape.cbegin() + rank - 1, 1, std::multiplies<size_t>());
+        const auto reshape0ConstData = std::vector<int64_t>{ batch, -1 };
+        const auto reshape1ConstData = interm_shape;
+        const auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape0ConstData.size()}, reshape0ConstData);
+        const auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape1ConstData.size()}, reshape1ConstData);
+
+        const auto reshape0 = std::make_shared<ov::opset1::Reshape>(add, reshape0Const, true);
+        const auto softMax = std::make_shared<ov::opset1::Softmax>(reshape0, 1);
+        const auto reshape1 = std::make_shared<ov::opset1::Reshape>(softMax, reshape1Const, true);
+        softmax_out = reshape1->output(0);
+    } else {
+        const auto softMax = std::make_shared<ov::opset1::Softmax>(add, rank - 1);
+        softmax_out = softMax->output(0);
+    }
+
     const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
-    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(reshape1, transpose2, transA, transB);
+    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(softmax_out, transpose2);
     const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
 
     ov::ResultVector results{std::make_shared<ov::opset1::Result>(transpose3)};
@@ -269,58 +285,38 @@ std::shared_ptr<ov::Model> MHAMatMul0TransposeFunction::initReference() const {
     auto data2 = std::make_shared<ov::opset1::Parameter>(precisions[2], input_shapes[2]);
     auto data3 = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
     ov::ParameterVector ngraphParams = {data0, data1, data2, data3};
+    NodeVector subgraph_inputs = {data0, data1, data2, data3};
 
     auto transpose0Param = std::make_shared<ov::opset1::Parameter>(precisions[0], input_shapes[0]);
     auto transpose1Param = std::make_shared<ov::opset1::Parameter>(precisions[1], input_shapes[1]);
     auto addParam = std::make_shared<ov::opset1::Parameter>(precisions[2], input_shapes[2]);
     auto transpose2Param = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
 
-    std::vector<ov::Shape> constantShapes;
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1}));
-    constantShapes.push_back(ov::Shape({2}));
-    constantShapes.push_back(ov::Shape({4}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-
-    auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[0], std::vector<int64_t>{0, 2, 1, 3});
-    auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[1], std::vector<int64_t>{0, 2, 3, 1});
-    auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[5], std::vector<int64_t>{0, 2, 1, 3});
-    auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[6], std::vector<int64_t>{0, 2, 1, 3});
-
-    std::vector<float> mulConstData(1);
-    auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1});
-    ov::ParameterVector subgraphParams = {transpose0Param, transpose1Param, addParam, transpose2Param};
+    ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param};
 
-    std::vector<int64_t> reshape0ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0] *
-                                                                   input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]),
-                                              -1};
-    auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[3], reshape0ConstData);
+    const auto rank = input_shapes[0].size();
+    const auto fusion_order = get_fusion_order(rank);
+    const auto decomposed_order = get_decomposed_order(rank);
 
-    std::vector<int64_t> reshape1ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[2]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1])};
-    auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[4], reshape1ConstData);
+    const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order);
+    const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
 
-    float transA = false;
-    float transB = false;
     const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
     const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
+
+    const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1});
     const auto mul = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
-    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, mul, transA, transB);
+    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, mul);
     const auto add = std::make_shared<ov::op::v1::Add>(matMul0, addParam);
-    const auto reshape0 = std::make_shared<ov::opset1::Reshape>(add, reshape0Const, true);
-    const auto softMax = std::make_shared<ov::opset1::Softmax>(reshape0, 1);
-    const auto reshape1 = std::make_shared<ov::opset1::Reshape>(softMax, reshape1Const, true);
+    const auto softMax = std::make_shared<ov::opset1::Softmax>(add, rank - 1);
     const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
-    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(reshape1, transpose2, transA, transB);
+    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(softMax, transpose2);
     const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
 
-    auto subgraph = std::make_shared<ov::snippets::op::Subgraph>(
-            NodeVector{data0, data1, data2, data3},
-            std::make_shared<ov::Model>(NodeVector{transpose3}, subgraphParams));
+    auto subgraph = std::make_shared<ov::snippets::op::Subgraph>(subgraph_inputs,
+            std::make_shared<ov::Model>(NodeVector{transpose3}, subgraph_params));
 
     return std::make_shared<ov::Model>(NodeVector{subgraph}, ngraphParams);
 }
@@ -982,9 +978,9 @@ std::shared_ptr<ov::Model> MHATransposedInputFunction::initReference() const {
         }
     }
 
-    const auto param0 = std::make_shared<ov::opset1::Parameter>(precision, data0->get_shape());
-    const auto param1 = std::make_shared<ov::opset1::Parameter>(precision, in1->get_shape());
-    const auto param2 = std::make_shared<ov::opset1::Parameter>(precision, data2->get_shape());
+    const auto param0 = std::make_shared<ov::opset1::Parameter>(precision, data0->get_output_partial_shape(0));
+    const auto param1 = std::make_shared<ov::opset1::Parameter>(precision, in1->get_output_partial_shape(0));
+    const auto param2 = std::make_shared<ov::opset1::Parameter>(precision, data2->get_output_partial_shape(0));
 
     std::shared_ptr<ov::Node> matmul0_in1 = param1;
     if (!m_order.empty() && is_supported) {