From 3f249aa69b65d5ae53c53cff648b41d4bfb0d1bb Mon Sep 17 00:00:00 2001
From: Dmitry Matveev <dmitry.matveev@intel.com>
Date: Fri, 29 Nov 2024 14:05:21 +0000
Subject: [PATCH] NPUW Hotfixes: Memory and L0 pipeline (#27826)

### Details:
- Keep tensors for decompression cut-off in a host-side closure, not
lazy tensor - so they are not uploaded to bank & detached after that
  - This leads to 2x memory consumption and the subsequent crash
- Relaxed requirements to enable the unfolded execution - so it may
still happen if there's single-call functions that require DCOFF
(previously having those would reject this unfolded path

### Tickets:
 - C-155523 (most likely, related to)

@smirnov-alexey please take care of the release branch cherry-pick
---
 .../intel_npu/src/plugin/npuw/compiled_model.cpp       |  5 +++--
 .../src/plugin/npuw/partitioning/patterns/dcoff.cpp    | 10 ++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index b9cdad9f4879db..f9573cb78f21ec 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -727,8 +727,9 @@ std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infe
         const auto num_submodels = m_compiled_submodels.size();
         for (std::size_t idx = 0u; idx < num_submodels; idx++) {
             const auto& comp_model_desc = m_compiled_submodels[idx];
-            if (!comp_model_desc.replaced_by.has_value()) {
-                // not a funcall, do nothing
+            if (!comp_model_desc.replaced_by.has_value() || comp_model_desc.forced_to_fcall) {
+                // not a funcall, do nothing, or a subgraph that was forced to funcall
+                // (a 1-call function) - skip
                 continue;
             }
             const auto real_idx = comp_model_desc.replaced_by.value();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
index 31093a34871db9..93a43c9b82570a 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
@@ -97,12 +97,10 @@ ClosureRemap build_remap(const Function& fbody, const DCOFFParams& params_to) {
             LOG_DEBUG("This is an OK parameter, will be kept");
             m.closure_remap.push_back(i - fbody._param_offset);
 
-            // Check if unpack is indeed required
-            const auto& type = param->get_element_type();
-            if (type == ov::element::i4 || type == ov::element::u4 || type == ov::element::i8 ||
-                type == ov::element::u8) {
-                m.weights_to_unpack.insert(i - fbody._param_offset);
-            }
+            // FIXME: type should be queried from a lazy tensor
+            // and compared against param->get_element_type()
+            // to decide 100%
+            m.weights_to_unpack.insert(i - fbody._param_offset);
         }
 
         // Process zero points for parameters