diff --git a/include/imex/Conversion/Passes.h b/include/imex/Conversion/Passes.h
index 9fa75ca1e..c868dc887 100644
--- a/include/imex/Conversion/Passes.h
+++ b/include/imex/Conversion/Passes.h
@@ -25,6 +25,7 @@
 #include <imex/Conversion/GPUXToLLVM/GPUXToLLVMPass.h>
 #include <imex/Conversion/MathToVC/MathToVC.h>
 #include <imex/Conversion/NDArrayToLinalg/NDArrayToLinalg.h>
+#include <imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h>
 #include <imex/Conversion/XeGPUToVC/XeGPUToVC.h>
 #include <imex/Conversion/XeTileToXeGPU/XeTileToXeGPU.h>
 
diff --git a/include/imex/Conversion/Passes.td b/include/imex/Conversion/Passes.td
index 2fffae412..d0e198eb2 100644
--- a/include/imex/Conversion/Passes.td
+++ b/include/imex/Conversion/Passes.td
@@ -109,6 +109,19 @@ def DropRegions: Pass<"drop-regions"> {
   let options = [];
 }
 
+//===----------------------------------------------------------------------===//
+// ConvertRegionParallelLoopToGpu
+//===----------------------------------------------------------------------===//
+
+def ConvertRegionParallelLoopToGpu : Pass<"convert-region-parallel-loops-to-gpu"> {
+  let summary = "Convert mapped scf.parallel ops within GPU regions to gpu launch operations";
+  let description = [{
+    Convert scf.parallel ops within GPU regions to gpu launch operations.
+  }];
+  let constructor = "::imex::createConvertRegionParallelLoopToGpuPass()";
+  let dependentDialects = [];
+  let options = [];
+}
 
 //===----------------------------------------------------------------------===//
 // GPUToSPIRV
diff --git a/include/imex/Conversion/RegionParallelLoopToGpu/CMakeLists.txt b/include/imex/Conversion/RegionParallelLoopToGpu/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/include/imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h b/include/imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h
new file mode 100644
index 000000000..d4f0437d8
--- /dev/null
+++ b/include/imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h
@@ -0,0 +1,34 @@
+//===- RegionParallelLoopToGpu.h -------*- C++ -*-===//
+//
+// Copyright 2024 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Adds the conversion pattern from `scf.parallel` within `region.env_region`
+/// to `gpu.launch`.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef _RegionParallelLoopToGpu_H_INCLUDED_
+#define _RegionParallelLoopToGpu_H_INCLUDED_
+
+#include <mlir/IR/PatternMatch.h>
+
+namespace mlir {
+class Pass;
+} // namespace mlir
+
+namespace imex {
+#define GEN_PASS_DECL_CONVERTREGIONPARALLELLOOPTOGPU
+#include "imex/Conversion/Passes.h.inc"
+
+/// Create a pass to convert the Region dialect to the GPU dialect.
+std::unique_ptr<::mlir::Pass> createConvertRegionParallelLoopToGpuPass();
+
+} // namespace imex
+
+#endif // _RegionParallelLoopToGpu_H_INCLUDED_
diff --git a/include/imex/Transforms/Passes.h b/include/imex/Transforms/Passes.h
index 6e6f7d1b6..7392e2c8b 100644
--- a/include/imex/Transforms/Passes.h
+++ b/include/imex/Transforms/Passes.h
@@ -25,6 +25,7 @@ namespace imex {
 std::unique_ptr<mlir::Pass> createSerializeSPIRVPass();
 std::unique_ptr<mlir::Pass>
 createInsertGPUAllocsPass(const char *clientAPI = "vulkan");
+std::unique_ptr<mlir::Pass> createInsertGPUCopyPass();
 std::unique_ptr<mlir::Pass> createSetSPIRVCapabilitiesPass();
 std::unique_ptr<mlir::Pass>
 createSetSPIRVAbiAttributePass(const char *clientAPI = "vulkan");
@@ -40,6 +41,7 @@ createOptimizeTransposePass(const std::string &device = "pvc");
 std::unique_ptr<mlir::Pass> createHoistTransposePass();
 std::unique_ptr<mlir::Pass> createVnniTransformationPass();
 std::unique_ptr<mlir::Pass> createEmulateNonNativeBF16Pass();
+std::unique_ptr<mlir::Pass> createTileLoopsPass();
 
 #define GEN_PASS_DECL
 #include "imex/Transforms/Passes.h.inc"
diff --git a/include/imex/Transforms/Passes.td b/include/imex/Transforms/Passes.td
index b97fdf891..ac9220a36 100644
--- a/include/imex/Transforms/Passes.td
+++ b/include/imex/Transforms/Passes.td
@@ -41,10 +41,20 @@ def InsertGPUAllocs : Pass<"insert-gpu-allocs", "::mlir::func::FuncOp"> {
     Option<"clientAPI", "client-api", "std::string", /*default=*/"\"opencl\"",
            "The client API to use for inserting gpu allocs">,
     Option<"inRegions", "in-regions", "bool", "false",
-           "Add gpu allocs only for memref.AllocOps within GPU regions">
+           "Add gpu allocs only for memref.AllocOps within GPU regions">,
+    Option<"hostShared", "host-shared", "bool", "true",
+           "If set, allocate shared memory accessible both on both host and device.">
   ];
 }
 
+def InsertGPUCopy : Pass<"insert-gpu-copy", "::mlir::func::FuncOp"> {
+  let summary = "Converts memref.copy op to gpu.memcpy if within an env region.";
+  let constructor = "imex::createInsertGPUCopyPass()";
+  let dependentDialects = ["::mlir::memref::MemRefDialect",
+                           "::mlir::gpu::GPUDialect",
+                           "::mlir::arith::ArithDialect"];
+}
+
 def SetSPIRVCapabilities : Pass<"set-spirv-capabilities"> {
   let summary = "Sets Spirv capabilities";
   let constructor = "imex::createSetSPIRVCapabilitiesPass()";
@@ -200,4 +210,28 @@ def HoistTranspose : Pass<"imex-xegpu-hoist-transpose"> {
   ];
 }
 
+def TileLoops : Pass<"tile-loops", "::mlir::func::FuncOp"> {
+  let summary = "Tile linalg.generic loops for GPU offloading";
+  let description = [{
+    Tiles loops defined with tensor inputs/outputs using the given tile sizes.
+    This pass should be applied after loop fusion and before bufferization.
+    Uses `TileUsingSCF` method. To map the loop to GPU blocks and threads this
+    pass should be called twice. If `in-regions` is set, only loops within GPU
+    regions are tiled.
+  }];
+  let options = [
+    ListOption<"tileSizes", "tile-sizes", "int64_t", "Tile sizes">,
+    Option<"minTileFactor", "min-tile-factor", "int64_t", "2",
+           "Minimum factor between dimension size and a tile size">,
+    Option<"inRegions", "in-regions", "bool", "false",
+           "Convert loops only within GPU regions">
+  ];
+  let constructor = "imex::createTileLoopsPass()";
+  let dependentDialects = [
+    "::mlir::linalg::LinalgDialect",
+    "::mlir::scf::SCFDialect"
+  ];
+}
+
+
 #endif // _IMEX_TRANSFORMS_PASSES_TD_INCLUDED_
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 715ce1d20..9828ea59d 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -2,6 +2,7 @@ add_subdirectory(ArithToVC)
 add_subdirectory(DistToStandard)
 add_subdirectory(NDArrayToLinalg)
 add_subdirectory(DropRegions)
+add_subdirectory(RegionParallelLoopToGpu)
 add_subdirectory(GPUToSPIRV)
 add_subdirectory(GPUToGPUX)
 add_subdirectory(GPUXToLLVM)
diff --git a/lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp b/lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp
index 96a5dfea6..a457382ff 100644
--- a/lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp
+++ b/lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp
@@ -526,12 +526,15 @@ struct CopyLowering
         ::mlir::MemRefType::get(tTyp.getShape(), tTyp.getElementType());
     auto mr = rewriter.create<::mlir::memref::AllocOp>(
         loc, mrTyp, dynDims, rewriter.getI64IntegerAttr(8));
-    // and copy if non-0
+    // and copy if not zero sized
     if (!retArTyp.hasZeroSize()) {
       auto srcMR =
           createToMemRef(loc, rewriter, src, srcArTyp.getMemRefType(src));
-      // create a region with given env, add copy op within it
-      auto env = rewriter.getStringAttr("protect_copy_op");
+      // wrap copy in a region to mark it non-deletable or a gpu copy
+      bool hasGPUEnv = ::imex::ndarray::hasGPUEnv(srcArTyp) ||
+                       ::imex::ndarray::hasGPUEnv(retArTyp);
+      std::string regName = hasGPUEnv ? "gpu_copy_op" : "protect_copy_op";
+      auto env = rewriter.getStringAttr(regName);
       rewriter.create<::imex::region::EnvironmentRegionOp>(
           loc, env, std::nullopt, std::nullopt,
           [&srcMR, &mr](::mlir::OpBuilder &builder, ::mlir::Location loc) {
diff --git a/lib/Conversion/RegionParallelLoopToGpu/CMakeLists.txt b/lib/Conversion/RegionParallelLoopToGpu/CMakeLists.txt
new file mode 100644
index 000000000..868f59d40
--- /dev/null
+++ b/lib/Conversion/RegionParallelLoopToGpu/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_imex_conversion_library(IMEXSCFToGPU
+  RegionParallelLoopToGpu.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/imex/Conversion/SCFToGPU
+
+  DEPENDS
+  IMEXConversionPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRSCFToGPU
+)
diff --git a/lib/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.cpp b/lib/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.cpp
new file mode 100644
index 000000000..20279bdfd
--- /dev/null
+++ b/lib/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.cpp
@@ -0,0 +1,69 @@
+//===- RegionParallelLoopToGpu.cpp -  --------------*- C++ -*-===//
+//
+// Copyright 2024 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file extends upstream ParallelLoopToGpuPass by applying the transform
+/// only if the parallel loop is within a GPU region
+/// (`region.env_region #region.gpu_env<...>`).
+///
+//===----------------------------------------------------------------------===//
+
+#include <imex/Conversion/RegionParallelLoopToGpu/RegionParallelLoopToGpu.h>
+#include <imex/Dialect/Region/RegionUtils.h>
+#include <mlir/Conversion/SCFToGPU/SCFToGPU.h>
+#include <mlir/Dialect/Affine/IR/AffineOps.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Pass/Pass.h>
+#include <mlir/Transforms/DialectConversion.h>
+
+namespace imex {
+#define GEN_PASS_DEF_CONVERTREGIONPARALLELLOOPTOGPU
+#include "imex/Conversion/Passes.h.inc"
+
+namespace {
+struct ConvertRegionParallelLoopToGpuPass
+    : public ::imex::impl::ConvertRegionParallelLoopToGpuBase<
+          ConvertRegionParallelLoopToGpuPass> {
+  ConvertRegionParallelLoopToGpuPass() = default;
+
+  void runOnOperation() override {
+    ::mlir::RewritePatternSet patterns(&getContext());
+    ::mlir::populateParallelLoopToGPUPatterns(patterns);
+    ::mlir::ConversionTarget target(getContext());
+    target.markUnknownOpDynamicallyLegal(
+        [](::mlir::Operation *) { return true; });
+    ::mlir::configureParallelLoopToGPULegality(target);
+
+    // collect all gpu regions
+    ::mlir::SmallVector<::mlir::Operation *> ops;
+    getOperation()->walk([&](::imex::region::EnvironmentRegionOp op,
+                             const ::mlir::WalkStage &stage) {
+      if (::imex::region::isGpuRegion(op)) {
+        ops.push_back(op);
+        return ::mlir::WalkResult::skip();
+      }
+      return ::mlir::WalkResult::advance();
+    });
+
+    // apply par-loop to gpu conversion to collected gpu regions
+    if (::mlir::failed(
+            ::mlir::applyPartialConversion(ops, target, std::move(patterns)))) {
+      signalPassFailure();
+    }
+    ::mlir::finalizeParallelLoopToGPUConversion(getOperation());
+  }
+};
+} // namespace
+
+/// Create a pass to convert the Region dialect to the GPU dialect.
+std::unique_ptr<::mlir::Pass> createConvertRegionParallelLoopToGpuPass() {
+  return std::make_unique<ConvertRegionParallelLoopToGpuPass>();
+}
+
+} // namespace imex
diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt
index 47d7e21b7..071397eca 100644
--- a/lib/Transforms/CMakeLists.txt
+++ b/lib/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_mlir_library(IMEXTransforms
   CastIndex.cpp
   EmulateNonNativeBF16.cpp
   InsertGPUAllocs.cpp
+  InsertGPUCopy.cpp
   LowerMemRefCopy.cpp
   RemoveSingleElemVector.cpp
   RemoveTemporaries.cpp
@@ -14,6 +15,7 @@ add_mlir_library(IMEXTransforms
   VnniTransformation.cpp
   OptimizeTranspose.cpp
   HoistTranspose.cpp
+  TileLoops.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/imex/Transforms
diff --git a/lib/Transforms/InsertGPUAllocs.cpp b/lib/Transforms/InsertGPUAllocs.cpp
index d9e350e2c..87010e536 100644
--- a/lib/Transforms/InsertGPUAllocs.cpp
+++ b/lib/Transforms/InsertGPUAllocs.cpp
@@ -105,7 +105,7 @@ class InsertGPUAllocsPass final
         auto allocResult = builder.create<::mlir::gpu::AllocOp>(
             alloc.getLoc(), alloc.getType(), /*asyncToken*/ nullptr,
             /*asyncDependencies*/ std::nullopt, alloc.getDynamicSizes(),
-            alloc.getSymbolOperands(), true);
+            alloc.getSymbolOperands(), /*hostShared*/ hostShared);
         alloc.replaceAllUsesWith(allocResult);
         alloc.erase();
       }
diff --git a/lib/Transforms/InsertGPUCopy.cpp b/lib/Transforms/InsertGPUCopy.cpp
new file mode 100644
index 000000000..50af43f92
--- /dev/null
+++ b/lib/Transforms/InsertGPUCopy.cpp
@@ -0,0 +1,89 @@
+//===- InsertGPUCopy.cpp - InsertGPUCopy Pass  -------*- C++ -*-===//
+//
+// Copyright 2022 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file replaces the memref.copy ops with gpu.memcpy ops if the
+/// memref.copy resides in an environment region. This environment region must
+/// be created in a prior pass where the device/host memory semantics are
+/// present.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Threading.h"
+#include <imex/Transforms/Passes.h>
+
+#include <imex/Dialect/Region/RegionUtils.h>
+#include <mlir/Dialect/Affine/IR/AffineOps.h>
+#include <mlir/Dialect/Bufferization/Transforms/BufferViewFlowAnalysis.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/GPU/Transforms/Passes.h>
+#include <mlir/Dialect/MemRef/IR/MemRef.h>
+#include <mlir/Pass/Pass.h>
+#include <optional>
+
+namespace imex {
+#define GEN_PASS_DEF_INSERTGPUCOPY
+#include "imex/Transforms/Passes.h.inc"
+} // namespace imex
+
+namespace {
+
+inline bool isInEnvRegion(::mlir::Operation *op) {
+  if (!op)
+    return false;
+  if (!op->getParentOfType<::imex::region::EnvironmentRegionOp>())
+    return false;
+  return true;
+}
+
+class InsertGPUCopyPass final
+    : public imex::impl::InsertGPUCopyBase<InsertGPUCopyPass> {
+public:
+  void runOnOperation() override {
+    auto func = getOperation();
+    auto &funcBody = func.getBody();
+    if (funcBody.empty()) {
+      return;
+    } else if (!llvm::hasSingleElement(funcBody)) {
+      func.emitError("Function must have exactly one block");
+      signalPassFailure();
+      return;
+    }
+
+    mlir::OpBuilder builder(func);
+    // collect copy ops in GPU regions
+    ::mlir::SmallVector<::mlir::memref::CopyOp> copyOpsInGpuRegion;
+
+    // traverse ops and identify memref.copy ops which are in GPU region
+    (void)func.walk([&](::mlir::memref::CopyOp op) {
+      if (isInEnvRegion(op)) {
+        copyOpsInGpuRegion.emplace_back(op);
+      }
+    });
+
+    // Replace copy ops with gpu.memcpy
+    for (auto copyOp : copyOpsInGpuRegion) {
+      builder.setInsertionPoint(copyOp);
+      // /*asyncToken*/ std::nullopt,
+      builder.create<::mlir::gpu::MemcpyOp>(
+          copyOp.getLoc(), /*resultTypes*/ ::mlir::TypeRange{},
+          /*asyncDependencies*/ ::mlir::ValueRange{}, copyOp.getTarget(),
+          copyOp.getSource());
+      copyOp.erase();
+    }
+  }
+};
+
+} // namespace
+
+namespace imex {
+std::unique_ptr<mlir::Pass> createInsertGPUCopyPass() {
+  return std::make_unique<InsertGPUCopyPass>();
+}
+} // namespace imex
diff --git a/lib/Transforms/TileLoops.cpp b/lib/Transforms/TileLoops.cpp
new file mode 100644
index 000000000..edd358662
--- /dev/null
+++ b/lib/Transforms/TileLoops.cpp
@@ -0,0 +1,141 @@
+//===- TileLoops.cpp ------------------------------------*- C++ -*-===//
+//
+// Copyright 2023 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the TileLoops transform which tiles loops for GPU
+/// mapping.
+///
+//===----------------------------------------------------------------------===//
+
+#include <imex/Utils/PassUtils.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/Linalg/IR/Linalg.h>
+#include <mlir/Dialect/SCF/IR/SCF.h>
+#include <mlir/Dialect/SCF/Transforms/TileUsingInterface.h>
+#include <mlir/Interfaces/TilingInterface.h>
+#include <mlir/Pass/Pass.h>
+
+#include "llvm/Support/Threading.h"
+#include <imex/Dialect/Region/RegionUtils.h>
+#include <imex/Transforms/Passes.h>
+
+namespace imex {
+#define GEN_PASS_DEF_TILELOOPS
+#include "imex/Transforms/Passes.h.inc"
+} // namespace imex
+
+#define DEBUG_TYPE "tile-loops"
+
+#ifndef NDEBUG
+#define DEBUG_MSG(PREFIX, MSG)                                                 \
+  LLVM_DEBUG(llvm::dbgs() << PREFIX << ": " << MSG << "\n");
+#define DEBUG_OP(PREFIX, MSG, OP)                                              \
+  LLVM_DEBUG(llvm::dbgs() << PREFIX << ": " << MSG << " '" << OP->getName()    \
+                          << "' " << OP->getLoc() << "\n");
+#define DEBUG_OP_VEC(PREFIX, MSG, OPVEC)                                       \
+  LLVM_DEBUG(llvm::dbgs() << PREFIX << ": " << MSG << " (" << OPVEC.size()     \
+                          << ")\n");                                           \
+  for (auto op : OPVEC) {                                                      \
+    DEBUG_OP(PREFIX, "  ", op)                                                 \
+  }
+#endif
+
+using namespace imex;
+
+namespace {
+
+static ::mlir::FailureOr<::mlir::SmallVector<int64_t>>
+getDefaultTileSizes(::mlir::linalg::LinalgOp linalgOp,
+                    ::mlir::ArrayRef<int64_t> userProvidedTiles) {
+  // The user-provided tiles are considered from the outer
+  // most loop. If not enough tiles are provided we pad with
+  // zeros.
+  if (!userProvidedTiles.empty()) {
+    size_t numParallelLoops = linalgOp.getNumParallelLoops();
+    size_t nonZeros = 0;
+    for (auto tile : userProvidedTiles)
+      if (tile != 0)
+        nonZeros++;
+    if (nonZeros > numParallelLoops ||
+        userProvidedTiles.size() > linalgOp.getNumLoops()) {
+      return ::mlir::failure();
+    }
+
+    ::mlir::SmallVector<int64_t> userTiles(linalgOp.getNumLoops(), 0);
+    for (auto tile : ::llvm::enumerate(userProvidedTiles))
+      userTiles[tile.index()] = tile.value();
+    return userTiles;
+  }
+  return ::mlir::failure();
+}
+
+struct TileLoops final : public imex::impl::TileLoopsBase<TileLoops> {
+
+  using TileLoopsBase::TileLoopsBase;
+
+  void runOnOperation() override {
+
+    ::mlir::func::FuncOp func = getOperation();
+    ::mlir::IRRewriter rewriter(&getContext());
+    transform(rewriter, func, this->tileSizes, this->minTileFactor);
+
+    return;
+  }
+
+private:
+  void transform(::mlir::RewriterBase &rewriter, ::mlir::func::FuncOp func,
+                 ::mlir::ArrayRef<int64_t> tileSizes, int64_t minTileFactor) {
+    DEBUG_MSG("tile-loops", "Entering transform");
+    ::mlir::SmallVector<::mlir::Operation *> allLinalgOps;
+    func->walk([&](::mlir::linalg::LinalgOp linalgOp) {
+      if (!inRegions || ::imex::region::isInGpuRegion(linalgOp)) {
+        allLinalgOps.push_back(linalgOp);
+      }
+    });
+    DEBUG_OP_VEC("tile-loops", "  Found linalg ops", allLinalgOps);
+
+    for (auto op : allLinalgOps) {
+      DEBUG_OP("tile-loops", "  Tiling op:", op);
+      auto tiles = getDefaultTileSizes(
+          ::llvm::cast<::mlir::linalg::LinalgOp>(op), tileSizes);
+      if (failed(tiles)) {
+        DEBUG_MSG("tile-loops",
+                  "  Failed to compute default tile sizes. Aborting.");
+        return;
+      }
+      DEBUG_MSG("tile-loops", "  tile sizes:");
+      LLVM_DEBUG(llvm::dbgs() << "tile-loops:    (");
+      LLVM_DEBUG(llvm::interleaveComma(*tiles, llvm::dbgs()));
+      LLVM_DEBUG(llvm::dbgs() << ")\n");
+
+      auto tilesRes =
+          ::mlir::getAsOpFoldResult(rewriter.getI64ArrayAttr(*tiles));
+      ::mlir::scf::SCFTilingOptions options;
+      options.setTileSizes(tilesRes);
+      options.setLoopType(::mlir::scf::SCFTilingOptions::LoopType::ForallOp);
+      auto tileOp = ::mlir::cast<::mlir::TilingInterface>(op);
+      ::mlir::FailureOr<::mlir::scf::SCFTilingResult> tilingResult =
+          mlir::scf::tileUsingSCF(rewriter, tileOp, options);
+      if (failed(tilingResult)) {
+        DEBUG_MSG("tile-loops", "  Failed to tile op. Aborting.");
+        return;
+      }
+      DEBUG_MSG("tile-loops", "  Tiling applied successfully.");
+      rewriter.replaceOp(op, tilingResult.value().replacements);
+    }
+  }
+};
+
+} // end anonymous namespace
+
+namespace imex {
+std::unique_ptr<mlir::Pass> createTileLoopsPass() {
+  return std::make_unique<TileLoops>();
+}
+} // namespace imex
diff --git a/test/Conversion/NDArrayToLinalg/NDArrayToLinalg.mlir b/test/Conversion/NDArrayToLinalg/NDArrayToLinalg.mlir
index e5aa868be..86c23cdf0 100644
--- a/test/Conversion/NDArrayToLinalg/NDArrayToLinalg.mlir
+++ b/test/Conversion/NDArrayToLinalg/NDArrayToLinalg.mlir
@@ -365,7 +365,7 @@ func.func @test_copy(%a: !ndarray.ndarray<?xi64>) -> !ndarray.ndarray<?xi64> {
 // CHECK-NEXT: tensor.dim
 // CHECK-NEXT: memref.alloc
 // CHECK-NEXT: bufferization.to_memref
-// CHECK-NEXT: region.env_region "protect_copy_op"
+// CHECK-NEXT: region.env_region "gpu_copy_op"
 // CHECK-NEXT: memref.copy
 // CHECK-NEXT: }
 // CHECK-NEXT: bufferization.to_tensor
@@ -373,7 +373,7 @@ func.func @test_copy(%a: !ndarray.ndarray<?xi64>) -> !ndarray.ndarray<?xi64> {
 // CHECK-NEXT: tensor.dim
 // CHECK-NEXT: memref.alloc
 // CHECK-NEXT: bufferization.to_memref
-// CHECK-NEXT: region.env_region "protect_copy_op"
+// CHECK-NEXT: region.env_region "gpu_copy_op"
 // CHECK-NEXT: memref.copy
 // CHECK-NEXT: }
 // CHECK-NEXT: bufferization.to_tensor
diff --git a/test/Conversion/SCFToGPU/RegionParallelLoopToGpu.mlir b/test/Conversion/SCFToGPU/RegionParallelLoopToGpu.mlir
new file mode 100644
index 000000000..8e1d09a8a
--- /dev/null
+++ b/test/Conversion/SCFToGPU/RegionParallelLoopToGpu.mlir
@@ -0,0 +1,69 @@
+// RUN: imex-opt --split-input-file -convert-region-parallel-loops-to-gpu %s -verify-diagnostics -o -| FileCheck %s
+
+// 2-d parallel loop mapped to block.y and block.x
+
+// -----
+func.func @test_convert_region_parloop_gpu(%arg0 : index, %arg1 : index, %arg2 : index,
+                              %arg3 : index, %arg4 : index,
+                              %buf : memref<?x?xf32>,
+                              %res : memref<?x?xf32>) {
+  %step = arith.constant 2 : index
+  region.env_region #region.gpu_env<device = "test"> {
+    scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                            step (%arg4, %step)  {
+      %val = memref.load %buf[%i0, %i1] : memref<?x?xf32>
+      memref.store %val, %res[%i1, %i0] : memref<?x?xf32>
+    } { mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>] }
+    region.env_region_yield
+  }
+  return
+}
+// CHECK: test_convert_region_parloop_gpu
+// CHECK: region.env_region #region.gpu_env<device = "test">
+// CHECK: gpu.launch
+
+// -----
+func.func @test_convert_region_parloop_cpu(%arg0 : index, %arg1 : index, %arg2 : index,
+                              %arg3 : index, %arg4 : index,
+                              %buf : memref<?x?xf32>,
+                              %res : memref<?x?xf32>) {
+  %step = arith.constant 2 : index
+  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%arg4, %step)  {
+    %val = memref.load %buf[%i0, %i1] : memref<?x?xf32>
+    memref.store %val, %res[%i1, %i0] : memref<?x?xf32>
+  } { mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>] }
+  return
+}
+// CHECK: test_convert_region_parloop_cpu
+// CHECK: scf.parallel
+
+// -----
+func.func @test_convert_region_parloop_combined(%arg0 : index, %arg1 : index, %arg2 : index,
+                              %arg3 : index, %arg4 : index,
+                              %buf : memref<?x?xf32>,
+                              %res : memref<?x?xf32>) {
+  %step = arith.constant 2 : index
+  region.env_region "something" {
+    region.env_region #region.gpu_env<device = "test"> {
+      scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                              step (%arg4, %step)  {
+        %val = memref.load %buf[%i0, %i1] : memref<?x?xf32>
+        memref.store %val, %res[%i1, %i0] : memref<?x?xf32>
+      } { mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>] }
+      region.env_region_yield
+    }
+
+    scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%arg4, %step)  {
+    %val = memref.load %buf[%i0, %i1] : memref<?x?xf32>
+    memref.store %val, %res[%i1, %i0] : memref<?x?xf32>
+  } { mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>] }
+  }
+  return
+}
+// CHECK: test_convert_region_parloop_combined
+// CHECK: region.env_region "something" {
+// CHECK: region.env_region #region.gpu_env<device = "test">
+// CHECK: gpu.launch
+// CHECK: scf.parallel
diff --git a/test/Transforms/InsertGpuAllocs/gpu_region_alloc.mlir b/test/Transforms/InsertGpuAllocs/gpu_region_alloc.mlir
index a85db359b..0861a69b2 100644
--- a/test/Transforms/InsertGpuAllocs/gpu_region_alloc.mlir
+++ b/test/Transforms/InsertGpuAllocs/gpu_region_alloc.mlir
@@ -1,4 +1,4 @@
-// RUN: imex-opt --insert-gpu-allocs='in-regions=1' %s | FileCheck %s
+// RUN: imex-opt --insert-gpu-allocs='in-regions=1 host-shared=0' %s | FileCheck %s
 
 func.func @test_region_alloc() {
   %0 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
@@ -16,7 +16,7 @@ func.func @test_region_alloc() {
 // CHECK-LABEL: func.func @test_region_alloc
 // CHECK-NEXT: memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
 // CHECK-NEXT: region.env_region #region.gpu_env<device = "XeGPU"> -> memref<2x5xf32> {
-// CHECK-NEXT: gpu.alloc host_shared () : memref<2x5xf32>
+// CHECK-NEXT: gpu.alloc () : memref<2x5xf32>
 // CHECK-NEXT: region.env_region_yield %memref : memref<2x5xf32>
 // CHECK-NEXT: }
 // CHECK: memref.dealloc %alloc : memref<2x5xf32>
diff --git a/test/Transforms/insert-gpu-copy.mlir b/test/Transforms/insert-gpu-copy.mlir
new file mode 100644
index 000000000..434711da2
--- /dev/null
+++ b/test/Transforms/insert-gpu-copy.mlir
@@ -0,0 +1,47 @@
+// RUN: imex-opt --split-input-file --insert-gpu-copy %s -verify-diagnostics -o -| FileCheck %s
+
+func.func @test_copy_gpu_region() -> (memref<?x?xi32>, memref<?x?xi32>) {
+    %c5 = arith.constant 5 : index
+    %alloc = memref.alloc(%c5, %c5) : memref<?x?xi32>
+    %0 = region.env_region #region.gpu_env<device = "test"> -> memref<?x?xi32> {
+        %memref = gpu.alloc  (%c5, %c5) : memref<?x?xi32>
+        memref.copy %alloc, %memref : memref<?x?xi32> to memref<?x?xi32>
+        region.env_region_yield %memref : memref<?x?xi32>
+    }
+    return %alloc, %0 : memref<?x?xi32>, memref<?x?xi32>
+}
+// CHECK-LABEL: func.func @test_copy_gpu_region
+// CHECK: [[ALLOC:%.*]] = memref.alloc
+// CHECK: region.env_region #region.gpu_env<device = "test">
+// CHECK-NEXT: [[ALLOC2:%.*]] = gpu.alloc
+// CHECK-NEXT: gpu.memcpy [[ALLOC2]], [[ALLOC]] : memref<?x?xi32>, memref<?x?xi32>
+// CHECK-NEXT: region.env_region_yield [[ALLOC2]] : memref<?x?xi32>
+
+func.func @test_copy_region() -> (memref<?x?xi32>, memref<?x?xi32>) {
+    %c5 = arith.constant 5 : index
+    %alloc = memref.alloc(%c5, %c5) : memref<?x?xi32>
+    %0 = region.env_region "string_attr" -> memref<?x?xi32> {
+        %memref = gpu.alloc  (%c5, %c5) : memref<?x?xi32>
+        memref.copy %alloc, %memref : memref<?x?xi32> to memref<?x?xi32>
+        region.env_region_yield %memref : memref<?x?xi32>
+    }
+    return %alloc, %0 : memref<?x?xi32>, memref<?x?xi32>
+}
+// CHECK-LABEL: func.func @test_copy_region
+// CHECK: [[ALLOC:%.*]] = memref.alloc
+// CHECK: region.env_region "string_attr"
+// CHECK-NEXT: [[ALLOC2:%.*]] = gpu.alloc
+// CHECK-NEXT: gpu.memcpy [[ALLOC2]], [[ALLOC]] : memref<?x?xi32>, memref<?x?xi32>
+// CHECK-NEXT: region.env_region_yield [[ALLOC2]] : memref<?x?xi32>
+
+func.func @test_copy() -> (memref<?x?xi32>, memref<?x?xi32>) {
+    %c5 = arith.constant 5 : index
+    %alloc = memref.alloc(%c5, %c5) : memref<?x?xi32>
+    %memref = memref.alloc(%c5, %c5) : memref<?x?xi32>
+    memref.copy %alloc, %memref : memref<?x?xi32> to memref<?x?xi32>
+    return %alloc, %memref : memref<?x?xi32>, memref<?x?xi32>
+}
+// CHECK-LABEL: func.func @test_copy
+// CHECK: [[ALLOC:%.*]] = memref.alloc
+// CHECK: [[MEMREF:%.*]] = memref.alloc
+// CHECK: memref.copy [[ALLOC]], [[MEMREF]] : memref<?x?xi32> to memref<?x?xi32>
diff --git a/test/Transforms/tile-loops.mlir b/test/Transforms/tile-loops.mlir
new file mode 100644
index 000000000..098999f1a
--- /dev/null
+++ b/test/Transforms/tile-loops.mlir
@@ -0,0 +1,38 @@
+// RUN: imex-opt --split-input-file -tile-loops='tile-sizes=32' -tile-loops='tile-sizes=1' %s -verify-diagnostics -o -| FileCheck %s
+
+#map = affine_map<(d0) -> (d0)>
+module {
+  func.func @add(%arg0: tensor<129xf32>, %arg1: tensor<129xf32>, %arg2: tensor<129xf32>) -> tensor<129xf32> {
+    %0 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<129xf32>, tensor<129xf32>) outs(%arg2 : tensor<129xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %1 = arith.addf %in, %in_0 : f32
+      linalg.yield %1 : f32
+    } -> tensor<129xf32>
+    return %0 : tensor<129xf32>
+  }
+}
+// CHECK-LABEL: func.func @add
+// CHECK-NEXT: %[[FORALL:.*]] = scf.forall (%arg3) = (0) to (129) step (32) shared_outs(%arg4 = %arg2) -> (tensor<129xf32>) {
+// CHECK-NEXT: %[[C129:.*]] = arith.constant 129 : index
+// CHECK-NEXT: %[[MIN:.*]] = affine.min #map(%[[ARG3:.*]])
+// CHECK-NEXT: %[[APPLY1:.*]] = affine.apply #map1(%[[MIN]])
+// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %arg0[%[[ARG3]]] [%[[MIN]]] [1] : tensor<129xf32> to tensor<?xf32>
+// CHECK-NEXT: %[[EXTRACTED_SLICE_0:.*]] = tensor.extract_slice %arg1[%[[ARG3]]] [%[[MIN]]] [1] : tensor<129xf32> to tensor<?xf32>
+// CHECK-NEXT: %[[EXTRACTED_SLICE_1:.*]] = tensor.extract_slice %arg4[%[[ARG3]]] [%[[MIN]]] [1] : tensor<129xf32> to tensor<?xf32>
+// CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[FORALL:.*]] = scf.forall (%[[ARG5:.*]]) in (%[[MIN]]) shared_outs(%[[ARG6:.*]] = %[[EXTRACTED_SLICE_1]]) -> (tensor<?xf32>) {
+// CHECK-NEXT: %[[EXTRACTED_SLICE_4:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][%[[ARG5]]] [1] [1] : tensor<?xf32> to tensor<1xf32>
+// CHECK-NEXT: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_0]][%[[ARG5]]] [1] [1] : tensor<?xf32> to tensor<1xf32>
+// CHECK-NEXT: %[[EXTRACTED_SLICE_6:.*]] = tensor.extract_slice %[[ARG6]][%[[ARG5]]] [1] [1] : tensor<?xf32> to tensor<1xf32>
+// CHECK-NEXT: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%[[EXTRACTED_SLICE_4]], %[[EXTRACTED_SLICE_5]] : tensor<1xf32>, tensor<1xf32>) outs(%[[EXTRACTED_SLICE_6]] : tensor<1xf32>) {
+// CHECK-NEXT: ^bb0(%[[IN:.*]]: f32, %[[IN_7:.*]]: f32, %[[OUT:.*]]: f32):
+// CHECK-NEXT: %[[ADDF:.*]] = arith.addf %[[IN]], %[[IN_7]] : f32
+// CHECK-NEXT: linalg.yield %[[ADDF]] : f32
+// CHECK-NEXT: } -> tensor<1xf32>
+// CHECK-NEXT: scf.forall.in_parallel {
+// CHECK-NEXT: tensor.parallel_insert_slice %[[GENERIC]] into %[[ARG6]][%[[ARG5]]] [1] [1] : tensor<1xf32> into tensor<?xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK: scf.forall.in_parallel {
+// CHECK-NEXT: tensor.parallel_insert_slice %[[FORALL]] into %arg4[%[[ARG3]]] [%[[MIN]]] [1] : tensor<?xf32> into tensor<129xf32>
+// CHECK-NEXT: }