From dd5fe57285c3572dd306743a4c95b4d5c234a919 Mon Sep 17 00:00:00 2001 From: Kunal Kotwani Date: Mon, 31 Jul 2023 09:22:40 -0700 Subject: [PATCH] [Backport 2.x] Add safeguards limits for file cache (#8989) * Add safeguard limits for file cache during node level allocation (#8208) Signed-off-by: Kunal Kotwani (cherry picked from commit 91bfa01606974b947455fbc289e21a1aad096fa8) * Add restore level safeguards to prevent file cache oversubscription (#8606) Signed-off-by: Kunal Kotwani (cherry picked from commit a3aab67ee86bf171a7eb480f0933e0b955fbf4f3) --- CHANGELOG.md | 2 + .../cluster/ClusterInfoServiceIT.java | 28 +++ .../restore/RestoreSnapshotRequest.java | 2 +- .../org/opensearch/cluster/ClusterInfo.java | 24 +- .../cluster/InternalClusterInfoService.java | 15 +- .../cluster/routing/RoutingTable.java | 10 + .../decider/DiskThresholdDecider.java | 52 +++++ .../common/settings/ClusterSettings.java | 2 + .../store/remote/filecache/FileCache.java | 16 ++ .../main/java/org/opensearch/node/Node.java | 5 +- .../opensearch/snapshots/RestoreService.java | 82 ++++++- .../restore/RestoreSnapshotRequestTests.java | 4 + .../opensearch/cluster/ClusterInfoTests.java | 24 +- .../cluster/routing/RoutingTableTests.java | 43 ++++ .../allocation/DiskThresholdMonitorTests.java | 2 +- ...dexShardConstraintDeciderOverlapTests.java | 2 +- .../RemoteShardsBalancerBaseTestCase.java | 2 +- .../decider/DiskThresholdDeciderTests.java | 207 +++++++++++++++++- .../DiskThresholdDeciderUnitTests.java | 13 +- .../snapshots/SnapshotResiliencyTests.java | 143 +++++++++++- .../MockInternalClusterInfoService.java | 3 +- .../opensearch/test/OpenSearchTestCase.java | 10 + 22 files changed, 662 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0016514fcdc68..8eb1832bb17c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add server version as REST response header [#6583](https://github.com/opensearch-project/OpenSearch/issues/6583) - Start replication checkpointTimers on primary before segments upload to remote store. ([#8221]()https://github.com/opensearch-project/OpenSearch/pull/8221) - Introduce new static cluster setting to control slice computation for concurrent segment search. ([#8847](https://github.com/opensearch-project/OpenSearch/pull/8884)) +- Add configuration for file cache size to max remote data ratio to prevent oversubscription of file cache ([#8606](https://github.com/opensearch-project/OpenSearch/pull/8606)) ### Dependencies - Bump `org.apache.logging.log4j:log4j-core` from 2.17.1 to 2.20.0 ([#8307](https://github.com/opensearch-project/OpenSearch/pull/8307)) @@ -33,6 +34,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Create separate SourceLookup instance per segment slice in SignificantTextAggregatorFactory ([#8807](https://github.com/opensearch-project/OpenSearch/pull/8807)) - Replace the deprecated IndexReader APIs with new storedFields() & termVectors() ([#7792](https://github.com/opensearch-project/OpenSearch/pull/7792)) - [Remote Store] Add support to restore only unassigned shards of an index ([#8792](https://github.com/opensearch-project/OpenSearch/pull/8792)) +- Add safeguard limits for file cache during node level allocation ([#8208](https://github.com/opensearch-project/OpenSearch/pull/8208)) ### Deprecated diff --git a/server/src/internalClusterTest/java/org/opensearch/cluster/ClusterInfoServiceIT.java b/server/src/internalClusterTest/java/org/opensearch/cluster/ClusterInfoServiceIT.java index 17e8526acfd74..508b8e21e42c1 100644 --- a/server/src/internalClusterTest/java/org/opensearch/cluster/ClusterInfoServiceIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/cluster/ClusterInfoServiceIT.java @@ -50,6 +50,7 @@ import org.opensearch.core.common.Strings; import org.opensearch.index.IndexService; import org.opensearch.index.shard.IndexShard; +import org.opensearch.index.store.remote.filecache.FileCacheStats; import org.opensearch.index.store.Store; import org.opensearch.indices.IndicesService; import org.opensearch.indices.SystemIndexDescriptor; @@ -192,6 +193,11 @@ public void testClusterInfoServiceCollectsInformation() { logger.info("--> shard size: {}", size); assertThat("shard size is greater than 0", size, greaterThanOrEqualTo(0L)); } + + final Map nodeFileCacheStats = info.nodeFileCacheStats; + assertNotNull(nodeFileCacheStats); + assertThat("file cache is empty on non search nodes", nodeFileCacheStats.size(), Matchers.equalTo(0)); + ClusterService clusterService = internalTestCluster.getInstance(ClusterService.class, internalTestCluster.getClusterManagerName()); ClusterState state = clusterService.state(); for (ShardRouting shard : state.routingTable().allShards()) { @@ -209,6 +215,28 @@ public void testClusterInfoServiceCollectsInformation() { } } + public void testClusterInfoServiceCollectsFileCacheInformation() { + internalCluster().startNodes(1); + internalCluster().ensureAtLeastNumSearchAndDataNodes(2); + + InternalTestCluster internalTestCluster = internalCluster(); + // Get the cluster info service on the cluster-manager node + final InternalClusterInfoService infoService = (InternalClusterInfoService) internalTestCluster.getInstance( + ClusterInfoService.class, + internalTestCluster.getClusterManagerName() + ); + infoService.setUpdateFrequency(TimeValue.timeValueMillis(200)); + ClusterInfo info = infoService.refresh(); + assertNotNull("info should not be null", info); + final Map nodeFileCacheStats = info.nodeFileCacheStats; + assertNotNull(nodeFileCacheStats); + assertThat("file cache is enabled on both search nodes", nodeFileCacheStats.size(), Matchers.equalTo(2)); + + for (FileCacheStats fileCacheStats : nodeFileCacheStats.values()) { + assertThat("file cache is non empty", fileCacheStats.getTotal().getBytes(), greaterThan(0L)); + } + } + public void testClusterInfoServiceInformationClearOnError() { internalCluster().startNodes( 2, diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/snapshots/restore/RestoreSnapshotRequest.java b/server/src/main/java/org/opensearch/action/admin/cluster/snapshots/restore/RestoreSnapshotRequest.java index 6c995b95d9a5f..123048c75985c 100644 --- a/server/src/main/java/org/opensearch/action/admin/cluster/snapshots/restore/RestoreSnapshotRequest.java +++ b/server/src/main/java/org/opensearch/action/admin/cluster/snapshots/restore/RestoreSnapshotRequest.java @@ -532,7 +532,7 @@ public String snapshotUuid() { /** * Sets the storage type for this request. */ - RestoreSnapshotRequest storageType(StorageType storageType) { + public RestoreSnapshotRequest storageType(StorageType storageType) { this.storageType = storageType; return this; } diff --git a/server/src/main/java/org/opensearch/cluster/ClusterInfo.java b/server/src/main/java/org/opensearch/cluster/ClusterInfo.java index 1acdf76c27172..c0832d7d8f2ae 100644 --- a/server/src/main/java/org/opensearch/cluster/ClusterInfo.java +++ b/server/src/main/java/org/opensearch/cluster/ClusterInfo.java @@ -32,6 +32,7 @@ package org.opensearch.cluster; +import org.opensearch.Version; import org.opensearch.cluster.routing.ShardRouting; import org.opensearch.common.unit.ByteSizeValue; import org.opensearch.core.common.io.stream.StreamInput; @@ -41,6 +42,7 @@ import org.opensearch.core.xcontent.XContentBuilder; import org.opensearch.core.index.shard.ShardId; import org.opensearch.index.store.StoreStats; +import org.opensearch.index.store.remote.filecache.FileCacheStats; import java.io.IOException; import java.util.Collections; @@ -64,9 +66,10 @@ public class ClusterInfo implements ToXContentFragment, Writeable { public static final ClusterInfo EMPTY = new ClusterInfo(); final Map routingToDataPath; final Map reservedSpace; + final Map nodeFileCacheStats; protected ClusterInfo() { - this(Map.of(), Map.of(), Map.of(), Map.of(), Map.of()); + this(Map.of(), Map.of(), Map.of(), Map.of(), Map.of(), Map.of()); } /** @@ -84,13 +87,15 @@ public ClusterInfo( final Map mostAvailableSpaceUsage, final Map shardSizes, final Map routingToDataPath, - final Map reservedSpace + final Map reservedSpace, + final Map nodeFileCacheStats ) { this.leastAvailableSpaceUsage = leastAvailableSpaceUsage; this.shardSizes = shardSizes; this.mostAvailableSpaceUsage = mostAvailableSpaceUsage; this.routingToDataPath = routingToDataPath; this.reservedSpace = reservedSpace; + this.nodeFileCacheStats = nodeFileCacheStats; } public ClusterInfo(StreamInput in) throws IOException { @@ -110,6 +115,11 @@ public ClusterInfo(StreamInput in) throws IOException { this.shardSizes = Collections.unmodifiableMap(sizeMap); this.routingToDataPath = Collections.unmodifiableMap(routingMap); this.reservedSpace = Collections.unmodifiableMap(reservedSpaceMap); + if (in.getVersion().onOrAfter(Version.V_2_10_0)) { + this.nodeFileCacheStats = in.readMap(StreamInput::readString, FileCacheStats::new); + } else { + this.nodeFileCacheStats = Map.of(); + } } @Override @@ -121,6 +131,9 @@ public void writeTo(StreamOutput out) throws IOException { if (out.getVersion().onOrAfter(StoreStats.RESERVED_BYTES_VERSION)) { out.writeMap(this.reservedSpace, (o, v) -> v.writeTo(o), (o, v) -> v.writeTo(o)); } + if (out.getVersion().onOrAfter(Version.V_2_10_0)) { + out.writeMap(this.nodeFileCacheStats, StreamOutput::writeString, (o, v) -> v.writeTo(o)); + } } public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { @@ -194,6 +207,13 @@ public Map getNodeMostAvailableDiskUsages() { return Collections.unmodifiableMap(this.mostAvailableSpaceUsage); } + /** + * Returns a node id to file cache stats mapping for the nodes that have search roles assigned to it. + */ + public Map getNodeFileCacheStats() { + return Collections.unmodifiableMap(this.nodeFileCacheStats); + } + /** * Returns the shard size for the given shard routing or null it that metric is not available. */ diff --git a/server/src/main/java/org/opensearch/cluster/InternalClusterInfoService.java b/server/src/main/java/org/opensearch/cluster/InternalClusterInfoService.java index 0acc7bece439f..9c12d6bb3e7ea 100644 --- a/server/src/main/java/org/opensearch/cluster/InternalClusterInfoService.java +++ b/server/src/main/java/org/opensearch/cluster/InternalClusterInfoService.java @@ -59,6 +59,7 @@ import org.opensearch.common.util.concurrent.AbstractRunnable; import org.opensearch.core.concurrency.OpenSearchRejectedExecutionException; import org.opensearch.index.store.StoreStats; +import org.opensearch.index.store.remote.filecache.FileCacheStats; import org.opensearch.monitor.fs.FsInfo; import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.ReceiveTimeoutTransportException; @@ -72,6 +73,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; +import java.util.stream.Collectors; /** * InternalClusterInfoService provides the ClusterInfoService interface, @@ -110,6 +112,7 @@ public class InternalClusterInfoService implements ClusterInfoService, ClusterSt private volatile Map leastAvailableSpaceUsages; private volatile Map mostAvailableSpaceUsages; + private volatile Map nodeFileCacheStats; private volatile IndicesStatsSummary indicesStatsSummary; // null if this node is not currently the cluster-manager private final AtomicReference refreshAndRescheduleRunnable = new AtomicReference<>(); @@ -122,6 +125,7 @@ public class InternalClusterInfoService implements ClusterInfoService, ClusterSt public InternalClusterInfoService(Settings settings, ClusterService clusterService, ThreadPool threadPool, Client client) { this.leastAvailableSpaceUsages = Map.of(); this.mostAvailableSpaceUsages = Map.of(); + this.nodeFileCacheStats = Map.of(); this.indicesStatsSummary = IndicesStatsSummary.EMPTY; this.threadPool = threadPool; this.client = client; @@ -208,7 +212,8 @@ public ClusterInfo getClusterInfo() { mostAvailableSpaceUsages, indicesStatsSummary.shardSizes, indicesStatsSummary.shardRoutingToDataPath, - indicesStatsSummary.reservedSpace + indicesStatsSummary.reservedSpace, + nodeFileCacheStats ); } @@ -221,6 +226,7 @@ protected CountDownLatch updateNodeStats(final ActionListener(listener, latch)); return latch; @@ -264,6 +270,13 @@ public void onResponse(NodesStatsResponse nodesStatsResponse) { ); leastAvailableSpaceUsages = Collections.unmodifiableMap(leastAvailableUsagesBuilder); mostAvailableSpaceUsages = Collections.unmodifiableMap(mostAvailableUsagesBuilder); + + nodeFileCacheStats = Collections.unmodifiableMap( + nodesStatsResponse.getNodes() + .stream() + .filter(nodeStats -> nodeStats.getNode().isSearchNode()) + .collect(Collectors.toMap(nodeStats -> nodeStats.getNode().getId(), NodeStats::getFileCacheStats)) + ); } @Override diff --git a/server/src/main/java/org/opensearch/cluster/routing/RoutingTable.java b/server/src/main/java/org/opensearch/cluster/routing/RoutingTable.java index 7934649a6d3eb..d6a67bc714689 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/RoutingTable.java +++ b/server/src/main/java/org/opensearch/cluster/routing/RoutingTable.java @@ -295,6 +295,16 @@ public ShardsIterator allShardsIncludingRelocationTargets(String[] indices) { return allShardsSatisfyingPredicate(indices, shardRouting -> true, true); } + /** + * All the shards on the node which match the predicate + * @param predicate condition to match + * @return iterator over shards matching the predicate + */ + public ShardsIterator allShardsSatisfyingPredicate(Predicate predicate) { + String[] indices = indicesRouting.keySet().toArray(new String[0]); + return allShardsSatisfyingPredicate(indices, predicate, false); + } + private ShardsIterator allShardsSatisfyingPredicate( String[] indices, Predicate predicate, diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDecider.java index 04d671596d7e1..61b96184abcc4 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDecider.java @@ -54,14 +54,21 @@ import org.opensearch.common.unit.ByteSizeValue; import org.opensearch.core.index.Index; import org.opensearch.core.index.shard.ShardId; +import org.opensearch.index.store.remote.filecache.FileCacheStats; import org.opensearch.snapshots.SnapshotShardSizeInfo; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; +import static org.opensearch.cluster.routing.RoutingPool.REMOTE_CAPABLE; +import static org.opensearch.cluster.routing.RoutingPool.getNodePool; +import static org.opensearch.cluster.routing.RoutingPool.getShardPool; import static org.opensearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING; import static org.opensearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING; +import static org.opensearch.index.store.remote.filecache.FileCache.DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING; /** * The {@link DiskThresholdDecider} checks that the node a shard is potentially @@ -167,6 +174,42 @@ public static long sizeOfRelocatingShards( @Override public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { ClusterInfo clusterInfo = allocation.clusterInfo(); + + /* + The following block enables allocation for remote shards within safeguard limits of the filecache. + */ + if (REMOTE_CAPABLE.equals(getNodePool(node)) && REMOTE_CAPABLE.equals(getShardPool(shardRouting, allocation))) { + final List remoteShardsOnNode = StreamSupport.stream(node.spliterator(), false) + .filter(shard -> shard.primary() && REMOTE_CAPABLE.equals(getShardPool(shard, allocation))) + .collect(Collectors.toList()); + final long currentNodeRemoteShardSize = remoteShardsOnNode.stream() + .map(ShardRouting::getExpectedShardSize) + .mapToLong(Long::longValue) + .sum(); + + final long shardSize = getExpectedShardSize( + shardRouting, + 0L, + allocation.clusterInfo(), + allocation.snapshotShardSizeInfo(), + allocation.metadata(), + allocation.routingTable() + ); + + final FileCacheStats fileCacheStats = clusterInfo.getNodeFileCacheStats().getOrDefault(node.nodeId(), null); + final long nodeCacheSize = fileCacheStats != null ? fileCacheStats.getTotal().getBytes() : 0; + final long totalNodeRemoteShardSize = currentNodeRemoteShardSize + shardSize; + final double dataToFileCacheSizeRatio = DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING.get(allocation.metadata().settings()); + if (dataToFileCacheSizeRatio > 0.0f && totalNodeRemoteShardSize > dataToFileCacheSizeRatio * nodeCacheSize) { + return allocation.decision( + Decision.NO, + NAME, + "file cache limit reached - remote shard size will exceed configured safeguard ratio" + ); + } + return Decision.YES; + } + Map usages = clusterInfo.getNodeMostAvailableDiskUsages(); final Decision decision = earlyTerminate(allocation, usages); if (decision != null) { @@ -422,6 +465,15 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl if (shardRouting.currentNodeId().equals(node.nodeId()) == false) { throw new IllegalArgumentException("Shard [" + shardRouting + "] is not allocated on node: [" + node.nodeId() + "]"); } + + /* + The following block prevents movement for remote shards since they do not use the local storage as + the primary source of data storage. + */ + if (REMOTE_CAPABLE.equals(getNodePool(node)) && REMOTE_CAPABLE.equals(getShardPool(shardRouting, allocation))) { + return Decision.ALWAYS; + } + final ClusterInfo clusterInfo = allocation.clusterInfo(); final Map usages = clusterInfo.getNodeLeastAvailableDiskUsages(); final Decision decision = earlyTerminate(allocation, usages); diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index f494cf91e0db9..b932f6983aafd 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -46,6 +46,7 @@ import org.opensearch.index.ShardIndexingPressureSettings; import org.opensearch.index.ShardIndexingPressureStore; import org.opensearch.search.SearchBootstrapSettings; +import org.opensearch.index.store.remote.filecache.FileCache; import org.opensearch.search.backpressure.settings.NodeDuressSettings; import org.opensearch.search.backpressure.settings.SearchBackpressureSettings; import org.opensearch.search.backpressure.settings.SearchShardTaskSettings; @@ -646,6 +647,7 @@ public void apply(Settings value, Settings current, Settings previous) { // Settings related to Searchable Snapshots Node.NODE_SEARCH_CACHE_SIZE_SETTING, + FileCache.DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING, // Settings related to Remote Refresh Segment Pressure RemoteRefreshSegmentPressureSettings.REMOTE_REFRESH_SEGMENT_PRESSURE_ENABLED, diff --git a/server/src/main/java/org/opensearch/index/store/remote/filecache/FileCache.java b/server/src/main/java/org/opensearch/index/store/remote/filecache/FileCache.java index 0aa3740fb6ecb..47b891fdb8d21 100644 --- a/server/src/main/java/org/opensearch/index/store/remote/filecache/FileCache.java +++ b/server/src/main/java/org/opensearch/index/store/remote/filecache/FileCache.java @@ -11,6 +11,7 @@ import org.apache.lucene.store.IndexInput; import org.opensearch.common.breaker.CircuitBreaker; import org.opensearch.common.breaker.CircuitBreakingException; +import org.opensearch.common.settings.Setting; import org.opensearch.index.store.remote.utils.cache.CacheUsage; import org.opensearch.index.store.remote.utils.cache.RefCountedCache; import org.opensearch.index.store.remote.utils.cache.SegmentedCache; @@ -49,6 +50,21 @@ public class FileCache implements RefCountedCache { private final CircuitBreaker circuitBreaker; + /** + * Defines a limit of how much total remote data can be referenced as a ratio of the size of the disk reserved for + * the file cache. For example, if 100GB disk space is configured for use as a file cache and the + * remote_data_ratio of 5 is defined, then a total of 500GB of remote data can be loaded as searchable snapshots. + * This is designed to be a safeguard to prevent oversubscribing a cluster. + * Specify a value of zero for no limit, which is the default for compatibility reasons. + */ + public static final Setting DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING = Setting.doubleSetting( + "cluster.filecache.remote_data_ratio", + 0.0, + 0.0, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + public FileCache(SegmentedCache cache, CircuitBreaker circuitBreaker) { this.theCache = cache; this.circuitBreaker = circuitBreaker; diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index 4159b38d62dc8..9916d1e1e1ab3 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -945,8 +945,9 @@ protected Node( clusterModule.getAllocationService(), metadataCreateIndexService, metadataIndexUpgradeService, - clusterService.getClusterSettings(), - shardLimitValidator + shardLimitValidator, + indicesService, + clusterInfoService::getClusterInfo ); final DiskThresholdMonitor diskThresholdMonitor = new DiskThresholdMonitor( diff --git a/server/src/main/java/org/opensearch/snapshots/RestoreService.java b/server/src/main/java/org/opensearch/snapshots/RestoreService.java index 6bf2459c51a1d..029c5b0377d26 100644 --- a/server/src/main/java/org/opensearch/snapshots/RestoreService.java +++ b/server/src/main/java/org/opensearch/snapshots/RestoreService.java @@ -42,6 +42,7 @@ import org.opensearch.action.admin.cluster.snapshots.restore.RestoreSnapshotRequest; import org.opensearch.action.support.IndicesOptions; import org.opensearch.cluster.ClusterChangedEvent; +import org.opensearch.cluster.ClusterInfo; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.ClusterStateApplier; import org.opensearch.cluster.ClusterStateTaskConfig; @@ -70,6 +71,7 @@ import org.opensearch.cluster.routing.RoutingChangesObserver; import org.opensearch.cluster.routing.RoutingTable; import org.opensearch.cluster.routing.ShardRouting; +import org.opensearch.cluster.routing.ShardsIterator; import org.opensearch.cluster.routing.UnassignedInfo; import org.opensearch.cluster.routing.allocation.AllocationService; import org.opensearch.cluster.service.ClusterManagerTaskKeys; @@ -88,6 +90,9 @@ import org.opensearch.index.IndexSettings; import org.opensearch.index.shard.IndexShard; import org.opensearch.core.index.shard.ShardId; +import org.opensearch.index.snapshots.IndexShardSnapshotStatus; +import org.opensearch.index.store.remote.filecache.FileCacheStats; +import org.opensearch.indices.IndicesService; import org.opensearch.indices.ShardLimitValidator; import org.opensearch.repositories.IndexId; import org.opensearch.repositories.RepositoriesService; @@ -105,6 +110,7 @@ import java.util.Set; import java.util.function.Function; import java.util.function.Predicate; +import java.util.function.Supplier; import java.util.stream.Collectors; import static java.util.Collections.unmodifiableSet; @@ -120,6 +126,8 @@ import static org.opensearch.common.util.FeatureFlags.SEARCHABLE_SNAPSHOT_EXTENDED_COMPATIBILITY; import static org.opensearch.common.util.set.Sets.newHashSet; import static org.opensearch.index.store.remote.directory.RemoteSnapshotDirectory.SEARCHABLE_SNAPSHOT_EXTENDED_COMPATIBILITY_MINIMUM_VERSION; +import static org.opensearch.index.store.remote.filecache.FileCache.DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING; +import static org.opensearch.node.Node.NODE_SEARCH_CACHE_SIZE_SETTING; import static org.opensearch.snapshots.SnapshotUtils.filterIndices; /** @@ -178,6 +186,10 @@ public class RestoreService implements ClusterStateApplier { private final ClusterSettings clusterSettings; + private final IndicesService indicesService; + + private final Supplier clusterInfoSupplier; + private final ClusterManagerTaskThrottler.ThrottlingKey restoreSnapshotTaskKey; private static final CleanRestoreStateTaskExecutor cleanRestoreStateTaskExecutor = new CleanRestoreStateTaskExecutor(); @@ -188,8 +200,9 @@ public RestoreService( AllocationService allocationService, MetadataCreateIndexService createIndexService, MetadataIndexUpgradeService metadataIndexUpgradeService, - ClusterSettings clusterSettings, - ShardLimitValidator shardLimitValidator + ShardLimitValidator shardLimitValidator, + IndicesService indicesService, + Supplier clusterInfoSupplier ) { this.clusterService = clusterService; this.repositoriesService = repositoriesService; @@ -201,6 +214,8 @@ public RestoreService( } this.clusterSettings = clusterService.getClusterSettings(); this.shardLimitValidator = shardLimitValidator; + this.indicesService = indicesService; + this.clusterInfoSupplier = clusterInfoSupplier; // Task is onboarded for throttling, it will get retried from associated TransportClusterManagerNodeAction. restoreSnapshotTaskKey = clusterService.registerClusterManagerTask(ClusterManagerTaskKeys.RESTORE_SNAPSHOT_KEY, true); @@ -447,7 +462,9 @@ public ClusterState execute(ClusterState currentState) { ClusterBlocks.Builder blocks = ClusterBlocks.builder().blocks(currentState.blocks()); RoutingTable.Builder rtBuilder = RoutingTable.builder(currentState.routingTable()); final Map shards; + final boolean isRemoteSnapshot = IndexModule.Type.REMOTE_SNAPSHOT.match(request.storageType().toString()); Set aliases = new HashSet<>(); + long totalRestorableRemoteIndexesSize = 0; if (indices.isEmpty() == false) { // We have some indices to restore @@ -458,17 +475,14 @@ public ClusterState execute(ClusterState currentState) { String index = indexEntry.getValue(); boolean partial = checkPartial(index); + IndexId snapshotIndexId = repositoryData.resolveIndexId(index); IndexMetadata snapshotIndexMetadata = updateIndexSettings( metadata.index(index), request.indexSettings(), request.ignoreIndexSettings() ); - if (IndexModule.Type.REMOTE_SNAPSHOT.match(request.storageType().toString())) { - snapshotIndexMetadata = addSnapshotToIndexSettings( - snapshotIndexMetadata, - snapshot, - repositoryData.resolveIndexId(index) - ); + if (isRemoteSnapshot) { + snapshotIndexMetadata = addSnapshotToIndexSettings(snapshotIndexMetadata, snapshot, snapshotIndexId); } final boolean isSearchableSnapshot = IndexModule.Type.REMOTE_SNAPSHOT.match( snapshotIndexMetadata.getSettings().get(IndexModule.INDEX_STORE_TYPE_SETTING.getKey()) @@ -494,7 +508,7 @@ public ClusterState execute(ClusterState currentState) { restoreUUID, snapshot, snapshotInfo.version(), - repositoryData.resolveIndexId(index), + snapshotIndexId, isSearchableSnapshot, isRemoteStoreShallowCopy, request.getSourceRemoteStoreRepository() @@ -618,6 +632,14 @@ public ClusterState execute(ClusterState currentState) { } for (int shard = 0; shard < snapshotIndexMetadata.getNumberOfShards(); shard++) { + if (isRemoteSnapshot) { + IndexShardSnapshotStatus.Copy shardStatus = repository.getShardSnapshotStatus( + snapshotInfo.snapshotId(), + snapshotIndexId, + new ShardId(metadata.index(index).getIndex(), shard) + ).asCopy(); + totalRestorableRemoteIndexesSize += shardStatus.getTotalSize(); + } if (!ignoreShards.contains(shard)) { shardsBuilder.put( new ShardId(renamedIndex, shard), @@ -654,6 +676,9 @@ public ClusterState execute(ClusterState currentState) { } checkAliasNameConflicts(indices, aliases); + if (isRemoteSnapshot) { + validateSearchableSnapshotRestorable(totalRestorableRemoteIndexesSize); + } Map updatedDataStreams = new HashMap<>(currentState.metadata().dataStreams()); updatedDataStreams.putAll( @@ -853,6 +878,45 @@ private IndexMetadata updateIndexSettings( return builder.settings(settingsBuilder).build(); } + private void validateSearchableSnapshotRestorable(long totalRestorableRemoteIndexesSize) { + ClusterInfo clusterInfo = clusterInfoSupplier.get(); + double remoteDataToFileCacheRatio = DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING.get(clusterService.getSettings()); + Map nodeFileCacheStats = clusterInfo.getNodeFileCacheStats(); + if (nodeFileCacheStats.isEmpty() || remoteDataToFileCacheRatio <= 0.01f) { + return; + } + + long totalNodeFileCacheSize = clusterInfo.getNodeFileCacheStats() + .values() + .stream() + .map(fileCacheStats -> fileCacheStats.getTotal().getBytes()) + .mapToLong(Long::longValue) + .sum(); + + Predicate isRemoteSnapshotShard = shardRouting -> shardRouting.primary() + && indicesService.indexService(shardRouting.index()).getIndexSettings().isRemoteSnapshot(); + + ShardsIterator shardsIterator = clusterService.state() + .routingTable() + .allShardsSatisfyingPredicate(isRemoteSnapshotShard); + + long totalRestoredRemoteIndexesSize = shardsIterator.getShardRoutings() + .stream() + .map(clusterInfo::getShardSize) + .mapToLong(Long::longValue) + .sum(); + + if (totalRestoredRemoteIndexesSize + totalRestorableRemoteIndexesSize > remoteDataToFileCacheRatio + * totalNodeFileCacheSize) { + throw new SnapshotRestoreException( + snapshot, + "Size of the indexes to be restored exceeds the file cache bounds. Increase the file cache capacity on the cluster nodes using " + + NODE_SEARCH_CACHE_SIZE_SETTING.getKey() + + " setting." + ); + } + } + @Override public void onFailure(String source, Exception e) { logger.warn(() -> new ParameterizedMessage("[{}] failed to restore snapshot", snapshotId), e); diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/snapshots/restore/RestoreSnapshotRequestTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/snapshots/restore/RestoreSnapshotRequestTests.java index 5c20b3b262730..82b2cfb2e3e51 100644 --- a/server/src/test/java/org/opensearch/action/admin/cluster/snapshots/restore/RestoreSnapshotRequestTests.java +++ b/server/src/test/java/org/opensearch/action/admin/cluster/snapshots/restore/RestoreSnapshotRequestTests.java @@ -112,6 +112,10 @@ private RestoreSnapshotRequest randomState(RestoreSnapshotRequest instance) { instance.snapshotUuid(randomBoolean() ? null : randomAlphaOfLength(10)); } + instance.storageType( + randomBoolean() ? RestoreSnapshotRequest.StorageType.LOCAL : RestoreSnapshotRequest.StorageType.REMOTE_SNAPSHOT + ); + if (randomBoolean()) { instance.setSourceRemoteStoreRepository(randomAlphaOfLengthBetween(5, 10)); } diff --git a/server/src/test/java/org/opensearch/cluster/ClusterInfoTests.java b/server/src/test/java/org/opensearch/cluster/ClusterInfoTests.java index 6be4a5a7f6e66..4ec7db2f3d552 100644 --- a/server/src/test/java/org/opensearch/cluster/ClusterInfoTests.java +++ b/server/src/test/java/org/opensearch/cluster/ClusterInfoTests.java @@ -36,6 +36,7 @@ import org.opensearch.cluster.routing.TestShardRouting; import org.opensearch.common.io.stream.BytesStreamOutput; import org.opensearch.core.index.shard.ShardId; +import org.opensearch.index.store.remote.filecache.FileCacheStats; import org.opensearch.test.OpenSearchTestCase; import java.util.HashMap; @@ -49,7 +50,8 @@ public void testSerialization() throws Exception { randomDiskUsage(), randomShardSizes(), randomRoutingToDataPath(), - randomReservedSpace() + randomReservedSpace(), + randomFileCacheStats() ); BytesStreamOutput output = new BytesStreamOutput(); clusterInfo.writeTo(output); @@ -60,6 +62,7 @@ public void testSerialization() throws Exception { assertEquals(clusterInfo.shardSizes, result.shardSizes); assertEquals(clusterInfo.routingToDataPath, result.routingToDataPath); assertEquals(clusterInfo.reservedSpace, result.reservedSpace); + assertEquals(clusterInfo.getNodeFileCacheStats().size(), result.getNodeFileCacheStats().size()); } private static Map randomDiskUsage() { @@ -79,6 +82,25 @@ private static Map randomDiskUsage() { return builder; } + private static Map randomFileCacheStats() { + int numEntries = randomIntBetween(0, 16); + final Map builder = new HashMap<>(numEntries); + for (int i = 0; i < numEntries; i++) { + String key = randomAlphaOfLength(16); + FileCacheStats fileCacheStats = new FileCacheStats( + randomLong(), + randomLong(), + randomLong(), + randomLong(), + randomLong(), + randomLong(), + randomLong() + ); + builder.put(key, fileCacheStats); + } + return builder; + } + private static Map randomShardSizes() { int numEntries = randomIntBetween(0, 128); final Map builder = new HashMap<>(numEntries); diff --git a/server/src/test/java/org/opensearch/cluster/routing/RoutingTableTests.java b/server/src/test/java/org/opensearch/cluster/routing/RoutingTableTests.java index 0ff9d6f07751a..53f1d71947f7c 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/RoutingTableTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/RoutingTableTests.java @@ -231,6 +231,49 @@ public void testShardsMatchingPredicateCount() { assertThat(clusterState.routingTable().shardsMatchingPredicateCount(predicate), is(2)); } + public void testAllShardsMatchingPredicate() { + MockAllocationService allocation = createAllocationService(Settings.EMPTY, new DelayedShardsMockGatewayAllocator()); + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test1").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1)) + .put(IndexMetadata.builder("test2").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1)) + .build(); + ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)) + .metadata(metadata) + .routingTable(RoutingTable.builder().addAsNew(metadata.index("test1")).addAsNew(metadata.index("test2")).build()) + .build(); + clusterState = ClusterState.builder(clusterState) + .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) + .build(); + clusterState = allocation.reroute(clusterState, "reroute"); + + Predicate predicate = s -> s.state() == ShardRoutingState.UNASSIGNED && s.unassignedInfo().isDelayed(); + assertThat(clusterState.routingTable().allShardsSatisfyingPredicate(predicate).size(), is(0)); + + // starting primaries + clusterState = startInitializingShardsAndReroute(allocation, clusterState); + // starting replicas + clusterState = startInitializingShardsAndReroute(allocation, clusterState); + // remove node2 and reroute + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).remove("node2")).build(); + // make sure both replicas are marked as delayed (i.e. not reallocated) + clusterState = allocation.disassociateDeadNodes(clusterState, true, "reroute"); + assertThat(clusterState.routingTable().allShardsSatisfyingPredicate(predicate).size(), is(2)); + + // Verifies true against all shards on the node (active/inactive) + assertThat(clusterState.routingTable().allShardsSatisfyingPredicate(shard -> true).size(), is(4)); + // Verifies false against all shards on the node (active/inactive) + assertThat(clusterState.routingTable().allShardsSatisfyingPredicate(shard -> false).size(), is(0)); + // Verifies against all primary shards on the node + assertThat(clusterState.routingTable().allShardsSatisfyingPredicate(ShardRouting::primary).size(), is(2)); + // Verifies a predicate which tests for inactive replicas + assertThat( + clusterState.routingTable() + .allShardsSatisfyingPredicate(shardRouting -> !shardRouting.primary() && !shardRouting.active()) + .size(), + is(2) + ); + } + public void testActivePrimaryShardsGrouped() { assertThat(this.emptyRoutingTable.activePrimaryShardsGrouped(new String[0], true).size(), is(0)); assertThat(this.emptyRoutingTable.activePrimaryShardsGrouped(new String[0], false).size(), is(0)); diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/DiskThresholdMonitorTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/DiskThresholdMonitorTests.java index 18140f3123a43..a8f8296cf9591 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/DiskThresholdMonitorTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/DiskThresholdMonitorTests.java @@ -798,7 +798,7 @@ private static ClusterInfo clusterInfo( final Map diskUsages, final Map reservedSpace ) { - return new ClusterInfo(diskUsages, null, null, null, reservedSpace); + return new ClusterInfo(diskUsages, null, null, null, reservedSpace, Map.of()); } } diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/IndexShardConstraintDeciderOverlapTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/IndexShardConstraintDeciderOverlapTests.java index 6bfa39f0c8ffd..cf32d2b3cf00f 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/IndexShardConstraintDeciderOverlapTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/IndexShardConstraintDeciderOverlapTests.java @@ -176,7 +176,7 @@ public DevNullClusterInfo( final Map shardSizes, final Map reservedSpace ) { - super(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, null, reservedSpace); + super(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, null, reservedSpace, Map.of()); } @Override diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/RemoteShardsBalancerBaseTestCase.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/RemoteShardsBalancerBaseTestCase.java index 9d7d0ebc5b2b1..dbb08a999877d 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/RemoteShardsBalancerBaseTestCase.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/RemoteShardsBalancerBaseTestCase.java @@ -239,7 +239,7 @@ public DevNullClusterInfo( final Map mostAvailableSpaceUsage, final Map shardSizes ) { - super(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, null, Map.of()); + super(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, null, Map.of(), Map.of()); } @Override diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java index 99bc6476aa228..bde8a45359814 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java @@ -69,6 +69,8 @@ import org.opensearch.common.settings.Settings; import org.opensearch.core.index.Index; import org.opensearch.core.index.shard.ShardId; +import org.opensearch.index.store.remote.filecache.FileCache; +import org.opensearch.index.store.remote.filecache.FileCacheStats; import org.opensearch.repositories.IndexId; import org.opensearch.snapshots.EmptySnapshotsInfoService; import org.opensearch.snapshots.InternalSnapshotsInfoService.SnapshotShard; @@ -82,6 +84,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.atomic.AtomicReference; import static java.util.Collections.emptyMap; @@ -282,6 +285,191 @@ public void testDiskThreshold() { assertThat(clusterState.getRoutingNodes().node("node4").size(), equalTo(1)); } + public void testDiskThresholdForRemoteShards() { + Settings diskSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.7) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.8) + .build(); + + Map usages = new HashMap<>(); + usages.put("node1", new DiskUsage("node1", "node1", "/dev/null", 100, 10)); // 90% used + usages.put("node2", new DiskUsage("node2", "node2", "/dev/null", 100, 35)); // 65% used + usages.put("node3", new DiskUsage("node3", "node3", "/dev/null", 100, 60)); // 40% used + + Map shardSizes = new HashMap<>(); + shardSizes.put("[test][0][p]", 10L); // 10 bytes + shardSizes.put("[test][0][r]", 10L); + + Map fileCacheStatsMap = new HashMap<>(); + fileCacheStatsMap.put("node1", new FileCacheStats(0, 0, 1000, 0, 0, 0, 0)); + fileCacheStatsMap.put("node2", new FileCacheStats(0, 0, 1000, 0, 0, 0, 0)); + fileCacheStatsMap.put("node3", new FileCacheStats(0, 0, 1000, 0, 0, 0, 0)); + final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes, fileCacheStatsMap); + + ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + AllocationDeciders deciders = new AllocationDeciders( + new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings))) + ); + + ClusterInfoService cis = () -> { + logger.info("--> calling fake getClusterInfo"); + return clusterInfo; + }; + AllocationService strategy = new AllocationService( + deciders, + new TestGatewayAllocator(), + new BalancedShardsAllocator(Settings.EMPTY), + cis, + EmptySnapshotsInfoService.INSTANCE + ); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(remoteIndexSettings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1)) + .build(); + + final RoutingTable initialRoutingTable = RoutingTable.builder().addAsNew(metadata.index("test")).build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)) + .metadata(metadata) + .routingTable(initialRoutingTable) + .build(); + + Set defaultWithSearchRole = new HashSet<>(CLUSTER_MANAGER_DATA_ROLES); + defaultWithSearchRole.add(DiscoveryNodeRole.SEARCH_ROLE); + + logger.info("--> adding two nodes"); + clusterState = ClusterState.builder(clusterState) + .nodes(DiscoveryNodes.builder().add(newNode("node1", defaultWithSearchRole)).add(newNode("node2", defaultWithSearchRole))) + .build(); + clusterState = strategy.reroute(clusterState, "reroute"); + logShardStates(clusterState); + + // Primary shard should be initializing, replica should not + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(2)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logShardStates(clusterState); + // Assert that we're able to start the primary + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(2)); + + logger.info("--> adding node3"); + + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + + logShardStates(clusterState); + // Assert that the replica is initialized now that node3 is available with enough space + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(2)); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(0)); + + logger.info("--> start the shards (replicas)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logShardStates(clusterState); + // Assert that the replica couldn't be started since node1 doesn't have enough space + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(2)); + assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1)); + assertThat(clusterState.getRoutingNodes().node("node2").size(), equalTo(1)); + assertThat(clusterState.getRoutingNodes().node("node3").size(), equalTo(0)); + } + + public void testFileCacheRemoteShardsDecisions() { + Settings diskSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "70%") + .build(); + + // We have an index with 2 primary shards each taking 40 bytes. Each node has 100 bytes available + final Map usages = new HashMap<>(); + usages.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 20)); // 80% used + usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 100)); // 0% used + + final Map shardSizes = new HashMap<>(); + shardSizes.put("[test][0][p]", 40L); + shardSizes.put("[test][1][p]", 40L); + shardSizes.put("[foo][0][p]", 10L); + + // First node has filecache size as 0, second has 1000, greater than the shard sizes. + Map fileCacheStatsMap = new HashMap<>(); + fileCacheStatsMap.put("node1", new FileCacheStats(0, 0, 0, 0, 0, 0, 0)); + fileCacheStatsMap.put("node2", new FileCacheStats(0, 0, 1000, 0, 0, 0, 0)); + + final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes, fileCacheStatsMap); + + Set defaultWithSearchRole = new HashSet<>(CLUSTER_MANAGER_DATA_ROLES); + defaultWithSearchRole.add(DiscoveryNodeRole.SEARCH_ROLE); + + DiskThresholdDecider diskThresholdDecider = makeDecider(diskSettings); + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(remoteIndexSettings(Version.CURRENT)).numberOfShards(2).numberOfReplicas(0)) + .persistentSettings(Settings.builder().put(FileCache.DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING.getKey(), 5).build()) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder().addAsNew(metadata.index("test")).build(); + + DiscoveryNode discoveryNode1 = new DiscoveryNode( + "node1", + buildNewFakeTransportAddress(), + emptyMap(), + defaultWithSearchRole, + Version.CURRENT + ); + DiscoveryNode discoveryNode2 = new DiscoveryNode( + "node2", + buildNewFakeTransportAddress(), + emptyMap(), + defaultWithSearchRole, + Version.CURRENT + ); + DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().add(discoveryNode1).add(discoveryNode2).build(); + + ClusterState baseClusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)) + .metadata(metadata) + .routingTable(initialRoutingTable) + .nodes(discoveryNodes) + .build(); + + // Two shards consuming each 80% of disk space while 70% is allowed, so shard 0 isn't allowed here + ShardRouting firstRouting = TestShardRouting.newShardRouting("test", 0, "node1", null, true, ShardRoutingState.STARTED); + ShardRouting secondRouting = TestShardRouting.newShardRouting("test", 1, "node1", null, true, ShardRoutingState.STARTED); + RoutingNode firstRoutingNode = new RoutingNode("node1", discoveryNode1, firstRouting, secondRouting); + RoutingNode secondRoutingNode = new RoutingNode("node2", discoveryNode2); + + RoutingTable.Builder builder = RoutingTable.builder() + .add( + IndexRoutingTable.builder(firstRouting.index()) + .addIndexShard(new IndexShardRoutingTable.Builder(firstRouting.shardId()).addShard(firstRouting).build()) + .addIndexShard(new IndexShardRoutingTable.Builder(secondRouting.shardId()).addShard(secondRouting).build()) + ); + ClusterState clusterState = ClusterState.builder(baseClusterState).routingTable(builder.build()).build(); + RoutingAllocation routingAllocation = new RoutingAllocation( + null, + new RoutingNodes(clusterState), + clusterState, + clusterInfo, + null, + System.nanoTime() + ); + routingAllocation.debugDecision(true); + Decision decision = diskThresholdDecider.canRemain(firstRouting, firstRoutingNode, routingAllocation); + assertThat(decision.type(), equalTo(Decision.Type.YES)); + + decision = diskThresholdDecider.canAllocate(firstRouting, firstRoutingNode, routingAllocation); + assertThat(decision.type(), equalTo(Decision.Type.NO)); + + assertThat( + ((Decision.Single) decision).getExplanation(), + containsString("file cache limit reached - remote shard size will exceed configured safeguard ratio") + ); + + decision = diskThresholdDecider.canAllocate(firstRouting, secondRoutingNode, routingAllocation); + assertThat(decision.type(), equalTo(Decision.Type.YES)); + } + public void testDiskThresholdWithAbsoluteSizes() { Settings diskSettings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) @@ -862,7 +1050,8 @@ public void testShardRelocationsTakenIntoAccount() { Map.of( new ClusterInfo.NodeAndPath("node1", "/dev/null"), new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), between(51, 200)).build() - ) + ), + Map.of() ) ); clusterState = applyStartedShardsUntilNoChange(clusterState, strategy); @@ -1454,16 +1643,26 @@ static class DevNullClusterInfo extends ClusterInfo { final Map mostAvailableSpaceUsage, final Map shardSizes ) { - this(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, Map.of()); + this(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, Map.of(), Map.of()); + } + + DevNullClusterInfo( + final Map leastAvailableSpaceUsage, + final Map mostAvailableSpaceUsage, + final Map shardSizes, + final Map nodeFileCacheStats + ) { + this(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, Map.of(), nodeFileCacheStats); } DevNullClusterInfo( final Map leastAvailableSpaceUsage, final Map mostAvailableSpaceUsage, final Map shardSizes, - Map reservedSpace + Map reservedSpace, + final Map nodeFileCacheStats ) { - super(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, null, reservedSpace); + super(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, null, reservedSpace, nodeFileCacheStats); } @Override diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java index 1de59587a0395..622d3a2c920b5 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java @@ -127,7 +127,7 @@ public void testCanAllocateUsesMaxAvailableSpace() { final Map shardSizes = new HashMap<>(); shardSizes.put("[test][0][p]", 10L); // 10 bytes - final ClusterInfo clusterInfo = new ClusterInfo(leastAvailableUsages, mostAvailableUsage, shardSizes, Map.of(), Map.of()); + final ClusterInfo clusterInfo = new ClusterInfo(leastAvailableUsages, mostAvailableUsage, shardSizes, Map.of(), Map.of(), Map.of()); RoutingAllocation allocation = new RoutingAllocation( new AllocationDeciders(Collections.singleton(decider)), clusterState.getRoutingNodes(), @@ -203,7 +203,7 @@ public void testCannotAllocateDueToLackOfDiskResources() { // way bigger than available space final long shardSize = randomIntBetween(110, 1000); shardSizes.put("[test][0][p]", shardSize); - ClusterInfo clusterInfo = new ClusterInfo(leastAvailableUsages, mostAvailableUsage, shardSizes, Map.of(), Map.of()); + ClusterInfo clusterInfo = new ClusterInfo(leastAvailableUsages, mostAvailableUsage, shardSizes, Map.of(), Map.of(), Map.of()); RoutingAllocation allocation = new RoutingAllocation( new AllocationDeciders(Collections.singleton(decider)), clusterState.getRoutingNodes(), @@ -320,7 +320,14 @@ public void testCanRemainUsesLeastAvailableSpace() { shardSizes.put("[test][1][p]", 10L); shardSizes.put("[test][2][p]", 10L); - final ClusterInfo clusterInfo = new ClusterInfo(leastAvailableUsages, mostAvailableUsage, shardSizes, shardRoutingMap, Map.of()); + final ClusterInfo clusterInfo = new ClusterInfo( + leastAvailableUsages, + mostAvailableUsage, + shardSizes, + shardRoutingMap, + Map.of(), + Map.of() + ); RoutingAllocation allocation = new RoutingAllocation( new AllocationDeciders(Collections.singleton(decider)), clusterState.getRoutingNodes(), diff --git a/server/src/test/java/org/opensearch/snapshots/SnapshotResiliencyTests.java b/server/src/test/java/org/opensearch/snapshots/SnapshotResiliencyTests.java index a121a190096b4..e4dec5163c400 100644 --- a/server/src/test/java/org/opensearch/snapshots/SnapshotResiliencyTests.java +++ b/server/src/test/java/org/opensearch/snapshots/SnapshotResiliencyTests.java @@ -34,6 +34,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.mockito.Mockito; import org.opensearch.ExceptionsHelper; import org.opensearch.Version; import org.opensearch.action.ActionListener; @@ -105,6 +106,8 @@ import org.opensearch.client.AdminClient; import org.opensearch.client.node.NodeClient; import org.opensearch.cluster.ClusterChangedEvent; +import org.opensearch.cluster.ClusterInfo; +import org.opensearch.cluster.ClusterInfoService; import org.opensearch.cluster.ClusterModule; import org.opensearch.cluster.ClusterName; import org.opensearch.cluster.ClusterState; @@ -176,7 +179,9 @@ import org.opensearch.index.seqno.RetentionLeaseSyncer; import org.opensearch.index.shard.PrimaryReplicaSyncer; import org.opensearch.index.store.RemoteSegmentStoreDirectoryFactory; +import org.opensearch.index.store.remote.filecache.FileCache; import org.opensearch.index.store.remote.filecache.FileCacheCleaner; +import org.opensearch.index.store.remote.filecache.FileCacheStats; import org.opensearch.indices.IndicesModule; import org.opensearch.indices.IndicesService; import org.opensearch.indices.ShardLimitValidator; @@ -213,6 +218,7 @@ import org.opensearch.test.OpenSearchTestCase; import org.opensearch.test.disruption.DisruptableMockTransport; import org.opensearch.threadpool.ThreadPool; +import org.opensearch.transport.RemoteTransportException; import org.opensearch.transport.TransportException; import org.opensearch.transport.TransportInterceptor; import org.opensearch.transport.TransportRequest; @@ -245,6 +251,7 @@ import static java.util.Collections.emptyMap; import static java.util.Collections.emptySet; +import static org.mockito.Mockito.when; import static org.opensearch.action.support.ActionTestUtils.assertNoFailureListener; import static org.opensearch.env.Environment.PATH_HOME_SETTING; import static org.opensearch.monitor.StatusInfo.Status.HEALTHY; @@ -260,6 +267,7 @@ import static org.hamcrest.Matchers.lessThanOrEqualTo; import static org.hamcrest.Matchers.notNullValue; import static org.mockito.Mockito.mock; +import static org.opensearch.node.Node.NODE_SEARCH_CACHE_SIZE_SETTING; public class SnapshotResiliencyTests extends OpenSearchTestCase { @@ -413,6 +421,106 @@ public void testSuccessfulSnapshotAndRestore() { assertEquals(0, snapshotInfo.failedShards()); } + public void testSearchableSnapshotOverSubscription() { + setupTestCluster(1, 2, 2); + + String repoName = "repo"; + String snapshotName = "snapshot"; + final String index = "test"; + final int shards = randomIntBetween(1, 10); + final int documents = randomIntBetween(0, 100); + + final TestClusterNodes.TestClusterNode clusterManagerNode = testClusterNodes.currentClusterManager( + testClusterNodes.nodes.values().iterator().next().clusterService.state() + ); + + Map nodeFileCacheStats = new HashMap<>(); + for (TestClusterNodes.TestClusterNode node : testClusterNodes.nodes.values()) { + nodeFileCacheStats.put(node.node.getId(), new FileCacheStats(0, 1, 0, 0, 0, 0, 0)); + } + ClusterInfo clusterInfo = new ClusterInfo(Map.of(), Map.of(), Map.of(), Map.of(), Map.of(), nodeFileCacheStats); + testClusterNodes.nodes.values().forEach(node -> when(node.getMockClusterInfoService().getClusterInfo()).thenReturn(clusterInfo)); + + final StepListener createSnapshotResponseListener = new StepListener<>(); + + continueOrDie(createRepoAndIndex(repoName, index, shards), createIndexResponse -> { + final Runnable afterIndexing = () -> client().admin() + .cluster() + .prepareCreateSnapshot(repoName, snapshotName) + .setWaitForCompletion(true) + .execute(createSnapshotResponseListener); + if (documents == 0) { + afterIndexing.run(); + } else { + final BulkRequest bulkRequest = new BulkRequest().setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); + for (int i = 0; i < documents; ++i) { + bulkRequest.add(new IndexRequest(index).source(Collections.singletonMap("foo", "bar" + i))); + } + final StepListener bulkResponseStepListener = new StepListener<>(); + client().bulk(bulkRequest, bulkResponseStepListener); + continueOrDie(bulkResponseStepListener, bulkResponse -> { + assertFalse("Failures in bulk response: " + bulkResponse.buildFailureMessage(), bulkResponse.hasFailures()); + assertEquals(documents, bulkResponse.getItems().length); + afterIndexing.run(); + }); + } + }); + + final StepListener deleteIndexListener = new StepListener<>(); + + continueOrDie( + createSnapshotResponseListener, + createSnapshotResponse -> client().admin().indices().delete(new DeleteIndexRequest(index), deleteIndexListener) + ); + + final StepListener restoreSnapshotResponseListener = new StepListener<>(); + continueOrDie( + deleteIndexListener, + ignored -> client().admin() + .cluster() + .restoreSnapshot( + new RestoreSnapshotRequest(repoName, snapshotName).waitForCompletion(true) + .storageType(RestoreSnapshotRequest.StorageType.REMOTE_SNAPSHOT), + restoreSnapshotResponseListener + ) + ); + + final AtomicBoolean exceptionVerified = new AtomicBoolean(); + + restoreSnapshotResponseListener.whenComplete(null, restoreSnapshotException -> { + Throwable throwable = restoreSnapshotException; + if (restoreSnapshotException instanceof RemoteTransportException) { + throwable = restoreSnapshotException.getCause(); + } + try { + assertTrue(throwable instanceof SnapshotRestoreException); + assertTrue( + throwable.getMessage() + .contains( + "Size of the indexes to be restored exceeds the file cache bounds. Increase the file cache capacity on the cluster nodes using " + + NODE_SEARCH_CACHE_SIZE_SETTING.getKey() + + " setting." + ) + ); + } catch (SnapshotRestoreException ignored) {} + exceptionVerified.set(true); + }); + + runUntil(exceptionVerified::get, TimeUnit.MINUTES.toMillis(5L)); + assertTrue(exceptionVerified.get()); + SnapshotsInProgress finalSnapshotsInProgress = clusterManagerNode.clusterService.state().custom(SnapshotsInProgress.TYPE); + assertFalse(finalSnapshotsInProgress.entries().stream().anyMatch(entry -> entry.state().completed() == false)); + final Repository repository = clusterManagerNode.repositoriesService.repository(repoName); + Collection snapshotIds = getRepositoryData(repository).getSnapshotIds(); + assertThat(snapshotIds, hasSize(1)); + + final SnapshotInfo snapshotInfo = repository.getSnapshotInfo(snapshotIds.iterator().next()); + assertEquals(SnapshotState.SUCCESS, snapshotInfo.state()); + assertThat(snapshotInfo.indices(), containsInAnyOrder(index)); + assertEquals(shards, snapshotInfo.successfulShards()); + assertEquals(0, snapshotInfo.failedShards()); + } + public void testSnapshotWithNodeDisconnects() { final int dataNodes = randomIntBetween(2, 10); final int clusterManagerNodes = randomFrom(1, 3, 5); @@ -1415,6 +1523,11 @@ private void setupTestCluster(int clusterManagerNodes, int dataNodes) { startCluster(); } + private void setupTestCluster(int clusterManagerNodes, int dataNodes, int searchNodes) { + testClusterNodes = new TestClusterNodes(clusterManagerNodes, dataNodes, searchNodes); + startCluster(); + } + private void scheduleSoon(Runnable runnable) { deterministicTaskQueue.scheduleAt(deterministicTaskQueue.getCurrentTimeMillis() + randomLongBetween(0, 100L), runnable); } @@ -1465,6 +1578,7 @@ private Environment createEnvironment(String nodeName) { ClusterBootstrapService.INITIAL_CLUSTER_MANAGER_NODES_SETTING.getKey(), ClusterBootstrapService.INITIAL_CLUSTER_MANAGER_NODES_SETTING.get(Settings.EMPTY) ) + .put(FileCache.DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING.getKey(), 5) .put(MappingUpdatedAction.INDICES_MAX_IN_FLIGHT_UPDATES_SETTING.getKey(), 1000) // o.w. some tests might block .build() ); @@ -1488,6 +1602,10 @@ private final class TestClusterNodes { private final Set disconnectedNodes = new HashSet<>(); TestClusterNodes(int clusterManagerNodes, int dataNodes) { + this(clusterManagerNodes, dataNodes, 0); + } + + TestClusterNodes(int clusterManagerNodes, int dataNodes, int searchNodes) { for (int i = 0; i < clusterManagerNodes; ++i) { nodes.computeIfAbsent("node" + i, nodeName -> { try { @@ -1506,6 +1624,15 @@ private final class TestClusterNodes { } }); } + for (int i = 0; i < searchNodes; ++i) { + nodes.computeIfAbsent("search-node" + i, nodeName -> { + try { + return newSearchNode(nodeName); + } catch (IOException e) { + throw new AssertionError(e); + } + }); + } } public TestClusterNode nodeById(final String nodeId) { @@ -1524,6 +1651,10 @@ private TestClusterNode newDataNode(String nodeName) throws IOException { return newNode(nodeName, DiscoveryNodeRole.DATA_ROLE); } + private TestClusterNode newSearchNode(String nodeName) throws IOException { + return newNode(nodeName, DiscoveryNodeRole.SEARCH_ROLE); + } + private TestClusterNode newNode(String nodeName, DiscoveryNodeRole role) throws IOException { return new TestClusterNode( new DiscoveryNode( @@ -1667,6 +1798,8 @@ private final class TestClusterNode { private final ThreadPool threadPool; + private final ClusterInfoService clusterInfoService; + private Coordinator coordinator; TestClusterNode(DiscoveryNode node) throws IOException { @@ -1784,6 +1917,7 @@ public void onFailure(final Exception e) { final NamedXContentRegistry namedXContentRegistry = new NamedXContentRegistry(Collections.emptyList()); final ScriptService scriptService = new ScriptService(settings, emptyMap(), emptyMap()); client = new NodeClient(settings, threadPool); + clusterInfoService = Mockito.mock(ClusterInfoService.class); final SetOnce rerouteServiceSetOnce = new SetOnce<>(); final SnapshotsInfoService snapshotsInfoService = new InternalSnapshotsInfoService( settings, @@ -1993,8 +2127,9 @@ public void onFailure(final Exception e) { new SystemIndices(emptyMap()), null ), - clusterSettings, - shardLimitValidator + shardLimitValidator, + indicesService, + clusterInfoService::getClusterInfo ); actions.put( PutMappingAction.INSTANCE, @@ -2207,6 +2342,10 @@ protected void assertSnapshotOrGenericThread() { } } + public ClusterInfoService getMockClusterInfoService() { + return clusterInfoService; + } + public void restart() { testClusterNodes.disconnectNode(this); final ClusterState oldState = this.clusterService.state(); diff --git a/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java b/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java index 6634d1b4dbafc..6354cf18e8b62 100644 --- a/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java +++ b/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java @@ -132,7 +132,8 @@ class SizeFakingClusterInfo extends ClusterInfo { delegate.getNodeMostAvailableDiskUsages(), delegate.shardSizes, delegate.routingToDataPath, - delegate.reservedSpace + delegate.reservedSpace, + delegate.nodeFileCacheStats ); } diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java index 788c93d7c6d5d..dee82451ba68b 100644 --- a/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java +++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java @@ -108,6 +108,7 @@ import org.opensearch.env.Environment; import org.opensearch.env.NodeEnvironment; import org.opensearch.env.TestEnvironment; +import org.opensearch.index.IndexModule; import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.index.analysis.AnalyzerScope; @@ -116,6 +117,7 @@ import org.opensearch.index.analysis.NamedAnalyzer; import org.opensearch.index.analysis.TokenFilterFactory; import org.opensearch.index.analysis.TokenizerFactory; +import org.opensearch.index.store.remote.filecache.FileCache; import org.opensearch.indices.analysis.AnalysisModule; import org.opensearch.monitor.jvm.JvmInfo; import org.opensearch.plugins.AnalysisPlugin; @@ -1198,6 +1200,14 @@ public static Settings.Builder settings(Version version) { return builder; } + public static Settings.Builder remoteIndexSettings(Version version) { + Settings.Builder builder = Settings.builder() + .put(FileCache.DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING.getKey(), 5) + .put(IndexMetadata.SETTING_VERSION_CREATED, version) + .put(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), IndexModule.Type.REMOTE_SNAPSHOT.getSettingsKey()); + return builder; + } + /** * Returns size random values */