diff --git a/server/src/main/java/org/opensearch/cluster/routing/RoutingNodes.java b/server/src/main/java/org/opensearch/cluster/routing/RoutingNodes.java
index 938a603c459c9..27056559d3ddf 100644
--- a/server/src/main/java/org/opensearch/cluster/routing/RoutingNodes.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/RoutingNodes.java
@@ -44,6 +44,7 @@
import org.opensearch.common.Randomness;
import org.opensearch.common.annotation.PublicApi;
import org.opensearch.common.collect.Tuple;
+import org.opensearch.common.settings.ClusterSettings;
import org.opensearch.core.Assertions;
import org.opensearch.core.index.Index;
import org.opensearch.core.index.shard.ShardId;
@@ -61,12 +62,15 @@
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Queue;
+import java.util.Random;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
+import static org.opensearch.node.remotestore.RemoteStoreNodeService.isMigratingToRemoteStore;
+
/**
* {@link RoutingNodes} represents a copy the routing information contained in the {@link ClusterState cluster state}.
* It can be either initialized as mutable or immutable (see {@link #RoutingNodes(ClusterState, boolean)}), allowing
@@ -418,6 +422,29 @@ public ShardRouting activeReplicaWithOldestVersion(ShardId shardId) {
.orElse(null);
}
+ /**
+ * Returns one active replica shard on a remote node for the given shard id or null
if
+ * no such replica is found.
+ *
+ * Since we aim to continue moving forward during remote store migration, replicas already migrated to remote nodes
+ * are preferred for primary promotion
+ */
+ public ShardRouting activeReplicaOnRemoteNode(ShardId shardId) {
+ Listtrue
iff all replicas are active for the given shard routing. Otherwise false
*/
@@ -735,11 +762,17 @@ private void unassignPrimaryAndPromoteActiveReplicaIfExists(
RoutingChangesObserver routingChangesObserver
) {
assert failedShard.primary();
- ShardRouting activeReplica;
- if (metadata.isSegmentReplicationEnabled(failedShard.getIndexName())) {
- activeReplica = activeReplicaWithOldestVersion(failedShard.shardId());
- } else {
- activeReplica = activeReplicaWithHighestVersion(failedShard.shardId());
+ ShardRouting activeReplica = null;
+ if (isMigratingToRemoteStore(new ClusterSettings(metadata.settings(), ClusterSettings.BUILT_IN_CLUSTER_SETTINGS))) {
+ // we might not find any replica on remote node
+ activeReplica = activeReplicaOnRemoteNode(failedShard.shardId());
+ }
+ if (activeReplica == null) {
+ if (metadata.isSegmentReplicationEnabled(failedShard.getIndexName())) {
+ activeReplica = activeReplicaWithOldestVersion(failedShard.shardId());
+ } else {
+ activeReplica = activeReplicaWithHighestVersion(failedShard.shardId());
+ }
}
if (activeReplica == null) {
moveToUnassigned(failedShard, unassignedInfo);
diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/FailedShardsRoutingTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/FailedShardsRoutingTests.java
index db4cedbbbe7b5..04e37e7d958d0 100644
--- a/server/src/test/java/org/opensearch/cluster/routing/allocation/FailedShardsRoutingTests.java
+++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/FailedShardsRoutingTests.java
@@ -40,6 +40,7 @@
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.cluster.metadata.Metadata;
import org.opensearch.cluster.node.DiscoveryNode;
+import org.opensearch.cluster.node.DiscoveryNodeRole;
import org.opensearch.cluster.node.DiscoveryNodes;
import org.opensearch.cluster.routing.RoutingNodes;
import org.opensearch.cluster.routing.RoutingTable;
@@ -47,13 +48,18 @@
import org.opensearch.cluster.routing.allocation.command.AllocationCommands;
import org.opensearch.cluster.routing.allocation.command.MoveAllocationCommand;
import org.opensearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider;
+import org.opensearch.common.UUIDs;
import org.opensearch.common.settings.Settings;
+import org.opensearch.common.util.FeatureFlags;
import org.opensearch.core.index.shard.ShardId;
import org.opensearch.indices.replication.common.ReplicationType;
+import org.opensearch.node.remotestore.RemoteStoreNodeService;
import org.opensearch.test.VersionUtils;
import java.util.ArrayList;
import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
import java.util.Set;
import static org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING;
@@ -61,6 +67,10 @@
import static org.opensearch.cluster.routing.ShardRoutingState.RELOCATING;
import static org.opensearch.cluster.routing.ShardRoutingState.STARTED;
import static org.opensearch.cluster.routing.ShardRoutingState.UNASSIGNED;
+import static org.opensearch.common.util.FeatureFlags.REMOTE_STORE_MIGRATION_EXPERIMENTAL;
+import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.REMOTE_STORE_CLUSTER_STATE_REPOSITORY_NAME_ATTRIBUTE_KEY;
+import static org.opensearch.node.remotestore.RemoteStoreNodeService.MIGRATION_DIRECTION_SETTING;
+import static org.opensearch.node.remotestore.RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.lessThan;
@@ -812,4 +822,134 @@ private void testReplicaIsPromoted(boolean isSegmentReplicationEnabled) {
}
}
}
+
+ public void testPreferReplicaOnRemoteNodeForPrimaryPromotion() {
+ FeatureFlags.initializeFeatureFlags(Settings.builder().put(REMOTE_STORE_MIGRATION_EXPERIMENTAL, "true").build());
+ AllocationService allocation = createAllocationService(Settings.builder().build());
+
+ // segment replication enabled
+ Settings.Builder settingsBuilder = settings(Version.CURRENT).put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT);
+
+ // remote store migration metadata settings
+ Metadata metadata = Metadata.builder()
+ .put(IndexMetadata.builder("test").settings(settingsBuilder).numberOfShards(1).numberOfReplicas(4))
+ .persistentSettings(
+ Settings.builder()
+ .put(REMOTE_STORE_COMPATIBILITY_MODE_SETTING.getKey(), RemoteStoreNodeService.CompatibilityMode.MIXED.mode)
+ .put(MIGRATION_DIRECTION_SETTING.getKey(), RemoteStoreNodeService.Direction.REMOTE_STORE.direction)
+ .build()
+ )
+ .build();
+
+ RoutingTable initialRoutingTable = RoutingTable.builder().addAsNew(metadata.index("test")).build();
+
+ ClusterState clusterState = ClusterState.builder(CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
+ .metadata(metadata)
+ .routingTable(initialRoutingTable)
+ .build();
+
+ ShardId shardId = new ShardId(metadata.index("test").getIndex(), 0);
+
+ // add a remote node and start primary shard
+ Map