From ccd3f50eea04fb53511fe514233866a2fd304f1e Mon Sep 17 00:00:00 2001 From: zane-neo Date: Wed, 16 Oct 2024 21:51:05 +0800 Subject: [PATCH] fix cluster not able to spin up issue when disk usage exceeds threshold (#15258) * fix cluster not able to spin up issue when disk usage exceeds threshold Signed-off-by: zane-neo * Add comment to changes Signed-off-by: zane-neo * Add UT to ensure the keepAliveThread starts before node starts Signed-off-by: zane-neo * remove unused imports Signed-off-by: zane-neo * Fix forbidden API calls check failed issue Signed-off-by: zane-neo * format code Signed-off-by: zane-neo * format code Signed-off-by: zane-neo * change setInstance method to static Signed-off-by: zane-neo * Add countdownlatch in test to coordinate the thread to avoid concureency issue caused test failure Signed-off-by: zane-neo --------- Signed-off-by: zane-neo --- CHANGELOG.md | 2 + .../opensearch/bootstrap/BootstrapTests.java | 42 +++++++++++++++++++ .../org/opensearch/bootstrap/Bootstrap.java | 21 +++++++++- 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73db9bb9ed7af..c581f9ae8811e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -83,6 +83,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Fix protobuf-java leak through client library dependencies ([#16254](https://github.com/opensearch-project/OpenSearch/pull/16254)) - Fix multi-search with template doesn't return status code ([#16265](https://github.com/opensearch-project/OpenSearch/pull/16265)) - Fix wrong default value when setting `index.number_of_routing_shards` to null on index creation ([#16331](https://github.com/opensearch-project/OpenSearch/pull/16331)) +- Fix disk usage exceeds threshold cluster can't spin up issue ([#15258](https://github.com/opensearch-project/OpenSearch/pull/15258))) + ### Security diff --git a/distribution/tools/keystore-cli/src/test/java/org/opensearch/bootstrap/BootstrapTests.java b/distribution/tools/keystore-cli/src/test/java/org/opensearch/bootstrap/BootstrapTests.java index e9219de218aef..7aa63a2736a8c 100644 --- a/distribution/tools/keystore-cli/src/test/java/org/opensearch/bootstrap/BootstrapTests.java +++ b/distribution/tools/keystore-cli/src/test/java/org/opensearch/bootstrap/BootstrapTests.java @@ -31,6 +31,7 @@ package org.opensearch.bootstrap; +import org.opensearch.common.logging.LogConfigurator; import org.opensearch.common.settings.KeyStoreCommandTestCase; import org.opensearch.common.settings.KeyStoreWrapper; import org.opensearch.common.settings.SecureSettings; @@ -38,6 +39,7 @@ import org.opensearch.common.util.io.IOUtils; import org.opensearch.core.common.settings.SecureString; import org.opensearch.env.Environment; +import org.opensearch.node.Node; import org.opensearch.test.OpenSearchTestCase; import org.junit.After; import org.junit.Before; @@ -51,8 +53,14 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import static org.hamcrest.Matchers.equalTo; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; public class BootstrapTests extends OpenSearchTestCase { Environment env; @@ -131,4 +139,38 @@ private void assertPassphraseRead(String source, String expected) { } } + public void testInitExecutionOrder() throws Exception { + AtomicInteger order = new AtomicInteger(0); + CountDownLatch countDownLatch = new CountDownLatch(1); + Thread mockThread = new Thread(() -> { + assertEquals(0, order.getAndIncrement()); + countDownLatch.countDown(); + }); + + Node mockNode = mock(Node.class); + doAnswer(invocation -> { + try { + boolean threadStarted = countDownLatch.await(1000, TimeUnit.MILLISECONDS); + assertTrue( + "Waited for one second but the keepAliveThread isn't started, please check the execution order of" + + "keepAliveThread.start and node.start", + threadStarted + ); + } catch (InterruptedException e) { + fail("Thread interrupted"); + } + assertEquals(1, order.getAndIncrement()); + return null; + }).when(mockNode).start(); + + LogConfigurator.registerErrorListener(); + Bootstrap testBootstrap = new Bootstrap(mockThread, mockNode); + Bootstrap.setInstance(testBootstrap); + + Bootstrap.startInstance(testBootstrap); + + verify(mockNode).start(); + assertEquals(2, order.get()); + } + } diff --git a/server/src/main/java/org/opensearch/bootstrap/Bootstrap.java b/server/src/main/java/org/opensearch/bootstrap/Bootstrap.java index 4e167d10b99fa..757e2c9da6e49 100644 --- a/server/src/main/java/org/opensearch/bootstrap/Bootstrap.java +++ b/server/src/main/java/org/opensearch/bootstrap/Bootstrap.java @@ -93,6 +93,17 @@ final class Bootstrap { private final Thread keepAliveThread; private final Spawner spawner = new Spawner(); + // For testing purpose + static void setInstance(Bootstrap bootstrap) { + INSTANCE = bootstrap; + } + + // For testing purpose + Bootstrap(Thread keepAliveThread, Node node) { + this.keepAliveThread = keepAliveThread; + this.node = node; + } + /** creates a new instance */ Bootstrap() { keepAliveThread = new Thread(new Runnable() { @@ -336,8 +347,10 @@ private static Environment createEnvironment( } private void start() throws NodeValidationException { - node.start(); + // keepAliveThread should start first than node to ensure the cluster can spin up successfully in edge cases: + // https://github.com/opensearch-project/OpenSearch/issues/14791 keepAliveThread.start(); + node.start(); } static void stop() throws IOException { @@ -410,7 +423,7 @@ static void init(final boolean foreground, final Path pidFile, final boolean qui throw new BootstrapException(e); } - INSTANCE.start(); + startInstance(INSTANCE); // We don't close stderr if `--quiet` is passed, because that // hides fatal startup errors. For example, if OpenSearch is @@ -462,6 +475,10 @@ static void init(final boolean foreground, final Path pidFile, final boolean qui } } + static void startInstance(Bootstrap instance) throws NodeValidationException { + instance.start(); + } + @SuppressForbidden(reason = "System#out") private static void closeSystOut() { System.out.close();