Skip to content

Commit

Permalink
feat: TOOLS-2873 add indexes-memory-budget healthcheck (#285)
Browse files Browse the repository at this point in the history
* feat: TOOLS-2873 add indexes-memory-budget healthcheck

* fix: TOOLS-2895 available_bin_names healthcheck trigger
  • Loading branch information
Jesse S authored Apr 22, 2024
1 parent 3031818 commit a3cdcb2
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 15 deletions.
19 changes: 10 additions & 9 deletions lib/health/health_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,15 +100,15 @@ def _create_health_result_dict(self):
res[HealthResultType.STATUS_COUNTERS] = copy.deepcopy(self.status_counters)

res[HealthResultType.EXCEPTIONS] = {}
res[HealthResultType.EXCEPTIONS][
HealthResultType.EXCEPTIONS_SYNTAX
] = copy.deepcopy(self.syntax_exceptions)
res[HealthResultType.EXCEPTIONS][
HealthResultType.EXCEPTIONS_PROCESSING
] = copy.deepcopy(self.health_exceptions)
res[HealthResultType.EXCEPTIONS][
HealthResultType.EXCEPTIONS_OTHER
] = copy.deepcopy(self.other_exceptions)
res[HealthResultType.EXCEPTIONS][HealthResultType.EXCEPTIONS_SYNTAX] = (
copy.deepcopy(self.syntax_exceptions)
)
res[HealthResultType.EXCEPTIONS][HealthResultType.EXCEPTIONS_PROCESSING] = (
copy.deepcopy(self.health_exceptions)
)
res[HealthResultType.EXCEPTIONS][HealthResultType.EXCEPTIONS_OTHER] = (
copy.deepcopy(self.other_exceptions)
)

res[HealthResultType.ASSERT] = copy.deepcopy(self.assert_outputs)
res[HealthResultType.DEBUG_MESSAGES] = copy.deepcopy(self.debug_outputs)
Expand Down Expand Up @@ -349,6 +349,7 @@ def _execute_queries(self, query_source=None, is_source_file=True):
if self.no_valid_version:
self._increment_counter(HealthResultCounter.QUERY_SKIPPED_COUNTER)
continue

if self._is_assert_query(query):
self._increment_counter(HealthResultCounter.ASSERT_QUERY_COUNTER)

Expand Down
22 changes: 16 additions & 6 deletions lib/health/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,12 +215,12 @@
/* NB : ADD CHECKS IF NODES ARE NOT HOMOGENOUS MEM / NUM CPU etc */
SET CONSTRAINT VERSION < 7.0
SET CONSTRAINT VERSION < 7.0;
s = select "available_bin_names", "available-bin-names" from NAMESPACE save;
r = group by NAMESPACE do s > 3200;
r = group by NAMESPACE do s > 6400;
ASSERT(r, True, "Low namespace available bin names.", "LIMITS", WARNING,
"Listed node[s] have low available bin name (< 3200) for corresponding namespace[s]. Maximum unique bin names allowed per namespace are 32k. Please run 'show statistics namespace like available' to get actual values. Possible improperly modeled data.",
"Listed node[s] have low available bin name (< 6400) for corresponding namespace[s]. Maximum unique bin names allowed per namespace are 64k. Please run 'show statistics namespace like available' to get actual values. Possible improperly modeled data.",
"Namespace available bin names check.");
Expand Down Expand Up @@ -329,6 +329,15 @@
ASSERT(warn, True, "Low namespace disk available pct.", "OPERATIONS", WARNING,
"Listed namespace[s] have lower than normal (< 20 %) available disk space. Probable cause - namespace size misconfiguration.",
"Namespace disk available pct check.");
SET CONSTRAINT VERSION >= 7.1.0;
used_bytes = select "index_used_bytes" as "stats" from NAMESPACE.STATISTICS save;
stop_used_bytes = select "indexes-memory-budget" as "stats" from NAMESPACE.CONFIG save;
budget_configured = do stop_used_bytes > 0;
critical = do used_bytes <= stop_used_bytes;
ASSERT(critical, True, "High namespace index memory used pct (stop-write enabled).", "OPERATIONS", CRITICAL,
"Listed namespace[s] have higher than normal memory usage for indexes. Probable cause - namespace size misconfiguration.",
"Critical Namespace index memory used pct check.", budget_configured);
SET CONSTRAINT VERSION >= 7.0.0;
used = select "data_used_pct" as "stats" from NAMESPACE.STATISTICS save;
Expand Down Expand Up @@ -1567,7 +1576,7 @@
"Non-zero sindex background ops query error check");
// Should be constrained to just 5.7
SET CONSTRAINT VERSION < 6.0
SET CONSTRAINT VERSION < 6.0;
// Scan Background OPS statistics
s = select "scan_ops_bg_complete" as "cnt" from NAMESPACE.STATISTICS;
Expand All @@ -1592,7 +1601,7 @@
"Listed namespace[s] show non-zero scan background ops errors. Please run 'show statistics namespace like scan_ops_bg' to see values.",
"Non-zero scan background ops error check");
SET CONSTRAINT VERSION > 3.9
SET CONSTRAINT VERSION > 3.9;
// Scan Agg statistics
s = select "scan_aggr_complete" as "cnt" from NAMESPACE.STATISTICS;
Expand Down Expand Up @@ -1754,7 +1763,7 @@
// XDR Write statistics
SET CONSTRAINT VERSION < 4.5.1
SET CONSTRAINT VERSION < 4.5.1;
s = select "xdr_write_success" as "cnt", "xdr_client_write_success" as "cnt" from NAMESPACE.STATISTICS;
t = select "xdr_write_timeout" as "cnt" from NAMESPACE.STATISTICS;
Expand Down Expand Up @@ -1950,6 +1959,7 @@
"Namespace partition-tree-sprigs check for Community edition",
e);
# Should be further restricted to < 7.0;
SET CONSTRAINT VERSION >= 4.2;
cs = select "cluster_size" from SERVICE.STATISTICS;
Expand Down

0 comments on commit a3cdcb2

Please sign in to comment.