From e553ce15c1b5e565a7dab7daa823bbf3425eeb9c Mon Sep 17 00:00:00 2001 From: Kate Osborn <50597707+kate-osborn@users.noreply.github.com> Date: Fri, 3 Jan 2025 17:17:05 -0700 Subject: [PATCH] Improve usage reporting in automated tests (#2971) Problem: Pipelines are failing intermittently due to r33 reporting errors. Solutions: - In the graceful recovery and reconfig tests, do not fail if there are errors in the NGINX logs. Instead, log the errors and continue. In the reconfig tests, all NGINX error logs are captured in the results file. Since the graceful recovery tests are part of the functional tests that run on every push to a PR and merge to main, it is not desirable to write the error logs to a file. Instead, if an unexpected error is seen in the NGINX logs, the test is skipped and the error is logged in the test output. Since we are sending traffic through NGINX during the graceful recovery tests, we will still catch any unrecoverable NGINX errors. - Reporting for conformance tests is disabled. --- .github/workflows/conformance.yml | 2 +- tests/framework/nginx_log_levels.go | 10 +++ tests/suite/graceful_recovery_test.go | 97 ++++++++++++++++++--------- tests/suite/reconfig_test.go | 8 ++- tests/suite/scale_test.go | 2 +- 5 files changed, 85 insertions(+), 34 deletions(-) create mode 100644 tests/framework/nginx_log_levels.go diff --git a/.github/workflows/conformance.yml b/.github/workflows/conformance.yml index fbf04b5cf9..c89301da54 100644 --- a/.github/workflows/conformance.yml +++ b/.github/workflows/conformance.yml @@ -142,7 +142,7 @@ jobs: - name: Setup license file for plus if: ${{ inputs.image == 'plus' }} env: - PLUS_LICENSE: ${{ secrets.JWT_PLUS_REPORTING }} + PLUS_LICENSE: ${{ secrets.JWT_PLUS_EXCEPTION_REPORTING }} run: echo "${PLUS_LICENSE}" > license.jwt - name: Setup conformance tests diff --git a/tests/framework/nginx_log_levels.go b/tests/framework/nginx_log_levels.go new file mode 100644 index 0000000000..25a3dfb00f --- /dev/null +++ b/tests/framework/nginx_log_levels.go @@ -0,0 +1,10 @@ +package framework + +// NGINX Log Prefixes for various log levels. +const ( + CritNGINXLog = "[crit]" + ErrorNGINXLog = "[error]" + WarnNGINXLog = "[warn]" + AlertNGINXLog = "[alert]" + EmergNGINXLog = "[emerg]" +) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index a45b23635a..c952ad4cc7 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -6,6 +6,7 @@ import ( "fmt" "net/http" "os/exec" + "slices" "strings" "time" @@ -28,7 +29,7 @@ const ( ngfContainerName = "nginx-gateway" ) -// Since checkContainerLogsForErrors may experience interference from previous tests (as explained in the function +// Since checkNGFContainerLogsForErrors may experience interference from previous tests (as explained in the function // documentation), this test is recommended to be run separate from other tests. var _ = Describe("Graceful Recovery test", Ordered, Label("graceful-recovery"), func() { files := []string{ @@ -189,6 +190,9 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names } checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, "", files, ns) + if errorLogs := getUnexpectedNginxErrorLogs(ngfPodName); errorLogs != "" { + Skip(fmt.Sprintf("NGINX has unexpected error logs: \n%s", errorLogs)) + } } func runRecoveryTest(teaURL, coffeeURL, ngfPodName, containerName string, files []string, ns *core.Namespace) { @@ -219,6 +223,9 @@ func runRecoveryTest(teaURL, coffeeURL, ngfPodName, containerName string, files } checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName, files, ns) + if errorLogs := getUnexpectedNginxErrorLogs(ngfPodName); errorLogs != "" { + Skip(fmt.Sprintf("NGINX has unexpected error logs: \n%s", errorLogs)) + } } func restartContainer(ngfPodName, containerName string) { @@ -347,13 +354,15 @@ func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName string, WithPolling(500 * time.Millisecond). Should(Succeed()) - checkContainerLogsForErrors(ngfPodName, containerName == nginxContainerName) + // When the NGINX process is killed, some errors are expected in the NGF logs while we wait for the + // NGINX container to be restarted. Therefore, we don't want to check the NGF logs for errors when the container + // we restarted was NGINX. + if containerName != nginxContainerName { + checkNGFContainerLogsForErrors(ngfPodName) + } } -// checkContainerLogsForErrors checks both nginx and NGF container's logs for any possible errors. -// When the NGINX process is killed, some errors are expected in the NGF logs while we wait for the -// NGINX container to be restarted. -func checkContainerLogsForErrors(ngfPodName string, checkNginxLogsOnly bool) { +func getNginxErrorLogs(ngfPodName string) string { nginxLogs, err := resourceManager.GetPodLogs( ngfNamespace, ngfPodName, @@ -361,38 +370,64 @@ func checkContainerLogsForErrors(ngfPodName string, checkNginxLogsOnly bool) { ) Expect(err).ToNot(HaveOccurred()) + errPrefixes := []string{ + framework.CritNGINXLog, + framework.ErrorNGINXLog, + framework.WarnNGINXLog, + framework.AlertNGINXLog, + framework.EmergNGINXLog, + } + errorLogs := "" + for _, line := range strings.Split(nginxLogs, "\n") { - Expect(line).ToNot(ContainSubstring("[crit]"), line) - Expect(line).ToNot(ContainSubstring("[alert]"), line) - Expect(line).ToNot(ContainSubstring("[emerg]"), line) - if strings.Contains(line, "[error]") { - expectedError1 := "connect() failed (111: Connection refused)" - expectedError2 := "could not be resolved (host not found) during usage report" - expectedError3 := "server returned 429" - // FIXME(salonichf5) remove this error message check - // when https://github.com/nginxinc/nginx-gateway-fabric/issues/2090 is completed. - expectedError4 := "no live upstreams while connecting to upstream" - Expect(line).To(Or( - ContainSubstring(expectedError1), - ContainSubstring(expectedError2), - ContainSubstring(expectedError3), - ContainSubstring(expectedError4), - )) + for _, prefix := range errPrefixes { + if strings.Contains(line, prefix) { + errorLogs += line + "\n" + break + } } } - if !checkNginxLogsOnly { - ngfLogs, err := resourceManager.GetPodLogs( - ngfNamespace, - ngfPodName, - &core.PodLogOptions{Container: ngfContainerName}, - ) - Expect(err).ToNot(HaveOccurred()) + return errorLogs +} + +func getUnexpectedNginxErrorLogs(ngfPodName string) string { + expectedErrStrings := []string{ + "connect() failed (111: Connection refused)", + "could not be resolved (host not found) during usage report", + "server returned 429", + // FIXME(salonichf5) remove this error message check + // when https://github.com/nginxinc/nginx-gateway-fabric/issues/2090 is completed. + "no live upstreams while connecting to upstream", + } - for _, line := range strings.Split(ngfLogs, "\n") { - Expect(line).ToNot(ContainSubstring("\"level\":\"error\""), line) + unexpectedErrors := "" + + errorLogs := getNginxErrorLogs(ngfPodName) + + for _, line := range strings.Split(errorLogs, "\n") { + if !slices.ContainsFunc(expectedErrStrings, func(s string) bool { + return strings.Contains(line, s) + }) { + unexpectedErrors += line } } + + return unexpectedErrors +} + +// checkNGFContainerLogsForErrors checks NGF container's logs for any possible errors. +func checkNGFContainerLogsForErrors(ngfPodName string) { + ngfLogs, err := resourceManager.GetPodLogs( + ngfNamespace, + ngfPodName, + &core.PodLogOptions{Container: ngfContainerName}, + ) + Expect(err).ToNot(HaveOccurred()) + + for _, line := range strings.Split(ngfLogs, "\n") { + Expect(line).ToNot(ContainSubstring("\"level\":\"error\""), line) + } } func checkLeaderLeaseChange(originalLeaseName string) error { diff --git a/tests/suite/reconfig_test.go b/tests/suite/reconfig_test.go index aa39e2c366..3f1d677851 100644 --- a/tests/suite/reconfig_test.go +++ b/tests/suite/reconfig_test.go @@ -405,7 +405,8 @@ var _ = Describe("Reconfiguration Performance Testing", Ordered, Label("nfr", "r ).WithTimeout(metricExistTimeout).WithPolling(metricExistPolling).Should(Succeed()) } - checkContainerLogsForErrors(ngfPodName, false) + checkNGFContainerLogsForErrors(ngfPodName) + nginxErrorLogs := getNginxErrorLogs(ngfPodName) reloadCount, err := framework.GetReloadCount(promInstance, ngfPodName) Expect(err).ToNot(HaveOccurred()) @@ -447,6 +448,7 @@ var _ = Describe("Reconfiguration Performance Testing", Ordered, Label("nfr", "r TimeToReadyAvgSingle: timeToReadyAvgSingle, NGINXReloads: int(reloadCount), NGINXReloadAvgTime: int(reloadAvgTime), + NGINXErrorLogs: nginxErrorLogs, EventsCount: int(eventsCount), EventsAvgTime: int(eventsAvgTime), } @@ -596,6 +598,7 @@ type reconfigTestResults struct { TestDescription string TimeToReadyTotal string TimeToReadyAvgSingle string + NGINXErrorLogs string EventsBuckets []framework.Bucket ReloadBuckets []framework.Bucket NumResources int @@ -627,6 +630,9 @@ const reconfigResultTemplate = ` {{- range .EventsBuckets }} - {{ .Le }}ms: {{ .Val }} {{- end }} + +### NGINX Error Logs +{{ .NGINXErrorLogs }} ` func writeReconfigResults(dest io.Writer, results reconfigTestResults) error { diff --git a/tests/suite/scale_test.go b/tests/suite/scale_test.go index 42982883fc..27919a7e46 100644 --- a/tests/suite/scale_test.go +++ b/tests/suite/scale_test.go @@ -368,7 +368,7 @@ The logs are attached only if there are errors. ) nginxErrors := checkLogErrors( "nginx", - []string{"[error]", "[emerg]", "[crit]", "[alert]"}, + []string{framework.ErrorNGINXLog, framework.EmergNGINXLog, framework.CritNGINXLog, framework.AlertNGINXLog}, nil, filepath.Join(testResultsDir, framework.CreateResultsFilename("log", "nginx", *plusEnabled)), )