From ca100879a411a32076187d4fd968d9b60ee6ab80 Mon Sep 17 00:00:00 2001 From: Krzysztof Kiewicz Date: Tue, 14 May 2024 18:25:23 +0200 Subject: [PATCH] Support `fieldname` passed in script (#56) In such a visualization we receive such 2 requests: Screenshot 2024-05-07 at 22 48 05 Screenshot 2024-05-07 at 22 48 25 Screenshot 2024-05-07 at 22 48 13 Responses are wrong: Screenshot 2024-05-07 at 22 48 30 Screenshot 2024-05-07 at 22 48 17 So far I'll just add support for this to those 3 aggregations. Adding everywhere would be a lot more work, and we don't see it anywhere else. It seems we're close to support 100% or almost 100% of clickable stuff in OpenSearch, so I'd rather finish that than waste time here. Looks to me like something which can be solved in general after `mapping` issue, but it doesn't hurt to have some working code to base on + tests sooner. After: Screenshot 2024-05-07 at 23 56 28 --- quesma/queryparser/aggregation_parser.go | 103 +++++++-- quesma/queryparser/aggregation_parser_test.go | 33 ++- .../aggregation_requests.go | 210 ++++++++++++++++++ 3 files changed, 326 insertions(+), 20 deletions(-) diff --git a/quesma/queryparser/aggregation_parser.go b/quesma/queryparser/aggregation_parser.go index 614721a55..7c4b31990 100644 --- a/quesma/queryparser/aggregation_parser.go +++ b/quesma/queryparser/aggregation_parser.go @@ -11,6 +11,7 @@ import ( "mitmproxy/quesma/model/bucket_aggregations" "mitmproxy/quesma/model/metrics_aggregations" "mitmproxy/quesma/util" + "regexp" "slices" "strconv" "strings" @@ -34,14 +35,15 @@ type aggrQueryBuilder struct { } type metricsAggregation struct { - AggrType string - FieldNames []string // on these fields we're doing aggregation. Array, because e.g. 'top_hits' can have multiple fields - FieldType clickhouse.DateTimeType // field type of FieldNames[0]. If it's a date field, a slightly different response is needed - Percentiles map[string]float64 // Only for percentiles aggregation - Keyed bool // Only for percentiles aggregation - SortBy string // Only for top_metrics - Size int // Only for top_metrics - Order string // Only for top_metrics + AggrType string + FieldNames []string // on these fields we're doing aggregation. Array, because e.g. 'top_hits' can have multiple fields + FieldType clickhouse.DateTimeType // field type of FieldNames[0]. If it's a date field, a slightly different response is needed + Percentiles map[string]float64 // Only for percentiles aggregation + Keyed bool // Only for percentiles aggregation + SortBy string // Only for top_metrics + Size int // Only for top_metrics + Order string // Only for top_metrics + IsFieldNameCompound bool // Only for a few aggregations, where we have only 1 field. It's a compound, so e.g. toHour(timestamp), not just "timestamp" } const metricsAggregationDefaultFieldType = clickhouse.Invalid @@ -84,7 +86,13 @@ func (b *aggrQueryBuilder) buildMetricsAggregation(metricsAggr metricsAggregatio query := b.buildAggregationCommon(metadata) switch metricsAggr.AggrType { case "sum", "min", "max", "avg": - query.NonSchemaFields = append(query.NonSchemaFields, metricsAggr.AggrType+`OrNull("`+getFirstFieldName()+`")`) + var fieldNameProperlyQuoted string + if metricsAggr.IsFieldNameCompound { + fieldNameProperlyQuoted = getFirstFieldName() + } else { + fieldNameProperlyQuoted = strconv.Quote(getFirstFieldName()) + } + query.NonSchemaFields = append(query.NonSchemaFields, metricsAggr.AggrType+`OrNull(`+fieldNameProperlyQuoted+`)`) case "quantile": // Sorting here useful mostly for determinism in tests. // It wasn't there before, and everything worked fine. We could safely remove it, if needed. @@ -430,11 +438,12 @@ func (cw *ClickhouseQueryTranslator) tryMetricsAggregation(queryMap QueryMap) (m metricsAggregations := []string{"sum", "avg", "min", "max", "cardinality", "value_count", "stats"} for k, v := range queryMap { if slices.Contains(metricsAggregations, k) { - fieldName := cw.parseFieldField(v, k) + fieldName, isFieldNameFromScript := cw.parseFieldFieldMaybeScript(v, k) return metricsAggregation{ - AggrType: k, - FieldNames: []string{fieldName}, - FieldType: cw.Table.GetDateTimeType(cw.Ctx, fieldName), + AggrType: k, + FieldNames: []string{fieldName}, + FieldType: cw.Table.GetDateTimeType(cw.Ctx, fieldName), + IsFieldNameCompound: isFieldNameFromScript, }, true } } @@ -536,7 +545,13 @@ func (cw *ClickhouseQueryTranslator) tryBucketAggregation(currentAggr *aggrQuery success = true // returned in most cases if histogram, ok := queryMap["histogram"]; ok { currentAggr.Type = bucket_aggregations.NewHistogram(cw.Ctx) - fieldName := strconv.Quote(cw.parseFieldField(histogram, "histogram")) + fieldName, isFieldNameFromScript := cw.parseFieldFieldMaybeScript(histogram, "histogram") + var fieldNameProperlyQuoted string + if isFieldNameFromScript { + fieldNameProperlyQuoted = fieldName + } else { + fieldNameProperlyQuoted = strconv.Quote(fieldName) + } var interval float64 intervalQueryMap, ok := histogram.(QueryMap)["interval"] if !ok { @@ -556,9 +571,9 @@ func (cw *ClickhouseQueryTranslator) tryBucketAggregation(currentAggr *aggrQuery default: logger.ErrorWithCtx(cw.Ctx).Msgf("unexpected type of interval: %T, value: %v", intervalRaw, intervalRaw) } - groupByStr := fieldName + groupByStr := fieldNameProperlyQuoted if interval != 1 { - groupByStr = fmt.Sprintf("floor(%s / %f) * %f", fieldName, interval, interval) + groupByStr = fmt.Sprintf("floor(%s / %f) * %f", fieldNameProperlyQuoted, interval, interval) } currentAggr.GroupByFields = append(currentAggr.GroupByFields, groupByStr) currentAggr.NonSchemaFields = append(currentAggr.NonSchemaFields, groupByStr) @@ -664,6 +679,62 @@ func (cw *ClickhouseQueryTranslator) parseFieldField(shouldBeMap any, aggregatio return "" } +// parseFieldFieldMaybeScript is basically almost a copy of parseFieldField above, but it also handles a basic script, if "field" is missing. +func (cw *ClickhouseQueryTranslator) parseFieldFieldMaybeScript(shouldBeMap any, aggregationType string) (field string, isFromScript bool) { + isFromScript = false + Map, ok := shouldBeMap.(QueryMap) + if !ok { + logger.WarnWithCtx(cw.Ctx).Msgf("%s aggregation is not a map, but %T, value: %v", aggregationType, shouldBeMap, shouldBeMap) + return + } + // maybe "field" field + if fieldRaw, ok := Map["field"]; ok { + if field, ok = fieldRaw.(string); ok { + return + } else { + logger.WarnWithCtx(cw.Ctx).Msgf("field is not a string, but %T, value: %v", fieldRaw, fieldRaw) + } + } + + // else: maybe script + if fieldName, ok := cw.parseFieldFromScriptField(Map); ok { + return fmt.Sprintf("toHour(`%s`)", fieldName), true + } + + logger.WarnWithCtx(cw.Ctx).Msgf("field not found in %s aggregation: %v", aggregationType, Map) + return +} + +func (cw *ClickhouseQueryTranslator) parseFieldFromScriptField(queryMap QueryMap) (fieldName string, success bool) { + scriptRaw, exists := queryMap["script"] + if !exists { + return + } + script, ok := scriptRaw.(QueryMap) + if !ok { + logger.WarnWithCtx(cw.Ctx).Msgf("script is not a JsonMap, but %T, value: %v", scriptRaw, scriptRaw) + return + } + + sourceRaw, exists := script["source"] + if !exists { + logger.WarnWithCtx(cw.Ctx).Msgf("source not found in script: %v", script) + return + } + source, ok := sourceRaw.(string) + if !ok { + logger.WarnWithCtx(cw.Ctx).Msgf("source is not a string, but %T, value: %v", sourceRaw, sourceRaw) + } + + // source must look like "doc['field_name'].value.getHour()" or "doc['field_name'].value.hourOfDay" + wantedRegex := regexp.MustCompile(`^doc\['(\w+)']\.value\.(?:getHour\(\)|hourOfDay)$`) + matches := wantedRegex.FindStringSubmatch(source) + if len(matches) == 2 { + return matches[1], true + } + return +} + func (cw *ClickhouseQueryTranslator) parseFilters(filtersMap QueryMap) []filter { var filters []filter filtersMap = filtersMap["filters"].(QueryMap) diff --git a/quesma/queryparser/aggregation_parser_test.go b/quesma/queryparser/aggregation_parser_test.go index c74316752..a4c9d7c68 100644 --- a/quesma/queryparser/aggregation_parser_test.go +++ b/quesma/queryparser/aggregation_parser_test.go @@ -581,10 +581,7 @@ func Test2AggregationParserExternalTestcases(t *testing.T) { // Let's leave those commented debugs for now, they'll be useful in next PRs for j, aggregation := range aggregations { - // fmt.Println("--- Aggregation "+strconv.Itoa(j)+":", aggregation) - // fmt.Println() - // fmt.Println("--- SQL string ", aggregation.String()) - // fmt.Println() + // fmt.Printf("--- Aggregation %d: %+v\n\n---SQL string: %s\n\n", j, aggregation, aggregation.String()) // fmt.Println("--- Group by: ", aggregation.GroupByFields) if test.ExpectedSQLs[j] != "NoDBQuery" { util.AssertSqlEqual(t, test.ExpectedSQLs[j], aggregation.String()) @@ -637,3 +634,31 @@ func Test_quoteArray(t *testing.T) { assert.Equal(t, inputs[i], test.input) // check that original array isn't changed } } + +func Test_parseFieldFromScriptField(t *testing.T) { + goodQueryMap := func(sourceField string) QueryMap { + return QueryMap{"script": QueryMap{"source": sourceField}} + } + testcases := []struct { + queryMap QueryMap + expectedMatch string + expectedSuccess bool + }{ + {goodQueryMap("doc['field1'].value.getHour()"), "field1", true}, + {goodQueryMap("doc['field1'].value.getHour() + doc['field2'].value.getHour()"), "", false}, + {goodQueryMap("doc['field1'].value.hourOfDay"), "field1", true}, + {goodQueryMap("doc['field1'].value"), "", false}, + {goodQueryMap("value.getHour() + doc['field2'].value.getHour()"), "", false}, + {QueryMap{}, "", false}, + {QueryMap{"script": QueryMap{}}, "", false}, + {QueryMap{"script": QueryMap{"source": ""}}, "", false}, + {QueryMap{"script": "script"}, "", false}, + {QueryMap{"script": QueryMap{"source": 1}}, "", false}, + } + cw := ClickhouseQueryTranslator{Ctx: context.Background()} + for _, tc := range testcases { + fieldName, success := cw.parseFieldFromScriptField(tc.queryMap) + assert.Equal(t, tc.expectedSuccess, success) + assert.Equal(t, tc.expectedMatch, fieldName) + } +} diff --git a/quesma/testdata/opensearch-visualize/aggregation_requests.go b/quesma/testdata/opensearch-visualize/aggregation_requests.go index 81a0e47e9..a4a429551 100644 --- a/quesma/testdata/opensearch-visualize/aggregation_requests.go +++ b/quesma/testdata/opensearch-visualize/aggregation_requests.go @@ -1276,4 +1276,214 @@ var AggregationTests = []testdata.AggregationTestCase{ "ORDER BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000))", }, }, + { // [8] + TestName: "Min/max with simple script. Reproduce: Visualize -> Line -> Metrics: Count, Buckets: X-Asis Histogram", + QueryRequestJson: ` + { + "_source": { + "excludes": [] + }, + "aggs": { + "maxAgg": { + "max": { + "script": { + "lang": "painless", + "source": "doc['timestamp'].value.getHour()" + } + } + }, + "minAgg": { + "min": { + "script": { + "lang": "painless", + "source": "doc['timestamp'].value.getHour()" + } + } + } + }, + "docvalue_fields": [ + { + "field": "@timestamp", + "format": "date_time" + }, + { + "field": "timestamp", + "format": "date_time" + }, + { + "field": "utc_time", + "format": "date_time" + } + ], + "query": { + "bool": { + "filter": [], + "must": [ + { + "match_all": {} + } + ], + "must_not": [], + "should": [] + } + }, + "script_fields": { + "hour_of_day": { + "script": { + "lang": "painless", + "source": "doc['timestamp'].value.getHour()" + } + } + }, + "size": 0, + "stored_fields": [ + "*" + ] + }`, + ExpectedResponse: ` + { + "_shards": { + "failed": 0, + "skipped": 0, + "successful": 1, + "total": 1 + }, + "aggregations": { + "maxAgg": { + "value": 23.0 + }, + "minAgg": { + "value": 0.0 + } + }, + "hits": { + "hits": [], + "max_score": null, + "total": { + "relation": "eq", + "value": 13059 + } + }, + "timed_out": false, + "took": 17 + }`, + ExpectedResults: [][]model.QueryResultRow{ + {{Cols: []model.QueryResultCol{model.NewQueryResultCol("value", uint64(13059))}}}, + {{Cols: []model.QueryResultCol{model.NewQueryResultCol(`maxOrNull("todo")`, 23.0)}}}, + {{Cols: []model.QueryResultCol{model.NewQueryResultCol(`minOrNull("todo")`, 0.0)}}}, + }, + ExpectedSQLs: []string{ + `SELECT count() FROM ` + testdata.QuotedTableName + ` `, + "SELECT maxOrNull(toHour(`timestamp`)) FROM " + testdata.QuotedTableName + ` `, + "SELECT minOrNull(toHour(`timestamp`)) FROM " + testdata.QuotedTableName + ` `, + }, + }, + { // [9] + TestName: "Histogram with simple script. Reproduce: Visualize -> Line -> Metrics: Count, Buckets: X-Asis Histogram", + QueryRequestJson: ` + { + "_source": { + "excludes": [] + }, + "aggs": { + "2": { + "histogram": { + "interval": 1, + "min_doc_count": 1, + "script": { + "lang": "painless", + "source": "doc['timestamp'].value.getHour()" + } + } + } + }, + "docvalue_fields": [ + { + "field": "@timestamp", + "format": "date_time" + }, + { + "field": "timestamp", + "format": "date_time" + }, + { + "field": "utc_time", + "format": "date_time" + } + ], + "query": { + "bool": { + "filter": [], + "must": [ + { + "match_all": {} + } + ], + "must_not": [], + "should": [] + } + }, + "script_fields": { + "hour_of_day": { + "script": { + "lang": "painless", + "source": "doc['timestamp'].value.getHour()" + } + } + }, + "size": 0, + "stored_fields": [ + "*" + ] + }`, + ExpectedResponse: ` + { + "_shards": { + "failed": 0, + "skipped": 0, + "successful": 1, + "total": 1 + }, + "aggregations": { + "2": { + "buckets": [ + { + "doc_count": 44, + "key": 0.0 + }, + { + "doc_count": 43, + "key": 1.0 + }, + { + "doc_count": 34, + "key": 2.0 + } + ] + } + }, + "hits": { + "hits": [], + "max_score": null, + "total": { + "relation": "eq", + "value": 886 + } + }, + "timed_out": false, + "took": 41 + }`, + ExpectedResults: [][]model.QueryResultRow{ + {{Cols: []model.QueryResultCol{model.NewQueryResultCol("value", uint64(886))}}}, + { + {Cols: []model.QueryResultCol{model.NewQueryResultCol("key", 0.0), model.NewQueryResultCol("doc_count", 44)}}, + {Cols: []model.QueryResultCol{model.NewQueryResultCol("key", 1.0), model.NewQueryResultCol("doc_count", 43)}}, + {Cols: []model.QueryResultCol{model.NewQueryResultCol("key", 2.0), model.NewQueryResultCol("doc_count", 34)}}, + }, + }, + ExpectedSQLs: []string{ + `SELECT count() FROM ` + testdata.QuotedTableName + ` `, + "SELECT toHour(`timestamp`), count() FROM " + testdata.QuotedTableName + " GROUP BY (toHour(`timestamp`)) ORDER BY (toHour(`timestamp`))", + }, + }, }