From c54fcc20a0b46dea5770066b01c9fb5831214d93 Mon Sep 17 00:00:00 2001
From: Krzysztof Kiewicz <krzysztof@quesma.com>
Date: Sun, 5 Jan 2025 17:01:44 +0100
Subject: [PATCH] `ip_range`: support for `ipv6` (#1157)

---
 docs/public/docs/limitations.md               |   2 +-
 quesma/go.mod                                 |   1 +
 quesma/go.sum                                 |   6 +-
 quesma/model/README.md                        |   2 +-
 quesma/model/bucket_aggregations/ip_range.go  |  28 ++--
 .../pancake_aggregation_parser_buckets.go     |  32 ++++-
 .../kibana-visualize/aggregation_requests.go  | 127 ++++++++++++++++--
 7 files changed, 170 insertions(+), 28 deletions(-)

diff --git a/docs/public/docs/limitations.md b/docs/public/docs/limitations.md
index 87ca9a913..66b00acef 100644
--- a/docs/public/docs/limitations.md
+++ b/docs/public/docs/limitations.md
@@ -34,7 +34,7 @@ Currently supported:
   including: `boolean`, `match`, `match phrase`, `multi-match`, `query string`, `nested`, `match all`, `exists`, `prefix`, `range`, `term`, `terms`, `wildcard`
 - most popular [Aggregations](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html),
   including: `avg`, `cardinality`, `max`, `min`, `percentile ranks`, `percentiles`, `stats`, `sum`, `top hits`, `top metrics`, `value counts`,
-  `date histogram`, `date range`, `filter`, `filters`, `histogram`, `range`, `singificant terms`, `terms`, `ip prefix`
+  `date histogram`, `date range`, `filter`, `filters`, `histogram`, `range`, `singificant terms`, `terms`, `ip prefix`, `ip range`
 
 Which as a result allows you to run Kibana/OSD queries and dashboards on data residing in ClickHouse/Hydrolix.
 
diff --git a/quesma/go.mod b/quesma/go.mod
index 055da1e49..cae03c989 100644
--- a/quesma/go.mod
+++ b/quesma/go.mod
@@ -41,6 +41,7 @@ require (
 require (
 	filippo.io/edwards25519 v1.1.0 // indirect
 	github.com/H0llyW00dzZ/cidr v1.2.1 // indirect
+	github.com/apparentlymart/go-cidr v1.1.0 // indirect
 	github.com/go-viper/mapstructure/v2 v2.2.1 // indirect
 	github.com/hashicorp/errwrap v1.0.0 // indirect
 	github.com/jackc/chunkreader/v2 v2.0.1 // indirect
diff --git a/quesma/go.sum b/quesma/go.sum
index 39f2d9293..2254fcd1d 100644
--- a/quesma/go.sum
+++ b/quesma/go.sum
@@ -7,15 +7,17 @@ github.com/ClickHouse/clickhouse-go/v2 v2.30.0 h1:AG4D/hW39qa58+JHQIFOSnxyL46H6h
 github.com/ClickHouse/clickhouse-go/v2 v2.30.0/go.mod h1:i9ZQAojcayW3RsdCb3YR+n+wC2h65eJsZCscZ1Z1wyo=
 github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU=
 github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU=
-github.com/H0llyW00dzZ/cidr v1.2.1 h1:DfRHX+RqVVKZijQGO1aJSaWvN9Saan8sycK/4wrfY5g=
-github.com/H0llyW00dzZ/cidr v1.2.1/go.mod h1:S+EgYkMandSAN27mGNG/CB3jeoXDAyalsvvVFpWdnXc=
 github.com/DataDog/go-sqllexer v0.0.18 h1:ErBvoO7/srJLdA2ebwd+HPqD4g1kN++BP64A8qvmh9U=
 github.com/DataDog/go-sqllexer v0.0.18/go.mod h1:KwkYhpFEVIq+BfobkTC1vfqm4gTi65skV/DpDBXtexc=
+github.com/H0llyW00dzZ/cidr v1.2.1 h1:DfRHX+RqVVKZijQGO1aJSaWvN9Saan8sycK/4wrfY5g=
+github.com/H0llyW00dzZ/cidr v1.2.1/go.mod h1:S+EgYkMandSAN27mGNG/CB3jeoXDAyalsvvVFpWdnXc=
 github.com/Masterminds/semver/v3 v3.1.1/go.mod h1:VPu/7SZ7ePZ3QOrcuXROw5FAcLl4a0cBrbBpGY/8hQs=
 github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA=
 github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA=
 github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ=
 github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw=
+github.com/apparentlymart/go-cidr v1.1.0 h1:2mAhrMoF+nhXqxTzSZMUzDHkLjmIHC+Zzn4tdgBZjnU=
+github.com/apparentlymart/go-cidr v1.1.0/go.mod h1:EBcsNrHc3zQeuaeCeCtQruQm+n9/YjEn/vI25Lg7Gwc=
 github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0=
 github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM=
 github.com/cockroachdb/apd v1.1.0 h1:3LFP3629v+1aKXU5Q37mxmRxX/pIu1nijXydLShEq5I=
diff --git a/quesma/model/README.md b/quesma/model/README.md
index 70564f204..837567c89 100644
--- a/quesma/model/README.md
+++ b/quesma/model/README.md
@@ -28,7 +28,7 @@ More info: https://www.elastic.co/guide/en/elasticsearch/reference/current/searc
  Median absolute deviation |          :x:           | Global                       |        :x:         | Moving function        |    :wavy_dash:     |
  Min                       |   :white_check_mark:   | Histogram                    | :white_check_mark: | Moving percentiles     |        :x:         |
  Percentile ranks          |   :white_check_mark:   | IP prefix                    | :white_check_mark: | Normalize              |        :x:         |
- Percentiles               |   :white_check_mark:   | IP range                     |        :x:         | Percentiles bucket     |        :x:         |
+ Percentiles               |   :white_check_mark:   | IP range                     | :white_check_mark: | Percentiles bucket     |        :x:         |
  Rate                      |          :x:           | Missing                      |        :x:         | Serial differencing    | :white_check_mark: |
  Scripted metric           |          :x:           | Multi-terms                  | :white_check_mark: | Stats bucket           |        :x:         |
  Stats                     |   :white_check_mark:   | Nested                       |        :x:         | Sum bucket             | :white_check_mark: |
diff --git a/quesma/model/bucket_aggregations/ip_range.go b/quesma/model/bucket_aggregations/ip_range.go
index e34bdbd1f..5befd892b 100644
--- a/quesma/model/bucket_aggregations/ip_range.go
+++ b/quesma/model/bucket_aggregations/ip_range.go
@@ -5,6 +5,7 @@ package bucket_aggregations
 import (
 	"context"
 	"fmt"
+	"net/netip"
 	"quesma/logger"
 	"quesma/model"
 	"reflect"
@@ -14,8 +15,6 @@ import (
 // So instead of "<= 255.255.255.255", it uses "< ::1:0:0:0"
 const BiggestIpv4 = "::1:0:0:0"
 
-// Current limitation: we expect Clickhouse field to be IPv4 (and not IPv6)
-
 // Clickhouse table to test SQLs:
 // CREATE TABLE __quesma_table_name (clientip IPv4) ENGINE=Log
 // INSERT INTO __quesma_table_name VALUES ('0.0.0.0'), ('5.5.5.5'), ('90.180.90.180'), ('128.200.0.8'),  ('192.168.1.67'), ('222.168.22.67')
@@ -95,23 +94,34 @@ func NewIpInterval(begin, end string, key *string) IpInterval {
 }
 
 func (interval IpInterval) ToWhereClause(field model.Expr) model.Expr {
-	isBegin := interval.begin != UnboundedInterval
-	isEnd := interval.end != UnboundedInterval && interval.end != BiggestIpv4
+	hasBegin := interval.hasBeginInResponse()
+	hasEnd := interval.hasEndInResponse()
 
 	begin := model.NewInfixExpr(field, ">=", model.NewLiteralSingleQuoteString(interval.begin))
 	end := model.NewInfixExpr(field, "<", model.NewLiteralSingleQuoteString(interval.end))
 
-	if isBegin && isEnd {
+	if hasBegin && hasEnd {
 		return model.NewInfixExpr(begin, "AND", end)
-	} else if isBegin {
+	} else if hasBegin {
 		return begin
-	} else if isEnd {
+	} else if hasEnd {
 		return end
 	} else {
 		return model.TrueExpr
 	}
 }
 
+// hasBeginInResponse returns true if we should add 'from' field to the response.
+// We do that <=> begin is not 0.0.0.0 (unbounded)
+func (interval IpInterval) hasBeginInResponse() bool {
+	return interval.begin != UnboundedInterval && netip.MustParseAddr(interval.begin) != netip.MustParseAddr("::")
+}
+
+// hasEndInResponse returns true if we should add 'to' field to the response.
+func (interval IpInterval) hasEndInResponse() bool {
+	return interval.end != UnboundedInterval
+}
+
 // String returns key part of the response, e.g. "1.0-2.0", or "*-6.55"
 func (interval IpInterval) String() string {
 	if interval.key != nil {
@@ -166,10 +176,10 @@ func (query *IpRange) CombinatorTranslateSqlResponseToJson(subGroup CombinatorGr
 	}
 
 	interval := query.intervals[subGroup.idx]
-	if interval.begin != UnboundedInterval {
+	if interval.hasBeginInResponse() {
 		response["from"] = interval.begin
 	}
-	if interval.end != UnboundedInterval {
+	if interval.hasEndInResponse() {
 		response["to"] = interval.end
 	}
 
diff --git a/quesma/queryparser/pancake_aggregation_parser_buckets.go b/quesma/queryparser/pancake_aggregation_parser_buckets.go
index 4b65d0319..f33f0eab0 100644
--- a/quesma/queryparser/pancake_aggregation_parser_buckets.go
+++ b/quesma/queryparser/pancake_aggregation_parser_buckets.go
@@ -6,9 +6,11 @@ package queryparser
 import (
 	"fmt"
 	"github.com/H0llyW00dzZ/cidr"
+	cidr2 "github.com/apparentlymart/go-cidr/cidr"
 	"github.com/pkg/errors"
 	"math"
 	"net"
+	"net/netip"
 	"quesma/clickhouse"
 	"quesma/logger"
 	"quesma/model"
@@ -410,23 +412,39 @@ func (cw *ClickhouseQueryTranslator) parseIpRange(aggregation *pancakeAggregatio
 	rangesRaw := params["ranges"].([]any)
 	ranges := make([]bucket_aggregations.IpInterval, 0, len(rangesRaw))
 	for _, rangeRaw := range rangesRaw {
+		var begin, end string
 		var key *string
 		if keyIfPresent, exists := cw.parseStringFieldExistCheck(rangeRaw.(QueryMap), "key"); exists {
 			key = &keyIfPresent
 		}
-		var begin, end string
 		if maskIfExists, exists := cw.parseStringFieldExistCheck(rangeRaw.(QueryMap), "mask"); exists {
 			_, ipNet, err := net.ParseCIDR(maskIfExists)
 			if err != nil {
 				return err
 			}
-			beginAsInt, endAsInt := cidr.IPv4ToRange(ipNet)
-			begin = util.IntToIpv4(beginAsInt)
-			// endAsInt is inclusive, we do +1, because we need it exclusive
-			if endAsInt != math.MaxUint32 {
-				end = util.IntToIpv4(endAsInt + 1)
+			if ipNet.IP.To4() != nil {
+				// it's ipv4
+				beginAsInt, endAsInt := cidr.IPv4ToRange(ipNet)
+				begin = util.IntToIpv4(beginAsInt)
+				// endAsInt is inclusive, we do +1, because we need it exclusive
+				if endAsInt != math.MaxUint32 {
+					end = util.IntToIpv4(endAsInt + 1)
+				} else {
+					end = bucket_aggregations.BiggestIpv4 // "255.255.255.255 + 1", so to say (value in compliance with Elastic)
+				}
+			} else if ipNet.IP.To16() != nil {
+				// it's ipv6
+				beginInclusive, endInclusive := cidr2.AddressRange(ipNet)
+				begin = beginInclusive.String()
+				// we do +1 (.Next()), because we need end to be exclusive
+				endExclusive := netip.MustParseAddr(endInclusive.String()).Next()
+				if endExclusive.IsValid() {
+					end = endExclusive.String()
+				} else { // invalid means endInclusive was already the biggest possible value (ff...ff)
+					end = bucket_aggregations.UnboundedInterval
+				}
 			} else {
-				end = bucket_aggregations.BiggestIpv4 // "255.255.255.255 + 1", so to say (value in compliance with Elastic)
+				return fmt.Errorf("invalid mask: %s", maskIfExists)
 			}
 			if key == nil {
 				key = &maskIfExists
diff --git a/quesma/testdata/kibana-visualize/aggregation_requests.go b/quesma/testdata/kibana-visualize/aggregation_requests.go
index 23c393893..fc207cb37 100644
--- a/quesma/testdata/kibana-visualize/aggregation_requests.go
+++ b/quesma/testdata/kibana-visualize/aggregation_requests.go
@@ -3375,9 +3375,6 @@ var AggregationTests = []testdata.AggregationTestCase{
 		TestName: "IP range, with ranges as CIDR masks. In Kibana: Add panel > Aggregation Based > Area. Buckets: X-asis: IP Range",
 		QueryRequestJson: `
 		{
-			"_source": {
-				"excludes": []
-			},
 			"aggs": {
 				"2": {
 					"ip_range": {
@@ -3451,7 +3448,7 @@ var AggregationTests = []testdata.AggregationTestCase{
 			}},
 		},
 		ExpectedPancakeSQL: `
-			SELECT countIf("clientip">='255.255.255.252') AS "range_0__aggr__2__count",
+			SELECT countIf(("clientip">='255.255.255.252' AND "clientip"<'::1:0:0:0')) AS "range_0__aggr__2__count",
 			  countIf("clientip">='128.129.130.131') AS "range_1__aggr__2__count",
 			  countIf(("clientip">='10.0.7.96' AND "clientip"<'10.0.7.128')) AS
 			  "range_2__aggr__2__count"
@@ -3461,9 +3458,6 @@ var AggregationTests = []testdata.AggregationTestCase{
 		TestName: "IP range, with ranges as CIDR masks, keyed=true. In Kibana: Add panel > Aggregation Based > Area. Buckets: X-asis: IP Range",
 		QueryRequestJson: `
 		{
-			"_source": {
-				"excludes": []
-			},
 			"aggs": {
 				"2": {
 					"ip_range": {
@@ -3535,10 +3529,127 @@ var AggregationTests = []testdata.AggregationTestCase{
 			}},
 		},
 		ExpectedPancakeSQL: `
-			SELECT countIf("clientip">='255.255.255.254') AS "range_0__aggr__2__count",
+			SELECT countIf(("clientip">='255.255.255.254' AND "clientip"<'::1:0:0:0')) AS "range_0__aggr__2__count",
 			  countIf("clientip">='128.129.130.131') AS "range_1__aggr__2__count",
 			  countIf(("clientip">='10.0.7.96' AND "clientip"<'10.0.7.128')) AS
 			  "range_2__aggr__2__count"
 			FROM __quesma_table_name`,
 	},
+	{ // [27]
+		TestName: "IP range ipv6",
+		QueryRequestJson: `
+		{
+			"aggs": {
+				"2": {
+					"ip_range": {
+						"field": "clientip",
+						"ranges": [
+							{
+								"from": "1::132:13:21:23:122:22"
+							},
+							{
+								"to": "1::132:13:21:23:122:22"
+							},
+							{
+								"to": "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"
+							}
+						]
+					}
+				}
+			},
+			"size": 0,
+			"track_total_hits": true
+		}`,
+		ExpectedResponse: `
+		{
+			"aggregations": {
+				"2": {
+					"buckets": [
+						{
+							"key": "1::132:13:21:23:122:22-*",
+							"from": "1::132:13:21:23:122:22",
+							"doc_count": 7290
+						},
+						{
+							"key": "*-1::132:13:21:23:122:22",
+							"to": "1::132:13:21:23:122:22",
+							"doc_count": 6784
+						},
+						{
+							"key": "*-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
+							"to": "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
+							"doc_count": 999999
+						}
+					]
+				}
+			}
+		}`,
+		ExpectedPancakeResults: []model.QueryResultRow{
+			{Cols: []model.QueryResultCol{
+				model.NewQueryResultCol("range_0__aggr__2__count", int64(7290)),
+				model.NewQueryResultCol("range_1__aggr__2__count", int64(6784)),
+				model.NewQueryResultCol("range_2__aggr__2__count", int64(999999)),
+			}},
+		},
+		ExpectedPancakeSQL: `
+			SELECT countIf("clientip">='1::132:13:21:23:122:22') AS
+			  "range_0__aggr__2__count",
+			  countIf("clientip"<'1::132:13:21:23:122:22') AS "range_1__aggr__2__count",
+			  countIf("clientip"<'ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff') AS
+			  "range_2__aggr__2__count"
+			FROM __quesma_table_name`,
+	},
+	{ // [28]
+		TestName: "IP range ipv6 with mask",
+		QueryRequestJson: `
+		{
+			"aggs": {
+				"2": {
+					"ip_range": {
+						"field": "clientip",
+						"ranges": [
+							{
+								"mask": "::/2"
+							},
+							{
+								"mask": "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/127"
+							}
+						]
+					}
+				}
+			},
+			"size": 0,
+			"track_total_hits": true
+		}`,
+		ExpectedResponse: `
+		{
+			"aggregations": {
+				"2": {
+					"buckets": [
+						{
+							"key": "::/2",
+							"to": "4000::",
+							"doc_count": 1
+						},
+						{
+							"key": "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff/127",
+							"from": "ffff:ffff:ffff:ffff:ffff:ffff:ffff:fffe",
+							"doc_count": 0
+						}
+					]
+				}
+			}
+		}`,
+		ExpectedPancakeResults: []model.QueryResultRow{
+			{Cols: []model.QueryResultCol{
+				model.NewQueryResultCol("range_0__aggr__2__count", int64(1)),
+				model.NewQueryResultCol("range_1__aggr__2__count", int64(0)),
+			}},
+		},
+		ExpectedPancakeSQL: `
+			SELECT countIf("clientip"<'4000::') AS "range_0__aggr__2__count",
+			  countIf("clientip">='ffff:ffff:ffff:ffff:ffff:ffff:ffff:fffe') AS
+			  "range_1__aggr__2__count"
+			FROM __quesma_table_name`,
+	},
 }