From 03e489df5df61b43162087a0b95ea259224fc1ac Mon Sep 17 00:00:00 2001 From: Darnell Andries Date: Fri, 10 Jan 2025 14:49:28 -0800 Subject: [PATCH] Address Web Discovery feedback --- .../content_scraper_browsertest.cc | 38 +++--- .../web_discovery/browser/hash_detection.cc | 18 ++- .../web_discovery/browser/hash_detection.h | 4 +- .../browser/payload_generator.cc | 14 +-- .../browser/payload_generator_unittest.cc | 110 +++++++----------- .../web_discovery/browser/privacy_guard.cc | 30 ++--- .../web_discovery/browser/privacy_guard.h | 2 +- .../browser/privacy_guard_unittest.cc | 48 ++++---- .../web_discovery/browser/regex_util.cc | 11 +- components/web_discovery/browser/regex_util.h | 14 ++- 10 files changed, 126 insertions(+), 163 deletions(-) diff --git a/browser/web_discovery/content_scraper_browsertest.cc b/browser/web_discovery/content_scraper_browsertest.cc index 0443833b80fa..d21044485817 100644 --- a/browser/web_discovery/content_scraper_browsertest.cc +++ b/browser/web_discovery/content_scraper_browsertest.cc @@ -13,6 +13,7 @@ #include "base/path_service.h" #include "base/test/bind.h" #include "base/test/scoped_feature_list.h" +#include "base/test/values_test_util.h" #include "brave/components/constants/brave_paths.h" #include "brave/components/web_discovery/browser/patterns.h" #include "brave/components/web_discovery/browser/server_config_loader.h" @@ -199,24 +200,17 @@ IN_PROC_BROWSER_TEST_F(WebDiscoveryContentScraperTest, RendererScrape) { ASSERT_EQ(fields->size(), 2u); - const auto* href_value = (*fields)[0].FindString("href"); - const auto* text_value = (*fields)[0].FindString("text"); - const auto* query_value_str = (*fields)[0].FindString("q"); - ASSERT_TRUE(href_value); - ASSERT_TRUE(text_value); - ASSERT_TRUE(query_value_str); - EXPECT_EQ(*href_value, "https://example.com/foo1"); - EXPECT_EQ(*text_value, "Foo1"); - EXPECT_EQ(*query_value_str, "A query"); - - href_value = (*fields)[1].FindString("href"); - text_value = (*fields)[1].FindString("text"); + base::ExpectDictStringValue("https://example.com/foo1", + (*fields)[0], "href"); + base::ExpectDictStringValue("Foo1", (*fields)[0], "text"); + base::ExpectDictStringValue("A query", (*fields)[0], "q"); + + base::ExpectDictStringValue("https://example.com/foo2", + (*fields)[1], "href"); + base::ExpectDictStringValue("Foo2", (*fields)[1], "text"); + const auto* query_value = (*fields)[1].Find("q"); - ASSERT_TRUE(href_value); - ASSERT_TRUE(text_value); ASSERT_TRUE(query_value); - EXPECT_EQ(*href_value, "https://example.com/foo2"); - EXPECT_EQ(*text_value, "Foo2"); EXPECT_TRUE(query_value->is_none()); field_map_it = scrape_result->fields.find("dont>match"); @@ -224,9 +218,7 @@ IN_PROC_BROWSER_TEST_F(WebDiscoveryContentScraperTest, RendererScrape) { fields = &field_map_it->second; ASSERT_EQ(fields->size(), 1u); - const auto* url_query_value = (*fields)[0].FindString("q2"); - ASSERT_TRUE(url_query_value); - EXPECT_EQ(*url_query_value, "testquery"); + base::ExpectDictStringValue("testquery", (*fields)[0], "q2"); }(); run_loop_->Quit(); })); @@ -255,12 +247,8 @@ IN_PROC_BROWSER_TEST_F(WebDiscoveryContentScraperTest, RustParseAndScrape) { ASSERT_EQ(fields->size(), 1u); - const auto* text_value = (*fields)[0].FindString("text"); - const auto* input_value = (*fields)[0].FindString("input"); - ASSERT_TRUE(text_value); - ASSERT_TRUE(input_value); - EXPECT_EQ(*text_value, "Foo3"); - EXPECT_EQ(*input_value, "Foo4"); + base::ExpectDictStringValue("Foo3", (*fields)[0], "text"); + base::ExpectDictStringValue("Foo4", (*fields)[0], "input"); field_map_it = scrape_result->fields.find("dont>match"); ASSERT_TRUE(field_map_it != scrape_result->fields.end()); diff --git a/components/web_discovery/browser/hash_detection.cc b/components/web_discovery/browser/hash_detection.cc index 4442860016b0..32d6ab032e95 100644 --- a/components/web_discovery/browser/hash_detection.cc +++ b/components/web_discovery/browser/hash_detection.cc @@ -9,7 +9,6 @@ #include "base/containers/fixed_flat_map.h" #include "brave/components/web_discovery/browser/hash_detection_matrix.h" -#include "brave/components/web_discovery/browser/util.h" namespace web_discovery { @@ -32,9 +31,7 @@ constexpr double kClassifierThreshold = 0.015; } // namespace -bool IsHashLikely(std::string value, double threshold_multiplier) { - TransformToAlphanumeric(value); - +bool IsHashLikely(std::string_view value, double threshold_multiplier) { if (value.empty()) { return false; } @@ -42,8 +39,19 @@ bool IsHashLikely(std::string value, double threshold_multiplier) { double log_prob_sum = 0.0; size_t add_count = 0; for (size_t i = 0; i < value.length() - 1; i++) { + if (!std::isalnum(value[i])) { + continue; + } + size_t next_char_i; + for (next_char_i = i + 1; + next_char_i < value.length() && !std::isalnum(value[next_char_i]); + next_char_i++) { + } + if (!std::isalnum(value[next_char_i])) { + break; + } auto matrix_pos_a = kTokenMap.find(value[i]); - auto matrix_pos_b = kTokenMap.find(value[i + 1]); + auto matrix_pos_b = kTokenMap.find(value[next_char_i]); if (matrix_pos_a == kTokenMap.end() || matrix_pos_b == kTokenMap.end()) { continue; } diff --git a/components/web_discovery/browser/hash_detection.h b/components/web_discovery/browser/hash_detection.h index 41ba3021fb12..0b0ee97c64cd 100644 --- a/components/web_discovery/browser/hash_detection.h +++ b/components/web_discovery/browser/hash_detection.h @@ -6,14 +6,14 @@ #ifndef BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_HASH_DETECTION_H_ #define BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_HASH_DETECTION_H_ -#include +#include namespace web_discovery { // Uses a pre-trained Markov chain classifier to detect the likelihood // of a hash in a given piece of text. Used in privacy guard functions // for detecting potentially private URLs/queries. -bool IsHashLikely(std::string value, double probability_multiplier = 1.0); +bool IsHashLikely(std::string_view value, double probability_multiplier = 1.0); } // namespace web_discovery diff --git a/components/web_discovery/browser/payload_generator.cc b/components/web_discovery/browser/payload_generator.cc index e8f12819d6b1..1be8d190ce67 100644 --- a/components/web_discovery/browser/payload_generator.cc +++ b/components/web_discovery/browser/payload_generator.cc @@ -190,13 +190,13 @@ std::vector GenerateQueryPayloads( base::Value::Dict GenerateAlivePayload(const ServerConfig& server_config, std::string date_hour) { - base::Value::Dict payload; - payload.Set(kActionKey, kAliveAction); - base::Value::Dict inner_payload; - inner_payload.Set(kStatusFieldName, true); - inner_payload.Set(kTimestampFieldName, date_hour); - inner_payload.Set(kCountryCodeFieldName, server_config.location); - payload.Set(kInnerPayloadKey, std::move(inner_payload)); + auto inner_payload = base::Value::Dict() + .Set(kStatusFieldName, true) + .Set(kTimestampFieldName, date_hour) + .Set(kCountryCodeFieldName, server_config.location); + auto payload = base::Value::Dict() + .Set(kActionKey, kAliveAction) + .Set(kInnerPayloadKey, std::move(inner_payload)); return payload; } diff --git a/components/web_discovery/browser/payload_generator_unittest.cc b/components/web_discovery/browser/payload_generator_unittest.cc index 446052798197..70ed4634dd14 100644 --- a/components/web_discovery/browser/payload_generator_unittest.cc +++ b/components/web_discovery/browser/payload_generator_unittest.cc @@ -8,6 +8,7 @@ #include #include +#include "base/test/values_test_util.h" #include "brave/components/web_discovery/browser/content_scraper.h" #include "brave/components/web_discovery/browser/patterns.h" #include "testing/gtest/include/gtest/gtest.h" @@ -69,30 +70,21 @@ TEST_F(WebDiscoveryPayloadGeneratorTest, GenerateQueryPayloads) { auto scrape_result = std::make_unique(test_url, "test_id"); std::vector single_dicts; - base::Value::Dict single_dict1; - single_dict1.Set("ab", "value1"); - single_dict1.Set("cd", "value2"); - single_dicts.push_back(std::move(single_dict1)); - base::Value::Dict single_dict2; - single_dict2.Set("ef", "value3"); - single_dict2.Set("gh", "value4"); - single_dicts.push_back(std::move(single_dict2)); + single_dicts.push_back( + base::Value::Dict().Set("ab", "value1").Set("cd", "value2")); + single_dicts.push_back( + base::Value::Dict().Set("ef", "value3").Set("gh", "value4")); scrape_result->fields[".single-element"] = std::move(single_dicts); std::vector result_dicts1; - base::Value::Dict result_dict1, result_dict2, result_dict3, result_dict4, - result_dict5; - result_dict1.Set("njk", "joinvalue1"); - result_dict2.Set("abc", "joinvalue2"); - result_dict3.Set("njk", "joinvalue3"); - result_dict4.Set("abc", "joinvalue4"); - result_dicts1.push_back(std::move(result_dict1)); - result_dicts1.push_back(std::move(result_dict2)); - result_dicts1.push_back(std::move(result_dict3)); - result_dicts1.push_back(std::move(result_dict4)); + result_dicts1.push_back(base::Value::Dict().Set("njk", "joinvalue1")); + result_dicts1.push_back(base::Value::Dict().Set("abc", "joinvalue2")); + result_dicts1.push_back(base::Value::Dict().Set("njk", "joinvalue3")); + result_dicts1.push_back(base::Value::Dict().Set("abc", "joinvalue4")); + std::vector result_dicts2; - result_dict5.Set("qurl", "https://example.com/test1"); - result_dicts2.push_back(std::move(result_dict5)); + result_dicts2.push_back( + base::Value::Dict().Set("qurl", "https://example.com/test1")); scrape_result->fields["#results"] = std::move(result_dicts1); scrape_result->fields["qurl"] = std::move(result_dicts2); @@ -100,50 +92,40 @@ TEST_F(WebDiscoveryPayloadGeneratorTest, GenerateQueryPayloads) { ASSERT_EQ(payloads.size(), 3u); const auto* payload = &payloads[0]; - const auto* action = payload->FindString(kActionKey); const auto* inner_payload = payload->FindDict(kInnerPayloadKey); - ASSERT_TRUE(action && inner_payload); - EXPECT_EQ(*action, "single_action"); + ASSERT_TRUE(inner_payload); + base::ExpectDictStringValue("single_action", *payload, kActionKey); EXPECT_EQ(inner_payload->size(), 3u); - const auto* ctry = inner_payload->FindString("ctry"); - const auto* val1 = inner_payload->FindString("ab"); - const auto* val2 = inner_payload->FindString("cd"); - ASSERT_TRUE(ctry && val1 && val2); - EXPECT_EQ(*ctry, "us"); - EXPECT_EQ(*val1, "value1"); - EXPECT_EQ(*val2, "value2"); + base::ExpectDictStringValue("us", *inner_payload, "ctry"); + base::ExpectDictStringValue("value1", *inner_payload, "ab"); + base::ExpectDictStringValue("value2", *inner_payload, "cd"); payload = &payloads[1]; - action = payload->FindString(kActionKey); inner_payload = payload->FindDict(kInnerPayloadKey); - ASSERT_TRUE(action && inner_payload); - EXPECT_EQ(*action, "single_action"); + ASSERT_TRUE(inner_payload); + base::ExpectDictStringValue("single_action", *payload, kActionKey); EXPECT_EQ(inner_payload->size(), 3u); - ctry = inner_payload->FindString("ctry"); - val1 = inner_payload->FindString("ef"); - val2 = inner_payload->FindString("gh"); - ASSERT_TRUE(ctry && val1 && val2); - EXPECT_EQ(*ctry, "us"); - EXPECT_EQ(*val1, "value3"); - EXPECT_EQ(*val2, "value4"); + base::ExpectDictStringValue("us", *inner_payload, "ctry"); + base::ExpectDictStringValue("value3", *inner_payload, "ef"); + base::ExpectDictStringValue("value4", *inner_payload, "gh"); payload = &payloads[2]; - action = payload->FindString(kActionKey); inner_payload = payload->FindDict(kInnerPayloadKey); - ASSERT_TRUE(action && inner_payload); - EXPECT_EQ(*action, "query"); + ASSERT_TRUE(inner_payload); + base::ExpectDictStringValue("query", *payload, kActionKey); EXPECT_EQ(inner_payload->size(), 2u); - const auto* qurl = inner_payload->FindString("qurl"); const auto* r_dict = inner_payload->FindDict("r"); - ASSERT_TRUE(qurl && r_dict); - EXPECT_EQ(*qurl, "https://example.com/test1"); + ASSERT_TRUE(r_dict); EXPECT_EQ(r_dict->size(), 4u); + base::ExpectDictStringValue("https://example.com/test1", *inner_payload, + "qurl"); + const auto* r0_dict = r_dict->FindDict("0"); const auto* r1_dict = r_dict->FindDict("1"); const auto* r2_dict = r_dict->FindDict("2"); @@ -153,15 +135,10 @@ TEST_F(WebDiscoveryPayloadGeneratorTest, GenerateQueryPayloads) { EXPECT_EQ(r1_dict->size(), 1u); EXPECT_EQ(r2_dict->size(), 1u); EXPECT_EQ(r3_dict->size(), 1u); - const auto* r0_val = r0_dict->FindString("njk"); - const auto* r1_val = r1_dict->FindString("abc"); - const auto* r2_val = r2_dict->FindString("njk"); - const auto* r3_val = r3_dict->FindString("abc"); - ASSERT_TRUE(r0_val && r1_val && r2_val && r2_val); - EXPECT_EQ(*r0_val, "joinvalue1"); - EXPECT_EQ(*r1_val, "joinvalue2"); - EXPECT_EQ(*r2_val, "joinvalue3"); - EXPECT_EQ(*r3_val, "joinvalue4"); + base::ExpectDictStringValue("joinvalue1", *r0_dict, "njk"); + base::ExpectDictStringValue("joinvalue2", *r1_dict, "abc"); + base::ExpectDictStringValue("joinvalue3", *r2_dict, "njk"); + base::ExpectDictStringValue("joinvalue4", *r3_dict, "abc"); } TEST_F(WebDiscoveryPayloadGeneratorTest, GenerateAlivePayload) { @@ -169,20 +146,15 @@ TEST_F(WebDiscoveryPayloadGeneratorTest, GenerateAlivePayload) { auto alive_payload = GenerateAlivePayload(*server_config_.get(), date_hour); - const auto* action = alive_payload.FindString("action"); - const auto* inner_payload = alive_payload.FindDict("payload"); + base::ExpectDictStringValue("alive", alive_payload, "action"); - ASSERT_TRUE(action && inner_payload); + const auto* inner_payload = alive_payload.FindDict("payload"); - EXPECT_EQ(*action, "alive"); - const auto* ctry = inner_payload->FindString("ctry"); - const auto* ts = inner_payload->FindString("t"); - const auto status = inner_payload->FindBool("status"); + ASSERT_TRUE(inner_payload); - ASSERT_TRUE(ctry && ts && status); - EXPECT_EQ(*ctry, "us"); - EXPECT_EQ(*ts, date_hour); - EXPECT_EQ(*status, true); + base::ExpectDictStringValue("us", *inner_payload, "ctry"); + base::ExpectDictStringValue(date_hour, *inner_payload, "t"); + base::ExpectDictBooleanValue(true, *inner_payload, "status"); } TEST_F(WebDiscoveryPayloadGeneratorTest, ExcludePrivateResult) { @@ -222,10 +194,8 @@ TEST_F(WebDiscoveryPayloadGeneratorTest, ExcludePrivateResult) { const auto* ri_dict = r_dict->FindDict(base::NumberToString(i)); ASSERT_TRUE(ri_dict); - const auto* url = ri_dict->FindString("u"); - ASSERT_TRUE(url); - - EXPECT_EQ(*url, "https://example.com/result" + base::NumberToString(i)); + base::ExpectDictStringValue( + "https://example.com/result" + base::NumberToString(i), *ri_dict, "u"); } } diff --git a/components/web_discovery/browser/privacy_guard.cc b/components/web_discovery/browser/privacy_guard.cc index 59156b1f36e6..6528a3725ba5 100644 --- a/components/web_discovery/browser/privacy_guard.cc +++ b/components/web_discovery/browser/privacy_guard.cc @@ -51,28 +51,28 @@ constexpr char kMaskedURLSuffix[] = "/ (PROTECTED)"; bool ContainsForbiddenKeywords(const GURL& url) { auto path_and_query = base::StrCat({url.path_piece(), "?", url.query_piece()}); - if (g_regex_util.CheckPathAndQueryStringKeywords(path_and_query)) { + if (g_regex_util->CheckPathAndQueryStringKeywords(path_and_query)) { return true; } if (!url.ref_piece().empty() && - g_regex_util.CheckQueryStringOrRefKeywords("#" + url.ref())) { + g_regex_util->CheckQueryStringOrRefKeywords("#" + url.ref())) { return true; } if (!url.query_piece().empty() && - g_regex_util.CheckQueryStringOrRefKeywords("?" + url.query())) { + g_regex_util->CheckQueryStringOrRefKeywords("?" + url.query())) { return true; } return false; } -bool IsPrivateDomainLikely(const std::string_view host) { +bool IsPrivateDomainLikely(std::string_view host) { auto dot_split = base::SplitString(host, ".", base::WhitespaceHandling::KEEP_WHITESPACE, base::SPLIT_WANT_ALL); if (dot_split.size() > kMaxDotSplitDomainSize) { return true; } - if (g_regex_util.CheckForLongNumber(host, kMaxDomainNumberLength)) { + if (g_regex_util->CheckForLongNumber(host, kMaxDomainNumberLength)) { return true; } auto hyphen_split = @@ -135,15 +135,15 @@ bool IsPrivateQueryLikely(const std::string& query) { VLOG(1) << "Ignoring query due to long split length"; return true; } - if (g_regex_util.CheckForLongNumber(query, kMaxQueryNumberLength)) { + if (g_regex_util->CheckForLongNumber(query, kMaxQueryNumberLength)) { VLOG(1) << "Ignoring query due to long number"; return true; } - if (g_regex_util.CheckForEmail(query)) { + if (g_regex_util->CheckForEmail(query)) { VLOG(1) << "Ignoring query due to inclusion of email"; return true; } - if (g_regex_util.CheckQueryHTTPCredentials(query)) { + if (g_regex_util->CheckQueryHTTPCredentials(query)) { VLOG(1) << "Ignoring query due to potential inclusion of HTTP credentials"; return true; } @@ -178,8 +178,8 @@ GURL GeneratePrivateSearchURL(const GURL& original_url, query_encoded_str})); } -bool ShouldDropLongURL(const GURL& url) { - if (g_regex_util.CheckForEmail(url.spec())) { +bool ShouldMaskURL(const GURL& url) { + if (g_regex_util->CheckForEmail(url.spec())) { return true; } if (!url.query_piece().empty()) { @@ -192,14 +192,14 @@ bool ShouldDropLongURL(const GURL& url) { if (query_parts.size() > kMaxQueryStringParts) { return true; } - if (g_regex_util.CheckForLongNumber(url.query_piece(), - kMaxQueryStringOrPathNumberLength)) { + if (g_regex_util->CheckForLongNumber(url.query_piece(), + kMaxQueryStringOrPathNumberLength)) { return true; } } if (!url.path_piece().empty()) { - if (g_regex_util.CheckForLongNumber(url.path_piece(), - kMaxQueryStringOrPathNumberLength)) { + if (g_regex_util->CheckForLongNumber(url.path_piece(), + kMaxQueryStringOrPathNumberLength)) { return true; } } @@ -234,7 +234,7 @@ std::optional MaskURL(const GURL& url) { return std::nullopt; } - if (!ShouldDropLongURL(url)) { + if (!ShouldMaskURL(url)) { return url.spec(); } diff --git a/components/web_discovery/browser/privacy_guard.h b/components/web_discovery/browser/privacy_guard.h index 4719be6fba81..e9250d38042b 100644 --- a/components/web_discovery/browser/privacy_guard.h +++ b/components/web_discovery/browser/privacy_guard.h @@ -32,7 +32,7 @@ GURL GeneratePrivateSearchURL(const GURL& original_url, // Checks if a URL should be dropped due to its length or content. // Currently only used for determining whether to mask a URL // in the function below. -bool ShouldDropLongURL(const GURL& url); +bool ShouldMaskURL(const GURL& url); // Masks a URL to protect privacy. Returns nullopt if URL is invalid. std::optional MaskURL(const GURL& url); diff --git a/components/web_discovery/browser/privacy_guard_unittest.cc b/components/web_discovery/browser/privacy_guard_unittest.cc index 52abdcafa763..ef73e89f0aa0 100644 --- a/components/web_discovery/browser/privacy_guard_unittest.cc +++ b/components/web_discovery/browser/privacy_guard_unittest.cc @@ -100,48 +100,44 @@ TEST_F(WebDiscoveryPrivacyGuardTest, GeneratePrivateSearchURL) { "https://example.com/find?testquery=special+chars+%40%23%24%25%5E%26%3D"); } -TEST_F(WebDiscoveryPrivacyGuardTest, ShouldDropLongURL) { - EXPECT_FALSE( - ShouldDropLongURL(GURL("https://www.search1.com/search?q=test"))); - EXPECT_FALSE(ShouldDropLongURL( +TEST_F(WebDiscoveryPrivacyGuardTest, ShouldMaskURL) { + EXPECT_FALSE(ShouldMaskURL(GURL("https://www.search1.com/search?q=test"))); + EXPECT_FALSE(ShouldMaskURL( GURL("https://search2.com/search?query=testing+a+nice+query"))); - EXPECT_FALSE(ShouldDropLongURL( + EXPECT_FALSE(ShouldMaskURL( GURL("https://search2.com/search?query=quick+fox&country=us&d=1"))); - EXPECT_FALSE(ShouldDropLongURL(GURL("https://www.website.com/page/test"))); + EXPECT_FALSE(ShouldMaskURL(GURL("https://www.website.com/page/test"))); - EXPECT_TRUE(ShouldDropLongURL( + EXPECT_TRUE(ShouldMaskURL( GURL("https://www.website.com/page/test?id=12823871923991"))); - EXPECT_TRUE(ShouldDropLongURL( - GURL("https://www.website.com/page/test1283192831292"))); - EXPECT_TRUE(ShouldDropLongURL( - GURL("https://www.website.com/page/1283192831292?q=1"))); - EXPECT_TRUE(ShouldDropLongURL( + EXPECT_TRUE( + ShouldMaskURL(GURL("https://www.website.com/page/test1283192831292"))); + EXPECT_TRUE( + ShouldMaskURL(GURL("https://www.website.com/page/1283192831292?q=1"))); + EXPECT_TRUE(ShouldMaskURL( GURL("https://www.website.com/page/test?a=1&b=2&c=3&d=4&e=5"))); - EXPECT_TRUE(ShouldDropLongURL( + EXPECT_TRUE(ShouldMaskURL( GURL("https://www.website.com/page/" "test?query=a+super+long+query+string+that+is+too+long"))); EXPECT_TRUE( - ShouldDropLongURL(GURL("https://www.website.com/page/ayLxezLhK1Lh1H1"))); - EXPECT_TRUE(ShouldDropLongURL(GURL("https://www.website.com/page/WebLogic"))); - EXPECT_TRUE(ShouldDropLongURL(GURL("https://www.website.com/page/admin"))); - EXPECT_TRUE(ShouldDropLongURL(GURL("https://www.website.com/page/edit/"))); - EXPECT_TRUE( - ShouldDropLongURL(GURL("https://www.website.com/page/doc?share=1"))); - EXPECT_TRUE( - ShouldDropLongURL(GURL("https://www.website.com/page/doc?user=abc"))); + ShouldMaskURL(GURL("https://www.website.com/page/ayLxezLhK1Lh1H1"))); + EXPECT_TRUE(ShouldMaskURL(GURL("https://www.website.com/page/WebLogic"))); + EXPECT_TRUE(ShouldMaskURL(GURL("https://www.website.com/page/admin"))); + EXPECT_TRUE(ShouldMaskURL(GURL("https://www.website.com/page/edit/"))); + EXPECT_TRUE(ShouldMaskURL(GURL("https://www.website.com/page/doc?share=1"))); + EXPECT_TRUE(ShouldMaskURL(GURL("https://www.website.com/page/doc?user=abc"))); + EXPECT_TRUE(ShouldMaskURL(GURL("https://www.website.com/page/doc#logout"))); EXPECT_TRUE( - ShouldDropLongURL(GURL("https://www.website.com/page/doc#logout"))); + ShouldMaskURL(GURL("https://www.website.com/page/doc?password=abc"))); EXPECT_TRUE( - ShouldDropLongURL(GURL("https://www.website.com/page/doc?password=abc"))); + ShouldMaskURL(GURL("https://user:pass@www.website.com/page/test"))); EXPECT_TRUE( - ShouldDropLongURL(GURL("https://user:pass@www.website.com/page/test"))); - EXPECT_TRUE(ShouldDropLongURL( - GURL("https://www.website.com/page/test?e=test@test.com"))); + ShouldMaskURL(GURL("https://www.website.com/page/test?e=test@test.com"))); } TEST_F(WebDiscoveryPrivacyGuardTest, MaskURL) { diff --git a/components/web_discovery/browser/regex_util.cc b/components/web_discovery/browser/regex_util.cc index eb5b998a0169..d6e6d09c1e35 100644 --- a/components/web_discovery/browser/regex_util.cc +++ b/components/web_discovery/browser/regex_util.cc @@ -48,15 +48,14 @@ namespace web_discovery { RegexUtil::RegexUtil() = default; RegexUtil::~RegexUtil() = default; -bool RegexUtil::CheckForEmail(const std::string_view str) { +bool RegexUtil::CheckForEmail(std::string_view str) { if (!email_regex_) { email_regex_.emplace(kEmailRegex); } return re2::RE2::PartialMatch(str, *email_regex_); } -bool RegexUtil::CheckForLongNumber(const std::string_view str, - size_t max_length) { +bool RegexUtil::CheckForLongNumber(std::string_view str, size_t max_length) { if (!long_number_regexes_.contains(max_length)) { auto regex_str = base::StrCat({kLongNumberRegexPrefix, base::NumberToString(max_length + 1), @@ -67,7 +66,7 @@ bool RegexUtil::CheckForLongNumber(const std::string_view str, } bool RegexUtil::CheckPathAndQueryStringKeywords( - const std::string_view path_and_query) { + std::string_view path_and_query) { if (path_and_query_string_keyword_regexes_.empty()) { for (const auto& regex_str : kPathAndQueryStringCheckRegexes) { path_and_query_string_keyword_regexes_.emplace_back(regex_str); @@ -81,7 +80,7 @@ bool RegexUtil::CheckPathAndQueryStringKeywords( return false; } -bool RegexUtil::CheckQueryStringOrRefKeywords(const std::string_view str) { +bool RegexUtil::CheckQueryStringOrRefKeywords(std::string_view str) { if (query_string_and_ref_keyword_regexes_.empty()) { for (const auto& regex_str : kQueryStringAndRefCheckRegexes) { query_string_and_ref_keyword_regexes_.emplace_back(regex_str); @@ -95,7 +94,7 @@ bool RegexUtil::CheckQueryStringOrRefKeywords(const std::string_view str) { return false; } -bool RegexUtil::CheckQueryHTTPCredentials(const std::string_view str) { +bool RegexUtil::CheckQueryHTTPCredentials(std::string_view str) { if (!http_password_regex_) { http_password_regex_.emplace(kHttpPasswordRegex); } diff --git a/components/web_discovery/browser/regex_util.h b/components/web_discovery/browser/regex_util.h index 03765c5f3a61..e30b92167ef7 100644 --- a/components/web_discovery/browser/regex_util.h +++ b/components/web_discovery/browser/regex_util.h @@ -12,12 +12,14 @@ #include #include "base/containers/flat_map.h" +#include "base/no_destructor.h" #include "third_party/re2/src/re2/re2.h" namespace web_discovery { // Lazily creates and caches pre-compiled regexes, mainly used for // privacy risk assessment of page URLs/contents. +// This class is not thread safe. class RegexUtil { public: RegexUtil(); @@ -26,11 +28,11 @@ class RegexUtil { RegexUtil(const RegexUtil&) = delete; RegexUtil& operator=(const RegexUtil&) = delete; - bool CheckForEmail(const std::string_view str); - bool CheckForLongNumber(const std::string_view str, size_t max_length); - bool CheckPathAndQueryStringKeywords(const std::string_view path_and_query); - bool CheckQueryStringOrRefKeywords(const std::string_view str); - bool CheckQueryHTTPCredentials(const std::string_view str); + bool CheckForEmail(std::string_view str); + bool CheckForLongNumber(std::string_view str, size_t max_length); + bool CheckPathAndQueryStringKeywords(std::string_view path_and_query); + bool CheckQueryStringOrRefKeywords(std::string_view str); + bool CheckQueryHTTPCredentials(std::string_view str); private: std::optional email_regex_; @@ -42,7 +44,7 @@ class RegexUtil { std::optional non_alphanumeric_regex_; }; -static RegexUtil g_regex_util; +static base::NoDestructor g_regex_util; } // namespace web_discovery