Skip to content

Commit

Permalink
Add Web Discovery reporting & double fetching
Browse files Browse the repository at this point in the history
  • Loading branch information
DJAndries committed Aug 1, 2024
1 parent 4b7529f commit 37e3465
Show file tree
Hide file tree
Showing 17 changed files with 2,083 additions and 0 deletions.
13 changes: 13 additions & 0 deletions components/web_discovery/browser/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ static_library("browser") {
"credential_manager.h",
"credential_signer.cc",
"credential_signer.h",
"double_fetcher.cc",
"double_fetcher.h",
"ecdh_aes.cc",
"ecdh_aes.h",
"hash_detection.cc",
"hash_detection.h",
"patterns.cc",
Expand All @@ -26,10 +30,16 @@ static_library("browser") {
"privacy_guard.h",
"regex_util.cc",
"regex_util.h",
"reporter.cc",
"reporter.h",
"request_queue.cc",
"request_queue.h",
"rsa.cc",
"rsa.h",
"server_config_loader.cc",
"server_config_loader.h",
"signature_basename.cc",
"signature_basename.h",
"util.cc",
"util.h",
"web_discovery_service.cc",
Expand Down Expand Up @@ -63,11 +73,14 @@ source_set("unit_tests") {
testonly = true
sources = [
"credential_manager_unittest.cc",
"double_fetcher_unittest.cc",
"hash_detection_unittest.cc",
"patterns_unittest.cc",
"payload_generator_unittest.cc",
"privacy_guard_unittest.cc",
"reporter_unittest.cc",
"server_config_loader_unittest.cc",
"signature_basename_unittest.cc",
]
deps = [
":browser",
Expand Down
132 changes: 132 additions & 0 deletions components/web_discovery/browser/double_fetcher.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/* Copyright (c) 2024 The Brave Authors. All rights reserved.
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at https://mozilla.org/MPL/2.0/. */

#include "brave/components/web_discovery/browser/double_fetcher.h"

#include <utility>

#include "brave/components/web_discovery/browser/pref_names.h"
#include "brave/components/web_discovery/browser/request_queue.h"
#include "brave/components/web_discovery/browser/util.h"
#include "components/prefs/pref_service.h"
#include "services/network/public/cpp/shared_url_loader_factory.h"
#include "services/network/public/cpp/simple_url_loader.h"
#include "services/network/public/mojom/url_response_head.mojom.h"

namespace web_discovery {

namespace {
constexpr char kUrlKey[] = "url";
constexpr char kAssociatedDataKey[] = "assoc_data";

constexpr base::TimeDelta kRequestMaxAge = base::Hours(1);
constexpr base::TimeDelta kMinRequestInterval =
base::Minutes(1) - base::Seconds(5);
constexpr base::TimeDelta kMaxRequestInterval =
base::Minutes(1) + base::Seconds(5);
constexpr size_t kMaxRetries = 3;
constexpr size_t kMaxDoubleFetchResponseSize = 2 * 1024 * 1024;

constexpr net::NetworkTrafficAnnotationTag kFetchNetworkTrafficAnnotation =
net::DefineNetworkTrafficAnnotation("wdp_doublefetch", R"(
semantics {
sender: "Brave Web Discovery Double Fetch"
description:
"Retrieves a page of interest without cookies for
scraping and reporting via Web Discovery."
trigger:
"Requests are sent minutes after the original
page request is made by the user."
data: "Page data"
destination: WEBSITE
}
policy {
cookies_allowed: NO
setting:
"Users can opt-in or out via brave://settings/search"
})");

} // namespace

DoubleFetcher::DoubleFetcher(
PrefService* profile_prefs,
network::SharedURLLoaderFactory* shared_url_loader_factory,
FetchedCallback callback)
: profile_prefs_(profile_prefs),
shared_url_loader_factory_(shared_url_loader_factory),
request_queue_(profile_prefs,
kScheduledDoubleFetches,
kRequestMaxAge,
kMinRequestInterval,
kMaxRequestInterval,
kMaxRetries,
base::BindRepeating(&DoubleFetcher::OnFetchTimer,
base::Unretained(this))),
callback_(callback) {}

DoubleFetcher::~DoubleFetcher() = default;

void DoubleFetcher::ScheduleDoubleFetch(const GURL& url,
base::Value associated_data) {
base::Value::Dict fetch_dict;
fetch_dict.Set(kUrlKey, url.spec());
fetch_dict.Set(kAssociatedDataKey, std::move(associated_data));

request_queue_.ScheduleRequest(base::Value(std::move(fetch_dict)));
}

void DoubleFetcher::OnFetchTimer(const base::Value& request_data) {
const auto* fetch_dict = request_data.GetIfDict();
const auto* url_str = fetch_dict ? fetch_dict->FindString(kUrlKey) : nullptr;
if (!url_str) {
request_queue_.NotifyRequestComplete(true);
return;
}

GURL url(*url_str);
auto resource_request = CreateResourceRequest(url);
url_loader_ = network::SimpleURLLoader::Create(
std::move(resource_request), kFetchNetworkTrafficAnnotation);
url_loader_->DownloadToString(
shared_url_loader_factory_.get(),
base::BindOnce(&DoubleFetcher::OnRequestComplete, base::Unretained(this),
url),
kMaxDoubleFetchResponseSize);
}

void DoubleFetcher::OnRequestComplete(
GURL url,
std::optional<std::string> response_body) {
auto result = ProcessCompletedRequest(&response_body);

auto request_data = request_queue_.NotifyRequestComplete(result);

if (request_data) {
const auto& request_dict = request_data->GetDict();
const auto* assoc_data = request_dict.Find(kAssociatedDataKey);
if (assoc_data) {
callback_.Run(url, *assoc_data, response_body);
}
}
}

bool DoubleFetcher::ProcessCompletedRequest(
std::optional<std::string>* response_body) {
auto* response_info = url_loader_->ResponseInfo();
if (!response_body || !response_info) {
return false;
}
auto response_code = response_info->headers->response_code();
if (response_code < 200 || response_code >= 300) {
if (response_code >= 500) {
// Only retry failures due to server error
return false;
}
*response_body = std::nullopt;
}
return true;
}

} // namespace web_discovery
67 changes: 67 additions & 0 deletions components/web_discovery/browser/double_fetcher.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/* Copyright (c) 2024 The Brave Authors. All rights reserved.
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at https://mozilla.org/MPL/2.0/. */

#ifndef BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_DOUBLE_FETCHER_H_
#define BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_DOUBLE_FETCHER_H_

#include <memory>
#include <optional>
#include <string>

#include "base/memory/raw_ptr.h"
#include "base/values.h"
#include "brave/components/web_discovery/browser/request_queue.h"
#include "url/gurl.h"

class PrefService;

namespace network {
class SharedURLLoaderFactory;
class SimpleURLLoader;
} // namespace network

namespace web_discovery {

// Makes anonymous requests to relevant page URLs, without involvement of the
// user's session. In the case of search engine result pages, the result of the
// double fetch will scraped for search engine results for a future submission.
// Uses `RequestQueue` to persist and schedule double fetches. Requests
// will be sent on somewhat random intervals averaging to a minute.
class DoubleFetcher {
public:
using FetchedCallback =
base::RepeatingCallback<void(const GURL& url,
const base::Value& associated_data,
std::optional<std::string> response_body)>;
DoubleFetcher(PrefService* profile_prefs,
network::SharedURLLoaderFactory* shared_url_loader_factory,
FetchedCallback callback);
~DoubleFetcher();

DoubleFetcher(const DoubleFetcher&) = delete;
DoubleFetcher& operator=(const DoubleFetcher&) = delete;

// Queues a double fetch for a given URL. The associated data will be stored
// beside the queue request, and will be passed to the `FetchedCallback`
// upon completion.
void ScheduleDoubleFetch(const GURL& url, base::Value associated_data);

private:
void OnFetchTimer(const base::Value& request_data);
void OnRequestComplete(GURL url, std::optional<std::string> response_body);
bool ProcessCompletedRequest(std::optional<std::string>* response_body);

raw_ptr<PrefService> profile_prefs_;
raw_ptr<network::SharedURLLoaderFactory> shared_url_loader_factory_;
std::unique_ptr<network::SimpleURLLoader> url_loader_;

RequestQueue request_queue_;

FetchedCallback callback_;
};

} // namespace web_discovery

#endif // BRAVE_COMPONENTS_WEB_DISCOVERY_BROWSER_DOUBLE_FETCHER_H_
Loading

0 comments on commit 37e3465

Please sign in to comment.