diff --git a/crates/learner/src/lib.rs b/crates/learner/src/lib.rs index 723f1ee..e20db56 100644 --- a/crates/learner/src/lib.rs +++ b/crates/learner/src/lib.rs @@ -218,7 +218,7 @@ pub const IACR_CONFIG: &str = include_str!("../config/retrievers/iacr.toml"); pub mod prelude { pub use crate::{ database::DatabaseInstruction, error::LearnerError, resource::Resource, - retriever::ResponseProcessor, + retriever::response::ResponseProcessor, }; } diff --git a/crates/learner/src/resource/mod.rs b/crates/learner/src/resource/mod.rs index b8475c6..b20e222 100644 --- a/crates/learner/src/resource/mod.rs +++ b/crates/learner/src/resource/mod.rs @@ -50,6 +50,14 @@ mod shared; pub use paper::*; pub use shared::*; +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum ResourceType { + Paper, + Book, + // We can add more built-in types as needed + Custom(PathBuf), // For user-defined resource types via config +} + /// Core trait that defines the behavior of a resource in the system. /// /// This trait provides a common interface for all resource types, whether they are diff --git a/crates/learner/src/retriever/config.rs b/crates/learner/src/retriever/config.rs new file mode 100644 index 0000000..6a95ff8 --- /dev/null +++ b/crates/learner/src/retriever/config.rs @@ -0,0 +1,105 @@ +use response::ResponseFormat; + +use super::*; + +/// Configuration for a specific paper source retriever. +/// +/// This struct defines how to interact with a particular paper source's API, +/// including URL patterns, authentication, and response parsing rules. +/// +/// # Examples +/// +/// Example TOML configuration: +/// +/// ```toml +/// name = "arxiv" +/// base_url = "http://export.arxiv.org/api/query" +/// pattern = "^\\d{4}\\.\\d{4,5}$" +/// source = "arxiv" +/// endpoint_template = "http://export.arxiv.org/api/query?id_list={identifier}" +/// +/// [response_format] +/// type = "xml" +/// strip_namespaces = true +/// +/// [response_format.field_maps] +/// title = { path = "entry/title" } +/// abstract = { path = "entry/summary" } +/// publication_date = { path = "entry/published" } +/// authors = { path = "entry/author/name" } +/// ``` +#[derive(Debug, Clone, Deserialize)] +pub struct RetrieverConfig { + /// Name of this retriever configuration + pub name: String, + /// The type of resource this retriever should yield + pub resource_type: ResourceType, + /// Base URL for API requests + pub base_url: String, + /// Regex pattern for matching and extracting paper identifiers + #[serde(deserialize_with = "deserialize_regex")] + pub pattern: Regex, + /// Source identifier for papers from this retriever + pub source: String, + /// Template for constructing API endpoint URLs + pub endpoint_template: String, + /// Format and parsing configuration for API responses + pub response_format: ResponseFormat, + /// Optional HTTP headers for API requests + #[serde(default)] + pub headers: HashMap, +} + +impl RetrieverConfig { + /// Extracts the canonical identifier from an input string. + /// + /// Uses the configured regex pattern to extract the standardized + /// identifier from various input formats (URLs, DOIs, etc.). + /// + /// # Arguments + /// + /// * `input` - Input string containing a paper identifier + /// + /// # Returns + /// + /// Returns a Result containing either: + /// - The extracted identifier as a string slice + /// - A LearnerError if the input doesn't match the pattern + pub fn extract_identifier<'a>(&self, input: &'a str) -> Result<&'a str> { + self + .pattern + .captures(input) + .and_then(|cap| cap.get(1)) + .map(|m| m.as_str()) + .ok_or(LearnerError::InvalidIdentifier) + } + + pub async fn retrieve_paper(&self, input: &str) -> Result { + let identifier = self.extract_identifier(input)?; + let url = self.endpoint_template.replace("{identifier}", identifier); + + debug!("Fetching from {} via: {}", self.name, url); + + let client = reqwest::Client::new(); + let mut request = client.get(&url); + + // Add any configured headers + for (key, value) in &self.headers { + request = request.header(key, value); + } + + let response = request.send().await?; + let data = response.bytes().await?; + + trace!("{} response: {}", self.name, String::from_utf8_lossy(&data)); + + let response_processor = match &self.response_format { + ResponseFormat::Xml(config) => config as &dyn ResponseProcessor, + ResponseFormat::Json(config) => config as &dyn ResponseProcessor, + }; + let mut paper = response_processor.process_response(&data).await?; + paper.source = self.source.clone(); + paper.source_identifier = identifier.to_string(); + Ok(paper) + } +} diff --git a/crates/learner/src/retriever/mod.rs b/crates/learner/src/retriever/mod.rs index 11c0726..c2cc8ce 100644 --- a/crates/learner/src/retriever/mod.rs +++ b/crates/learner/src/retriever/mod.rs @@ -66,10 +66,12 @@ use std::collections::HashMap; +use self::{config::RetrieverConfig, response::Transform}; use super::*; +use crate::resource::ResourceType; -pub mod json; -pub mod xml; +pub mod config; +pub mod response; /// Main entry point for paper retrieval operations. /// @@ -122,189 +124,6 @@ impl Retriever { pub fn is_empty(&self) -> bool { self.configs.is_empty() } } -/// Configuration for a specific paper source retriever. -/// -/// This struct defines how to interact with a particular paper source's API, -/// including URL patterns, authentication, and response parsing rules. -/// -/// # Examples -/// -/// Example TOML configuration: -/// -/// ```toml -/// name = "arxiv" -/// base_url = "http://export.arxiv.org/api/query" -/// pattern = "^\\d{4}\\.\\d{4,5}$" -/// source = "arxiv" -/// endpoint_template = "http://export.arxiv.org/api/query?id_list={identifier}" -/// -/// [response_format] -/// type = "xml" -/// strip_namespaces = true -/// -/// [response_format.field_maps] -/// title = { path = "entry/title" } -/// abstract = { path = "entry/summary" } -/// publication_date = { path = "entry/published" } -/// authors = { path = "entry/author/name" } -/// ``` -#[derive(Debug, Clone, Deserialize)] -pub struct RetrieverConfig { - /// Name of this retriever configuration - pub name: String, - /// Base URL for API requests - pub base_url: String, - /// Regex pattern for matching and extracting paper identifiers - #[serde(deserialize_with = "deserialize_regex")] - pub pattern: Regex, - /// Source identifier for papers from this retriever - pub source: String, - /// Template for constructing API endpoint URLs - pub endpoint_template: String, - /// Format and parsing configuration for API responses - pub response_format: ResponseFormat, - /// Optional HTTP headers for API requests - #[serde(default)] - pub headers: HashMap, -} - -/// Available response format handlers. -/// -/// Specifies how to parse and extract paper metadata from API responses -/// in different formats. -/// -/// # Examples -/// -/// XML configuration: -/// ```toml -/// [response_format] -/// type = "xml" -/// strip_namespaces = true -/// -/// [response_format.field_maps] -/// title = { path = "entry/title" } -/// ``` -/// -/// JSON configuration: -/// ```toml -/// [response_format] -/// type = "json" -/// -/// [response_format.field_maps] -/// title = { path = "message/title/0" } -/// ``` -#[derive(Debug, Clone, Deserialize)] -#[serde(tag = "type")] -pub enum ResponseFormat { - /// XML response parser configuration - #[serde(rename = "xml")] - Xml(xml::XmlConfig), - /// JSON response parser configuration - #[serde(rename = "json")] - Json(json::JsonConfig), -} - -/// Field mapping configuration. -/// -/// Defines how to extract and transform specific fields from API responses. -/// -/// # Examples -/// -/// ```toml -/// [field_maps.title] -/// path = "entry/title" -/// transform = { type = "replace", pattern = "\\s+", replacement = " " } -/// ``` -#[derive(Debug, Clone, Deserialize)] -pub struct FieldMap { - /// Path to field in response (e.g., JSON path or XPath) - pub path: String, - /// Optional transformation to apply to extracted value - #[serde(default)] - pub transform: Option, -} - -/// Available field value transformations. -/// -/// Transformations that can be applied to extracted field values -/// before constructing the final Paper object. -/// -/// # Examples -/// -/// ```toml -/// # Clean up whitespace -/// transform = { type = "replace", pattern = "\\s+", replacement = " " } -/// -/// # Convert date format -/// transform = { type = "date", from_format = "%Y-%m-%d", to_format = "%Y-%m-%dT00:00:00Z" } -/// -/// # Construct full URL -/// transform = { type = "url", base = "https://example.com/", suffix = ".pdf" } -/// ``` -#[derive(Debug, Clone, Deserialize)] -#[serde(tag = "type")] -pub enum Transform { - /// Replace text using regex pattern - Replace { - /// Regular expression pattern to match - pattern: String, - /// Text to replace matched patterns with - replacement: String, - }, - /// Convert between date formats - Date { - /// Source date format string using chrono syntax (e.g., "%Y-%m-%d") - from_format: String, - /// Target date format string using chrono syntax (e.g., "%Y-%m-%dT%H:%M:%SZ") - to_format: String, - }, - /// Construct URL from parts - Url { - /// Base URL template, may contain {value} placeholder - base: String, - /// Optional suffix to append to the URL (e.g., ".pdf") - suffix: Option, - }, -} - -/// Trait for processing API responses into Paper objects. -/// -/// Implementors of this trait handle the conversion of raw API response data -/// into structured Paper metadata. The trait is implemented separately for -/// different response formats (XML, JSON) to provide a unified interface for -/// paper retrieval. -/// -/// # Examples -/// -/// ```no_run -/// # use learner::{retriever::ResponseProcessor, resource::Paper}; -/// # use learner::error::LearnerError; -/// struct CustomProcessor; -/// -/// #[async_trait::async_trait] -/// impl ResponseProcessor for CustomProcessor { -/// async fn process_response(&self, data: &[u8]) -> Result { -/// // Parse response data and construct Paper -/// todo!() -/// } -/// } -/// ``` -#[async_trait] -pub trait ResponseProcessor: Send + Sync { - /// Process raw response data into a Paper object. - /// - /// # Arguments - /// - /// * `data` - Raw bytes from the API response - /// - /// # Returns - /// - /// Returns a Result containing either: - /// - A fully populated Paper object - /// - A LearnerError if parsing fails - async fn process_response(&self, data: &[u8]) -> Result; -} - impl Retriever { /// Creates a new empty retriever with no configurations. /// @@ -577,84 +396,6 @@ impl Retriever { } } -impl RetrieverConfig { - /// Extracts the canonical identifier from an input string. - /// - /// Uses the configured regex pattern to extract the standardized - /// identifier from various input formats (URLs, DOIs, etc.). - /// - /// # Arguments - /// - /// * `input` - Input string containing a paper identifier - /// - /// # Returns - /// - /// Returns a Result containing either: - /// - The extracted identifier as a string slice - /// - A LearnerError if the input doesn't match the pattern - pub fn extract_identifier<'a>(&self, input: &'a str) -> Result<&'a str> { - self - .pattern - .captures(input) - .and_then(|cap| cap.get(1)) - .map(|m| m.as_str()) - .ok_or(LearnerError::InvalidIdentifier) - } - - /// Retrieves a paper using this configuration. - /// - /// This method: - /// 1. Extracts the canonical identifier - /// 2. Constructs the API URL - /// 3. Makes the HTTP request - /// 4. Processes the response - /// - /// # Arguments - /// - /// * `input` - Paper identifier or URL - /// - /// # Returns - /// - /// Returns a Result containing either: - /// - The retrieved Paper object - /// - A LearnerError if any step fails - /// - /// # Errors - /// - /// This method will return an error if: - /// - The identifier cannot be extracted - /// - The HTTP request fails - /// - The response cannot be parsed - pub async fn retrieve_paper(&self, input: &str) -> Result { - let identifier = self.extract_identifier(input)?; - let url = self.endpoint_template.replace("{identifier}", identifier); - - debug!("Fetching from {} via: {}", self.name, url); - - let client = reqwest::Client::new(); - let mut request = client.get(&url); - - // Add any configured headers - for (key, value) in &self.headers { - request = request.header(key, value); - } - - let response = request.send().await?; - let data = response.bytes().await?; - - trace!("{} response: {}", self.name, String::from_utf8_lossy(&data)); - - let response_processor = match &self.response_format { - ResponseFormat::Xml(config) => config as &dyn ResponseProcessor, - ResponseFormat::Json(config) => config as &dyn ResponseProcessor, - }; - let mut paper = response_processor.process_response(&data).await?; - paper.source = self.source.clone(); - paper.source_identifier = identifier.to_string(); - Ok(paper) - } -} - /// Custom deserializer for converting string patterns into Regex objects. /// /// Used with serde's derive functionality to automatically deserialize diff --git a/crates/learner/src/retriever/json.rs b/crates/learner/src/retriever/response/json.rs similarity index 100% rename from crates/learner/src/retriever/json.rs rename to crates/learner/src/retriever/response/json.rs diff --git a/crates/learner/src/retriever/response/mod.rs b/crates/learner/src/retriever/response/mod.rs new file mode 100644 index 0000000..73f938d --- /dev/null +++ b/crates/learner/src/retriever/response/mod.rs @@ -0,0 +1,141 @@ +use super::*; + +pub mod json; +pub mod xml; + +/// Available response format handlers. +/// +/// Specifies how to parse and extract paper metadata from API responses +/// in different formats. +/// +/// # Examples +/// +/// XML configuration: +/// ```toml +/// [response_format] +/// type = "xml" +/// strip_namespaces = true +/// +/// [response_format.field_maps] +/// title = { path = "entry/title" } +/// ``` +/// +/// JSON configuration: +/// ```toml +/// [response_format] +/// type = "json" +/// +/// [response_format.field_maps] +/// title = { path = "message/title/0" } +/// ``` +#[derive(Debug, Clone, Deserialize)] +#[serde(tag = "type")] +pub enum ResponseFormat { + /// XML response parser configuration + #[serde(rename = "xml")] + Xml(xml::XmlConfig), + /// JSON response parser configuration + #[serde(rename = "json")] + Json(json::JsonConfig), +} + +/// Field mapping configuration. +/// +/// Defines how to extract and transform specific fields from API responses. +/// +/// # Examples +/// +/// ```toml +/// [field_maps.title] +/// path = "entry/title" +/// transform = { type = "replace", pattern = "\\s+", replacement = " " } +/// ``` +#[derive(Debug, Clone, Deserialize)] +pub struct FieldMap { + /// Path to field in response (e.g., JSON path or XPath) + pub path: String, + /// Optional transformation to apply to extracted value + #[serde(default)] + pub transform: Option, +} + +/// Available field value transformations. +/// +/// Transformations that can be applied to extracted field values +/// before constructing the final Paper object. +/// +/// # Examples +/// +/// ```toml +/// # Clean up whitespace +/// transform = { type = "replace", pattern = "\\s+", replacement = " " } +/// +/// # Convert date format +/// transform = { type = "date", from_format = "%Y-%m-%d", to_format = "%Y-%m-%dT00:00:00Z" } +/// +/// # Construct full URL +/// transform = { type = "url", base = "https://example.com/", suffix = ".pdf" } +/// ``` +#[derive(Debug, Clone, Deserialize)] +#[serde(tag = "type")] +pub enum Transform { + /// Replace text using regex pattern + Replace { + /// Regular expression pattern to match + pattern: String, + /// Text to replace matched patterns with + replacement: String, + }, + /// Convert between date formats + Date { + /// Source date format string using chrono syntax (e.g., "%Y-%m-%d") + from_format: String, + /// Target date format string using chrono syntax (e.g., "%Y-%m-%dT%H:%M:%SZ") + to_format: String, + }, + /// Construct URL from parts + Url { + /// Base URL template, may contain {value} placeholder + base: String, + /// Optional suffix to append to the URL (e.g., ".pdf") + suffix: Option, + }, +} + +/// Trait for processing API responses into Paper objects. +/// +/// Implementors of this trait handle the conversion of raw API response data +/// into structured Paper metadata. The trait is implemented separately for +/// different response formats (XML, JSON) to provide a unified interface for +/// paper retrieval. +/// +/// # Examples +/// +/// ```no_run +/// # use learner::{retriever::ResponseProcessor, resource::Paper}; +/// # use learner::error::LearnerError; +/// struct CustomProcessor; +/// +/// #[async_trait::async_trait] +/// impl ResponseProcessor for CustomProcessor { +/// async fn process_response(&self, data: &[u8]) -> Result { +/// // Parse response data and construct Paper +/// todo!() +/// } +/// } +/// ``` +#[async_trait] +pub trait ResponseProcessor: Send + Sync { + /// Process raw response data into a Paper object. + /// + /// # Arguments + /// + /// * `data` - Raw bytes from the API response + /// + /// # Returns + /// + /// Returns a Result containing either: + /// - A fully populated Paper object + /// - A LearnerError if parsing fails + async fn process_response(&self, data: &[u8]) -> Result; +} diff --git a/crates/learner/src/retriever/xml.rs b/crates/learner/src/retriever/response/xml.rs similarity index 100% rename from crates/learner/src/retriever/xml.rs rename to crates/learner/src/retriever/response/xml.rs diff --git a/crates/learner/tests/workflows/build_retriever.rs b/crates/learner/tests/workflows/build_retriever.rs index 29d6995..3a7051b 100644 --- a/crates/learner/tests/workflows/build_retriever.rs +++ b/crates/learner/tests/workflows/build_retriever.rs @@ -1,6 +1,8 @@ use std::fs::read_to_string; -use learner::retriever::{ResponseFormat, RetrieverConfig, Transform}; +use learner::retriever::response::{ResponseFormat, Transform}; + +use super::*; #[test] fn test_arxiv_config_deserialization() { diff --git a/crates/learner/tests/workflows/mod.rs b/crates/learner/tests/workflows/mod.rs index 3adb178..add8da3 100644 --- a/crates/learner/tests/workflows/mod.rs +++ b/crates/learner/tests/workflows/mod.rs @@ -1,4 +1,4 @@ -use learner::retriever::RetrieverConfig; +use learner::retriever::config::RetrieverConfig; use super::*; diff --git a/crates/sdk/src/validate.rs b/crates/sdk/src/validate.rs index 5d4f80e..92bb056 100644 --- a/crates/sdk/src/validate.rs +++ b/crates/sdk/src/validate.rs @@ -2,7 +2,7 @@ use std::fs::read_to_string; use learner::{ resource::ResourceConfig, - retriever::{ResponseFormat, RetrieverConfig}, + retriever::{config::RetrieverConfig, response::ResponseFormat}, }; use super::*;