From 4fd71ad9f9dc070b744e1fd434673e25e59cc3e8 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 07:38:12 -0800 Subject: [PATCH 01/13] Use ptr::write when writing uninitialized memory --- crates/duckdb/examples/hello-ext/main.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/crates/duckdb/examples/hello-ext/main.rs b/crates/duckdb/examples/hello-ext/main.rs index 6f159e9a..823065e0 100644 --- a/crates/duckdb/examples/hello-ext/main.rs +++ b/crates/duckdb/examples/hello-ext/main.rs @@ -12,6 +12,7 @@ use libduckdb_sys as ffi; use std::{ error::Error, ffi::{c_char, c_void, CString}, + ptr, }; #[repr(C)] @@ -47,14 +48,19 @@ impl VTab for HelloVTab { bind.add_result_column("column0", LogicalTypeHandle::from(LogicalTypeId::Varchar)); let param = bind.get_parameter(0).to_string(); unsafe { - (*data).name = CString::new(param).unwrap().into_raw(); + ptr::write( + data, + HelloBindData { + name: CString::new(param).unwrap().into_raw(), + }, + ); } Ok(()) } unsafe fn init(_: &InitInfo, data: *mut HelloInitData) -> Result<(), Box> { unsafe { - (*data).done = false; + ptr::write(data, HelloInitData { done: false }); } Ok(()) } From f109bc9b2c8290c14e7e6bf9504338164be16a56 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 07:38:43 -0800 Subject: [PATCH 02/13] Use smaller unsafe blocks --- crates/duckdb/examples/hello-ext/main.rs | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/crates/duckdb/examples/hello-ext/main.rs b/crates/duckdb/examples/hello-ext/main.rs index 823065e0..67c886f9 100644 --- a/crates/duckdb/examples/hello-ext/main.rs +++ b/crates/duckdb/examples/hello-ext/main.rs @@ -1,3 +1,5 @@ +#![warn(unsafe_op_in_unsafe_fn)] + extern crate duckdb; extern crate duckdb_loadable_macros; extern crate libduckdb_sys; @@ -66,22 +68,20 @@ impl VTab for HelloVTab { } unsafe fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { - let init_info = func.get_init_data::(); - let bind_info = func.get_bind_data::(); + let init_info = unsafe { func.get_init_data::().as_mut().unwrap() }; + let bind_info = unsafe { func.get_bind_data::().as_mut().unwrap() }; - unsafe { - if (*init_info).done { - output.set_len(0); - } else { - (*init_info).done = true; - let vector = output.flat_vector(0); - let name = CString::from_raw((*bind_info).name); - let result = CString::new(format!("Hello {}", name.to_str()?))?; - // Can't consume the CString - (*bind_info).name = CString::into_raw(name); - vector.insert(0, result); - output.set_len(1); - } + if init_info.done { + output.set_len(0); + } else { + init_info.done = true; + let vector = output.flat_vector(0); + let name = unsafe { CString::from_raw((*bind_info).name) }; + let result = CString::new(format!("Hello {}", name.to_str()?))?; + // Can't consume the CString + bind_info.name = CString::into_raw(name); + vector.insert(0, result); + output.set_len(1); } Ok(()) } From b6df65027653c687c9a07275366a5a0641329d4f Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 07:41:36 -0800 Subject: [PATCH 03/13] Rust BindData/InitData can just use Rust types --- crates/duckdb/examples/hello-ext/main.rs | 29 ++++-------------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/crates/duckdb/examples/hello-ext/main.rs b/crates/duckdb/examples/hello-ext/main.rs index 67c886f9..9d729d48 100644 --- a/crates/duckdb/examples/hello-ext/main.rs +++ b/crates/duckdb/examples/hello-ext/main.rs @@ -17,23 +17,12 @@ use std::{ ptr, }; -#[repr(C)] struct HelloBindData { - name: *mut c_char, + name: String, } -impl Free for HelloBindData { - fn free(&mut self) { - unsafe { - if self.name.is_null() { - return; - } - drop(CString::from_raw(self.name)); - } - } -} +impl Free for HelloBindData {} -#[repr(C)] struct HelloInitData { done: bool, } @@ -48,14 +37,9 @@ impl VTab for HelloVTab { unsafe fn bind(bind: &BindInfo, data: *mut HelloBindData) -> Result<(), Box> { bind.add_result_column("column0", LogicalTypeHandle::from(LogicalTypeId::Varchar)); - let param = bind.get_parameter(0).to_string(); + let name = bind.get_parameter(0).to_string(); unsafe { - ptr::write( - data, - HelloBindData { - name: CString::new(param).unwrap().into_raw(), - }, - ); + ptr::write(data, HelloBindData { name }); } Ok(()) } @@ -76,10 +60,7 @@ impl VTab for HelloVTab { } else { init_info.done = true; let vector = output.flat_vector(0); - let name = unsafe { CString::from_raw((*bind_info).name) }; - let result = CString::new(format!("Hello {}", name.to_str()?))?; - // Can't consume the CString - bind_info.name = CString::into_raw(name); + let result = CString::new(format!("Hello {}", bind_info.name))?; vector.insert(0, result); output.set_len(1); } From d9b0952653f692206bd55474e81c4d14fb65d86f Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 07:43:44 -0800 Subject: [PATCH 04/13] Use unsafe blocks inside unsafe fns generated by macro This will prevent the macro generating a warning in edition 2024 --- crates/duckdb-loadable-macros/src/lib.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crates/duckdb-loadable-macros/src/lib.rs b/crates/duckdb-loadable-macros/src/lib.rs index d82cde71..136f27f5 100644 --- a/crates/duckdb-loadable-macros/src/lib.rs +++ b/crates/duckdb-loadable-macros/src/lib.rs @@ -133,8 +133,10 @@ pub fn duckdb_entrypoint(_attr: TokenStream, item: TokenStream) -> TokenStream { /// Will be called by duckdb #[no_mangle] pub unsafe extern "C" fn #c_entrypoint(db: *mut c_void) { - let connection = Connection::open_from_raw(db.cast()).expect("can't open db connection"); - #prefixed_original_function(connection).expect("init failed"); + unsafe { + let connection = Connection::open_from_raw(db.cast()).expect("can't open db connection"); + #prefixed_original_function(connection).expect("init failed"); + } } /// # Safety @@ -142,7 +144,9 @@ pub fn duckdb_entrypoint(_attr: TokenStream, item: TokenStream) -> TokenStream { /// Predefined function, don't need to change unless you are sure #[no_mangle] pub unsafe extern "C" fn #c_entrypoint_version() -> *const c_char { - ffi::duckdb_library_version() + unsafe { + ffi::duckdb_library_version() + } } From 49c841fe7945f6b0a6aef2de36fb5916886cc7a4 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 07:44:56 -0800 Subject: [PATCH 05/13] Fix unused import --- crates/duckdb/src/vtab/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/duckdb/src/vtab/mod.rs b/crates/duckdb/src/vtab/mod.rs index 9249fb1e..7f034c14 100644 --- a/crates/duckdb/src/vtab/mod.rs +++ b/crates/duckdb/src/vtab/mod.rs @@ -20,7 +20,7 @@ mod excel; pub use function::{BindInfo, FunctionInfo, InitInfo, TableFunction}; pub use value::Value; -use crate::core::{DataChunkHandle, LogicalTypeHandle, LogicalTypeId}; +use crate::core::{DataChunkHandle, LogicalTypeHandle}; use ffi::{duckdb_bind_info, duckdb_data_chunk, duckdb_function_info, duckdb_init_info}; use ffi::duckdb_malloc; @@ -194,6 +194,7 @@ impl InnerConnection { mod test { use super::*; use crate::core::Inserter; + use crate::core::LogicalTypeId; use std::{ error::Error, ffi::{c_char, CString}, From a032088ec5741566311e681c35ef03c8d21eb87a Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 07:49:31 -0800 Subject: [PATCH 06/13] Better safety docs for vtab methods Contrary to the previous docs, the instances passed to these functions are *not* initialized by the caller. Rather, the called function is responsible for writing into uninitialized memory. --- crates/duckdb/src/vtab/mod.rs | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/crates/duckdb/src/vtab/mod.rs b/crates/duckdb/src/vtab/mod.rs index 7f034c14..cb7b9208 100644 --- a/crates/duckdb/src/vtab/mod.rs +++ b/crates/duckdb/src/vtab/mod.rs @@ -65,27 +65,22 @@ pub trait VTab: Sized { /// /// # Safety /// - /// This function is unsafe because it dereferences raw pointers (`data`) and manipulates the memory directly. - /// The caller must ensure that: - /// - /// - The `data` pointer is valid and points to a properly initialized `BindData` instance. - /// - The lifetime of `data` must outlive the execution of `bind` to avoid dangling pointers, especially since - /// `bind` does not take ownership of `data`. - /// - Concurrent access to `data` (if applicable) must be properly synchronized. - /// - The `bind` object must be valid and correctly initialized. + /// `data` points to an *uninitialized* block of memory large enough to hold a `Self::BindData` value. + /// The implementation should initialize this memory with the appropriate data for the table function, + /// without reading the existing memory, + /// typically using [`std::ptr::write`] or similar. unsafe fn bind(bind: &BindInfo, data: *mut Self::BindData) -> Result<(), Box>; + /// Initialize the table function /// /// # Safety /// - /// This function is unsafe because it performs raw pointer dereferencing on the `data` argument. - /// The caller is responsible for ensuring that: - /// - /// - The `data` pointer is non-null and points to a valid `InitData` instance. - /// - There is no data race when accessing `data`, meaning if `data` is accessed from multiple threads, - /// proper synchronization is required. - /// - The lifetime of `data` extends beyond the scope of this call to avoid use-after-free errors. + /// `data` points to an *uninitialized* block of memory large enough to hold a `Self::InitData` value. + /// The implementation should initialize this memory with the appropriate data for the table function, + /// without reading the existing memory, + /// typically using [`std::ptr::write`] or similar. unsafe fn init(init: &InitInfo, data: *mut Self::InitData) -> Result<(), Box>; + /// The actual function /// /// # Safety From 80c7c0c63a5ee26c8c4db5c4e728485727baba1e Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 07:59:17 -0800 Subject: [PATCH 07/13] Similar safety fixes in vtab tests --- crates/duckdb/src/vtab/mod.rs | 53 +++++++++++++---------------------- 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/crates/duckdb/src/vtab/mod.rs b/crates/duckdb/src/vtab/mod.rs index cb7b9208..078369ea 100644 --- a/crates/duckdb/src/vtab/mod.rs +++ b/crates/duckdb/src/vtab/mod.rs @@ -193,25 +193,15 @@ mod test { use std::{ error::Error, ffi::{c_char, CString}, + ptr, }; - #[repr(C)] struct HelloBindData { - name: *mut c_char, + name: String, } - impl Free for HelloBindData { - fn free(&mut self) { - unsafe { - if self.name.is_null() { - return; - } - drop(CString::from_raw(self.name)); - } - } - } + impl Free for HelloBindData {} - #[repr(C)] struct HelloInitData { done: bool, } @@ -226,37 +216,32 @@ mod test { unsafe fn bind(bind: &BindInfo, data: *mut HelloBindData) -> Result<(), Box> { bind.add_result_column("column0", LogicalTypeHandle::from(LogicalTypeId::Varchar)); - let param = bind.get_parameter(0).to_string(); + let name = bind.get_parameter(0).to_string(); unsafe { - (*data).name = CString::new(param).unwrap().into_raw(); + ptr::write(data, HelloBindData { name }); } Ok(()) } unsafe fn init(_: &InitInfo, data: *mut HelloInitData) -> Result<(), Box> { unsafe { - (*data).done = false; + ptr::write(data, HelloInitData { done: false }); } Ok(()) } unsafe fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { - let init_info = func.get_init_data::(); - let bind_info = func.get_bind_data::(); - - unsafe { - if (*init_info).done { - output.set_len(0); - } else { - (*init_info).done = true; - let vector = output.flat_vector(0); - let name = CString::from_raw((*bind_info).name); - let result = CString::new(format!("Hello {}", name.to_str()?))?; - // Can't consume the CString - (*bind_info).name = CString::into_raw(name); - vector.insert(0, result); - output.set_len(1); - } + let init_info = unsafe { func.get_init_data::().as_mut().unwrap() }; + let bind_info = unsafe { func.get_bind_data::().as_ref().unwrap() }; + + if init_info.done { + output.set_len(0); + } else { + init_info.done = true; + let vector = output.flat_vector(0); + let result = CString::new(format!("Hello {}", bind_info.name))?; + vector.insert(0, result); + output.set_len(1); } Ok(()) } @@ -273,10 +258,10 @@ mod test { unsafe fn bind(bind: &BindInfo, data: *mut HelloBindData) -> Result<(), Box> { bind.add_result_column("column0", LogicalTypeHandle::from(LogicalTypeId::Varchar)); - let param = bind.get_named_parameter("name").unwrap().to_string(); + let name = bind.get_named_parameter("name").unwrap().to_string(); assert!(bind.get_named_parameter("unknown_name").is_none()); unsafe { - (*data).name = CString::new(param).unwrap().into_raw(); + ptr::write(data, HelloBindData { name }); } Ok(()) } From 5947f6d463c82b4114bd9784fceea958f9c9e144 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 08:41:40 -0800 Subject: [PATCH 08/13] VTab::bind and init are now safe Rather than passing a pointer to a block of uninitialized memory, which can easily lead to UB, these functions now just return Rust objects. This improves #414 by reducing the amount of unsafe code needed from extensions. --- crates/duckdb/examples/hello-ext/main.rs | 15 +--- crates/duckdb/src/vtab/mod.rs | 107 +++++++++-------------- 2 files changed, 46 insertions(+), 76 deletions(-) diff --git a/crates/duckdb/examples/hello-ext/main.rs b/crates/duckdb/examples/hello-ext/main.rs index 9d729d48..3f19e69e 100644 --- a/crates/duckdb/examples/hello-ext/main.rs +++ b/crates/duckdb/examples/hello-ext/main.rs @@ -14,7 +14,6 @@ use libduckdb_sys as ffi; use std::{ error::Error, ffi::{c_char, c_void, CString}, - ptr, }; struct HelloBindData { @@ -35,20 +34,14 @@ impl VTab for HelloVTab { type InitData = HelloInitData; type BindData = HelloBindData; - unsafe fn bind(bind: &BindInfo, data: *mut HelloBindData) -> Result<(), Box> { + fn bind(bind: &BindInfo) -> Result> { bind.add_result_column("column0", LogicalTypeHandle::from(LogicalTypeId::Varchar)); let name = bind.get_parameter(0).to_string(); - unsafe { - ptr::write(data, HelloBindData { name }); - } - Ok(()) + Ok(HelloBindData { name }) } - unsafe fn init(_: &InitInfo, data: *mut HelloInitData) -> Result<(), Box> { - unsafe { - ptr::write(data, HelloInitData { done: false }); - } - Ok(()) + fn init(_: &InitInfo) -> Result> { + Ok(HelloInitData { done: false }) } unsafe fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { diff --git a/crates/duckdb/src/vtab/mod.rs b/crates/duckdb/src/vtab/mod.rs index 078369ea..06bf541f 100644 --- a/crates/duckdb/src/vtab/mod.rs +++ b/crates/duckdb/src/vtab/mod.rs @@ -1,8 +1,11 @@ -use crate::{error::Error, inner_connection::InnerConnection, Connection, Result}; +// #![warn(unsafe_op_in_unsafe_fn)] -use super::{ffi, ffi::duckdb_free}; use std::ffi::c_void; +use crate::{error::Error, inner_connection::InnerConnection, Connection, Result}; + +use super::ffi; + mod function; mod value; @@ -23,26 +26,12 @@ pub use value::Value; use crate::core::{DataChunkHandle, LogicalTypeHandle}; use ffi::{duckdb_bind_info, duckdb_data_chunk, duckdb_function_info, duckdb_init_info}; -use ffi::duckdb_malloc; -use std::mem::size_of; - -/// duckdb_malloc a struct of type T -/// used for the bind_info and init_info -/// # Safety -/// This function is obviously unsafe -unsafe fn malloc_data_c() -> *mut T { - duckdb_malloc(size_of::()).cast() -} - -/// free bind or info data +/// Given a raw pointer to a box, free the box and the data contained within it. /// /// # Safety -/// This function is obviously unsafe -/// TODO: maybe we should use a Free trait here -unsafe extern "C" fn drop_data_c(v: *mut c_void) { - let actual = v.cast::(); - (*actual).free(); - duckdb_free(v); +/// The pointer must be a valid pointer to a `Box` created by `Box::into_raw`. +unsafe extern "C" fn drop_boxed(v: *mut c_void) { + drop(unsafe { Box::from_raw(v.cast::()) }); } /// Free trait for the bind and init data @@ -59,27 +48,15 @@ pub trait VTab: Sized { /// The data type of the bind data type InitData: Sized + Free; /// The data type of the init data - type BindData: Sized + Free; + type BindData: Sized + Free; // TODO: and maybe Send + Sync as this might be called from multiple threads? + + // TODO: Get rid of Free, just use Drop? /// Bind data to the table function - /// - /// # Safety - /// - /// `data` points to an *uninitialized* block of memory large enough to hold a `Self::BindData` value. - /// The implementation should initialize this memory with the appropriate data for the table function, - /// without reading the existing memory, - /// typically using [`std::ptr::write`] or similar. - unsafe fn bind(bind: &BindInfo, data: *mut Self::BindData) -> Result<(), Box>; + fn bind(bind: &BindInfo) -> Result>; /// Initialize the table function - /// - /// # Safety - /// - /// `data` points to an *uninitialized* block of memory large enough to hold a `Self::InitData` value. - /// The implementation should initialize this memory with the appropriate data for the table function, - /// without reading the existing memory, - /// typically using [`std::ptr::write`] or similar. - unsafe fn init(init: &InitInfo, data: *mut Self::InitData) -> Result<(), Box>; + fn init(init: &InitInfo) -> Result>; /// The actual function /// @@ -130,11 +107,16 @@ where T: VTab, { let info = InitInfo::from(info); - let data = malloc_data_c::(); - let result = T::init(&info, data); - info.set_init_data(data.cast(), Some(drop_data_c::)); - if result.is_err() { - info.set_error(&result.err().unwrap().to_string()); + match T::init(&info) { + Ok(init_data) => { + info.set_init_data( + Box::into_raw(Box::new(init_data)) as *mut c_void, + Some(drop_boxed::), + ); + } + Err(e) => { + info.set_error(&e.to_string()); + } } } @@ -143,11 +125,16 @@ where T: VTab, { let info = BindInfo::from(info); - let data = malloc_data_c::(); - let result = T::bind(&info, data); - info.set_bind_data(data.cast(), Some(drop_data_c::)); - if result.is_err() { - info.set_error(&result.err().unwrap().to_string()); + match T::bind(&info) { + Ok(bind_data) => { + info.set_bind_data( + Box::into_raw(Box::new(bind_data)) as *mut c_void, + Some(drop_boxed::), + ); + } + Err(e) => { + info.set_error(&e.to_string()); + } } } @@ -193,7 +180,6 @@ mod test { use std::{ error::Error, ffi::{c_char, CString}, - ptr, }; struct HelloBindData { @@ -214,20 +200,14 @@ mod test { type InitData = HelloInitData; type BindData = HelloBindData; - unsafe fn bind(bind: &BindInfo, data: *mut HelloBindData) -> Result<(), Box> { + fn bind(bind: &BindInfo) -> Result> { bind.add_result_column("column0", LogicalTypeHandle::from(LogicalTypeId::Varchar)); let name = bind.get_parameter(0).to_string(); - unsafe { - ptr::write(data, HelloBindData { name }); - } - Ok(()) + Ok(HelloBindData { name }) } - unsafe fn init(_: &InitInfo, data: *mut HelloInitData) -> Result<(), Box> { - unsafe { - ptr::write(data, HelloInitData { done: false }); - } - Ok(()) + fn init(_: &InitInfo) -> Result> { + Ok(HelloInitData { done: false }) } unsafe fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { @@ -256,18 +236,15 @@ mod test { type InitData = HelloInitData; type BindData = HelloBindData; - unsafe fn bind(bind: &BindInfo, data: *mut HelloBindData) -> Result<(), Box> { + fn bind(bind: &BindInfo) -> Result> { bind.add_result_column("column0", LogicalTypeHandle::from(LogicalTypeId::Varchar)); let name = bind.get_named_parameter("name").unwrap().to_string(); assert!(bind.get_named_parameter("unknown_name").is_none()); - unsafe { - ptr::write(data, HelloBindData { name }); - } - Ok(()) + Ok(HelloBindData { name }) } - unsafe fn init(init_info: &InitInfo, data: *mut HelloInitData) -> Result<(), Box> { - HelloVTab::init(init_info, data) + fn init(init_info: &InitInfo) -> Result> { + HelloVTab::init(init_info) } unsafe fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { From 4103139d9d0a310d5f496bc449d6a034334d8aed Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 09:07:36 -0800 Subject: [PATCH 09/13] vtab::Free is no longer needed BindInfo and InitInfo will be dropped in the usual way when freed by duckdb core. Any necessary destructors can be in Drop impls. --- crates/duckdb/examples/hello-ext/main.rs | 6 +----- crates/duckdb/src/vtab/mod.rs | 16 ++-------------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/crates/duckdb/examples/hello-ext/main.rs b/crates/duckdb/examples/hello-ext/main.rs index 3f19e69e..f8ba43d9 100644 --- a/crates/duckdb/examples/hello-ext/main.rs +++ b/crates/duckdb/examples/hello-ext/main.rs @@ -6,7 +6,7 @@ extern crate libduckdb_sys; use duckdb::{ core::{DataChunkHandle, Inserter, LogicalTypeHandle, LogicalTypeId}, - vtab::{BindInfo, Free, FunctionInfo, InitInfo, VTab}, + vtab::{BindInfo, FunctionInfo, InitInfo, VTab}, Connection, Result, }; use duckdb_loadable_macros::duckdb_entrypoint; @@ -20,16 +20,12 @@ struct HelloBindData { name: String, } -impl Free for HelloBindData {} - struct HelloInitData { done: bool, } struct HelloVTab; -impl Free for HelloInitData {} - impl VTab for HelloVTab { type InitData = HelloInitData; type BindData = HelloBindData; diff --git a/crates/duckdb/src/vtab/mod.rs b/crates/duckdb/src/vtab/mod.rs index 06bf541f..b6fc940f 100644 --- a/crates/duckdb/src/vtab/mod.rs +++ b/crates/duckdb/src/vtab/mod.rs @@ -34,23 +34,15 @@ unsafe extern "C" fn drop_boxed(v: *mut c_void) { drop(unsafe { Box::from_raw(v.cast::()) }); } -/// Free trait for the bind and init data -pub trait Free { - /// Free the data - fn free(&mut self) {} -} - /// Duckdb table function trait /// /// See to the HelloVTab example for more details /// pub trait VTab: Sized { /// The data type of the bind data - type InitData: Sized + Free; + type InitData: Sized; /// The data type of the init data - type BindData: Sized + Free; // TODO: and maybe Send + Sync as this might be called from multiple threads? - - // TODO: Get rid of Free, just use Drop? + type BindData: Sized; // TODO: and maybe Send + Sync as this might be called from multiple threads? /// Bind data to the table function fn bind(bind: &BindInfo) -> Result>; @@ -186,16 +178,12 @@ mod test { name: String, } - impl Free for HelloBindData {} - struct HelloInitData { done: bool, } struct HelloVTab; - impl Free for HelloInitData {} - impl VTab for HelloVTab { type InitData = HelloInitData; type BindData = HelloBindData; From 843c6e768299d9f49f52f5ae8f97aea11e70e82c Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 09:30:35 -0800 Subject: [PATCH 10/13] BindData and InitData should be Send+Sync It's not completely clear but it looks like the engine could run the table fn from multiple threads, so requiring this seems safer --- crates/duckdb/src/vtab/mod.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/crates/duckdb/src/vtab/mod.rs b/crates/duckdb/src/vtab/mod.rs index b6fc940f..fbca5d5e 100644 --- a/crates/duckdb/src/vtab/mod.rs +++ b/crates/duckdb/src/vtab/mod.rs @@ -39,12 +39,21 @@ unsafe extern "C" fn drop_boxed(v: *mut c_void) { /// See to the HelloVTab example for more details /// pub trait VTab: Sized { - /// The data type of the bind data - type InitData: Sized; - /// The data type of the init data - type BindData: Sized; // TODO: and maybe Send + Sync as this might be called from multiple threads? + /// The data type of the init data. + /// + /// The init data tracks the state of the table function and is global across threads. + /// + /// The init data is shared across threads so must be `Send + Sync`. + type InitData: Sized + Send + Sync; + + /// The data type of the bind data. + /// + /// The bind data is shared across threads so must be `Send + Sync`. + type BindData: Sized + Send + Sync; /// Bind data to the table function + /// + /// This function is used for determining the return type of a table producing function and returning bind data fn bind(bind: &BindInfo) -> Result>; /// Initialize the table function From 6ecbc3cee831323ce6d3259f1013b779756023a4 Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 09:56:16 -0800 Subject: [PATCH 11/13] Add a safe & typed interface to get bind_data --- crates/duckdb/examples/hello-ext/main.rs | 4 +-- crates/duckdb/src/vtab/function.rs | 37 +++++++++++++-------- crates/duckdb/src/vtab/mod.rs | 42 +++++++++++++----------- 3 files changed, 48 insertions(+), 35 deletions(-) diff --git a/crates/duckdb/examples/hello-ext/main.rs b/crates/duckdb/examples/hello-ext/main.rs index f8ba43d9..158e74d0 100644 --- a/crates/duckdb/examples/hello-ext/main.rs +++ b/crates/duckdb/examples/hello-ext/main.rs @@ -40,9 +40,9 @@ impl VTab for HelloVTab { Ok(HelloInitData { done: false }) } - unsafe fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { + fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { let init_info = unsafe { func.get_init_data::().as_mut().unwrap() }; - let bind_info = unsafe { func.get_bind_data::().as_mut().unwrap() }; + let bind_info = func.get_bind_data(); if init_info.done { output.set_len(0); diff --git a/crates/duckdb/src/vtab/function.rs b/crates/duckdb/src/vtab/function.rs index 9d14b510..18c15209 100644 --- a/crates/duckdb/src/vtab/function.rs +++ b/crates/duckdb/src/vtab/function.rs @@ -9,10 +9,11 @@ use super::{ duckdb_table_function_set_init, duckdb_table_function_set_local_init, duckdb_table_function_set_name, duckdb_table_function_supports_projection_pushdown, idx_t, }, - LogicalTypeHandle, Value, + LogicalTypeHandle, VTab, Value, }; use std::{ ffi::{c_void, CString}, + marker::PhantomData, os::raw::c_char, }; @@ -334,9 +335,12 @@ use super::ffi::{ /// An interface to store and retrieve data during the function execution stage #[derive(Debug)] -pub struct FunctionInfo(duckdb_function_info); +pub struct FunctionInfo { + ptr: duckdb_function_info, + _vtab: std::marker::PhantomData, +} -impl FunctionInfo { +impl FunctionInfo { /// Report that an error has occurred while executing the function. /// /// # Arguments @@ -344,44 +348,49 @@ impl FunctionInfo { pub fn set_error(&self, error: &str) { let c_str = CString::new(error).unwrap(); unsafe { - duckdb_function_set_error(self.0, c_str.as_ptr() as *const c_char); + duckdb_function_set_error(self.ptr, c_str.as_ptr()); } } + /// Gets the bind data set by [`BindInfo::set_bind_data`] during the bind. /// /// Note that the bind data should be considered as read-only. /// For tracking state, use the init data instead. - /// - /// # Arguments - /// * `returns`: The bind data object - pub fn get_bind_data(&self) -> *mut T { - unsafe { duckdb_function_get_bind_data(self.0).cast() } + pub fn get_bind_data(&self) -> &V::BindData { + unsafe { + let bind_data: *const V::BindData = duckdb_function_get_bind_data(self.ptr).cast(); + bind_data.as_ref().unwrap() + } } + /// Gets the init data set by [`InitInfo::set_init_data`] during the init. /// /// # Arguments /// * `returns`: The init data object pub fn get_init_data(&self) -> *mut T { - unsafe { duckdb_function_get_init_data(self.0).cast() } + unsafe { duckdb_function_get_init_data(self.ptr).cast() } } /// Retrieves the extra info of the function as set in [`TableFunction::set_extra_info`] /// /// # Arguments /// * `returns`: The extra info pub fn get_extra_info(&self) -> *mut T { - unsafe { duckdb_function_get_extra_info(self.0).cast() } + unsafe { duckdb_function_get_extra_info(self.ptr).cast() } } /// Gets the thread-local init data set by [`InitInfo::set_init_data`] during the local_init. /// /// # Arguments /// * `returns`: The init data object pub fn get_local_init_data(&self) -> *mut T { - unsafe { duckdb_function_get_local_init_data(self.0).cast() } + unsafe { duckdb_function_get_local_init_data(self.ptr).cast() } } } -impl From for FunctionInfo { +impl From for FunctionInfo { fn from(ptr: duckdb_function_info) -> Self { - Self(ptr) + Self { + ptr, + _vtab: PhantomData, + } } } diff --git a/crates/duckdb/src/vtab/mod.rs b/crates/duckdb/src/vtab/mod.rs index fbca5d5e..d8cf1ffa 100644 --- a/crates/duckdb/src/vtab/mod.rs +++ b/crates/duckdb/src/vtab/mod.rs @@ -59,21 +59,13 @@ pub trait VTab: Sized { /// Initialize the table function fn init(init: &InitInfo) -> Result>; - /// The actual function + /// Generate rows from the table function. /// - /// # Safety + /// The implementation should populate the `output` parameter with the rows to be returned. /// - /// This function is unsafe because it: - /// - /// - Dereferences multiple raw pointers (`func` to access `init_info` and `bind_info`). - /// - /// The caller must ensure that: - /// - /// - All pointers (`func`, `output`, internal `init_info`, and `bind_info`) are valid and point to the expected types of data structures. - /// - The `init_info` and `bind_info` data pointed to remains valid and is not freed until after this function completes. - /// - No other threads are concurrently mutating the data pointed to by `init_info` and `bind_info` without proper synchronization. - /// - The `output` parameter is correctly initialized and can safely be written to. - unsafe fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box>; + /// When the table function is done, the implementation should set the length of the output to 0. + fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box>; + /// Does the table function support pushdown /// default is false fn supports_pushdown() -> bool { @@ -95,7 +87,7 @@ unsafe extern "C" fn func(info: duckdb_function_info, output: duckdb_data_chu where T: VTab, { - let info = FunctionInfo::from(info); + let info = FunctionInfo::::from(info); let mut data_chunk_handle = DataChunkHandle::new_unowned(output); let result = T::func(&info, &mut data_chunk_handle); if result.is_err() { @@ -207,16 +199,16 @@ mod test { Ok(HelloInitData { done: false }) } - unsafe fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { + fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { let init_info = unsafe { func.get_init_data::().as_mut().unwrap() }; - let bind_info = unsafe { func.get_bind_data::().as_ref().unwrap() }; + let bind_data = func.get_bind_data(); if init_info.done { output.set_len(0); } else { init_info.done = true; let vector = output.flat_vector(0); - let result = CString::new(format!("Hello {}", bind_info.name))?; + let result = CString::new(format!("Hello {}", bind_data.name))?; vector.insert(0, result); output.set_len(1); } @@ -244,8 +236,20 @@ mod test { HelloVTab::init(init_info) } - unsafe fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { - HelloVTab::func(func, output) + fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { + let init_info = unsafe { func.get_init_data::().as_mut().unwrap() }; + let bind_info = func.get_bind_data(); + + if init_info.done { + output.set_len(0); + } else { + init_info.done = true; + let vector = output.flat_vector(0); + let result = CString::new(format!("Hello {}", bind_info.name))?; + vector.insert(0, result); + output.set_len(1); + } + Ok(()) } fn named_parameters() -> Option> { From cf11a091d21cdf5a03e8e33e982ded8d7d4169da Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 10:14:48 -0800 Subject: [PATCH 12/13] Also safely retrieve the init_data --- crates/duckdb/examples/hello-ext/main.rs | 17 ++++++++++------- crates/duckdb/src/vtab/function.rs | 16 ++++++++++++---- crates/duckdb/src/vtab/mod.rs | 22 ++++++++++++---------- 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/crates/duckdb/examples/hello-ext/main.rs b/crates/duckdb/examples/hello-ext/main.rs index 158e74d0..8a6edad3 100644 --- a/crates/duckdb/examples/hello-ext/main.rs +++ b/crates/duckdb/examples/hello-ext/main.rs @@ -1,4 +1,5 @@ #![warn(unsafe_op_in_unsafe_fn)] +#![warn(unsafe)] // extensions can be safe extern crate duckdb; extern crate duckdb_loadable_macros; @@ -14,6 +15,7 @@ use libduckdb_sys as ffi; use std::{ error::Error, ffi::{c_char, c_void, CString}, + sync::atomic::{AtomicBool, Ordering}, }; struct HelloBindData { @@ -21,7 +23,7 @@ struct HelloBindData { } struct HelloInitData { - done: bool, + done: AtomicBool, } struct HelloVTab; @@ -37,19 +39,20 @@ impl VTab for HelloVTab { } fn init(_: &InitInfo) -> Result> { - Ok(HelloInitData { done: false }) + Ok(HelloInitData { + done: AtomicBool::new(false), + }) } fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { - let init_info = unsafe { func.get_init_data::().as_mut().unwrap() }; - let bind_info = func.get_bind_data(); + let init_data = func.get_init_data(); + let bind_data = func.get_bind_data(); - if init_info.done { + if init_data.done.swap(true, Ordering::Relaxed) { output.set_len(0); } else { - init_info.done = true; let vector = output.flat_vector(0); - let result = CString::new(format!("Hello {}", bind_info.name))?; + let result = CString::new(format!("Hello {}", bind_data.name))?; vector.insert(0, result); output.set_len(1); } diff --git a/crates/duckdb/src/vtab/function.rs b/crates/duckdb/src/vtab/function.rs index 18c15209..276610cc 100644 --- a/crates/duckdb/src/vtab/function.rs +++ b/crates/duckdb/src/vtab/function.rs @@ -337,7 +337,7 @@ use super::ffi::{ #[derive(Debug)] pub struct FunctionInfo { ptr: duckdb_function_info, - _vtab: std::marker::PhantomData, + _vtab: PhantomData, } impl FunctionInfo { @@ -363,13 +363,21 @@ impl FunctionInfo { } } - /// Gets the init data set by [`InitInfo::set_init_data`] during the init. + /// Get a reference to the init data set by [`InitInfo::set_init_data`] during the init. + /// + /// This returns a shared reference because the init data is shared between multiple threads. + /// It may internally be mutable. /// /// # Arguments /// * `returns`: The init data object - pub fn get_init_data(&self) -> *mut T { - unsafe { duckdb_function_get_init_data(self.ptr).cast() } + pub fn get_init_data(&self) -> &V::InitData { + // Safety: A pointer to a box of the init data is stored during vtab init. + unsafe { + let init_data: *const V::InitData = duckdb_function_get_init_data(self.ptr).cast(); + init_data.as_ref().unwrap() + } } + /// Retrieves the extra info of the function as set in [`TableFunction::set_extra_info`] /// /// # Arguments diff --git a/crates/duckdb/src/vtab/mod.rs b/crates/duckdb/src/vtab/mod.rs index d8cf1ffa..32bd479d 100644 --- a/crates/duckdb/src/vtab/mod.rs +++ b/crates/duckdb/src/vtab/mod.rs @@ -170,6 +170,8 @@ mod test { use super::*; use crate::core::Inserter; use crate::core::LogicalTypeId; + use std::sync::atomic::AtomicBool; + use std::sync::atomic::Ordering; use std::{ error::Error, ffi::{c_char, CString}, @@ -180,7 +182,7 @@ mod test { } struct HelloInitData { - done: bool, + done: AtomicBool, } struct HelloVTab; @@ -196,17 +198,18 @@ mod test { } fn init(_: &InitInfo) -> Result> { - Ok(HelloInitData { done: false }) + Ok(HelloInitData { + done: AtomicBool::new(false), + }) } fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { - let init_info = unsafe { func.get_init_data::().as_mut().unwrap() }; + let init_data = func.get_init_data(); let bind_data = func.get_bind_data(); - if init_info.done { + if init_data.done.swap(true, Ordering::Relaxed) { output.set_len(0); } else { - init_info.done = true; let vector = output.flat_vector(0); let result = CString::new(format!("Hello {}", bind_data.name))?; vector.insert(0, result); @@ -237,15 +240,14 @@ mod test { } fn func(func: &FunctionInfo, output: &mut DataChunkHandle) -> Result<(), Box> { - let init_info = unsafe { func.get_init_data::().as_mut().unwrap() }; - let bind_info = func.get_bind_data(); + let init_data = func.get_init_data(); + let bind_data = func.get_bind_data(); - if init_info.done { + if init_data.done.swap(true, Ordering::Relaxed) { output.set_len(0); } else { - init_info.done = true; let vector = output.flat_vector(0); - let result = CString::new(format!("Hello {}", bind_info.name))?; + let result = CString::new(format!("Hello {}", bind_data.name))?; vector.insert(0, result); output.set_len(1); } From 9400a51c1baa9acaf604660950ff10aa6c33c1eb Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Sat, 21 Dec 2024 10:43:19 -0800 Subject: [PATCH 13/13] Add unsafe blocks, rm unnecessary cast --- crates/duckdb/src/vtab/function.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crates/duckdb/src/vtab/function.rs b/crates/duckdb/src/vtab/function.rs index 276610cc..cb32ee77 100644 --- a/crates/duckdb/src/vtab/function.rs +++ b/crates/duckdb/src/vtab/function.rs @@ -139,7 +139,9 @@ impl From for InitInfo { impl InitInfo { /// # Safety pub unsafe fn set_init_data(&self, data: *mut c_void, freeer: Option) { - duckdb_init_set_init_data(self.0, data, freeer); + unsafe { + duckdb_init_set_init_data(self.0, data, freeer); + } } /// Returns the column indices of the projected columns at the specified positions. @@ -189,7 +191,7 @@ impl InitInfo { /// * `error`: The error message pub fn set_error(&self, error: &str) { let c_str = CString::new(error).unwrap(); - unsafe { duckdb_init_set_error(self.0, c_str.as_ptr() as *const c_char) } + unsafe { duckdb_init_set_error(self.0, c_str.as_ptr()) } } } @@ -310,7 +312,9 @@ impl TableFunction { /// /// # Safety pub unsafe fn set_extra_info(&self, extra_info: *mut c_void, destroy: duckdb_delete_callback_t) { - duckdb_table_function_set_extra_info(self.ptr, extra_info, destroy); + unsafe { + duckdb_table_function_set_extra_info(self.ptr, extra_info, destroy); + } } /// Sets the thread-local init function of the table function