From d2dfe834942b5cfbda2b3b93fea64d205158da85 Mon Sep 17 00:00:00 2001 From: Joshua David Barillas Date: Thu, 9 Jan 2025 20:18:29 -0500 Subject: [PATCH] Add quote_style option to csv IO functions (#1049) * Create datatype for quote style and trait implementations * Add quote style to CSV IO functions * Add quote_style option to polars backend * Add quote_style type and options to backend * Add quote_style * Add tests * Add documentation on supported quote_style values --- lib/explorer/backend/data_frame.ex | 9 ++- lib/explorer/data_frame.ex | 25 +++++++- lib/explorer/polars_backend/data_frame.ex | 26 ++++++-- lib/explorer/polars_backend/lazy_frame.ex | 10 +-- lib/explorer/polars_backend/native.ex | 8 +-- native/explorer/src/dataframe/io.rs | 9 ++- native/explorer/src/datatypes.rs | 40 ++++++++++++ native/explorer/src/lazyframe/io.rs | 4 +- test/explorer/data_frame/csv_test.exs | 75 +++++++++++++++++++++++ 9 files changed, 185 insertions(+), 21 deletions(-) diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex index 360f50e46..ad2e0cc29 100644 --- a/lib/explorer/backend/data_frame.ex +++ b/lib/explorer/backend/data_frame.ex @@ -13,6 +13,7 @@ defmodule Explorer.Backend.DataFrame do @type ok_result() :: :ok | {:error, Exception.t()} @type io_result(t) :: {:ok, t} | {:error, Exception.t()} + @type quote_style :: :necessary | :always | :non_numeric | :never # Generic result @type result(t) :: {:ok, t} | {:error, term()} @@ -65,10 +66,16 @@ defmodule Explorer.Backend.DataFrame do entry :: fs_entry(), header? :: boolean(), delimiter :: String.t(), + quote_style :: quote_style, streaming :: boolean() ) :: ok_result() - @callback dump_csv(df, header? :: boolean(), delimiter :: String.t()) :: io_result(binary()) + @callback dump_csv( + df, + header? :: boolean(), + delimiter :: String.t(), + quote_style :: quote_style + ) :: io_result(binary()) @callback load_csv( contents :: String.t(), diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index ac0984201..329f04998 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -651,6 +651,9 @@ defmodule Explorer.DataFrame do * `:delimiter` - A single character used to separate fields within a record. (default: `","`) + * `:quote_style` - The quoting style to use. Possible values are `:necessary`, `:always`, `:non_numeric`, and `:never`. + (default: `:necessary`) + * `:config` - An optional struct, keyword list or map, normally associated with remote file systems. See [IO section](#module-io-operations) for more details. (default: `nil`) @@ -664,13 +667,21 @@ defmodule Explorer.DataFrame do @spec to_csv(df :: DataFrame.t(), filename :: fs_entry() | String.t(), opts :: Keyword.t()) :: :ok | {:error, Exception.t()} def to_csv(df, filename, opts \\ []) do - opts = Keyword.validate!(opts, header: true, delimiter: ",", streaming: true, config: nil) + opts = + Keyword.validate!(opts, + header: true, + delimiter: ",", + quote_style: :necessary, + streaming: true, + config: nil + ) with {:ok, entry} <- normalise_entry(filename, opts[:config]) do Shared.apply_dataframe(df, :to_csv, [ entry, opts[:header], opts[:delimiter], + opts[:quote_style], opts[:streaming] ]) end @@ -702,6 +713,8 @@ defmodule Explorer.DataFrame do * `:header` - Should the column names be written as the first line of the file? (default: `true`) * `:delimiter` - A single character used to separate fields within a record. (default: `","`) + * `:quote_style` - The quoting style to use. Possible values are `:necessary`, `:always`, `:non_numeric`, and `:never`. + (default: `:necessary`) ## Examples @@ -713,8 +726,14 @@ defmodule Explorer.DataFrame do @spec dump_csv(df :: DataFrame.t(), opts :: Keyword.t()) :: {:ok, String.t()} | {:error, Exception.t()} def dump_csv(df, opts \\ []) do - opts = Keyword.validate!(opts, header: true, delimiter: ",") - Shared.apply_dataframe(df, :dump_csv, [opts[:header], opts[:delimiter]], false) + opts = Keyword.validate!(opts, header: true, delimiter: ",", quote_style: :necessary) + + Shared.apply_dataframe( + df, + :dump_csv, + [opts[:header], opts[:delimiter], opts[:quote_style]], + false + ) end @doc """ diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex index 4a18b4ae0..c5f6ec460 100644 --- a/lib/explorer/polars_backend/data_frame.ex +++ b/lib/explorer/polars_backend/data_frame.ex @@ -145,28 +145,42 @@ defmodule Explorer.PolarsBackend.DataFrame do end @impl true - def to_csv(%DataFrame{data: df}, %Local.Entry{} = entry, header?, delimiter, _streaming) do + def to_csv( + %DataFrame{data: df}, + %Local.Entry{} = entry, + header?, + delimiter, + quote_style, + _streaming + ) do <> = delimiter - case Native.df_to_csv(df, entry.path, header?, delimiter) do + case Native.df_to_csv(df, entry.path, header?, delimiter, quote_style) do {:ok, _} -> :ok {:error, error} -> {:error, RuntimeError.exception(error)} end end @impl true - def to_csv(%DataFrame{data: df}, %S3.Entry{} = entry, header?, delimiter, _streaming) do + def to_csv( + %DataFrame{data: df}, + %S3.Entry{} = entry, + header?, + delimiter, + quote_style, + _streaming + ) do <> = delimiter - case Native.df_to_csv_cloud(df, entry, header?, delimiter) do + case Native.df_to_csv_cloud(df, entry, header?, delimiter, quote_style) do {:ok, _} -> :ok {:error, error} -> {:error, RuntimeError.exception(error)} end end @impl true - def dump_csv(%DataFrame{} = df, header?, <>) do - case Native.df_dump_csv(df.data, header?, delimiter) do + def dump_csv(%DataFrame{} = df, header?, <>, quote_style) do + case Native.df_dump_csv(df.data, header?, delimiter, quote_style) do {:ok, string} -> {:ok, string} {:error, error} -> {:error, RuntimeError.exception(error)} end diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex index 013ab3621..5d70eae31 100644 --- a/lib/explorer/polars_backend/lazy_frame.ex +++ b/lib/explorer/polars_backend/lazy_frame.ex @@ -351,20 +351,20 @@ defmodule Explorer.PolarsBackend.LazyFrame do end @impl true - def to_csv(%DF{} = ldf, %Local.Entry{} = entry, header?, delimiter, streaming) do + def to_csv(%DF{} = ldf, %Local.Entry{} = entry, header?, delimiter, quote_style, streaming) do <> = delimiter - case Native.lf_to_csv(ldf.data, entry.path, header?, delimiter, streaming) do + case Native.lf_to_csv(ldf.data, entry.path, header?, delimiter, quote_style, streaming) do {:ok, _} -> :ok {:error, error} -> {:error, RuntimeError.exception(error)} end end @impl true - def to_csv(%DF{} = ldf, %S3.Entry{} = entry, header?, delimiter, _streaming) do + def to_csv(%DF{} = ldf, %S3.Entry{} = entry, header?, delimiter, quote_style, _streaming) do eager_df = compute(ldf) - Eager.to_csv(eager_df, entry, header?, delimiter, false) + Eager.to_csv(eager_df, entry, header?, delimiter, quote_style, false) end @impl true @@ -639,7 +639,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do covariance: 3, nil_count: 1, dummies: 3, - dump_csv: 3, + dump_csv: 4, dump_ipc: 2, dump_ipc_stream: 2, dump_ndjson: 1, diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex index 9a2a82740..ee890834c 100644 --- a/lib/explorer/polars_backend/native.ex +++ b/lib/explorer/polars_backend/native.ex @@ -76,7 +76,7 @@ defmodule Explorer.PolarsBackend.Native do def df_concat_columns(_dfs), do: err() def df_drop(_df, _name), do: err() def df_dtypes(_df), do: err() - def df_dump_csv(_df, _has_headers, _delimiter), do: err() + def df_dump_csv(_df, _has_headers, _delimiter, _quote_style), do: err() def df_dump_ndjson(_df), do: err() def df_dump_parquet(_df, _compression), do: err() def df_dump_ipc(_df, _compression), do: err() @@ -156,8 +156,8 @@ defmodule Explorer.PolarsBackend.Native do def df_slice_by_indices(_df, _indices, _groups), do: err() def df_slice_by_series(_df, _series, _groups), do: err() def df_transpose(_df, _keep_names_as, _new_col_names), do: err() - def df_to_csv(_df, _filename, _has_headers, _delimiter), do: err() - def df_to_csv_cloud(_df, _ex_entry, _has_headers, _delimiter), do: err() + def df_to_csv(_df, _filename, _has_headers, _delimiter, _quote_style), do: err() + def df_to_csv_cloud(_df, _ex_entry, _has_headers, _delimiter, _quote_style), do: err() def df_to_dummies(_df, _columns), do: err() def df_to_ipc(_df, _filename, _compression), do: err() def df_to_ipc_cloud(_df, _ex_entry, _compression), do: err() @@ -274,7 +274,7 @@ defmodule Explorer.PolarsBackend.Native do def lf_to_parquet_cloud(_df, _filename, _compression), do: err() def lf_to_ipc(_df, _filename, _compression, _streaming), do: err() def lf_to_ipc_cloud(_df, _cloud_entry, _compression), do: err() - def lf_to_csv(_df, _filename, _header, _delimiter, _streaming), do: err() + def lf_to_csv(_df, _filename, _header, _delimiter, _quote_style, _streaming), do: err() def lf_sql(_df, _sql_string, _table_name), do: err() # Series diff --git a/native/explorer/src/dataframe/io.rs b/native/explorer/src/dataframe/io.rs index 8a6d5617e..dd565c998 100644 --- a/native/explorer/src/dataframe/io.rs +++ b/native/explorer/src/dataframe/io.rs @@ -15,7 +15,7 @@ use rustler::{Binary, Env, NewBinary}; use std::fs::File; use std::io::{BufReader, BufWriter, Cursor}; -use crate::datatypes::{ExParquetCompression, ExS3Entry, ExSeriesDtype}; +use crate::datatypes::{ExParquetCompression, ExQuoteStyle, ExS3Entry, ExSeriesDtype}; use crate::{ExDataFrame, ExplorerError}; #[cfg(feature = "cloud")] @@ -100,12 +100,14 @@ pub fn df_to_csv( filename: &str, include_headers: bool, delimiter: u8, + quote_style: ExQuoteStyle, ) -> Result<(), ExplorerError> { let file = File::create(filename)?; let mut buf_writer = BufWriter::new(file); CsvWriter::new(&mut buf_writer) .include_header(include_headers) .with_separator(delimiter) + .with_quote_style(quote_style.into()) .finish(&mut data.clone())?; Ok(()) } @@ -117,12 +119,14 @@ pub fn df_to_csv_cloud( ex_entry: ExS3Entry, include_headers: bool, delimiter: u8, + quote_style: ExQuoteStyle, ) -> Result<(), ExplorerError> { let mut cloud_writer = build_aws_s3_cloud_writer(ex_entry)?; CsvWriter::new(&mut cloud_writer) .include_header(include_headers) .with_separator(delimiter) + .with_quote_style(quote_style.into()) .finish(&mut data.clone())?; let _ = cloud_writer.finish()?; @@ -136,12 +140,14 @@ pub fn df_dump_csv( data: ExDataFrame, include_headers: bool, delimiter: u8, + quote_style: ExQuoteStyle, ) -> Result { let mut buf = vec![]; CsvWriter::new(&mut buf) .include_header(include_headers) .with_separator(delimiter) + .with_quote_style(quote_style.into()) .finish(&mut data.clone())?; let mut values_binary = NewBinary::new(env, buf.len()); @@ -675,6 +681,7 @@ pub fn df_to_csv_cloud( _ex_entry: ExS3Entry, _has_headers: bool, _delimiter: u8, + _quote_style: ExQuoteStyle, ) -> Result<(), ExplorerError> { Err(ExplorerError::Other("Explorer was compiled without the \"aws\" feature enabled. \ This is mostly due to this feature being incompatible with your computer's architecture. \ diff --git a/native/explorer/src/datatypes.rs b/native/explorer/src/datatypes.rs index ae8f5c113..7a72625ff 100644 --- a/native/explorer/src/datatypes.rs +++ b/native/explorer/src/datatypes.rs @@ -785,6 +785,14 @@ impl TryFrom for ParquetCompression { } } +#[derive(NifTaggedEnum)] +pub enum QuoteStyle { + Necessary, + Always, + NonNumeric, + Never, +} + // ========================= // ====== FSS Structs ====== // ========================= @@ -849,3 +857,35 @@ impl From for Expr { ex_expr.clone_inner() } } + +use polars::prelude::QuoteStyle as PolarsQuoteStyle; + +#[derive(NifTaggedEnum)] +pub enum ExQuoteStyle { + Necessary, + Always, + NonNumeric, + Never, +} + +impl From for PolarsQuoteStyle { + fn from(style: ExQuoteStyle) -> Self { + match style { + ExQuoteStyle::Necessary => PolarsQuoteStyle::Necessary, + ExQuoteStyle::Always => PolarsQuoteStyle::Always, + ExQuoteStyle::NonNumeric => PolarsQuoteStyle::NonNumeric, + ExQuoteStyle::Never => PolarsQuoteStyle::Never, + } + } +} + +impl From for ExQuoteStyle { + fn from(style: PolarsQuoteStyle) -> Self { + match style { + PolarsQuoteStyle::Necessary => ExQuoteStyle::Necessary, + PolarsQuoteStyle::Always => ExQuoteStyle::Always, + PolarsQuoteStyle::NonNumeric => ExQuoteStyle::NonNumeric, + PolarsQuoteStyle::Never => ExQuoteStyle::Never, + } + } +} diff --git a/native/explorer/src/lazyframe/io.rs b/native/explorer/src/lazyframe/io.rs index 903d7a908..bf59d8074 100644 --- a/native/explorer/src/lazyframe/io.rs +++ b/native/explorer/src/lazyframe/io.rs @@ -4,7 +4,7 @@ use std::io::BufWriter; use std::num::NonZeroUsize; use crate::dataframe::io::schema_from_dtypes_pairs; -use crate::datatypes::{ExParquetCompression, ExS3Entry, ExSeriesDtype}; +use crate::datatypes::{ExParquetCompression, ExQuoteStyle, ExS3Entry, ExSeriesDtype}; use crate::{ExLazyFrame, ExplorerError}; #[rustler::nif] @@ -256,6 +256,7 @@ pub fn lf_to_csv( filename: &str, include_headers: bool, delimiter: u8, + quote_style: ExQuoteStyle, streaming: bool, ) -> Result<(), ExplorerError> { let lf = data.clone_inner(); @@ -283,6 +284,7 @@ pub fn lf_to_csv( CsvWriter::new(&mut buf_writer) .include_header(include_headers) .with_separator(delimiter) + .with_quote_style(quote_style.into()) .finish(&mut df.clone())?; Ok(()) } diff --git a/test/explorer/data_frame/csv_test.exs b/test/explorer/data_frame/csv_test.exs index c865ec4c9..60ec3424b 100644 --- a/test/explorer/data_frame/csv_test.exs +++ b/test/explorer/data_frame/csv_test.exs @@ -916,4 +916,79 @@ defmodule Explorer.DataFrame.CSVTest do end defp http_endpoint(bypass), do: "http://localhost:#{bypass.port}" + + describe "quote_style option" do + @tag :tmp_dir + test "necessary quote_style", %{tmp_dir: tmp_dir} do + df = DF.new(a: ["a,b", "c", "d,e"], b: [1, 2, 3]) + path = tmp_csv(tmp_dir, "") + + :ok = DF.to_csv!(df, path, quote_style: :necessary) + contents = File.read!(path) + + assert contents == """ + a,b + "a,b",1 + c,2 + "d,e",3 + """ + end + + @tag :tmp_dir + test "always quote_style", %{tmp_dir: tmp_dir} do + df = DF.new(a: ["a", "b"], b: [1, 2]) + path = tmp_csv(tmp_dir, "") + + :ok = DF.to_csv!(df, path, quote_style: :always) + contents = File.read!(path) + + assert contents == """ + "a","b" + "a","1" + "b","2" + """ + end + + @tag :tmp_dir + test "non_numeric quote_style", %{tmp_dir: tmp_dir} do + df = DF.new(a: ["abc", "def"], b: [1, 2]) + path = tmp_csv(tmp_dir, "") + + :ok = DF.to_csv!(df, path, quote_style: :non_numeric) + contents = File.read!(path) + + assert contents == """ + "a","b" + "abc",1 + "def",2 + """ + end + + @tag :tmp_dir + test "never quote_style", %{tmp_dir: tmp_dir} do + df = DF.new(a: ["a,b", "c"], b: [1, 2]) + path = tmp_csv(tmp_dir, "") + + :ok = DF.to_csv!(df, path, quote_style: :never) + contents = File.read!(path) + + assert contents == """ + a,b + a,b,1 + c,2 + """ + end + + @tag :tmp_dir + test "invalid quote_style", %{tmp_dir: tmp_dir} do + df = DF.new(a: ["a"], b: [1]) + path = tmp_csv(tmp_dir, "") + + assert_raise ErlangError, + "Erlang error: :invalid_variant", + fn -> + DF.to_csv!(df, path, quote_style: :invalid) + end + end + end end