Skip to content

Commit

Permalink
Merge pull request #31 from mrufsvold/develop
Browse files Browse the repository at this point in the history
1.1.0
  • Loading branch information
mrufsvold authored Jul 7, 2023
2 parents 6fbbf89 + 7a4dace commit d818a3b
Show file tree
Hide file tree
Showing 16 changed files with 1,382 additions and 743 deletions.
8 changes: 6 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
name = "ExpandNestedData"
uuid = "8a7d223a-a7dc-4abf-8bc1-b0ce2ace9adc"
authors = ["Micah Rufsvold <[email protected]>"]
version = "1.0.0"
version = "1.1.0"

[deps]
Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
SumTypes = "8e1ec7a9-0e02-4297-b0fe-6433085c89f2"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"

[compat]
DataStructures = "0.18"
Compat = "3.42, 4"
DataStructures = "0.18.14"
PooledArrays = "1.4"
StructTypes = "1.10"
Tables = "1"
Expand Down
32 changes: 32 additions & 0 deletions benchmarks/benchmarks.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
using ExpandNestedData

small_dict = Dict(
:a => 1,
:b => "2",
:c => Dict(:e => Symbol(3), :f => 4)
)


many_records = [
small_dict
for _ in 1:1000
]

function make_deep_dict(depth=1)
if depth == 1
return Dict(Symbol(depth) => 1)
end
return Dict(
Symbol(i) => make_deep_dict(depth-1)
for i in 1:3
)
end

# @btime expand($small_dict; lazy_columns=true, column_style=:nested)
# @profview ExpandNestedData.expand(small_dict; lazy_columns=true, column_style=:nested);
# deep_dict = make_deep_dict(10)
# @btime expand($deep_dict; lazy_columns=true, column_style=:nested);
# @descend expand(many_records; lazy_columns=true, column_style=:nested)
# @profview ExpandNestedData.expand(many_records; lazy_columns=true, column_style=:nested)
# @profview_allocs ExpandNestedData.expand(many_records; lazy_columns=true, column_style=:nested)
# @btime expand($many_records; lazy_columns=true, column_style=:nested);
70 changes: 34 additions & 36 deletions src/ExpandTypes.jl → src/ColumnDefinitions.jl
Original file line number Diff line number Diff line change
@@ -1,31 +1,8 @@
@enum StepType dict arr leaf default merge_cols stack_cols columns
module ColumnDefinitions
using ..ColumnSetManagers: ColumnSet, unnamed
import ..join_names
export ColumnDefinition, get_field_path, get_column_name, get_default_value, get_pool_arrays, make_column_def_child_copies, current_path_name, construct_column_definitions

struct ExpandMissing end
struct UnpackStep{N,T,C}
type::StepType
name::N
data::T
level::Int64
path_node::C
end
get_step_type(u::UnpackStep) = u.type
get_name(u::UnpackStep) = u.name
get_data(u::UnpackStep) = u.data
get_level(u::UnpackStep) = u.level
get_path_node(u::UnpackStep) = u.path_node

"""NameValueContainer is an abstraction on Dict and DataType structs so that we can get their
contents without worrying about `getkey` or `getproperty`, etc.
"""
NameValueContainer = Union{StructTypes.DictType, StructTypes.DataType}
Container = Union{StructTypes.DictType, StructTypes.DataType, StructTypes.ArrayType}

is_NameValueContainer(t) = typeof(StructTypes.StructType(t)) <: NameValueContainer
is_container(t) = typeof(StructTypes.StructType(t)) <: Container
is_value_type(t::Type) = !is_container(t) && isconcretetype(t)

##### ColumnDefinition #####
############################

"""ColumnDefinition provides a mechanism for specifying details for extracting data from a nested data source"""
struct ColumnDefinition
Expand All @@ -50,7 +27,6 @@ get_pool_arrays(c::ColumnDefinition) = c.pool_arrays
## Keyword Args
* `column_name::Symbol`: A name for the resulting column. If `nothing`, defaults to joining the `field_path` with snake case format.
* `flatten_arrays::Bool`: When a leaf node is an array, should the values be flattened into separate rows or treated as a single value. Default: `true`
* `default_value`: When the field_path keys do not exist on one or more branches, fill with this value. Default: `missing`
* `pool_arrays::Bool`: When collecting values for this column, choose whether to use `PooledArrays` instead of `Base.Vector`. Default: `false` (use `Vector`)
* `name_join_pattern::String`: The separator for joining field paths into column names. Default: "_"
Expand All @@ -62,30 +38,52 @@ function ColumnDefinition(field_path; kwargs...)
end
function ColumnDefinition(field_path::T; column_name=nothing, default_value=missing, pool_arrays=false, name_join_pattern::String = "_") where {T <: Tuple}
if column_name isa Nothing
path = last(field_path) == :unnamed ? field_path[1:end-1] : field_path
path = last(field_path) == unnamed ? field_path[1:end-1] : field_path
column_name = join_names(path, name_join_pattern)
end
ColumnDefinition(field_path, column_name, default_value, pool_arrays)
end
function ColumnDefinition(field_path, column_names::Dict; pool_arrays::Bool, name_join_pattern = "_")
column_name = field_path in keys(column_names) ? column_names[field_path] : nothing
column_name = haskey(column_names, field_path) ? column_names[field_path] : nothing
ColumnDefinition(field_path; column_name=column_name, pool_arrays=pool_arrays, name_join_pattern = name_join_pattern)
end
function construct_column_definitions(columns, column_names, pool_arrays, name_join_pattern)
paths = keys(columns)
function construct_column_definitions(col_set, column_names, pool_arrays, name_join_pattern)
paths = keys(col_set)
return ColumnDefinition.(paths, Ref(column_names); pool_arrays=pool_arrays, name_join_pattern)
end

function current_path_name(c::ColumnDefinition, level)

function current_path_name(c::ColumnDefinition, level::Int64)
fp = get_field_path(c)
return fp[level]
end
get_unique_current_names(defs, level) = unique((current_path_name(def, level) for def in defs))
function make_column_def_child_copies(column_defs::Vector{ColumnDefinition}, name, level)
return filter(

"""
get_unique_current_names(defs, level)
Get all unique names for the given depth level for a list of ColumnDefinitions
"""
get_unique_current_names(defs::AbstractArray{ColumnDefinition}, level) = unique((current_path_name(def, level) for def in defs))

"""
make_column_def_child_copies(column_defs::Vector{ColumnDefinition}, name, level)
Return a column definitions that have children for the given name at the given level.
"""
function make_column_def_child_copies(column_defs::AbstractArray{ColumnDefinition}, name, level::Int64)
mask = map(
def -> is_current_name(def, name, level) && length(get_field_path(def)) > level,
column_defs
)
return view(column_defs, mask)
end
"""
is_current_name(column_def::ColumnDefinition, name, level)
Check if name matches the field path for column_def at level
"""
is_current_name(column_def::ColumnDefinition, name, level) = current_path_name(column_def, level) == name
"""
has_more_keys(column_def, level)
Check if there are more keys in the field path below the given level
"""
has_more_keys(column_def, level) = level < length(get_field_path(column_def))

end # ColumnDefinitions
89 changes: 0 additions & 89 deletions src/ColumnSet.jl

This file was deleted.

Loading

2 comments on commit d818a3b

@mrufsvold
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/87059

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.1.0 -m "<description of version>" d818a3b3c7206101dab2085e6e907c800ff297b4
git push origin v1.1.0

Please sign in to comment.