diff --git a/Project.toml b/Project.toml index 6791309..8267ba8 100644 --- a/Project.toml +++ b/Project.toml @@ -1,17 +1,21 @@ name = "ExpandNestedData" uuid = "8a7d223a-a7dc-4abf-8bc1-b0ce2ace9adc" authors = ["Micah Rufsvold "] -version = "1.0.0" +version = "1.1.0" [deps] +Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" +SumTypes = "8e1ec7a9-0e02-4297-b0fe-6433085c89f2" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" [compat] -DataStructures = "0.18" +Compat = "3.42, 4" +DataStructures = "0.18.14" PooledArrays = "1.4" StructTypes = "1.10" Tables = "1" diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl new file mode 100644 index 0000000..f58d514 --- /dev/null +++ b/benchmarks/benchmarks.jl @@ -0,0 +1,32 @@ +using ExpandNestedData + +small_dict = Dict( + :a => 1, + :b => "2", + :c => Dict(:e => Symbol(3), :f => 4) +) + + +many_records = [ + small_dict + for _ in 1:1000 +] + +function make_deep_dict(depth=1) + if depth == 1 + return Dict(Symbol(depth) => 1) + end + return Dict( + Symbol(i) => make_deep_dict(depth-1) + for i in 1:3 + ) +end + +# @btime expand($small_dict; lazy_columns=true, column_style=:nested) +# @profview ExpandNestedData.expand(small_dict; lazy_columns=true, column_style=:nested); +# deep_dict = make_deep_dict(10) +# @btime expand($deep_dict; lazy_columns=true, column_style=:nested); +# @descend expand(many_records; lazy_columns=true, column_style=:nested) +# @profview ExpandNestedData.expand(many_records; lazy_columns=true, column_style=:nested) +# @profview_allocs ExpandNestedData.expand(many_records; lazy_columns=true, column_style=:nested) +# @btime expand($many_records; lazy_columns=true, column_style=:nested); \ No newline at end of file diff --git a/src/ExpandTypes.jl b/src/ColumnDefinitions.jl similarity index 61% rename from src/ExpandTypes.jl rename to src/ColumnDefinitions.jl index ccc5019..a4bd579 100644 --- a/src/ExpandTypes.jl +++ b/src/ColumnDefinitions.jl @@ -1,31 +1,8 @@ -@enum StepType dict arr leaf default merge_cols stack_cols columns +module ColumnDefinitions +using ..ColumnSetManagers: ColumnSet, unnamed +import ..join_names +export ColumnDefinition, get_field_path, get_column_name, get_default_value, get_pool_arrays, make_column_def_child_copies, current_path_name, construct_column_definitions -struct ExpandMissing end -struct UnpackStep{N,T,C} - type::StepType - name::N - data::T - level::Int64 - path_node::C -end -get_step_type(u::UnpackStep) = u.type -get_name(u::UnpackStep) = u.name -get_data(u::UnpackStep) = u.data -get_level(u::UnpackStep) = u.level -get_path_node(u::UnpackStep) = u.path_node - -"""NameValueContainer is an abstraction on Dict and DataType structs so that we can get their -contents without worrying about `getkey` or `getproperty`, etc. -""" -NameValueContainer = Union{StructTypes.DictType, StructTypes.DataType} -Container = Union{StructTypes.DictType, StructTypes.DataType, StructTypes.ArrayType} - -is_NameValueContainer(t) = typeof(StructTypes.StructType(t)) <: NameValueContainer -is_container(t) = typeof(StructTypes.StructType(t)) <: Container -is_value_type(t::Type) = !is_container(t) && isconcretetype(t) - -##### ColumnDefinition ##### -############################ """ColumnDefinition provides a mechanism for specifying details for extracting data from a nested data source""" struct ColumnDefinition @@ -50,7 +27,6 @@ get_pool_arrays(c::ColumnDefinition) = c.pool_arrays ## Keyword Args * `column_name::Symbol`: A name for the resulting column. If `nothing`, defaults to joining the `field_path` with snake case format. -* `flatten_arrays::Bool`: When a leaf node is an array, should the values be flattened into separate rows or treated as a single value. Default: `true` * `default_value`: When the field_path keys do not exist on one or more branches, fill with this value. Default: `missing` * `pool_arrays::Bool`: When collecting values for this column, choose whether to use `PooledArrays` instead of `Base.Vector`. Default: `false` (use `Vector`) * `name_join_pattern::String`: The separator for joining field paths into column names. Default: "_" @@ -62,30 +38,52 @@ function ColumnDefinition(field_path; kwargs...) end function ColumnDefinition(field_path::T; column_name=nothing, default_value=missing, pool_arrays=false, name_join_pattern::String = "_") where {T <: Tuple} if column_name isa Nothing - path = last(field_path) == :unnamed ? field_path[1:end-1] : field_path + path = last(field_path) == unnamed ? field_path[1:end-1] : field_path column_name = join_names(path, name_join_pattern) end ColumnDefinition(field_path, column_name, default_value, pool_arrays) end function ColumnDefinition(field_path, column_names::Dict; pool_arrays::Bool, name_join_pattern = "_") - column_name = field_path in keys(column_names) ? column_names[field_path] : nothing + column_name = haskey(column_names, field_path) ? column_names[field_path] : nothing ColumnDefinition(field_path; column_name=column_name, pool_arrays=pool_arrays, name_join_pattern = name_join_pattern) end -function construct_column_definitions(columns, column_names, pool_arrays, name_join_pattern) - paths = keys(columns) +function construct_column_definitions(col_set, column_names, pool_arrays, name_join_pattern) + paths = keys(col_set) return ColumnDefinition.(paths, Ref(column_names); pool_arrays=pool_arrays, name_join_pattern) end -function current_path_name(c::ColumnDefinition, level) + +function current_path_name(c::ColumnDefinition, level::Int64) fp = get_field_path(c) return fp[level] end -get_unique_current_names(defs, level) = unique((current_path_name(def, level) for def in defs)) -function make_column_def_child_copies(column_defs::Vector{ColumnDefinition}, name, level) - return filter( + +""" + get_unique_current_names(defs, level) +Get all unique names for the given depth level for a list of ColumnDefinitions +""" +get_unique_current_names(defs::AbstractArray{ColumnDefinition}, level) = unique((current_path_name(def, level) for def in defs)) + +""" + make_column_def_child_copies(column_defs::Vector{ColumnDefinition}, name, level) +Return a column definitions that have children for the given name at the given level. +""" +function make_column_def_child_copies(column_defs::AbstractArray{ColumnDefinition}, name, level::Int64) + mask = map( def -> is_current_name(def, name, level) && length(get_field_path(def)) > level, column_defs ) + return view(column_defs, mask) end +""" + is_current_name(column_def::ColumnDefinition, name, level) +Check if name matches the field path for column_def at level +""" is_current_name(column_def::ColumnDefinition, name, level) = current_path_name(column_def, level) == name +""" + has_more_keys(column_def, level) +Check if there are more keys in the field path below the given level +""" has_more_keys(column_def, level) = level < length(get_field_path(column_def)) + +end # ColumnDefinitions diff --git a/src/ColumnSet.jl b/src/ColumnSet.jl deleted file mode 100644 index f2ee42e..0000000 --- a/src/ColumnSet.jl +++ /dev/null @@ -1,89 +0,0 @@ -##### ColumnSet ##### -##################### - -# Convenience alias for a dictionary of columns -ColumnSet = Dict{Tuple, NestedIterator} -columnset(col, level) = ColumnSet(Tuple(() for _ in 1:level) => col) -init_column_set(step) = init_column_set(get_data(step), get_name(step), get_level(step)) -function init_column_set(data, name, level) - col_set = columnset(NestedIterator(data), level) - prepend_name!(col_set, name, level) - return col_set -end - -column_length(cols) = cols |> values |> first |> length - -"""apply_in_place!(cols, f, args...) -Apply f to each key, column pair by popping the value and readding -the key (this prevents mismatching key hashes after manipulating a ColumnSet)""" -function apply_in_place!(cols, f, args...) - initial_keys = collect(keys(cols)) - for key in initial_keys - val = pop!(cols, key) - key, val = f(key, val, args...) - cols[key] = val - end -end - -""" -prepend_name!(cols, name, level) -Set the given name for all column keys at the given level -""" -function prepend_name!(cols, name, level) - level < 1 && return nothing - apply_in_place!(cols, _prepend_name, name, level) -end -function _prepend_name(key, val, name, level) - new_key = Tuple(i==level ? name : k for (i,k) in enumerate(key)) - return new_key, val -end - -""" -repeat_each_column!(cols, n) - -Given a column set, apply repeat_each to all columns in place -""" -function repeat_each_column!(cols, n) - apply_in_place!(cols,_repeat_each_column, n) -end -function _repeat_each_column(key, val, n) - return key, repeat_each(val, n) -end - -""" -cycle_columns_to_length!(cols::ColumnSet) - -Given a column set where the length of all columns is some factor of the length of the longest -column, cycle all the short columns to match the length of the longest -""" -function cycle_columns_to_length!(cols::ColumnSet) - col_lengths = cols |> values .|> length - longest = col_lengths |> maximum - apply_in_place!(cols, cols_to_length, longest) - return cols -end -function cols_to_length(key, val, longest) - catchup_mult = longest ÷ length(val) - return key, cycle(val, catchup_mult) -end - -""" -get_column(cols::ColumnSet, name, default::NestedIterator) - -Get a column from a set with a given name, if no column with that name is found -construct a new column with same length as column set -""" -get_column(cols::ColumnSet, name, default::NestedIterator) = name in keys(cols) ? cols[name] : cycle(default, column_length(cols) ÷ length(default)) - - - -"""Return a missing column for each member of a child path""" -function make_missing_column_set(path_node) - missing_column_set = Dict( - get_field_path(value_node) => get_default(value_node) - for value_node in get_all_value_nodes(path_node) - ) - return missing_column_set -end - - diff --git a/src/ColumnSetManager.jl b/src/ColumnSetManager.jl new file mode 100644 index 0000000..4373512 --- /dev/null +++ b/src/ColumnSetManager.jl @@ -0,0 +1,335 @@ +module ColumnSetManagers +using DataStructures: OrderedRobinDict, Stack +using ..NameLists: NameID, NameList, top_level_id, unnamed_id, unnamed, max_id +using ..NestedIterators +import ..get_name +import ..get_id +import ..collect_tuple +export NameID, NameList, top_level_id, unnamed, unnamed_id +export ColumnSet, cycle_columns_to_length!, repeat_each_column!, get_first_key, get_total_length, column_length, set_length! +export ColumnSetManager, get_id, get_name, get_id_for_path, get_column_set, free_column_set!, build_final_column_set, init_column_set, reconstruct_field_path + + + +#### ColumnSet #### +################### +"""A dict-like set of columns. The keys are Int64 ids for actual names that are stored +in the ColumnSetManager""" +mutable struct ColumnSet + cols::Vector{Pair{NameID, RawNestedIterator}} + len::Int64 +end +ColumnSet() = ColumnSet(Pair{NameID, RawNestedIterator}[], 0) +function ColumnSet(p::Pair...) + cs = ColumnSet(Pair{NameID, RawNestedIterator}[p...], 0) + sort_keys!(cs) + reset_length!(cs) + return cs +end + +# Dict Interface +function Base.empty!(cs::ColumnSet) + empty!(cs.cols) + cs.len = 0 +end + +Base.haskey(cs::ColumnSet, k) = insorted((k,0), cs.cols; by=first) + +function Base.setindex!(cs::ColumnSet, v, k) + insert!(cs, (k=>v)) + cs.len = max(cs.len, length(v)) + return cs +end + +function Base.getindex(cs::ColumnSet, k) + i = searchsortedfirst(cs.cols, (k,0); by=first) + p = cs.cols[i] + if p[1] != k + throw(KeyError(k)) + end + return p[2] +end + + +""" + get_column!(cols::ColumnSet, name, default::RawNestedIterator) +Get a column from a set with a given name, if no column with that name is found +construct a new column with same length as column set +""" +function Base.pop!(cols::ColumnSet, name_id::NameID, default::RawNestedIterator) + return if haskey(cols,name_id) + last(pop!(cols,name_id)) + else + cycle(default, column_length(cols)) + end +end +function Base.pop!(cs::ColumnSet, k) + i = searchsortedfirst(cs.cols, (k,0); by=first) + p = popat!(cs.cols, i) + if p[1] != k + throw(KeyError(k)) + end + return p +end + +function Base.push!(cs::ColumnSet, p::Pair) + push!(cs.cols, p) +end + +function Base.merge!(cs1::ColumnSet, cs2::ColumnSet) + append!(cs1.cols, cs2.cols) + sort_keys!(cs1) + cs1.len = max(cs1.len, cs2.len) + return cs1 +end + +Base.keys(cs::ColumnSet) = (first(p) for p in cs.cols) +Base.values(cs::ColumnSet) = (last(p) for p in cs.cols) +Base.pairs(cs::ColumnSet) = (p for p in cs.cols) +Base.isequal(::ColumnSet, ::ColumnSet) = throw(ErrorException("To compare ColumnSet with ColumnSet, you must pass a ColumnSetManager.")) +function Base.isequal(cs::ColumnSet, o::ColumnSet, csm) + all(isequal.(keys(cs), keys(o))) && all(isequal.(values(cs), values(o), Ref(csm))) +end + +Base.length(cs::ColumnSet) = length(cs.cols) +Base.insert!(cs::ColumnSet, p::Pair) = insert!(cs.cols, searchsortedfirst(cs.cols, p; by=first), p) + +sort_keys!(cs::ColumnSet) = sort!(cs.cols; by=first, alg=InsertionSort) +"""Get the length of the longest column. This is almost always the length of all columns in the set +except in the midst of merging multiple sets together""" +column_length(cols::ColumnSet) = cols.len + +"""Check for the longest column in a set and update column length of set""" +function reset_length!(cs::ColumnSet) + len = 0 + for v in values(cs) + len = max(len, length(v)) + end + set_length!(cs, len) + return cs +end + +"""Force column length of set""" +function set_length!(cs::ColumnSet, l) + cs.len = l +end + + +""" +The ColumnSetManager creates IDs and stores for keys in the input data and for full field paths. +It also keeps ColumnSets that are no longer in use and recycles them when a new ColumnSet is needed +""" +struct ColumnSetManager{T} + # todo using dict here reduced JET errors, need to test performance later + name_to_id::OrderedRobinDict{Any, NameID} + id_generator::T + column_sets::Stack{ColumnSet} + name_list_collector::Vector{NameID} +end +function ColumnSetManager() + name_to_id = OrderedRobinDict{Any, NameID}(unnamed => unnamed_id) + id_generator = Iterators.Stateful(Iterators.countfrom(2)) + column_sets = Stack{ColumnSet}() + name_list_collector = NameID[] + return ColumnSetManager( + name_to_id, + id_generator, + column_sets, + name_list_collector, + ) +end + + +""" + get_id(csm::ColumnSetManager, name) +Get an id for a new or existing name within a field path +""" +function get_id(csm::ColumnSetManager, name) + if haskey(csm.name_to_id, name) + return csm.name_to_id[name] + end + id = NameID(first(csm.id_generator)) + csm.name_to_id[name] = id + return id +end + +get_id(::ColumnSetManager, name::NameID) = name + + +""" + get_id(csm::ColumnSetManager, field_path::Cons{Int64}) + +Get an id for the linked list of ids that constitute a field path in the core loop +""" +function get_id(csm::ColumnSetManager, name_list::NameList) + field_path = collect_name_ids(csm::ColumnSetManager, name_list::NameList) + path_tuple = collect_tuple(field_path) + return get_id(csm, path_tuple) +end + +function collect_name_ids(csm::ColumnSetManager, name_list::NameList) + empty!(csm.name_list_collector) + head::NameList = name_list + while head.i != top_level_id + push!(csm.name_list_collector, head.i) + head = head.tail_i + end + # need to reverse field path because we stack the last on top as we descend through the data structure + return Iterators.reverse(csm.name_list_collector) +end + + +""" + get_id_for_path(csm::ColumnSetManager, field_path:Tuple) + +Given a path of actual names, create an id for each name, then create a id for the +new id path, return that final id +""" +function get_id_for_path(csm::ColumnSetManager, field_path::Tuple) + path_tuple = tuple((get_id(csm, name) for name in field_path)...) + id = get_id(csm, path_tuple) + return id +end + +""" + get_name(csm::ColumnSetManager, id) +Return the name associated with an id +""" +function get_name(csm::ColumnSetManager, id::NameID) + return csm.name_to_id.keys[id.id] +end + +""" + reconstruct_field_path(csm::ColumnSetManager, id) +Given an id for a field_path, reconstruct a tuple of actual names. +For heterogenous nodes (with both containers and values) the field path has an "unnamed" value. This +is dropped when reconstructing the field path. +""" +function reconstruct_field_path(csm::ColumnSetManager, id::NameID, include_unnamed=false) + id_path = get_name(csm, id) + symbol_gen = include_unnamed ? + (Symbol(get_name(csm, name_id)) for name_id in id_path) : + (Symbol(get_name(csm, name_id)) for name_id in id_path if name_id != unnamed_id) + return tuple(symbol_gen...) +end + +""" + get_column_set(csm::ColumnSetManager) +Get a new ColumnSet from the manager +""" +function get_column_set(csm::ColumnSetManager) + col_set = if !isempty(csm.column_sets) + pop!(csm.column_sets) + else + ColumnSet() + end + return col_set +end + +""" + free_column_set!(csm::ColumnSetManager, column_set::ColumnSet) +Return a ColumnSet so that it can be recycled in future `get_column_set` calls +""" +function free_column_set!(csm::ColumnSetManager, column_set::ColumnSet) + empty!(column_set) + push!(csm.column_sets, column_set) +end + +""" + Base.merge!(csm::ColumnSetManager, cs1, cs2) +Merge cs2 into cs1 and free cs2 +""" +function Base.merge!(csm::ColumnSetManager, cs1::ColumnSet, cs2::ColumnSet) + merge!(cs1, cs2) + free_column_set!(csm, cs2) + return cs1 +end + +""" + init_column_set(csm::ColumnSetManager, name::Cons{Int64}, data) +Create a new ColumnSet containing an id for name and a RawNestedIterator around data +""" +function init_column_set(csm::ColumnSetManager, name::NameList, data) + col = RawNestedIterator(csm, data) + cs = get_column_set(csm) + id = get_id(csm, name) + cs[id] = col + return cs +end + +""" + build_final_column_set(csm::ColumnSetManager, raw_cs) +Take a ColumnSet with ID keys and reconstruct a column_set with actual names keys +""" +function build_final_column_set(csm::ColumnSetManager, raw_cs) + # todo -- we could track the longest field_path and then make the tuple length known + # todo -- can the final columnset be changed to symbols at this point? + final_cs = OrderedRobinDict{Tuple, NestedIterator}() + for (raw_id, column) in pairs(raw_cs) + field_path = reconstruct_field_path(csm, raw_id) + final_cs[field_path] = NestedIterator(csm, column) + end + return final_cs +end + +""" + get_total_length(vec_of_col_sets) +Add up the column_length of all columns in a vector +""" +function get_total_length(vec_of_col_sets) + len = 0 + for col_set in vec_of_col_sets + len += column_length(col_set) + end + return len +end + + +"""apply_in_place!(cols, f, args...) +Apply f to each key, column pair by popping the value and readding +the key (this prevents mismatching key hashes after manipulating a ColumnSet)""" +function apply_in_place!(cols, f, args...) + for i in eachindex(cols) + k, v = cols[i] + val = f(v, args...) + cols[i] = Pair(k,val) + end +end + +""" +repeat_each_column!(cols, n) + +Given a column set, apply repeat_each to all columns in place +""" +function repeat_each_column!(col_set::ColumnSet, n) + apply_in_place!(col_set.cols, repeat_each, n) + col_set.len *= n + return col_set +end + +""" +cycle_columns_to_length!(cols::ColumnSet) + +Given a column set where the length of all columns is some factor of the length of the longest +column, cycle all the short columns to match the length of the longest +""" +function cycle_columns_to_length!(col_set::ColumnSet) + longest = col_set.len + apply_in_place!(col_set.cols, cols_to_length, longest) + return col_set +end +function cols_to_length(val, longest) + catchup_mult = longest ÷ length(val) + return cycle(val, catchup_mult) +end + +""" + get_first_key(cs::ColumnSet) +Return the lowest value id key from a columnset +""" +get_first_key(cs::ColumnSet) = length(cs) > 0 ? first(first(cs.cols)) : max_id + + + + +end # ColumnSetManagers diff --git a/src/Core.jl b/src/Core.jl index 4b0cc7a..6f3e451 100644 --- a/src/Core.jl +++ b/src/Core.jl @@ -1,5 +1,3 @@ -using DataStructures: Stack - """ expand(data, column_defs=nothing; default_value = missing, @@ -33,14 +31,17 @@ function expand(data, column_definitions=nothing; name_join_pattern = "_") typed_column_style = get_column_style(column_style) - path_graph = make_path_graph(column_definitions) - columns = create_columns(data, path_graph; default_value=default_value) + csm = ColumnSetManager() + path_graph = make_path_graph(csm, column_definitions) + + raw_columns = create_columns(data, path_graph, csm, default_value) + columns = build_final_column_set(csm, raw_columns) final_path_graph = column_definitions isa Nothing ? - make_path_graph(construct_column_definitions(columns, column_names, pool_arrays, name_join_pattern)) : + make_path_graph(csm, construct_column_definitions(columns, column_names, pool_arrays, name_join_pattern)) : path_graph - expanded_table = ExpandedTable(columns, final_path_graph; lazy_columns=lazy_columns, pool_arrays=pool_arrays) + expanded_table = ExpandedTable(columns, final_path_graph, csm; lazy_columns=lazy_columns, pool_arrays=pool_arrays) final_table = if typed_column_style == flat_columns as_flat_table(expanded_table) @@ -50,82 +51,105 @@ function expand(data, column_definitions=nothing; return final_table end -"""Wrap an object in the correct UnpackStep""" -function wrap_object(name::N, data::T, level::Int64, path_node::C, step_type::S=nothing) where {N,T,C,S} - if T <: ExpandMissing - return default_object(name, level, path_node) - end - struct_t = typeof(StructTypes.StructType(T)) - obj_type = if S <: StepType - step_type - elseif struct_t <: StructTypes.ArrayType - arr - elseif struct_t <: NameValueContainer - dict - else - leaf - end - return UnpackStep{N,T,C}(obj_type, name, data, level, path_node) -end - -# Helper functions for building Unpack steps -default_object(name::N, level, path_node::C) where {N,C} = UnpackStep{N, Nothing, C}(default, name, nothing, level, path_node) -stack_instruction(name::N, col_n, level) where N = UnpackStep{N, Int64, Nothing}(stack_cols, name, col_n, level, nothing) -merge_instruction(name::N, col_n, level) where N = UnpackStep{N, Int64, Nothing}(merge_cols, name, col_n, level, nothing) -column_set_step(cols::T) where T = UnpackStep{Nothing, T, Nothing}(columns, nothing, cols, 0, nothing) - - -function create_columns(data, path_graph; default_value=missing, kwargs...) - default_column = NestedIterator(default_value) +function create_columns(data, path_graph, csm, default_value=missing) + default_column = RawNestedIterator(csm, default_value) @assert length(default_column) == 1 "The default value must have a length of 1. If you want the value to have a length, try wrapping in a Tuple with `(default_val,)`" column_stack = ColumnSet[] instruction_stack = Stack{UnpackStep}() - push!(instruction_stack, wrap_object(:top_level, data, 0, path_graph)) + + push!(instruction_stack, wrap_object(NameList(), data, path_graph)) while !isempty(instruction_stack) step = pop!(instruction_stack) - dispatch_step!(step, default_column, column_stack, instruction_stack) + dispatch_step!(step, default_column, column_stack, instruction_stack, csm) end @assert length(column_stack) == 1 "Internal Error, more than one column stack resulted" return first(column_stack) end + + """ dispatch_step!(step, default_column, column_stack, instruction_stack) Generic dispatch to the correct function for this step """ -function dispatch_step!(step, default_column, column_stack, instruction_stack) - step_type = get_step_type(step) - - if step_type == columns - push!(column_stack, get_data(step)) - elseif step_type == default - level = get_level(step) - col_set = columnset(default_column, level) - prepend_name!(col_set, get_name(step), level) - push!(column_stack, col_set) - elseif step_type == merge_cols - merge_cols!(step, column_stack) - elseif step_type == stack_cols - stack_cols!(step, column_stack, default_column) - elseif step_type == dict - process_dict!(step, instruction_stack) - elseif step_type == arr - process_array!(step, instruction_stack) - elseif step_type == leaf - process_leaf!(step, instruction_stack) +function dispatch_step!(step, default_column, column_stack, instruction_stack, csm) + @debug "dispatching" step=step + @cases step begin + DictStep(n,d,p) => process_dict!(n, d, p, instruction_stack, csm) + ArrayStep(n,d,p) => process_array!(n, d, p, instruction_stack, csm) + LeafStep(n,d) => process_leaf!(n, d, instruction_stack, csm) + DefaultStep(n) => create_default_column_set!(n, default_column, column_stack, csm) + MergeStep(d) => merge_cols!(d, column_stack, csm) + StackStep(d) => stack_cols!(d, column_stack, default_column, csm) + NewColumnSetStep(cs) => push!(column_stack, cs) end return nothing end """ -process_leaf!(step, instruction_stack) + process_leaf!(step, instruction_stack, csm) Take a value at the end of a path and wrap it in a new ColumnSet """ -function process_leaf!(step, instruction_stack) - push!(instruction_stack, column_set_step(init_column_set(step))) +function process_leaf!(name_list, data, instruction_stack, csm) + push!(instruction_stack, init_column_set_step(csm, name_list, data)) end +""" + create_default_column_set!(step, default_column, column_stack, csm) +Build a column set with a single column which is the default column for the run +""" +function create_default_column_set!(name_list, default_column, column_stack, csm) + name_id = get_id(csm, name_list) + col_set = get_column_set(csm) + col_set[name_id] = default_column + push!(column_stack, col_set) +end + +""" + process_dict!(step::UnpackStep, instruction_stack) + +Handle a NameValuePair container (struct or dict) by calling process on all values with a +new UnpackStep that has a name matching the key. If ColumnDefinitions are provided, then +only grab the keys that apply and add default columns where a key is missing. +""" +function process_dict!(parent_name_list, data, node, instruction_stack, csm) + data_name_ids = get_id.(Ref(csm), get_names(data)) + @debug "processing NameValueContainer" step_type=:dict dtype=typeof(data) key_ids=data_name_ids + + child_nodes = @cases node begin + Path => [c for c in get_children(node) if get_name(c) != unnamed_id] + Value => throw(ErrorException("Got value node in process_dict, should have been passed to process_leaf")) + Simple => (SimpleNode(id) for id in data_name_ids) + end + + if length(child_nodes) == 0 + push!(instruction_stack, empty_column_set_step(csm)) + return nothing + end + + push!(instruction_stack, MergeStep(length(child_nodes))) + + for child_node in child_nodes + name_id = get_name(child_node) + @debug "getting information for child" name=name node=child_node + name_list = NameList(parent_name_list, name_id) + # TODO we have to do this lookup twice (once to make id, once to get name back) + # it would be better to zip up the name_ids with the values as they're constructed + name = get_name(csm, name_id) + child_data = get_value(data, name, ExpandMissing()) + @debug "child data retrieved" data=child_data + data_has_name = name_id in data_name_ids + next_step = @cases child_node begin + Path => wrap_container_val(data_has_name, name_list, child_data, child_node, csm) + Value => wrap_object(name_list, child_data, child_node, LeafStep) + Simple => wrap_object(name_list, child_data, child_node) + end + @debug "Adding next step" child_name=name step=next_step + push!(instruction_stack, next_step) + end + return nothing +end """ process_array!(step::UnpackStep, instruction_stack) @@ -136,26 +160,25 @@ If it is all "values", return it to be processed as a leaf If it is a mix, take the loose "values" and process as a leaf. Then merge that ColumnSet with the ColumnSet resulting from stacking the containers. """ -function process_array!(step::UnpackStep{N,T,C}, instruction_stack) where {N,T,C} - arr = get_data(step) - name = get_name(step) - level = get_level(step) - path_node = get_path_node(step) - element_count = length(arr) - +function process_array!(name_list, arr::T, node, instruction_stack, csm) where {T <: AbstractArray} + element_count = length(arr)::Int64 + @debug "Processing array" dtype=T arr_len=element_count if element_count == 0 # If we have column defs, but the array is empty, that means we need to make a # missing column_set - next_step = !(C <: Union{SimpleNode, Nothing}) ? - column_set_step(make_missing_column_set(path_node)) : - default_object(name, level, path_node) - push!(instruction_stack, next_step) + @cases node begin + [Path,Value] => empty_arr_path!(csm, node, instruction_stack) + Simple => empty_arr_simple!(name_list, instruction_stack) + end return nothing elseif element_count == 1 - push!(instruction_stack, wrap_object(name, first(arr), level, path_node)) + @cases node begin + [Path,Value,Simple] => push!(instruction_stack, wrap_object(name_list, first(arr), node)) + end + return nothing elseif all_eltypes_are_values(T) - push!(instruction_stack, column_set_step(init_column_set(arr, name, level))) + push!(instruction_stack, init_column_set_step(csm, name_list, arr)) return nothing end @@ -164,84 +187,50 @@ function process_array!(step::UnpackStep{N,T,C}, instruction_stack) where {N,T,C # Arrays with a mix need to be split and processed separately is_container_mask = is_container.(arr) container_count = sum(is_container_mask) - no_containers = container_count == 0 - all_containers = container_count == element_count - + all_containers, no_containers = @cases node begin + Simple => (container_count == element_count, container_count == 0) + Value => (false, true) + Path(_,c) => calculate_container_status_for_path_node(c, container_count) + end + @debug "element_types" all_containers=all_containers no_containers=no_containers if no_containers - push!(instruction_stack, column_set_step(init_column_set(arr, name, level))) + push!(instruction_stack, init_column_set_step(csm, name_list, arr)) return nothing end # The loose values will need to by merged into the stacked objects below if !all_containers - push!(instruction_stack, merge_instruction(name, 2, level)) - loose_values = [e for (f,e) in zip(is_container_mask, arr) if !f] - push!(instruction_stack, wrap_object(:unnamed, loose_values, level+1, path_node, leaf)) + push!(instruction_stack, MergeStep(2)) + loose_values = view(arr, .!is_container_mask) + next_step = length(loose_values) == 0 ? + missing_column_set_step(csm, node) : + wrap_object(NameList(name_list, unnamed_id), loose_values, node, LeafStep) + @debug "loose values" next_step=next_step + push!(instruction_stack, next_step) end - push!(instruction_stack, stack_instruction(name, container_count, level)) + container_count > 1 && push!(instruction_stack, StackStep(container_count)) - containers = all_containers ? arr : [e for (f,e) in zip(is_container_mask, arr) if f] + containers = view(arr, is_container_mask) for container in containers - push!(instruction_stack, wrap_object(name, container, level, path_node)) + next_step = wrap_object(name_list, container, node) + @debug "adding container element" next_step=next_step + push!(instruction_stack, next_step) end end -""" - process_dict!(step::UnpackStep, instruction_stack) -Handle a NameValuePair container (struct or dict) by calling process on all values with a -new UnpackStep that has a name matching the key. If ColumnDefinitions are provided, then -only grab the keys that apply and add default columns where a key is missing. -""" -function process_dict!(step::UnpackStep{N,T,C}, instruction_stack) where {N,T,C} - data = get_data(step) - level = get_level(step) - column_defs_provided = !(C <: Union{Nothing, SimpleNode}) - path_node = get_path_node(step) - data_names = get_names(data) - - child_nodes = column_defs_provided ? get_children(path_node) : SimpleNode.(data_names) - - names_num = length(child_nodes) - if names_num == 0 - push!(instruction_stack, column_set_step(ColumnSet())) - return nothing - end - push!(instruction_stack, merge_instruction(get_name(step), length(child_nodes), level)) - - for child_node in child_nodes - name = get_name(child_node) - # both are always true when unguided - should_have_child = !(child_node isa ValueNode) - data_has_name = name in data_names - child_data = get_value(data, name, ExpandMissing()) - - # CASE 1: Expected a child node and found one, unpack it (captures all unguided) - next_step = if should_have_child && data_has_name - wrap_object(name, child_data, level+1, child_node) - # CASE 2: Expected a child node, but don't find it - elseif should_have_child && !data_has_name - column_set_step(make_missing_column_set(child_node)) - # CASE 3: We don't expect a child node: wrap any value in a new column - elseif !should_have_child - wrap_object(name, child_data, level+1, child_node, leaf) - end - push!(instruction_stack, next_step) - end - return nothing -end ########### """ - merge_cols!(step, column_stack) + merge_cols!(step, column_stack, csm) Take N ColumnSets from the column_stack and merge them. This means repeating the values of each ColumnSet such that you get the Cartesian Product of their join. """ -function merge_cols!(step, column_stack) +function merge_cols!(set_num, column_stack, csm) col_set = pop!(column_stack) multiplier = 1 - for _ in 2:get_data(step) + for _ in 2:set_num new_col_set = pop!(column_stack) if length(new_col_set) == 0 continue @@ -250,36 +239,46 @@ function merge_cols!(step, column_stack) # to make a product of values repeat_each_column!(new_col_set, multiplier) multiplier *= column_length(new_col_set) - merge!(col_set, new_col_set) + merge!(csm, col_set, new_col_set) end if length(col_set) > 1 # catch up short columns with the total length for this group cycle_columns_to_length!(col_set) end - prepend_name!(col_set, get_name(step), get_level(step)) push!(column_stack, col_set) return nothing end """ - stack_cols!(step, column_stack, default_col) + stack_cols!(step, column_stack, default_col, csm) Take the ColumnSets that were created by processing the elements of an array and stack them together. If a column name is present in one set but not in the other, then insert a default column. """ -function stack_cols!(step, column_stack, default_col) - columns_to_stack = @view column_stack[end-get_data(step)+1:end] - prepend_name!.(columns_to_stack, Ref(get_name(step)), get_level(step)) - unique_names = columns_to_stack .|> keys |> Iterators.flatten |> unique - column_set = ColumnSet() - for name in unique_names - # For each unique column name, get that column for the results of processing each element - # in this array, and then stack them all - column_set[name] = columns_to_stack .|> - (col_set -> get_column(col_set, name, default_col)) |> - (cols -> foldl(stack, cols)) +function stack_cols!(column_set_num, column_stack, default_col, csm) + columns_to_stack = @view column_stack[end-column_set_num+1:end] + + new_column_set = get_column_set(csm) + total_len = get_total_length(columns_to_stack) + set_length!(new_column_set, total_len) + + # Since the column_sets are already sorted by key, the minimum first key in a columnset + # We go down each columnset and check if it has a matching key. + # From there, we either pop! the column if the key matches or create a default column and add + # it to the stack + vcat = NestedVcat(csm) + while !all(length(cs)==0 for cs in columns_to_stack) + first_key = minimum(get_first_key, columns_to_stack) + matching_cols = (pop!(cs, first_key, default_col) for cs in columns_to_stack) + push!(new_column_set, first_key=>foldl(vcat, matching_cols)) end - deleteat!(column_stack, length(column_stack)-get_data(step)+1:length(column_stack)) - push!(column_stack, column_set) + + # free the column_sets that are no longer needed + for _ in 1:column_set_num + cs = pop!(column_stack) + free_column_set!(csm, cs) + end + + push!(column_stack, new_column_set) return nothing end diff --git a/src/CoreHelpers.jl b/src/CoreHelpers.jl new file mode 100644 index 0000000..fd98790 --- /dev/null +++ b/src/CoreHelpers.jl @@ -0,0 +1,126 @@ +using .PathGraph +using .PathGraph: Node, get_all_value_nodes, get_field_path, get_default +using .ColumnSetManagers +using .ColumnSetManagers: ColumnSetManager, get_column_set, get_id_for_path, get_name + +@sum_type UnpackStep :hidden begin + DictStep(::NameList, ::Any, ::Node) + ArrayStep(::NameList, ::AbstractArray, ::Node) + LeafStep(::NameList, ::Any) + DefaultStep(::NameList) + MergeStep(::Int64) + StackStep(::Int64) + NewColumnSetStep(::ColumnSet) +end + +DictStep(name_list, data, path_node) = UnpackStep'.DictStep(name_list, data, path_node) +ArrayStep(name_list, arr, path_node) = UnpackStep'.ArrayStep(name_list, arr, path_node) +LeafStep(name_list, data) = UnpackStep'.LeafStep(name_list, data) +LeafStep(name_list, data, _) = UnpackStep'.LeafStep(name_list, data) +DefaultStep(name_list) = UnpackStep'.DefaultStep(name_list) +MergeStep(num_columns) = UnpackStep'.MergeStep(num_columns) +StackStep(num_columns) = UnpackStep'.StackStep(num_columns) +NewColumnSetStep(col_set) = UnpackStep'.NewColumnSetStep(col_set) + +# A couple predefined new column set step creators +missing_column_set_step(csm, path_node) = NewColumnSetStep(make_missing_column_set(csm, path_node)) +init_column_set_step(csm, name_list, data) = NewColumnSetStep(init_column_set(csm, name_list, data)) +empty_column_set_step(csm) = NewColumnSetStep(get_column_set(csm)) + +function PathGraph.get_name(u::UnpackStep) + return @cases u begin + [DictStep,ArrayStep,LeafStep,DefaultStep](n) => n + [MergeStep,StackStep,NewColumnSetStep] => throw(ErrorException("step has no name")) + end +end +function get_column_number(u::UnpackStep) + return @cases u begin + [MergeStep,StackStep](n) => n + [DictStep,ArrayStep,LeafStep,DefaultStep,NewColumnSetStep] => throw( + ErrorException("step does not have a column number")) + end +end +function get_data(u::UnpackStep) + return @cases u begin + [DictStep,ArrayStep,LeafStep](n,d) => d + [DefaultStep,MergeStep,StackStep,NewColumnSetStep] => throw(ErrorException("step do not have a data field")) + end +end +function get_path_node(u::UnpackStep) + return @cases u begin + [DictStep,ArrayStep](n,d,p) => p + [LeafStep,DefaultStep,MergeStep,StackStep,NewColumnSetStep] => throw( + ErrorException("step does not contain a path node")) + end +end +function ColumnSetManagers.get_column_set(u::UnpackStep) + return @cases u begin + [NewColumnSetStep](c) => c + [LeafStep,DefaultStep,MergeStep,StackStep,DictStep,ArrayStep] => throw( + ErrorException("Only NewColumnSetStep has a column_set field")) + end + node +end + +"""Wrap an object in the correct UnpackStep""" +function wrap_object(name::NameList, data::T, path_node::Node, step_type::S=nothing) where {T,S} + @debug "running wrap object" dtype=T name=name + if T <: ExpandMissing + @debug "got missing path for" name=name + return UnpackStep'.DefaultStep(name) + elseif !(S <: Nothing) + @debug "enforced step_type" step_type=step_type name=name + return step_type(name, data, path_node) + end + struct_t = typeof(StructTypes.StructType(T)) + @debug "StructType calculated" t=struct_t + _step_type = if struct_t <: StructTypes.ArrayType + UnpackStep'.ArrayStep + elseif struct_t <: NameValueContainer + DictStep + else + LeafStep + end + @debug "wrapping step" step_type=_step_type + return _step_type(name, data, path_node) +end + +function empty_arr_simple!(name, instruction_stack) + next_step = UnpackStep'.DefaultStep(name) + push!(instruction_stack, next_step) +end + +function empty_arr_path!(csm, path_node, instruction_stack) + next_step = missing_column_set_step(csm, path_node) + push!(instruction_stack, next_step) +end + +function calculate_container_status_for_path_node(child_nodes, container_count) + # for path nodes, we need to check if there is :unnamed (indicating that there should be loose values) + # if so, override all_containers so we check for loose + if !any(unnamed_id == get_name(n) for n in child_nodes) + # otherwise, we ignore any non-containers + return (true, false) + end + (false, container_count == 0) +end + +function wrap_container_val(data_has_name::Bool, name_list::NameList, data, node::Node, csm::ColumnSetManager) + @debug "wrap_container val for" data=data + if data_has_name + return wrap_object(name_list, data, node) + end + return missing_column_set_step(csm, node) +end + +"""Return a missing column for each member of a child path""" +function make_missing_column_set(csm, path_node::Node) + missing_column_set = get_column_set(csm) + + for value_node in get_all_value_nodes(path_node) + id = get_field_path(value_node) + missing_column_set[id] = get_default(value_node) + end + + return missing_column_set +end diff --git a/src/ExpandNestedData.jl b/src/ExpandNestedData.jl index 51fc00d..d9fc076 100644 --- a/src/ExpandNestedData.jl +++ b/src/ExpandNestedData.jl @@ -1,17 +1,38 @@ module ExpandNestedData +using Base: merge! +using DataStructures +using DataStructures: Stack, OrderedRobinDict +using Logging using PooledArrays using StructTypes +using SumTypes +using TypedTables: Table export expand export ColumnDefinition -export nested_columns, flat_columns + +"""NameValueContainer is an abstraction on Dict and DataType structs so that we can get their +contents without worrying about `getkey` or `getproperty`, etc. +""" +NameValueContainer = Union{StructTypes.DictType, StructTypes.DataType} +Container = Union{StructTypes.DictType, StructTypes.DataType, StructTypes.ArrayType} +struct ExpandMissing end + +function get_name end +function get_id end include("Utils.jl") +include("NameLists.jl") include("NestedIterators.jl") -include("ColumnSet.jl") -include("ExpandTypes.jl") +include("ColumnSetManager.jl") +include("ColumnDefinitions.jl") include("PathGraph.jl") +using .NestedIterators +using .ColumnSetManagers +using .ColumnDefinitions +using .PathGraph include("ExpandedTable.jl") include("Core.jl") +include("CoreHelpers.jl") end diff --git a/src/ExpandedTable.jl b/src/ExpandedTable.jl index 59f37a0..ee10898 100644 --- a/src/ExpandedTable.jl +++ b/src/ExpandedTable.jl @@ -1,6 +1,3 @@ -using Tables -using TypedTables - @enum ColumnStyle flat_columns nested_columns get_column_style(s::Symbol) = (flat=flat_columns, nested=nested_columns)[s] @@ -11,10 +8,10 @@ struct ExpandedTable end """Construct an ExpandedTable from the results of `create_columns`""" -function ExpandedTable(columns::Dict{K, T}, path_graph; lazy_columns, kwargs...) where {K, T<: NestedIterator{<:Any}} - column_tuple = make_column_tuple(columns, path_graph, lazy_columns) +function ExpandedTable(columns::OrderedRobinDict{K, T}, path_graph, csm; lazy_columns, kwargs...) where {K, T<: NestedIterator{<:Any}} + column_tuple = make_column_tuple(columns, path_graph, lazy_columns, csm) col_lookup = Dict( - get_final_name(val_node) => get_field_path(val_node) + get_name(csm, get_final_name(val_node)) => reconstruct_field_path(csm, get_field_path(val_node), true) for val_node in get_all_value_nodes(path_graph) ) return ExpandedTable(col_lookup, column_tuple) @@ -22,26 +19,38 @@ end """Build a nested NamedTuple of TypedTables from the columns following the same nesting structure as the source data""" -function make_column_tuple(columns, path_graph::AbstractPathNode, lazy_columns::Bool) - kvs = [] - for child in get_children(path_graph) - push!(kvs, Symbol(get_name(child)) => make_column_tuple(columns, child, lazy_columns)) +function make_column_tuple(col_set, node::Node, lazy_columns::Bool, csm) + column_t = lazy_columns ? NestedIterator : Union{Vector, PooledArray} + return make_column_tuple(col_set, node, column_t, csm) +end +function make_column_tuple(col_set, node::Node, column_t::Type{T}, csm) where T + return @cases node begin + Path(n,c) => new_level(col_set, n, c, column_t, csm) + Value(n, _, fp_id, pool, _) => new_column(col_set, n, fp_id, pool, column_t, csm) + Simple => throw(ErrorException("there should be no simple nodes when building the column tuple for the final table")) + end +end +function new_level(col_set, name_id, child_nodes, column_t::Type{T}, csm) where T + children_table = get_children_table(col_set, child_nodes, column_t, csm) + if name_id == top_level_id + return children_table end + return get_name(csm, name_id) => children_table +end - children_tuple = NamedTuple(kvs) - return Table(children_tuple) -end -function make_column_tuple(columns, path_graph::ValueNode, lazy_columns::Bool) - lazy_column = columns[get_field_path(path_graph)] - value_column = lazy_columns ? lazy_column : collect(lazy_column, get_pool_arrays(path_graph)) - if length(get_children(path_graph)) > 0 - d = Dict(:unnamed => value_column) - for child in get_children(path_graph) - d[Symbol(get_name(child))] = make_column_tuple(columns, child, lazy_columns) - end - return Table(NamedTuple(d)) +function get_children_table(col_set, child_nodes, column_t::Type{T}, csm) where T + keyval_pairs = Vector{Pair{Symbol, Union{Table,T}}}(undef, length(child_nodes)) + for (i, child) in enumerate(child_nodes) + keyval_pairs[i] = make_column_tuple(col_set, child, column_t, csm) end - return value_column + return Table(NamedTuple(keyval_pairs)) + +end +function new_column(col_set, name_id, field_path_id, pool_arrays, ::Type{T}, csm) where T + field_path = reconstruct_field_path(csm, field_path_id) + lazy_column = col_set[field_path] + value_column = T <: NestedIterator ? lazy_column : collect(lazy_column, pool_arrays) + return get_name(csm, name_id) => value_column end # Get Tables diff --git a/src/NameLists.jl b/src/NameLists.jl new file mode 100644 index 0000000..067ae55 --- /dev/null +++ b/src/NameLists.jl @@ -0,0 +1,28 @@ +module NameLists +#### Linked List for Key/Names #### +################################### +# An ID refering to a key/name in the input +struct NameID + id::Int64 +end +Base.isless(n::NameID, o::NameID) = n.id < o.id + +# Points to current head of a NameList +struct NameList + tail_i::Union{NameList, Nothing} + i::NameID +end +NameList() = NameList(nothing, top_level_id) + +#### Constants #### +################### + +const no_name_id = NameID(-1) +"""A NameID for TOP_LEVEL""" +const top_level_id = NameID(0) +"""the id for unnamed key. This happens when an array has loose values and containers""" +const unnamed_id = NameID(1) +"""the name to use for unnamed keys""" +const unnamed = :expand_nested_data_unnamed +const max_id = NameID(typemax(Int64)) +end \ No newline at end of file diff --git a/src/NestedIterators.jl b/src/NestedIterators.jl index ffed7fe..7dee10c 100644 --- a/src/NestedIterators.jl +++ b/src/NestedIterators.jl @@ -1,17 +1,76 @@ -"""NestedIterator is a container for instructions that build columns""" -struct NestedIterator{T} <: AbstractArray{T, 1} - get_index::Function +module NestedIterators +using PooledArrays +using SumTypes +using Compat +using ..NameLists: NameID, no_name_id +import ..get_name +import ..get_id +export RawNestedIterator, NestedIterator, seed, repeat_each, cycle, NestedVcat + +@sum_type IterCapture :hidden begin + RawSeed(::NameID) + RawRepeat(::Int64) + RawCycle(::Int64) + RawVcat(::Int64, ::Vector{IterCapture}, ::Vector{IterCapture}) +end + +mutable struct RawNestedIterator + # todo use linked list + get_index::Vector{IterCapture} column_length::Int64 - el_type::Type{T} + el_type::Type one_value::Bool - unique_val::Ref{T} + unique_val::NameID end +""" +RawNestedIterator(csm, data; total_length=nothing, default_value=missing) -Base.length(ni::NestedIterator) = ni.column_length -Base.size(ni::NestedIterator) = (ni.column_length,) -Base.getindex(ni::NestedIterator, i) = ni.get_index(i) -Base.eachindex(ni::NestedIterator) = 1:length(ni) -Base.collect(x::NestedIterator, pool_arrays=false) = pool_arrays ? PooledArray(x) : Vector(x) +Construct a new RawNestedIterator seeded with the value data +# Args +csm::ColumnSetManager +data::Any: seed value +total_length::Int: Cycle the values to reach total_length (must be even divisible by the length of `data`) +default_value: Value to fill if data is empty +""" +function RawNestedIterator(csm, data::T; total_length::Int=0, default_value=missing) where T + value = if T <: AbstractArray + length(data) == 0 ? (default_value,) : data + else + (data,) + end + is_one = allequal(value) + len = length(value) + val_T = typeof(value) + id = get_id(csm, value) + ncycle = total_length == 0 ? 1 : total_length ÷ len + return RawNestedIterator(id, val_T, is_one, len, ncycle) +end + +function RawNestedIterator(value_id::NameID, ::Type{T}, is_one::Bool, len::Int64, ncycle::Int64) where T + E = eltype(T) + f = IterCapture[IterCapture'.RawSeed(value_id), IterCapture'.RawCycle(len)] + unique_val = is_one ? value_id : no_name_id + return RawNestedIterator(f, len*ncycle, E, is_one, unique_val) +end +RawNestedIterator() = RawNestedIterator(IterCapture[], 0, Union{}, false, no_name_id) + +Base.length(rni::RawNestedIterator) = rni.column_length +Base.size(rni::RawNestedIterator) = (rni.column_length,) +Base.collect(rni::RawNestedIterator, csm) = collect(NestedIterator(csm, rni)) +Base.isequal(::RawNestedIterator, ::RawNestedIterator) = throw(ErrorException("To compare RawNestedIterator with RawNestedIterators, you must pass a ColumnSetManager.")) +function Base.isequal(rni1::RawNestedIterator, rni2::RawNestedIterator, csm) + rni1.column_length == rni2.column_length || return false + rni1.el_type === rni2.el_type || return false + if rni1.one_value != rni2.one_value + return false + else + rni1.unique_val == rni2.unique_val && return true + # if one iter was seeded with (1,) but the other was seeded with [1], the unique_val ids + # will be different, so we need to check the actual values + isequal(first(get_name(csm, rni1.unique_val)), first(get_name(csm,rni2.unique_val))) && return true + end + return isequal(collect(rni1, csm), collect(rni2, csm)) +end abstract type InstructionCapture <: Function end @@ -20,6 +79,10 @@ struct Seed{T} <: InstructionCapture data::T end (s::Seed)(i) = s.data[i] +struct RawSeed + data_id::NameID +end +Seed(csm, raw_seed::RawSeed) = get_name(csm, raw_seed.data_id) """Captures the repeat value for a repeat_each call""" struct UnrepeatEach <: InstructionCapture @@ -27,14 +90,14 @@ struct UnrepeatEach <: InstructionCapture end (u::UnrepeatEach)(i) = ceil(Int64, i/u.n) -"""repeat_each(c, N) will return an array where each source element appears N times in a row""" -function repeat_each(c::NestedIterator{T}, n) where T +function repeat_each(c::RawNestedIterator, n) # when there is only one unique value, we can skip composing the repeat_each step - return if c.one_value - NestedIterator(c.get_index, c.column_length * n, T, true, c.unique_val) - else - NestedIterator(c.get_index ∘ UnrepeatEach(n), c.column_length * n, T, false, c.unique_val) + c.column_length *= n + if c.one_value + return c end + push!(c.get_index,IterCapture'.RawRepeat(n)) + return c end """Captures the repeat value for a cycle call""" @@ -43,69 +106,96 @@ struct Uncycle <: InstructionCapture end (u::Uncycle)(i) = mod((i-1),u.n) + 1 """cycle(c, n) cycles through an array N times""" -function cycle(c::NestedIterator{T}, n) where T +function cycle(c::RawNestedIterator, n) + original_len = c.column_length # when there is only one unique value, we can skip composing the uncycle step - return if c.one_value && !(typeof(c.get_index) <: Seed) - NestedIterator(c.get_index, c.column_length * n, T, true, c.unique_val) - else - l = length(c) - NestedIterator(c.get_index ∘ Uncycle(l), c.column_length * n, T, false, c.unique_val) + c.column_length *= n + if c.one_value + return c end + push!(c.get_index,IterCapture'.RawCycle(original_len)) + return c end -"""Captures the two getindex functions of stacked NestedIterators. f_len tells which index to break over to g.""" -struct Unstack{F, G} <: InstructionCapture +"""Captures the two getindex functions of vcated NestedIterators. f_len tells which index to break over to g.""" +struct Unvcat{F, G} <: InstructionCapture f_len::Int64 f::F g::G end -(u::Unstack)(i) = i > u.f_len ? u.g(i-u.f_len) : u.f(i) +(u::Unvcat)(i) = i > u.f_len ? u.g(i-u.f_len) : u.f(i) -"""stack(c1::NestedIterator, c2::NestedIterator) +"""vcat(csm::ColumnSetManger, c1::RawNestedIterator, c2::RawNestedIterator) Return a single NestedIterator which is the result of vcat(c1,c2) """ -function stack(c1::NestedIterator{T}, c2::NestedIterator{U}) where {T, U} - type = Union{T, U} - len = (c1,c2) .|> length |> sum - - if T <: U - only_one_value = c1.one_value && c2.one_value && isequal(c1.unique_val[], c2.unique_val[]) - if only_one_value - return NestedIterator(c1.get_index, len, type, true, c1.unique_val) - end +function _vcat(csm, c1::RawNestedIterator, c2::RawNestedIterator) + c1_len = length(c1) + c2_len = length(c2) + c1_len == 0 && return c2 + c2_len == 0 && return c1 + + T1 = c1.el_type + T2 = c2.el_type + only_one_value = if T1 === T2 && c1.one_value && c2.one_value + v1 = get_single_value(csm, c1.unique_val, T1) + v2 = get_single_value(csm, c2.unique_val, T1) + isequal(v1, v2) + else + false + end + + type = Union{T1, T2} + len = c1_len + c2_len + + if only_one_value + c1.column_length = len + return c1 end - NestedIterator(Unstack(length(c1), c1.get_index, c2.get_index), len, type, false, Ref{type}()) + + return RawNestedIterator( + IterCapture[IterCapture'.RawVcat(c1_len, c1.get_index, c2.get_index)], + len, type, false, no_name_id + ) end -stack(c) = c +get_single_value(csm, id, ::Type{T}) where T = first(get_name(csm, id))::T +struct NestedVcat{T} <: Function + csm::T +end +(v::NestedVcat)(c1,c2) = _vcat(v.csm, c1, c2) +(v::NestedVcat)(c1) = c1 -""" - NestedIterator(data; total_length=nothing) +function build_get_index(csm, captures) + iter_funcs = Iterators.map(cap -> get_iter_func(csm, cap), captures) + return foldr(∘, iter_funcs) +end -Construct a new NestedIterator seeded with the value data -# Args -data::Any: seed value -total_length::Int: Cycle the values to reach total_length (must be even divisible by the length of `data`) -""" -function NestedIterator(data::T; total_length::Int=0, default_value=missing) where T - value = if T <: AbstractArray - length(data) == 0 ? (default_value,) : data - else - (data,) +function get_iter_func(csm, capture::IterCapture) + @cases capture begin + RawSeed(id) => Seed(get_name(csm, id)) + RawRepeat(n) => UnrepeatEach(n) + RawCycle(n) => Uncycle(n) + RawVcat(len, iter1, iter2) => Unvcat(len, build_get_index(csm, iter1), build_get_index(csm, iter2)) end - len = length(value) - ncycle = total_length < 1 ? 1 : total_length ÷ len - return _NestedIterator(value, len, ncycle) end -function _NestedIterator(value::T, len::Int64, ncycle::Int64) where T - E = eltype(T) - f = Seed(value) - is_one = len == 1 - unique_val = Ref{E}() - if is_one - unique_val[] = first(value)::E +"""NestedIterator is a container for instructions that build columns""" +struct NestedIterator{T,F} <: AbstractArray{T, 1} + get_index::F + column_length::Int64 + el_type::Type{T} + function NestedIterator(get_index, column_length, el_type) + return new{el_type, typeof(get_index)}(get_index, column_length, el_type) end - ni = NestedIterator{E}(f, len, E, is_one, unique_val) - return cycle(ni, ncycle) end +function NestedIterator(csm, raw::RawNestedIterator) + get_index = build_get_index(csm, raw.get_index) + return NestedIterator(get_index, length(raw), raw.el_type) +end +Base.length(ni::NestedIterator) = ni.column_length +Base.size(ni::NestedIterator) = (ni.column_length,) +Base.getindex(ni::NestedIterator, i) = ni.get_index(i) +Base.eachindex(ni::NestedIterator) = 1:length(ni) +Base.collect(x::NestedIterator, pool_arrays=false) = pool_arrays ? PooledArray(x) : Vector(x) + +end #NestedIterators diff --git a/src/PathGraph.jl b/src/PathGraph.jl index 1a28a11..5887f08 100644 --- a/src/PathGraph.jl +++ b/src/PathGraph.jl @@ -1,109 +1,146 @@ -##### PathGraph ##### -##################### +module PathGraph +using SumTypes +using ..ColumnSetManagers: ColumnSetManager, NameID, get_id, unnamed_id, unnamed, top_level_id, get_id_for_path +using ..NestedIterators: RawNestedIterator +using ..ColumnDefinitions +using ..ColumnDefinitions: ColumnDefinition, + get_unique_current_names, + get_field_path, + get_pool_arrays, + get_default_value, + get_column_name, + has_more_keys, + current_path_name, + make_column_def_child_copies +import ..get_name + +export Node, SimpleNode, ValueNode, PathNode, get_name, get_children, get_all_value_nodes, get_default, make_path_graph, get_final_name + +@sum_type Node :hidden begin + Path(::NameID, ::Vector{Node}) + Value(::NameID, ::NameID, ::NameID, ::Bool, ::Ref{RawNestedIterator}) + Simple(::NameID) +end -abstract type AbstractPathNode end +PathNode(csm::ColumnSetManager, name, children::Vector{Node}) = PathNode(get_id(csm, name), children) +PathNode(name::NameID, children::Vector{Node}) = Node'.Path(name, children) -"""A node in the ColumnDefinition graph that has children""" -struct PathNode <: AbstractPathNode - name - children::Vector{AbstractPathNode} +function ValueNode(csm::ColumnSetManager, name, final_name, field_path, pool_arrays::Bool, default::RawNestedIterator) + ValueNode(get_id(csm, name), get_id(csm, final_name), get_id_for_path(csm, field_path), pool_arrays, default) end +ValueNode(name::NameID, final_name::NameID, field_path::NameID, pool_arrays::Bool, default::RawNestedIterator) = Node'.Value(name, final_name, field_path, pool_arrays, Ref{RawNestedIterator}(default)) -"""A node in the ColumnDefinition graph that points to a leaf/value""" -struct ValueNode <: AbstractPathNode - name - final_name::Symbol - children::Vector{AbstractPathNode} - field_path::Tuple - pool_arrays - default::NestedIterator -end +SimpleNode(csm::ColumnSetManager, name) = SimpleNode(get_id(csm, name)) +SimpleNode(name::NameID) = Node'.Simple(name) -"""A node to capture a name (for emulating node behavior when unguided)""" -struct SimpleNode <: AbstractPathNode - name +function get_name(node::Node) + return @cases node begin + Path(n,_) => n + Value(n,_,_,_,_) => n + Simple(n) => n + end +end +function get_children(node::Node) + return @cases node begin + Path(_,c) => c + [Value,Simple] => throw(ErrorException("Value and Simple nodes do not have children")) + end +end +function get_final_name(node::Node) + return @cases node begin + [Path, Simple] => throw(ErrorException("Path and Simple nodes do not have a final_name")) + Value(_,n,_,_,_) => n + end +end +function ColumnDefinitions.get_field_path(node::Node) + return @cases node begin + [Path, Simple] => throw(ErrorException("Path and Simple nodes do not have a field_path")) + Value(_,_,p,_,_) => p + end +end +function ColumnDefinitions.get_pool_arrays(node::Node) + return @cases node begin + [Path, Simple] => throw(ErrorException("Path and Simple nodes do not have a pool_arrays")) + Value(_,_,_,p,_) => p + end end -function ValueNode(name, field_path, pool_arrays, default; col_name) - ValueNode(name, col_name, ValueNode[], field_path, pool_arrays,default) +function get_default(node::Node) + return @cases node begin + [Path, Simple] => throw(ErrorException("Path and Simple nodes do not have a default")) + Value(_,_,_,_,d) => d[] + end end -get_children(n::AbstractPathNode) = n.children -get_name(n::AbstractPathNode) = n.name -get_final_name(n::ValueNode) = n.final_name -get_field_path(n::ValueNode) = n.field_path -get_pool_arrays(n::ValueNode) = n.pool_arrays -get_default(n::ValueNode) = n.default """Given a certain level index, return the rest of the path down to the value""" -function path_to_value(c::ValueNode, current_index) +function path_to_value(c::Node, current_index) fp = get_field_path(c) return fp[current_index:end] end -function get_all_value_nodes(node) - value_node_channel = Channel{ValueNode}() do ch +function get_all_value_nodes(node::Node) + value_node_channel = Channel{Node}() do ch get_all_value_nodes(node, ch) end return collect(value_node_channel) end -function get_all_value_nodes(node::T, ch) where {T} - if T <: ValueNode - put!(ch, node) - return nothing +function get_all_value_nodes(node::Node, ch) + @cases node begin + Path => get_all_value_nodes.(get_children(node), Ref(ch)) + Value => put!(ch, node) + Simple => throw(ErrorException("Cannot retrieve value nodes from a simple node")) end - get_all_value_nodes.(get_children(node), Ref(ch)) return nothing end - -""" -SIDE EFFECT: also appends :unnamed to any column defs that stop at a pathnode to capture any -loose values in an array at that level -""" -function make_path_nodes!(column_defs, level = 1) +function make_path_nodes!(csm, column_defs::AbstractArray{ColumnDefinition}, level = 1) unique_names = get_unique_current_names(column_defs, level) - nodes = Vector{AbstractPathNode}(undef, length(unique_names)) + nodes = Vector{Node}(undef, length(unique_names)) for (i, unique_name) in enumerate(unique_names) - matching_defs = filter(p -> current_path_name(p, level) == unique_name, column_defs) - are_value_nodes = [!has_more_keys(def, level) for def in matching_defs] - - all_value_nodes = all(are_value_nodes) - mix_of_node_types = !all_value_nodes && any(are_value_nodes) - - if all_value_nodes - # If we got to a value node, there should only be one. - def = first(matching_defs) - nodes[i] = ValueNode( - unique_name, get_field_path(def), get_pool_arrays(def), NestedIterator(get_default_value(def)); - col_name = get_column_name(def)) - continue - end - - with_children = !mix_of_node_types ? - matching_defs : - [def for (is_value, def) in zip(are_value_nodes, matching_defs) if !is_value] - children_column_defs = make_column_def_child_copies(with_children, unique_name, level) - - child_nodes = make_path_nodes!(children_column_defs, level+1) - if mix_of_node_types - without_child_idx = findfirst(identity, are_value_nodes) - without_child = matching_defs[without_child_idx] - value_column_node = ValueNode( - :unnamed, - (get_field_path(without_child)..., :unnamed), - get_pool_arrays(without_child), - NestedIterator(get_default_value(without_child)); - col_name=get_column_name(without_child)) - push!(child_nodes, value_column_node) - end - - nodes[i] = PathNode(unique_name, child_nodes) + nodes[i] = extract_path_node!(csm, column_defs, unique_name, level) end return nodes -end +end + +"""Analyze the column_defs that match the unique name at this level and create a node""" +function extract_path_node!(csm, column_defs, unique_name, level) + matching_defs = filter(p -> current_path_name(p, level) == unique_name, column_defs) + are_value_nodes = [!has_more_keys(def, level) for def in matching_defs] + + all_value_nodes = all(are_value_nodes) + mix_of_node_types = !all_value_nodes && any(are_value_nodes) + + if all_value_nodes + # If we got to a value node, there should only be one. + def = first(matching_defs) + return ValueNode( + csm, unique_name, get_column_name(def), get_field_path(def), get_pool_arrays(def), RawNestedIterator(csm, get_default_value(def)) + ) + end + with_children = view(matching_defs, .!are_value_nodes) + children_column_defs = make_column_def_child_copies(with_children, unique_name, level) + + child_nodes = make_path_nodes!(csm, children_column_defs, level+1) + if mix_of_node_types + without_child_idx = findfirst(identity, are_value_nodes) + without_child = matching_defs[without_child_idx] + value_column_node = ValueNode( + csm, + unnamed_id, + get_column_name(without_child), + (get_field_path(without_child)..., unnamed), + get_pool_arrays(without_child), + RawNestedIterator(csm, get_default_value(without_child)) + ) + push!(child_nodes, value_column_node) + end + + return PathNode(csm, unique_name, child_nodes) +end """Create a graph of field_paths that models the structure of the nested data""" -make_path_graph(column_defs::Vector{ColumnDefinition}) = PathNode(:TOP_LEVEL, make_path_nodes!(column_defs)) -make_path_graph(::Nothing; _...) = nothing +make_path_graph(csm, column_defs) = PathNode(top_level_id, make_path_nodes!(csm, column_defs)) +make_path_graph(_, ::Nothing) = SimpleNode(unnamed_id) +end diff --git a/src/Utils.jl b/src/Utils.jl index eb47ab0..a43d67d 100644 --- a/src/Utils.jl +++ b/src/Utils.jl @@ -1,3 +1,7 @@ +is_NameValueContainer(t) = typeof(StructTypes.StructType(t)) <: NameValueContainer +is_container(t) = typeof(StructTypes.StructType(t)) <: Container +is_value_type(t::Type) = !is_container(t) && isconcretetype(t) + """Check if the eltype of a T are all value types (i.e. not containers)""" all_eltypes_are_values(::Type{T}) where T = all_is_value_type(eltype(T)) function all_is_value_type(::Type{T}) where T @@ -33,3 +37,19 @@ end """Link a list of keys into an underscore separted column name""" join_names(names, joiner="_") = names .|> string |> (s -> join(s, joiner)) |> Symbol +function safe_peel(itr) + try + return Iterators.peel(itr) + catch e + if e isa BoundsError + return nothing + end + throw(e) + end +end + +"""Collect an iterator into a tuple""" +collect_tuple(itr) = _collect_tuple(safe_peel(itr)) +_collect_tuple(peel_return) = _collect_tuple(peel_return...) +_collect_tuple(::Nothing) = () +_collect_tuple(val, rest::Iterators.Rest) = (val, collect_tuple(rest)...) diff --git a/test/Manifest.toml b/test/Manifest.toml deleted file mode 100644 index 4b408ce..0000000 --- a/test/Manifest.toml +++ /dev/null @@ -1,171 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -julia_version = "1.8.2" -manifest_format = "2.0" -project_hash = "c8f78ae9bfbaaa2c5622d1b2fc3e0bae29da3646" - -[[deps.Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "195c5505521008abea5aee4f96930717958eac6f" -uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.4.0" - -[[deps.Artifacts]] -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" - -[[deps.Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[deps.CompilerSupportLibraries_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" -version = "0.5.2+0" - -[[deps.DataAPI]] -git-tree-sha1 = "e08915633fcb3ea83bf9d6126292e5bc5c739922" -uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" -version = "1.13.0" - -[[deps.DataValueInterfaces]] -git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" -uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" -version = "1.0.0" - -[[deps.Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[deps.Dictionaries]] -deps = ["Indexing", "Random", "Serialization"] -git-tree-sha1 = "e82c3c97b5b4ec111f3c1b55228cebc7510525a2" -uuid = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4" -version = "0.3.25" - -[[deps.Future]] -deps = ["Random"] -uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" - -[[deps.Indexing]] -git-tree-sha1 = "ce1566720fd6b19ff3411404d4b977acd4814f9f" -uuid = "313cdc1a-70c2-5d6a-ae34-0150d3930a38" -version = "1.1.1" - -[[deps.InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[deps.IteratorInterfaceExtensions]] -git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" -uuid = "82899510-4779-5014-852e-03e436cf321d" -version = "1.0.0" - -[[deps.JSON3]] -deps = ["Dates", "Mmap", "Parsers", "SnoopPrecompile", "StructTypes", "UUIDs"] -git-tree-sha1 = "84b10656a41ef564c39d2d477d7236966d2b5683" -uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" -version = "1.12.0" - -[[deps.Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[deps.LinearAlgebra]] -deps = ["Libdl", "libblastrampoline_jll"] -uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - -[[deps.Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[deps.Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[deps.Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[deps.OpenBLAS_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] -uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.20+0" - -[[deps.OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" -uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" - -[[deps.Parsers]] -deps = ["Dates", "SnoopPrecompile"] -git-tree-sha1 = "b64719e8b4504983c7fca6cc9db3ebc8acc2a4d6" -uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "2.5.1" - -[[deps.PooledArrays]] -deps = ["DataAPI", "Future"] -git-tree-sha1 = "a6062fe4063cdafe78f4a0a81cfffb89721b30e7" -uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" -version = "1.4.2" - -[[deps.Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[deps.Random]] -deps = ["SHA", "Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[deps.SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" -version = "0.7.0" - -[[deps.Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[deps.SnoopPrecompile]] -git-tree-sha1 = "f604441450a3c0569830946e5b33b78c928e1a85" -uuid = "66db9d55-30c0-4569-8b51-7e840670fc0c" -version = "1.0.1" - -[[deps.SplitApplyCombine]] -deps = ["Dictionaries", "Indexing"] -git-tree-sha1 = "48f393b0231516850e39f6c756970e7ca8b77045" -uuid = "03a91e81-4c3e-53e1-a0a4-9c0c8f19dd66" -version = "1.2.2" - -[[deps.StructTypes]] -deps = ["Dates", "UUIDs"] -git-tree-sha1 = "ca4bccb03acf9faaf4137a9abc1881ed1841aa70" -uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" -version = "1.10.0" - -[[deps.TableTraits]] -deps = ["IteratorInterfaceExtensions"] -git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39" -uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" -version = "1.0.1" - -[[deps.Tables]] -deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"] -git-tree-sha1 = "c79322d36826aa2f4fd8ecfa96ddb47b174ac78d" -uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -version = "1.10.0" - -[[deps.Test]] -deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[deps.TypedTables]] -deps = ["Adapt", "Dictionaries", "Indexing", "SplitApplyCombine", "Tables", "Unicode"] -git-tree-sha1 = "ec72e7a68a6ffdc507b751714ff3e84e09135d9e" -uuid = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" -version = "1.4.1" - -[[deps.UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[deps.Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[deps.libblastrampoline_jll]] -deps = ["Artifacts", "Libdl", "OpenBLAS_jll"] -uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" -version = "5.1.1+0" diff --git a/test/Project.toml b/test/Project.toml index 8a1cbdd..4bd0b43 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index ca49816..0f9b729 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,8 +3,7 @@ using Test using JSON3 using ExpandNestedData using TypedTables - -EN = ExpandNestedData +using DataStructures: OrderedRobinDict fieldequal(v1, v2) = (v1==v2) isa Bool ? v1==v2 : false fieldequal(::Nothing, ::Nothing) = true @@ -21,6 +20,21 @@ function fieldsequal(o1, o2) end return true end +function fieldsequal(o1::NamedTuple, o2::NamedTuple) + for name in keys(o1) + prop1 = getindex(o1, name) + prop2 = getindex(o2, name) + + if prop1 isa NamedTuple && prop2 isa NamedTuple + return fieldsequal(prop1, prop2) + end + if !fieldequal(prop1, prop2) + println("Didn't match on $name. Got $prop1 and $prop2") + return false + end + end + return true +end function get_rows(t, fields, len) return [ @@ -37,156 +51,341 @@ function unordered_equal(t1, t2) Set(get_rows(t1, fields,len)) == Set(get_rows(t2, fields,len)) end -@testset "Internals" begin - iter1 = ExpandNestedData.NestedIterator([1,2]) - @test [1,2] == collect(iter1) - @test [1,2,1,2] == collect(ExpandNestedData.cycle(iter1, 2)) - @test [1,1,2,2] == collect(ExpandNestedData.repeat_each(iter1, 2)) - @test [1,2,1,2] == collect(ExpandNestedData.stack(iter1, iter1)) - col_set = ExpandNestedData.ColumnSet( - (:a,) => ExpandNestedData.NestedIterator([1,2]), - (:b,) => ExpandNestedData.NestedIterator([3,4,5,6]), - ) - @test isequal( - ExpandNestedData.cycle_columns_to_length!(col_set), - ExpandNestedData.ColumnSet( - (:a,) => ExpandNestedData.NestedIterator([1,2,1,2]), - (:b,) => ExpandNestedData.NestedIterator([3,4,5,6]), - ) - ) +function all_equal(arr) + if length(arr) == 1 + return true + elseif length(arr) == 2 + return @inbounds isequal(arr[1], arr[2]) + end - col_set = ExpandNestedData.ColumnSet( - (:a,) => ExpandNestedData.NestedIterator([1,2]), - (:b,) => ExpandNestedData.NestedIterator([3,4,5,6]), - ) - @test fieldsequal(ExpandNestedData.ColumnDefinition((:a,)), ExpandNestedData.ColumnDefinition([:a])) + matches = true + el = arr[1] + @inbounds for i in 2:length(arr) + matches = isequal(el, arr[i]) && matches + el = arr[i] + end + return matches end -# Source Data -const simple_test_body = JSON3.read(""" -{"data" : [ - {"E" : 7, "D" : 1}, - {"E" : 8, "D" : 2} -]}""") -const expected_simple_table = (data_E=[7,8], data_D=[1,2]) - -const test_body_str = """ -{ - "a" : [ - {"b" : 1, "c" : 2}, - {"b" : 2}, - {"b" : [3, 4], "c" : 1}, - {"b" : []} - ], - "d" : 4 -} -""" -const test_body = JSON3.read(test_body_str) - -struct InternalObj - b - c -end -struct MainBody - a::Vector{InternalObj} - d -end -const struct_body = JSON3.read(test_body_str, MainBody) +@testset "ExpandNestedData" begin -const heterogenous_level_test_body = Dict( - :data => [ - Dict(:E => 8), - 5 - ] - ) + @testset "Internals" begin + @testset "NestedIterators and ColumnSets" begin + csm = ExpandNestedData.ColumnSetManager() + iter1_2() = ExpandNestedData.RawNestedIterator(csm, [1,2]) + @test [1,2] == collect(iter1_2(), csm) + @test [1,2,1,2] == collect(ExpandNestedData.cycle(iter1_2(), 2), csm) + @test [1,1,2,2] == collect(ExpandNestedData.repeat_each(iter1_2(), 2), csm) + ex_vcat = ExpandNestedData.NestedIterators.NestedVcat(csm) + @test [1,2,1,2] == collect(ex_vcat(iter1_2(), iter1_2()), csm) + col_set = ExpandNestedData.ColumnSet( + ExpandNestedData.NameID(2) => ExpandNestedData.RawNestedIterator(csm, [3,4,5,6]), + ExpandNestedData.NameID(1) => ExpandNestedData.RawNestedIterator(csm, [1,2]), + ) + @test collect(keys(col_set)) == [ExpandNestedData.NameID(1),ExpandNestedData.NameID(2)] + col_set2 = ExpandNestedData.ColumnSet( + ExpandNestedData.NameID(1) => ExpandNestedData.RawNestedIterator(csm, [1,2,1,2]), + ExpandNestedData.NameID(2) => ExpandNestedData.RawNestedIterator(csm, [3,4,5,6]), + ) + @test isequal(ExpandNestedData.cycle_columns_to_length!(col_set), col_set2, csm) + + # popping columns + @test ExpandNestedData.get_first_key(col_set) == ExpandNestedData.NameID(1) + default_col = pop!(col_set, ExpandNestedData.NameID(3), ExpandNestedData.RawNestedIterator(csm, [1])) + @test isequal(default_col, ExpandNestedData.RawNestedIterator(csm, [1,1,1,1]), csm) + popped_col = pop!(col_set, ExpandNestedData.NameID(2), ExpandNestedData.RawNestedIterator(csm, [1])) + @test collect(popped_col, csm) == [3,4,5,6] + @test collect(keys(col_set)) == [ExpandNestedData.NameID(1)] + + # column length + @test ExpandNestedData.get_total_length([col_set, col_set2]) == 8 + @test ExpandNestedData.column_length(ExpandNestedData.repeat_each_column!(col_set, 2)) == 8 + + # column set manager + csm = ExpandNestedData.ColumnSetManager() + cs = ExpandNestedData.get_column_set(csm) + @test isequal(cs, ExpandNestedData.ColumnSet(), csm) + ExpandNestedData.free_column_set!(csm, cs) + @test !isempty(csm.column_sets) + cs = ExpandNestedData.get_column_set(csm) + @test isempty(csm.column_sets) + + cs[ExpandNestedData.NameID(3)] = ExpandNestedData.RawNestedIterator() + cs[ExpandNestedData.NameID(1)] = ExpandNestedData.RawNestedIterator() + @test collect(keys(cs)) == [ExpandNestedData.NameID(1),ExpandNestedData.NameID(3)] + + name = :test_name + id = ExpandNestedData.get_id(csm, name) + @test id == ExpandNestedData.NameID(2) + @test id == ExpandNestedData.get_id(csm, name) + @test name == ExpandNestedData.get_name(csm, id) + field_path = (name,) + id_path = (id,) + id_for_path = ExpandNestedData.get_id(csm, id_path) + @test id_for_path == ExpandNestedData.get_id_for_path(csm, field_path) + + # NameLists + top = ExpandNestedData.NameList() + l = ExpandNestedData.NameList(top, id) + id_for_tuple_from_list = ExpandNestedData.get_id(csm, l) + @test id_for_tuple_from_list == id_for_path + @test ExpandNestedData.ColumnSetManagers.reconstruct_field_path(csm, id_for_tuple_from_list) == field_path + + # Rebuild ColumnSet + raw_cs = ExpandNestedData.ColumnSet(id_for_path => ExpandNestedData.RawNestedIterator(csm, [1])) + finalized_col = ExpandNestedData.NestedIterator(csm, ExpandNestedData.RawNestedIterator(csm, [1])) + @test OrderedRobinDict((name,) => finalized_col) == ExpandNestedData.build_final_column_set(csm, raw_cs) + end + + @testset "ColumnDefinitions and PathGraph" begin + @test fieldsequal(ColumnDefinition((:a,)), ColumnDefinition([:a])) + coldef = ColumnDefinition((:a,:b), Dict(); pool_arrays=false, name_join_pattern = "^") + @test coldef == ColumnDefinition((:a,:b), Symbol("a^b"), missing, false) + @test ExpandNestedData.current_path_name(coldef, 2) == :b + @test collect(ExpandNestedData.make_column_def_child_copies([coldef], :a, 1)) == [coldef] + + csm = ExpandNestedData.ColumnSetManager() + simple = ExpandNestedData.SimpleNode(csm, :a) + value = ExpandNestedData.ValueNode(csm, :a, :a, (:a,), false, ExpandNestedData.RawNestedIterator(csm, [1])) + path_n = ExpandNestedData.PathNode(csm, :a, ExpandNestedData.Node[value]) + @test all_equal(ExpandNestedData.get_name.((simple,value,path_n))) + for (f,result) in (( + ExpandNestedData.PathGraph.get_final_name, ExpandNestedData.NameID(2)), + (ExpandNestedData.PathGraph.get_field_path,ExpandNestedData.NameID(4)), + (ExpandNestedData.PathGraph.get_pool_arrays,false)) + @test_throws ErrorException f(simple) + @test_throws ErrorException f(path_n) + @test f(value) == result + end + + @test ExpandNestedData.get_all_value_nodes(path_n) == [value] + @test isequal(ExpandNestedData.get_default(value), ExpandNestedData.RawNestedIterator(csm, [1]),csm) + end + + @testset "Utils" begin + @test ExpandNestedData.all_eltypes_are_values(Vector{Union{Int64, String, Float64}}) + @test !ExpandNestedData.all_eltypes_are_values(Vector{Union{Int64, String, AbstractFloat}}) + @test !ExpandNestedData.all_eltypes_are_values(Vector{Union{Dict, String}}) + d = Dict(:a => 1, :b => 2) + @test ExpandNestedData.get_names(d) == keys(d) + struct _T_ + a + end + + @test collect(ExpandNestedData.get_names(_T_(1))) == collect(fieldnames(_T_)) + @test ExpandNestedData.get_value(d, :a, 3) == 1 + @test ExpandNestedData.get_value(d, :c, 3) == 3 + @test ExpandNestedData.get_value(_T_(1), :a, 3) == 1 + @test ExpandNestedData.join_names((:a,1,"hi"), ".") == Symbol("a.1.hi") + end -@testset "Unguided Expand" begin - actual_simple_table = EN.expand(simple_test_body) - @test unordered_equal(actual_simple_table, expected_simple_table) - @test eltype(actual_simple_table.data_D) == Int64 - - # Expanding Arrays - actual_expanded_table = EN.expand(test_body) - @test begin - expected_table_expanded = ( - a_b=[1,2,3,4,missing], - a_c=[2,missing,1,1, missing], - d=[4,4,4,4,4]) - unordered_equal(actual_expanded_table, expected_table_expanded) + @testset "Core" begin + csm = ExpandNestedData.ColumnSetManager() + name_list = ExpandNestedData.NameList() + node = ExpandNestedData.SimpleNode(ExpandNestedData.NameID(0)) + col_num = 5 + dict_step = ExpandNestedData.DictStep(name_list, Dict(), node) + array_step = ExpandNestedData.ArrayStep(name_list, [], node) + leaf_step = ExpandNestedData.LeafStep(name_list, 1) + default_step = ExpandNestedData.DefaultStep(name_list) + merge_step = ExpandNestedData.MergeStep(col_num) + stack_step = ExpandNestedData.StackStep(col_num) + col_step = ExpandNestedData.NewColumnSetStep(ExpandNestedData.get_column_set(csm)) + + # test get_name + for s in (dict_step, array_step, leaf_step, default_step) + @test ExpandNestedData.get_name(s) == name_list + end + for s in (merge_step, stack_step, col_step) + @test_throws ErrorException ExpandNestedData.get_name(s) + end + + # test get_data + for (s,expected) in ((dict_step, Dict()),(array_step,[]),(leaf_step,1)) + @test ExpandNestedData.get_data(s) == expected + end + for s in (default_step, merge_step, stack_step, col_step) + @test_throws ErrorException ExpandNestedData.get_data(s) + end + + # test get_column_number + for s in (merge_step, stack_step) + @test ExpandNestedData.get_column_number(s) == col_num + end + for s in (default_step, dict_step,array_step,leaf_step, col_step) + @test_throws ErrorException ExpandNestedData.get_column_number(s) + end + + # test get_path_node + for s in (dict_step,array_step) + @test ExpandNestedData.get_path_node(s) == node + end + for s in (default_step, leaf_step, col_step, merge_step, stack_step) + @test_throws ErrorException ExpandNestedData.get_path_node(s) + end + + # test get_column_set + for s in (col_step,) + @test isequal(ExpandNestedData.get_column_set(s), ExpandNestedData.ColumnSet(),csm) + end + for s in (dict_step,array_step, default_step, leaf_step, merge_step, stack_step) + @test_throws ErrorException ExpandNestedData.get_column_set(s) + end + + @test isequal( + ExpandNestedData.get_column_set(ExpandNestedData.empty_column_set_step(csm)), + ExpandNestedData.ColumnSet(), + csm) + + @test begin + column_defs = [ + ExpandNestedData.ColumnDefinition((:data,)), + ExpandNestedData.ColumnDefinition((:data, :E)) + ] + path_graph = ExpandNestedData.PathGraph.make_path_graph(csm, column_defs) + actual_col_set = ExpandNestedData.make_missing_column_set(csm, path_graph) + expected_col_set = ExpandNestedData.ColumnSet( + ExpandNestedData.get_id_for_path(csm, (:data, ExpandNestedData.unnamed)) => ExpandNestedData.RawNestedIterator(csm, [missing]), + ExpandNestedData.get_id_for_path(csm, (:data, :E)) => ExpandNestedData.RawNestedIterator(csm, [missing]) + ) + isequal(actual_col_set, expected_col_set,csm) + end + end end - # Using struct of struct as input - @test begin - expected_table_expanded = ( - new_column=[1,2,3,4,nothing], - a_c=[2,nothing,1,1, nothing], - d=[4,4,4,4,4]) - unordered_equal( - EN.expand(struct_body; default_value=nothing, column_names= Dict((:a, :b) => :new_column)), - expected_table_expanded) + @testset "DataStructure Internals" begin + d = OrderedRobinDict(:a => 1, :b => missing) + k = d.keys + @test k isa Vector{Symbol} + @test k[2] == :b + d[:b] = 5 + @test (d[:b]) == 5 end - @test (typeof(EN.expand(struct_body; pool_arrays=true, lazy_columns=false).d) == - typeof(PooledArray(Int64[]))) - - @test fieldsequal( - EN.expand(struct_body; column_style=:nested) |> rows |> last, - (a=(b=1,c=2), d=4) - ) - - @test unordered_equal(EN.expand(heterogenous_level_test_body), (data = [5], data_E = [8])) - - empty_dict_field = Dict( - :a => Dict(), - :b => 5 - ) - @test unordered_equal(EN.expand(empty_dict_field), (b = [5],)) - - @test begin - two_layer_deep = Dict( - :a => Dict( - :b => Dict( - :c => 1, - :d => 2, - ) + + + # Source Data + simple_test_body = JSON3.read(""" + {"data" : [ + {"E" : 7, "D" : 1}, + {"E" : 8, "D" : 2} + ]}""") + expected_simple_table = (data_E=[7,8], data_D=[1,2]) + + test_body_str = """ + { + "a" : [ + {"b" : 1, "c" : 2}, + {"b" : 2}, + {"b" : [3, 4], "c" : 1}, + {"b" : []} + ], + "d" : 4 + } + """ + test_body = JSON3.read(test_body_str) + + struct InternalObj + b + c + end + struct MainBody + a::Vector{InternalObj} + d + end + struct_body = JSON3.read(test_body_str, MainBody) + + heterogenous_level_test_body = Dict( + :data => [ + Dict(:E => 8), + 5 + ] ) + + @testset "Unguided Expand" begin + actual_simple_table = ExpandNestedData.expand(simple_test_body) + @test unordered_equal(actual_simple_table, expected_simple_table) + @test eltype(actual_simple_table.data_D) == Int64 + + # Expanding Arrays + @test begin + actual_expanded_table = ExpandNestedData.expand(test_body) + expected_table_expanded = ( + a_b=[1,2,3,4,missing], + a_c=[2,missing,1,1, missing], + d=[4,4,4,4,4]) + unordered_equal(actual_expanded_table, expected_table_expanded) + end + + # Using struct of struct as input + @test begin + expected_table_expanded = ( + new_column=[1,2,3,4,nothing], + a_c=[2,nothing,1,1, nothing], + d=[4,4,4,4,4]) + unordered_equal( + ExpandNestedData.expand(struct_body; default_value=nothing, column_names= Dict((:a, :b) => :new_column)), + expected_table_expanded) + end + @test (typeof(ExpandNestedData.expand(struct_body; pool_arrays=true, lazy_columns=false).d) == + typeof(PooledArray(Int64[]))) + + @test fieldsequal((ExpandNestedData.expand(struct_body; column_style=:nested) |> rows |> last), (a=(b=1,c=2), d=4)) + + @test unordered_equal(ExpandNestedData.expand(heterogenous_level_test_body), (data = [5], data_E = [8])) + + empty_dict_field = Dict( + :a => Dict(), + :b => 5 ) - unordered_equal(EN.expand(two_layer_deep), (a_b_c = [1], a_b_d = [2])) + @test unordered_equal(ExpandNestedData.expand(empty_dict_field), (b = [5],)) + + @test begin + two_layer_deep = Dict( + :a => Dict( + :b => Dict( + :c => 1, + :d => 2, + ) + ) + ) + unordered_equal(ExpandNestedData.expand(two_layer_deep), (a_b_c = [1], a_b_d = [2])) + end end -end -@testset "Configured Expand" begin - columns_defs = [ - EN.ColumnDefinition((:d,)), - EN.ColumnDefinition((:a, :b)), - EN.ColumnDefinition((:a, :c); name_join_pattern = "?_#"), - EN.ColumnDefinition((:e, :f); default_value="Missing branch") + @testset "Configured Expand" begin + columns_defs = [ + ExpandNestedData.ColumnDefinition((:d,)), + ExpandNestedData.ColumnDefinition((:a, :b)), + ExpandNestedData.ColumnDefinition((:a, :c); name_join_pattern = "?_#"), + ExpandNestedData.ColumnDefinition((:e, :f); default_value="Missing branch") + ] + expected_table = NamedTuple((:d=>[4,4,4,4,4], :a_b=>[1,2,3,4, missing], Symbol("a?_#c")=>[2,missing,1,1, missing], + :e_f => repeat(["Missing branch"], 5)) + ) + @test unordered_equal(ExpandNestedData.expand(test_body, columns_defs), expected_table) + @test fieldsequal( + ExpandNestedData.expand(test_body, columns_defs; column_style=:nested) |> rows |> last, + (d=4, a=(b = 1, c = 2), e = (f="Missing branch",)) + ) + columns_defs = [ + ExpandNestedData.ColumnDefinition((:data,)), + ExpandNestedData.ColumnDefinition((:data, :E)) ] - expected_table = NamedTuple((:d=>[4,4,4,4,4], :a_b=>[1,2,3,4, missing], Symbol("a?_#c")=>[2,missing,1,1, missing], - :e_f => repeat(["Missing branch"], 5)) - ) - @test unordered_equal(EN.expand(test_body, columns_defs), expected_table) - @test fieldsequal( - EN.expand(test_body, columns_defs; column_style=:nested) |> rows |> last, - (d=4, a=(b = 1, c = 2), e = (f="Missing branch",)) - ) - columns_defs = [ - EN.ColumnDefinition((:data,)), - EN.ColumnDefinition((:data, :E)) - ] - @test unordered_equal(EN.expand(heterogenous_level_test_body, columns_defs), (data = [5], data_E = [8])) + @test unordered_equal(ExpandNestedData.expand(heterogenous_level_test_body, columns_defs), (data = [5], data_E = [8])) -end + end -@testset "superficial options" begin - # Expanding Arrays - actual_expanded_table = EN.expand(test_body; name_join_pattern = "?_#") - @test begin - expected_table_expanded = NamedTuple(( - Symbol("a?_#b")=>[1,2,3,4,missing], - Symbol("a?_#c")=>[2,missing,1,1, missing], - :d=>[4,4,4,4,4])) - unordered_equal(actual_expanded_table, expected_table_expanded) + @testset "superficial options" begin + # Expanding Arrays + actual_expanded_table = ExpandNestedData.expand(test_body; name_join_pattern = "?_#") + @test begin + expected_table_expanded = NamedTuple(( + Symbol("a?_#b")=>[1,2,3,4,missing], + Symbol("a?_#c")=>[2,missing,1,1, missing], + :d=>[4,4,4,4,4])) + unordered_equal(actual_expanded_table, expected_table_expanded) + end end end