-
Notifications
You must be signed in to change notification settings - Fork 49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Data Model for tree structured Document Representation #1022
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -0,0 +1,390 @@ | ||||
from abc import abstractmethod, ABC | ||||
from enum import Enum | ||||
|
||||
|
||||
class Category(Enum): | ||||
Caption = 1 | ||||
Footnote = 2 | ||||
Formula = 3 | ||||
ListItem = 4 | ||||
PageFooter = 5 | ||||
PageHeader = 6 | ||||
Picture = 7 | ||||
SectionHeader = 8 | ||||
Table = 9 | ||||
Text = 10 | ||||
Title = 11 | ||||
Section = 101 | ||||
Group = 102 | ||||
|
||||
|
||||
class Node: | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor: Again we can defer this, but we should probably call this something like DocNode since we already use Node for the query plan. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure, or differentiate by package name. |
||||
def __init__(self, node_id, properties): | ||||
self._node_id = node_id | ||||
self._properties = properties | ||||
|
||||
def node_id(self): | ||||
return self._node_id | ||||
|
||||
@abstractmethod | ||||
def category(self): | ||||
pass | ||||
|
||||
def properties(self): | ||||
return self._properties | ||||
|
||||
@abstractmethod | ||||
def accept(self, visitor): | ||||
pass | ||||
|
||||
|
||||
class Leaf(Node): | ||||
def __init__(self, node_id, box, content, properties): | ||||
super().__init__(node_id, properties) | ||||
self._box = box | ||||
self._content = content | ||||
|
||||
def box(self): | ||||
return self._box | ||||
|
||||
def content(self): | ||||
return self._content | ||||
|
||||
|
||||
class Internal(Node): | ||||
def __init__(self, node_id, children, properties): | ||||
super().__init__(node_id, properties) | ||||
self._children = children | ||||
|
||||
def children(self) -> list[Node]: | ||||
return self._children | ||||
|
||||
|
||||
class Caption(Leaf): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Certainly not critical, but would it make sense to create a parent class for all of the text-like leaf nodes (e.g. list items, text, etc.) I could imagine a lot of operations where you want to do something special for tables and images, but something common for all the text. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, sure. |
||||
def __init__(self, node_id, box, content, properties): | ||||
super().__init__(node_id, box, content, properties) | ||||
|
||||
def category(self): | ||||
return Category.Caption | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_caption(self) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would recommend moving the switch logic into the visitor class.
That way someone can override just the visit function if they want to handle the grouping differently rather than having to implement a whole bunch of functions. You may also want to implement more flexible visiting. For plan nodes we needed pre-traversal, post-traversal, in-place mutation, and single call for the entire tree. |
||||
|
||||
|
||||
class Footnote(Leaf): | ||||
def __init__(self, node_id, box, content, properties): | ||||
super().__init__(node_id, box, content, properties) | ||||
|
||||
def category(self): | ||||
return Category.Caption | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_footnote(self) | ||||
|
||||
|
||||
class Formula(Leaf): | ||||
def __init__(self, node_id, box, content, properties): | ||||
super().__init__(node_id, box, content, properties) | ||||
|
||||
def category(self): | ||||
return Category.Caption | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_formula(self) | ||||
|
||||
|
||||
class ListItem(Leaf): | ||||
def __init__(self, node_id, box, content, properties): | ||||
super().__init__(node_id, box, content, properties) | ||||
|
||||
def category(self): | ||||
return Category.Caption | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_list_item(self) | ||||
|
||||
|
||||
class PageFooter(Leaf): | ||||
def __init__(self, node_id, box, content, properties): | ||||
super().__init__(node_id, box, content, properties) | ||||
|
||||
def category(self): | ||||
return Category.Caption | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_page_footer(self) | ||||
|
||||
|
||||
class PageHeader(Leaf): | ||||
def __init__(self, node_id, box, content, properties): | ||||
super().__init__(node_id, box, content, properties) | ||||
|
||||
def category(self): | ||||
return Category.Caption | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_page_header(self) | ||||
|
||||
|
||||
class Picture(Leaf): | ||||
def __init__(self, node_id, box, content, properties): | ||||
super().__init__(node_id, box, content, properties) | ||||
|
||||
def category(self): | ||||
return Category.Caption | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_picture(self) | ||||
|
||||
|
||||
class SectionHeader(Leaf): | ||||
def __init__(self, node_id, box, content, properties): | ||||
super().__init__(node_id, box, content, properties) | ||||
|
||||
def category(self): | ||||
return Category.Caption | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_section_header(self) | ||||
|
||||
|
||||
class Table(Leaf): | ||||
def __init__(self, node_id, box, content, properties, continued=None): | ||||
super().__init__(node_id, box, content, properties) | ||||
|
||||
def category(self): | ||||
return Category.Caption | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_table(self) | ||||
|
||||
|
||||
class Text(Leaf): | ||||
def __init__(self, node_id, box, content, properties, continued=None): | ||||
super().__init__(node_id, box, content, properties) | ||||
self._continued = continued | ||||
|
||||
def category(self): | ||||
return Category.Caption | ||||
|
||||
def continued(self): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are the semantics here? Why does Text have a continued field, but not, say, tables? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is for the split objects. Table does have, it's just nothing is support there like merge across pages. |
||||
return self._continued | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_text(self) | ||||
|
||||
|
||||
class Title(Leaf): | ||||
def __init__(self, node_id, box, content, properties): | ||||
super().__init__(node_id, box, content, properties) | ||||
|
||||
def category(self): | ||||
return Category.Caption | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_title(self) | ||||
|
||||
|
||||
class Group(Internal): | ||||
""" | ||||
Group semantic related objects together, e.g. list items, figure and caption | ||||
""" | ||||
|
||||
def __init__(self, node_id, children, properties): | ||||
super().__init__(node_id, children, properties) | ||||
|
||||
def category(self): | ||||
return Category.Group | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_group(self) | ||||
|
||||
|
||||
class Section(Internal): | ||||
def __init__(self, node_id, children, header, properties): | ||||
super().__init__(node_id, children, properties) | ||||
self._header = header | ||||
|
||||
def category(self): | ||||
return Category.Section | ||||
|
||||
def accept(self, visitor): | ||||
return visitor.visit_section(self) | ||||
|
||||
|
||||
class Visitor(ABC): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just curious, what was your rationale for using a visitor like this vs a traverser like we do for plan nodes (
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the consideration is to have an easy support of polymorphism and scalability. One thing I think a long time is how to handle split objects nicely where I end up with a linked list. If you have a better idea, let me know. I'm not aware of this nodetraverse before. I looked a little bit on the printtraverse, seems it still either depend on modifying each node for polymorphism or squeeze in a big function handling all various node types inside those before/after/visit function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I commented on this above. The important thing about NodeTraverse is it lets you do pre and post traversal mutations of the tree. It also supported running a function once at the start of traversal of a tree. If there is no intent to mutate the tree, then the order doesn't matter, but then I would expect the visitor to not be returning things. Probably better to work this out in a design doc than code though. |
||||
@abstractmethod | ||||
def visit_caption(self, caption: Caption): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can the visit methods mutate the tree or are they purely read only? Do they return anything? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm just assuming it could return anything considering the dynamic typing, no generic or subtying needed here at all? |
||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_footnote(self, footnote: Footnote): | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_formula(self, formula: Formula): | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_list_item(self, list_item: ListItem): | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_page_footer(self, page_footer: PageFooter): | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_page_header(self, page_header: PageHeader): | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_picture(self, picture: Picture): | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_section_header(self, section_header: SectionHeader): | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_table(self, table: Table): | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_text(self, text: Text): | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_title(self, title: Title): | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_group(self, group: Group): | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def visit_section(self, section: Section): | ||||
pass | ||||
|
||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like a visit_generic or something would be useful. Something like
and then the |
||||
|
||||
class NaiveSemanticVisitor(Visitor): | ||||
def visit_caption(self, caption): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code is exactly why I would have a single visit function, then it could just be:
This more than halves the lines of code, makes it clear which things are handled similarly, and likely does the right thing if a new node type is introduced. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey, thanks for your feedback. I actually have different opinions on this.
|
||||
return caption.content() | ||||
|
||||
def visit_footnote(self, footnote): | ||||
return footnote.content() | ||||
|
||||
def visit_formula(self, formula): | ||||
return formula.content() | ||||
|
||||
def visit_list_item(self, list_item): | ||||
return list_item.content() | ||||
|
||||
def visit_page_footer(self, page_footer): | ||||
return page_footer.content() | ||||
|
||||
def visit_page_header(self, page_header): | ||||
return page_header.content() | ||||
|
||||
def visit_picture(self, picture): | ||||
raise Exception("Picture semantic output not implemented") | ||||
|
||||
def visit_section_header(self, section_header): | ||||
return section_header.content() | ||||
|
||||
def visit_table(self, table: Table): | ||||
content = table.content() | ||||
return content.to_csv() | ||||
|
||||
def visit_text(self, text: Text): | ||||
contents = text.content() | ||||
cur = text.continued() | ||||
while cur: | ||||
contents = contents.rstrip() + " " + cur.content() | ||||
cur = cur.continued() | ||||
|
||||
return contents | ||||
|
||||
def visit_title(self, title: Title): | ||||
return title.content() | ||||
|
||||
def visit_group(self, group: Group): | ||||
# merge content in the group naive | ||||
contents = [child.accept(self) for child in group.children()] | ||||
return " ".join(contents) | ||||
|
||||
def visit_section(self, section: Section): | ||||
# merge content in the section naive | ||||
contents = [child.accept(self) for child in section.children()] | ||||
return " ".join(contents) | ||||
|
||||
|
||||
class Page: | ||||
""" | ||||
Holds page objects not in the structure tree like page header, page footer | ||||
and footnote. | ||||
""" | ||||
|
||||
def __init__(self, nodes: list[Node]): | ||||
self._nodes = nodes | ||||
|
||||
|
||||
class Tokenizer: | ||||
@abstractmethod | ||||
def limit(self) -> int: | ||||
pass | ||||
|
||||
@abstractmethod | ||||
def count(self, semantic) -> int: | ||||
pass | ||||
|
||||
|
||||
class Document: | ||||
def __init__(self, root, nodes, pages, properties): | ||||
self._root = root | ||||
self._nodes = nodes | ||||
self._pages = pages | ||||
self._properties = properties | ||||
|
||||
def summary(self): | ||||
pass | ||||
|
||||
def filter(self): | ||||
pass | ||||
|
||||
def chunk(self, visitor, tokenizer): | ||||
""" | ||||
First pass use tokenizer to calculate token count for each node, second | ||||
round do traverse the tree and collect the node with token size smaller | ||||
than limit | ||||
@param visitor: a visitor defines how text is assembled from the tree | ||||
@param tokenizer: a tokenizer defines max token limit | ||||
""" | ||||
token_dict = {} | ||||
chunks = [] | ||||
|
||||
def post_traverse(node): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. traversal should be an operation on a node so you don't have to reimplement it.
|
||||
if isinstance(node, Internal): | ||||
for child in node.children(): | ||||
post_traverse(child) | ||||
semantic = node.accept(visitor) | ||||
count = tokenizer.count(semantic) | ||||
token_dict[node.node_id()] = (count, semantic) | ||||
else: | ||||
semantic = node.accept(visitor) | ||||
count = tokenizer.count(semantic) | ||||
token_dict[node.node_id()] = (count, semantic) | ||||
|
||||
def pre_traverse(node): | ||||
count, semantic = token_dict[node.node_id()] | ||||
if count <= tokenizer.limit(): | ||||
chunks.append(semantic) | ||||
elif isinstance(node, Internal): | ||||
for child in node.children(): | ||||
pre_traverse(child) | ||||
else: | ||||
chunks.append(semantic) | ||||
|
||||
post_traverse(self._root) | ||||
pre_traverse(self._root) | ||||
|
||||
return chunks |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is your intention to merge this PR into Sycamore, or just to use it as a vehicle to discuss the data model?
(To be clear, I want to get this into Sycamore, just want to get a sense of whether we should discuss the nuts and bolts now defer that).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't have opinions on this. It's quite isolated from other transforms today, so should be able to put anywhere.