Skip to content

Commit

Permalink
Data Model for tree structured Document Representation
Browse files Browse the repository at this point in the history
  • Loading branch information
bohou-aryn committed Nov 13, 2024
1 parent 407cc20 commit 920aff1
Show file tree
Hide file tree
Showing 2 changed files with 378 additions and 0 deletions.
343 changes: 343 additions & 0 deletions lib/sycamore/sycamore/document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,343 @@
from abc import abstractmethod, ABC
from enum import Enum


class Category(Enum):
Caption = 1
Footnote = 2
Formula = 3
ListItem = 4
PageFooter = 5
PageHeader = 6
Picture = 7
SectionHeader = 8
Table = 9
Text = 10
Title = 11
Section = 101
Group = 102


class Node:
def __init__(self, node_id, metadata):
self._node_id = node_id
self._metadata = metadata

def node_id(self):
return self._node_id

@abstractmethod
def category(self):
pass

def metadata(self):
return self._metadata

@abstractmethod
def accept(self, visitor):
pass


class Leaf(Node):
def __init__(self, node_id, box, content, metadata):
super().__init__(node_id, metadata)
self._box = box
self._content = content

def box(self):
return self._box

def content(self):
return self._content


class Internal(Node):
def __init__(self, node_id, children, metadata):
super().__init__(node_id, metadata)
self._children = children

def children(self) -> list[Node]:
return self._children


class Caption(Leaf):
def __init__(self, node_id, box, content, metadata):
super().__init__(node_id, box, content, metadata)

def category(self):
return Category.Caption

def accept(self, visitor):
return visitor.visit_caption(self)


class Footnote(Leaf):
def __init__(self, node_id, box, content, metadata):
super().__init__(node_id, box, content, metadata)

def category(self):
return Category.Caption

def accept(self, visitor):
return visitor.visit_footnote(self)


class Formula(Leaf):
def __init__(self, node_id, box, content, metadata):
super().__init__(node_id, box, content, metadata)

def category(self):
return Category.Caption

def accept(self, visitor):
return visitor.visit_formula(self)


class ListItem(Leaf):
def __init__(self, node_id, box, content, metadata):
super().__init__(node_id, box, content, metadata)

def category(self):
return Category.Caption

def accept(self, visitor):
return visitor.visit_list_item(self)


class PageFooter(Leaf):
def __init__(self, node_id, box, content, metadata):
super().__init__(node_id, box, content, metadata)

def category(self):
return Category.Caption

def accept(self, visitor):
return visitor.visit_page_footer(self)


class PageHeader(Leaf):
def __init__(self, node_id, box, content, metadata):
super().__init__(node_id, box, content, metadata)

def category(self):
return Category.Caption

def accept(self, visitor):
return visitor.visit_page_header(self)


class Picture(Leaf):
def __init__(self, node_id, box, content, metadata):
super().__init__(node_id, box, content, metadata)

def category(self):
return Category.Caption

def accept(self, visitor):
return visitor.visit_picture(self)


class SectionHeader(Leaf):
def __init__(self, node_id, box, content, metadata):
super().__init__(node_id, box, content, metadata)

def category(self):
return Category.Caption

def accept(self, visitor):
return visitor.visit_section_header(self)


class Table(Leaf):
def __init__(self, node_id, box, content, metadata, continued=None):
super().__init__(node_id, box, content, metadata)

def category(self):
return Category.Caption

def accept(self, visitor):
return visitor.visit_table(self)


class Text(Leaf):
def __init__(self, node_id, box, content, metadata, continued=None):
super().__init__(node_id, box, content, metadata)
self._continued = continued

def category(self):
return Category.Caption

def continued(self):
return self._continued

def accept(self, visitor):
return visitor.visit_text(self)


class Title(Leaf):
def __init__(self, node_id, box, content, metadata):
super().__init__(node_id, box, content, metadata)

def category(self):
return Category.Caption

def accept(self, visitor):
return visitor.visit_title(self)


class Group(Internal):
"""
Group semantic related objects together, e.g. list items, figure and caption
"""
def __init__(self, node_id, children, metadata):
super().__init__(node_id, children, metadata)

def category(self):
return Category.Group

def accept(self, visitor):
return visitor.visit_group(self)


class Section(Internal):
def __init__(self, node_id, children, header, metadata):
super().__init__(node_id, children, metadata)
self._header = header

def category(self):
return Category.Section

def accept(self, visitor):
return visitor.visit_section(self)


class Visitor(ABC):
@abstractmethod
def visit_caption(self, caption: Caption):
pass

@abstractmethod
def visit_footnote(self, footnote: Footnote):
pass

@abstractmethod
def visit_formula(self, formula: Formula):
pass

@abstractmethod
def visit_list_item(self, list_item: ListItem):
pass

@abstractmethod
def visit_page_footer(self, page_footer: PageFooter):
pass

@abstractmethod
def visit_page_header(self, page_header: PageHeader):
pass

@abstractmethod
def visit_picture(self, picture: Picture):
pass

@abstractmethod
def visit_section_header(self, section_header: SectionHeader):
pass

@abstractmethod
def visit_table(self, table: Table):
pass

@abstractmethod
def visit_text(self, text: Text):
pass

@abstractmethod
def visit_title(self, title: Title):
pass

@abstractmethod
def visit_group(self, group: Group):
pass

@abstractmethod
def visit_section(self, section: Section):
pass


class NaiveSemanticVisitor(Visitor):
def visit_caption(self, caption):
return caption.content()

def visit_footnote(self, footnote):
return footnote.content()

def visit_formula(self, formula):
return formula.content()

def visit_list_item(self, list_item):
return list_item.content()

def visit_page_footer(self, page_footer):
return page_footer.content()

def visit_page_header(self, page_header):
return page_header.content()

def visit_picture(self, picture):
raise Exception("Picture semantic output not implemented")

def visit_section_header(self, section_header):
return section_header.content()

def visit_table(self, table: Table):
content = table.content()
return content.to_csv()

def visit_text(self, text: Text):
contents = text.content()
cur = text.continued()
while cur:
contents = contents.rstrip() + " " + cur.content()
cur = cur.continued()

return contents

def visit_title(self, title: Title):
return title.content()

def visit_group(self, group: Group):
# merge content in the group naive
contents = [child.accept(self) for child in group.children()]
return " ".join(contents)

def visit_section(self, section: Section):
# merge content in the section naive
contents = [child.accept(self) for child in section.children()]
return " ".join(contents)


class Page:
"""
Holds page objects not in the structure tree like page header, page footer
and footnote.
"""
def __init__(self, nodes: list[Node]):
self._nodes = nodes


class Document:
def __init__(self, root, nodes, pages, metadata):
self._root = root
self._nodes = nodes
self._pages = pages
self._metadata = metadata

def summary(self):
pass

def filter(self):
pass

def chunk(self):
pass
35 changes: 35 additions & 0 deletions lib/sycamore/sycamore/tests/test_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@

from sycamore.document import Caption, NaiveSemanticVisitor, Table


def test_caption_accept():
node_id = 1
metadata = {"author": "John Doe"}
caption = Caption(node_id, [1, 2, 3, 4], "Caption content", metadata)
visitor = NaiveSemanticVisitor()

semantic = caption.accept(visitor)

# Assert
assert semantic == "Caption content"


def test_table_accept():
from sycamore.data import Table as TableContent, TableCell
node_id = 1
table = TableContent(
[
TableCell(content="head1", rows=[0], cols=[0], is_header=True),
TableCell(content="head2", rows=[0], cols=[1], is_header=True),
TableCell(content="3", rows=[1], cols=[0], is_header=False),
TableCell(content="4", rows=[1], cols=[1], is_header=False),
]
)
metadata = {"author": "John Doe"}
table = Table(node_id, [1, 2, 3, 4], table, metadata)
visitor = NaiveSemanticVisitor()

semantic = table.accept(visitor)

assert semantic == "head1,head2\n3,4\n"

0 comments on commit 920aff1

Please sign in to comment.