Skip to content

Commit

Permalink
Fix bugs in json-encoding of documents (#713)
Browse files Browse the repository at this point in the history
* Add explicit support for bbox marshalling
* Make sure we encode bytes as base64 so they're always encodable as json
* Test this using materialize on our actual data.
  • Loading branch information
eric-anderson authored Aug 23, 2024
1 parent 88c180f commit 0a3ed08
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 4 deletions.
13 changes: 11 additions & 2 deletions lib/sycamore/sycamore/connectors/file/file_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,16 @@

class JSONEncodeWithUserDict(json.JSONEncoder):
def default(self, obj):
from sycamore.data.bbox import BoundingBox

if isinstance(obj, UserDict):
return obj.data
elif isinstance(obj, BoundingBox):
return {"x1": obj.x1, "y1": obj.y1, "x2": obj.x2, "y2": obj.y2}
elif isinstance(obj, bytes):
return obj.decode("utf-8")
import base64

return base64.b64encode(obj).decode("utf-8")
else:
return json.JSONEncoder.default(self, obj)

Expand Down Expand Up @@ -100,7 +106,7 @@ def elements_to_bytes(doc: Document) -> bytes:
return out.getvalue().encode("utf-8")


def document_to_bytes(doc: Document) -> bytes:
def document_to_json_bytes(doc: Document) -> bytes:
"""
Returns a UTF-8 encoded json string of the document. Adds newline.
Beware this will try to interpret binary_representation as UTF-8.
Expand Down Expand Up @@ -173,6 +179,9 @@ class JsonWriter(Write):
files. Supports output to any Ray-supported filesystem. Typically
each source document (such as a PDF) ends up as a block. After an
explode(), there will be multiple documents in the block.
Warning: JSON writing is not reversable with JSON reading. You will get
a slightly different document back.
"""

def __init__(
Expand Down
4 changes: 2 additions & 2 deletions lib/sycamore/sycamore/connectors/file/file_writer_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ray.data._internal.execution.interfaces import TaskContext
from urllib.parse import urlparse

from sycamore.connectors.file.file_writer import default_filename, default_doc_to_bytes, document_to_bytes
from sycamore.connectors.file.file_writer import default_filename, default_doc_to_bytes, document_to_json_bytes
from sycamore.data import Document, MetadataDocument
from sycamore.utils.time_trace import TimeTrace

Expand Down Expand Up @@ -75,5 +75,5 @@ def write_block_to_file(self, block: BlockAccessor, file: NativeFile) -> None:
if isinstance(doc, MetadataDocument):
continue
del doc.binary_representation # Doesn't make sense in JSON
binary = document_to_bytes(doc)
binary = document_to_json_bytes(doc)
file.write(binary)
11 changes: 11 additions & 0 deletions lib/sycamore/sycamore/materialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,17 @@ def finalize(self):

@staticmethod
def infer_fs(path: str) -> "pyarrow.FileSystem":
import re

if not re.match("^[a-z0-9]+://.", path):
# pyarrow expects URIs, accepts /dir/path, but rejects ./dir/path
# normalize everything to a URI.
p = Path(path)
if p.is_absolute():
path = p.as_uri()
else:
path = p.absolute().as_uri()

from pyarrow import fs

(fs, root) = fs.FileSystem.from_uri(path)
Expand Down
Binary file not shown.
Empty file.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import sycamore
from sycamore.materialize import MaterializeSourceMode
from sycamore.tests.config import TEST_DIR
from sycamore.connectors.file.file_writer import document_to_json_bytes

# To re-generate the input file, change the False to True, and in the root directory, run
# poetry run python lib/sycamore/sycamore/tests/unit/connectors/file/test_file_writer.py
if False:
from sycamore.transforms.partition import ArynPartitioner

(
sycamore.init()
.read.binary(paths="./lib/sycamore/sycamore/tests/resources/data/pdfs/Ray_page11.pdf", binary_format="pdf")
.partition(ArynPartitioner(extract_images=True, use_partitioning_service=False, use_cache=False))
.materialize(
path="./lib/sycamore/sycamore/tests/resources/data/materialize/json_writer",
source_mode=MaterializeSourceMode.IF_PRESENT,
)
.execute()
)


def test_json_bytes_with_bbox_image():
docs = (
sycamore.init(exec_mode=sycamore.ExecMode.LOCAL)
.read.materialize(path=TEST_DIR / "resources/data/materialize/json_writer")
.take_all()
)
# TODO: once we support writers in local mode, switch this to be
# .write.json(tmpdir)
# running as part of ray, it's too slow
for d in docs:
_ = document_to_json_bytes(d)

0 comments on commit 0a3ed08

Please sign in to comment.