From 767cca780d6fa3022dd38c583d4d0e4108196f41 Mon Sep 17 00:00:00 2001 From: Andy Friedman Date: Wed, 21 Aug 2024 17:09:24 -0400 Subject: [PATCH] adding function to docset that returns document info instead of printing it --- lib/sycamore/sycamore/docset.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/sycamore/sycamore/docset.py b/lib/sycamore/sycamore/docset.py index 30a35b1be..a77537232 100644 --- a/lib/sycamore/sycamore/docset.py +++ b/lib/sycamore/sycamore/docset.py @@ -122,6 +122,14 @@ def _truncate(s): pprint.pp(document, stream=stream) + def get_doc_info(self, limit: int = -1): + doc_info = [] + for document in self.take(limit): + num_elems = len(document.elements) + document.data["elements"] = f"<{num_elems} elements>" + doc_info.append(document) + return doc_info + def count(self, include_metadata=False, **kwargs) -> int: """ Counts the number of documents in the resulting dataset.