Skip to content

Commit

Permalink
[DEV-11721]: create dataset with email options (#333)
Browse files Browse the repository at this point in the history
* create dataset with email options

* comment on email options

* does this trigger new harness
  • Loading branch information
jacobmanderson authored Sep 19, 2024
1 parent 4235c4f commit 71d17a9
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 8 deletions.
9 changes: 7 additions & 2 deletions indico/queries/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pathlib import Path
from typing import Dict, List, Optional, Union

import deprecation
import jsons
import pandas as pd

Expand All @@ -22,6 +21,7 @@
from indico.queries.storage import UploadBatched, UploadImages
from indico.types.dataset import (
Dataset,
EmailOptions,
OcrEngine,
OcrInputLanguage,
OmnipageOcrOptionsInput,
Expand Down Expand Up @@ -228,6 +228,7 @@ def __init__(
omnipage_ocr_options: OmnipageOcrOptionsInput = None,
read_api_ocr_options: ReadApiOcrOptionsInput = None,
request_interval: Union[int, float] = 5,
email_options: EmailOptions = None,
):
self.files = files
self.name = name
Expand All @@ -240,6 +241,7 @@ def __init__(
self.omnipage_ocr_options = omnipage_ocr_options
self.read_api_ocr_options = read_api_ocr_options
self.request_interval = request_interval
self.email_options = email_options
if omnipage_ocr_options is not None and read_api_ocr_options is not None:
raise IndicoInputError(
"Must supply either omnipage or readapi options but not both."
Expand Down Expand Up @@ -279,6 +281,7 @@ def requests(self):
readapi_ocr_options=self.read_api_ocr_options,
omnipage_ocr_options=self.omnipage_ocr_options,
ocr_engine=self.ocr_engine,
email_options=self.email_options,
)
yield _AddFiles(
dataset_id=self.previous.id, metadata=file_metadata, autoprocess=True
Expand Down Expand Up @@ -376,6 +379,7 @@ def __init__(
ocr_engine: OcrEngine = None,
omnipage_ocr_options: OmnipageOcrOptionsInput = None,
readapi_ocr_options: ReadApiOcrOptionsInput = None,
email_options: EmailOptions = None,
):
if not dataset_type:
dataset_type = "TEXT"
Expand All @@ -386,7 +390,8 @@ def __init__(
"ocrEngine": ocr_engine.name,
"omnipageOptions": omnipage_ocr_options,
"readapiOptions": readapi_ocr_options,
}
},
"emailOptions": email_options,
}
super().__init__(
self.query,
Expand Down
45 changes: 39 additions & 6 deletions indico/types/dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from enum import Enum
from typing import List
from typing import List, Optional

from indico.errors import IndicoInputError
from indico.types.base import BaseType
from indico.types.datafile import Datafile
from indico.errors import IndicoInputError


class DataColumn(BaseType):
Expand Down Expand Up @@ -57,28 +57,35 @@ class Dataset(BaseType):

def labelset_by_name(self, name: str) -> LabelSet:
if name not in [lab.name for lab in self.labelsets]:
raise IndicoInputError(f"No labelset found for {name}. Current labelset names include {[lab.name for lab in self.labelsets]}.")
raise IndicoInputError(
f"No labelset found for {name}. Current labelset names include {[lab.name for lab in self.labelsets]}."
)
return next(lab for lab in self.labelsets if lab.name == name)

def datacolumn_by_name(self, name: str) -> DataColumn:
if name not in [datacol.name for datacol in self.datacolumns]:
raise IndicoInputError(f"No datacolumn found for {name}. Current datacolumn names include {[datacol.name for datacol in self.datacolumns]}.")
raise IndicoInputError(
f"No datacolumn found for {name}. Current datacolumn names include {[datacol.name for datacol in self.datacolumns]}."
)
return next(datacol for datacol in self.datacolumns if datacol.name == name)


class TableReadOrder(Enum):
ROW = 0
COLUMN = 1


class OcrEngine(Enum):
"""
Enum representing available OCR engines.
"""

OMNIPAGE = 0
READAPI = 1
READAPI_V2 = 2
READAPI_TABLES_V1 = 3


class OmnipageOcrOptionsInput(BaseType):
"""
Omnipage specific OCR options for dataset creation.
Expand All @@ -95,6 +102,7 @@ class OmnipageOcrOptionsInput(BaseType):
table_read_order(TableReadOrder): Read table by row or column.
"""

auto_rotate: bool
single_column: bool
upscale_images: bool
Expand All @@ -105,6 +113,7 @@ class OmnipageOcrOptionsInput(BaseType):
native_pdf: bool
table_read_order: TableReadOrder


class ReadApiOcrOptionsInput(BaseType):
"""
Read API OCR options.
Expand All @@ -115,20 +124,44 @@ class ReadApiOcrOptionsInput(BaseType):
upscale_images(bool): Scale up low resolution images.
languages(List[str]): List of languages to use.
"""

auto_rotate: bool
single_column: bool
upscale_images: bool
languages: List[str]


class OcrInputLanguage(BaseType):
name: str
code: str

class OcrOptionsInput():

class IncludeSections(BaseType):
header: Optional[bool]
body: Optional[bool]
attachments: Optional[bool]


class EmailOptions(BaseType):
"""
Email options
Args:
include_sections: Sections of the email to include after parsing (header, body, attachments)
unpack: Unpack an email and treat it as a multi-file Submission
preserve_body_whitespace: Preserve whitespace in the body of the email
"""

include_sections: Optional[IncludeSections]
unpack: Optional[bool]
preserve_body_whitespace: Optional[bool]


class OcrOptionsInput:
"""
Input options for OCR engine.
"""

ocr_engine: OcrEngine
omnipage_options: OmnipageOcrOptionsInput
readapi_options: ReadApiOcrOptionsInput

23 changes: 23 additions & 0 deletions tests/integration/queries/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,3 +425,26 @@ def test_bad_csv_create_dataset(indico):
assert dataset.status == "CREATING"
dataset = client.call(GetDatasetFileStatus(id=dataset.id))
assert all([f.status == "FAILED" for f in dataset.files])


@pytest.mark.ocr("readapi")
def test_create_with_email_options_readapi(indico):
client = IndicoClient()
readapi_config: ReadApiOcrOptionsInput = {
"auto_rotate": True,
"single_column": False,
"upscale_images": True,
"languages": ["AUTO"],
}
email_config = {
"include_sections": {"header": True, "body": True, "attachments": True},
"unpack": True,
}
dataset = client.call(
CreateEmptyDataset(
name=f"dataset-{int(time.time())}",
ocr_engine=OcrEngine.READAPI,
readapi_ocr_options=readapi_config,
email_options=email_config,
)
)

0 comments on commit 71d17a9

Please sign in to comment.