-
Notifications
You must be signed in to change notification settings - Fork 2.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Local paths in common voice #3736
Changes from 4 commits
5cabd27
193130e
bb8c730
e3a59c3
fa58a9c
c325024
5288e6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -657,7 +657,9 @@ def _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs | |
""" | ||
# Generating data for all splits | ||
split_dict = SplitDict(dataset_name=self.name) | ||
split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs) | ||
split_generators_kwargs = self._make_split_generators_kwargs( | ||
{"dl_manager": dl_manager, **prepare_split_kwargs} | ||
) | ||
split_generators = self._split_generators(dl_manager, **split_generators_kwargs) | ||
|
||
# Checksums verification | ||
|
@@ -727,8 +729,12 @@ def _save_infos(self): | |
|
||
def _make_split_generators_kwargs(self, prepare_split_kwargs): | ||
"""Get kwargs for `self._split_generators()` from `prepare_split_kwargs`.""" | ||
del prepare_split_kwargs | ||
return {} | ||
split_generators_kwargs = {} | ||
split_generators_arg_names = inspect.signature(self._split_generators).parameters.keys() | ||
if "streaming" in split_generators_arg_names: | ||
streaming = isinstance(prepare_split_kwargs.get("dl_manager"), StreamingDownloadManager) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess you need the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed having this logic inside the streaming = dl_manager.is_streaming inside the dataset script |
||
split_generators_kwargs["streaming"] = streaming | ||
return split_generators_kwargs | ||
|
||
def as_dataset( | ||
self, split: Optional[Split] = None, run_post_process=True, ignore_verifications=False, in_memory=False | ||
|
@@ -892,7 +898,8 @@ def as_streaming_dataset( | |
data_dir=self.config.data_dir, | ||
) | ||
self._check_manual_download(dl_manager) | ||
splits_generators = {sg.name: sg for sg in self._split_generators(dl_manager)} | ||
split_generators_kwargs = self._make_split_generators_kwargs({"dl_manager": dl_manager}) | ||
splits_generators = {sg.name: sg for sg in self._split_generators(dl_manager, **split_generators_kwargs)} | ||
# By default, return all splits | ||
if split is None: | ||
splits_generator = splits_generators | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
small nit - I'd even pass the
streaming
flag here to make it super clear that they are two different modes and maybe have both a_generate_examples_streaming(...)
and a_generate_examples_non_streaming(...)