-
Notifications
You must be signed in to change notification settings - Fork 2.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Librispeech] Add 'all' config #4184
Changes from 5 commits
9bfbc14
fe68645
189ee52
2f11a71
627c99f
31e67cb
3944a3b
5f699fa
2adb2d7
bce8221
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
|
||
|
||
import datasets | ||
import os | ||
from datasets.tasks import AutomaticSpeechRecognition | ||
|
||
|
||
|
@@ -57,6 +58,7 @@ def map_to_array(batch): | |
_URL = "http://www.openslr.org/12" | ||
_DL_URL = "http://www.openslr.org/resources/12/" | ||
|
||
|
||
_DL_URLS = { | ||
"clean": { | ||
"dev": _DL_URL + "dev-clean.tar.gz", | ||
|
@@ -69,6 +71,15 @@ def map_to_array(batch): | |
"dev": _DL_URL + "dev-other.tar.gz", | ||
"train.500": _DL_URL + "train-other-500.tar.gz", | ||
}, | ||
"all": { | ||
"dev.clean": _DL_URL + "dev-clean.tar.gz", | ||
"dev.other": _DL_URL + "dev-other.tar.gz", | ||
"test.clean": _DL_URL + "test-clean.tar.gz", | ||
"test.other": _DL_URL + "test-other.tar.gz", | ||
"train.clean.100": _DL_URL + "train-clean-100.tar.gz", | ||
"train.clean.360": _DL_URL + "train-clean-360.tar.gz", | ||
"train.other.500": _DL_URL + "train-other-500.tar.gz", | ||
}, | ||
} | ||
|
||
|
||
|
@@ -91,9 +102,11 @@ class LibrispeechASR(datasets.GeneratorBasedBuilder): | |
"""Librispeech dataset.""" | ||
|
||
DEFAULT_WRITER_BATCH_SIZE = 256 | ||
DEFAULT_CONFIG_NAME = "all" | ||
BUILDER_CONFIGS = [ | ||
LibrispeechASRConfig(name="clean", description="'Clean' speech."), | ||
LibrispeechASRConfig(name="other", description="'Other', more challenging, speech."), | ||
LibrispeechASRConfig(name="all", description="Combined clean and other dataset."), | ||
] | ||
|
||
def _info(self): | ||
|
@@ -117,33 +130,132 @@ def _info(self): | |
|
||
def _split_generators(self, dl_manager): | ||
archive_path = dl_manager.download(_DL_URLS[self.config.name]) | ||
# (Optional) In non-streaming mode, we can extract the archive locally to have actual local audio files: | ||
local_extracted_archive = dl_manager.extract(archive_path) if not dl_manager.is_streaming else {} | ||
|
||
if self.config.name == "clean": | ||
train_splits = [ | ||
datasets.SplitGenerator( | ||
name="train.100", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.100"])} | ||
name="train.100", | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("train.100"), | ||
"files": dl_manager.iter_archive(archive_path["train.100"]), | ||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name="train.360", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.360"])} | ||
name="train.360", | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("train.360"), | ||
"files": dl_manager.iter_archive(archive_path["train.360"]), | ||
}, | ||
), | ||
] | ||
dev_splits = [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("dev"), | ||
"files": dl_manager.iter_archive(archive_path["dev"]), | ||
}, | ||
) | ||
] | ||
test_splits = [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("test"), | ||
"files": dl_manager.iter_archive(archive_path["test"]), | ||
}, | ||
) | ||
] | ||
elif self.config.name == "other": | ||
train_splits = [ | ||
datasets.SplitGenerator( | ||
name="train.500", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.500"])} | ||
name="train.500", | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("train.500"), | ||
"files": dl_manager.iter_archive(archive_path["train.500"]), | ||
}, | ||
) | ||
] | ||
dev_splits = [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("dev"), | ||
"files": dl_manager.iter_archive(archive_path["dev"]), | ||
}, | ||
) | ||
] | ||
test_splits = [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("test"), | ||
"files": dl_manager.iter_archive(archive_path["test"]), | ||
}, | ||
) | ||
] | ||
elif self.config.name == "all": | ||
train_splits = [ | ||
datasets.SplitGenerator( | ||
name="train.clean.100", | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("train.clean.100"), | ||
"files": dl_manager.iter_archive(archive_path["train.clean.100"]), | ||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name="train.clean.360", | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("train.clean.360"), | ||
"files": dl_manager.iter_archive(archive_path["train.clean.360"]), | ||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name="train.other.500", | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("train.other.500"), | ||
"files": dl_manager.iter_archive(archive_path["train.other.500"]), | ||
}, | ||
), | ||
] | ||
dev_splits = [ | ||
datasets.SplitGenerator( | ||
name="validation.clean", | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("validation.clean"), | ||
"files": dl_manager.iter_archive(archive_path["dev.clean"]), | ||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name="validation.other", | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("validation.other"), | ||
"files": dl_manager.iter_archive(archive_path["dev.other"]), | ||
}, | ||
), | ||
] | ||
test_splits = [ | ||
datasets.SplitGenerator( | ||
name="test.clean", | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("test.clean"), | ||
"files": dl_manager.iter_archive(archive_path["test.clean"]), | ||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name="test.other", | ||
gen_kwargs={ | ||
"local_extracted_archive": local_extracted_archive.get("test.other"), | ||
"files": dl_manager.iter_archive(archive_path["test.other"]), | ||
}, | ||
), | ||
] | ||
|
||
return train_splits + [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev"])} | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive_path["test"])} | ||
), | ||
] | ||
return train_splits + dev_splits + test_splits | ||
|
||
def _generate_examples(self, files): | ||
def _generate_examples(self, files, local_extracted_archive): | ||
"""Generate examples from a LibriSpeech archive_path.""" | ||
key = 0 | ||
audio_data = {} | ||
|
@@ -159,6 +271,9 @@ def _generate_examples(self, files): | |
id_, transcript = line.split(" ", 1) | ||
audio_file = f"{id_}.flac" | ||
speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]] | ||
audio_file = ( | ||
os.path.join(local_extracted_archive, audio_file) if local_extracted_archive else None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @lhoestq is the whole speech data now stored twice in non-streaming mode? Once as the original flac and once as an array of bytes? Should we change line 266: from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Opening a new PR for this to change it internally in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will be fixed in #4187 :) |
||
) | ||
transcripts.append( | ||
{ | ||
"id": id_, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needed to have extracted files in non-streaming mode