Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Librispeech] Add 'all' config #4184

Merged
Merged
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 127 additions & 12 deletions datasets/librispeech_asr/librispeech_asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@


import datasets
import os
from datasets.tasks import AutomaticSpeechRecognition


Expand Down Expand Up @@ -57,6 +58,7 @@ def map_to_array(batch):
_URL = "http://www.openslr.org/12"
_DL_URL = "http://www.openslr.org/resources/12/"


_DL_URLS = {
"clean": {
"dev": _DL_URL + "dev-clean.tar.gz",
Expand All @@ -69,6 +71,15 @@ def map_to_array(batch):
"dev": _DL_URL + "dev-other.tar.gz",
"train.500": _DL_URL + "train-other-500.tar.gz",
},
"all": {
"dev.clean": _DL_URL + "dev-clean.tar.gz",
"dev.other": _DL_URL + "dev-other.tar.gz",
"test.clean": _DL_URL + "test-clean.tar.gz",
"test.other": _DL_URL + "test-other.tar.gz",
"train.clean.100": _DL_URL + "train-clean-100.tar.gz",
"train.clean.360": _DL_URL + "train-clean-360.tar.gz",
"train.other.500": _DL_URL + "train-other-500.tar.gz",
},
}


Expand All @@ -91,9 +102,11 @@ class LibrispeechASR(datasets.GeneratorBasedBuilder):
"""Librispeech dataset."""

DEFAULT_WRITER_BATCH_SIZE = 256
DEFAULT_CONFIG_NAME = "all"
BUILDER_CONFIGS = [
LibrispeechASRConfig(name="clean", description="'Clean' speech."),
LibrispeechASRConfig(name="other", description="'Other', more challenging, speech."),
LibrispeechASRConfig(name="all", description="Combined clean and other dataset."),
]

def _info(self):
Expand All @@ -117,33 +130,132 @@ def _info(self):

def _split_generators(self, dl_manager):
archive_path = dl_manager.download(_DL_URLS[self.config.name])
# (Optional) In non-streaming mode, we can extract the archive locally to have actual local audio files:
local_extracted_archive = dl_manager.extract(archive_path) if not dl_manager.is_streaming else {}

if self.config.name == "clean":
train_splits = [
datasets.SplitGenerator(
name="train.100", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.100"])}
name="train.100",
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("train.100"),
"files": dl_manager.iter_archive(archive_path["train.100"]),
},
),
datasets.SplitGenerator(
name="train.360", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.360"])}
name="train.360",
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("train.360"),
"files": dl_manager.iter_archive(archive_path["train.360"]),
},
),
]
dev_splits = [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("dev"),
"files": dl_manager.iter_archive(archive_path["dev"]),
},
)
]
test_splits = [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("test"),
"files": dl_manager.iter_archive(archive_path["test"]),
},
)
]
elif self.config.name == "other":
train_splits = [
datasets.SplitGenerator(
name="train.500", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.500"])}
name="train.500",
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("train.500"),
"files": dl_manager.iter_archive(archive_path["train.500"]),
},
)
]
dev_splits = [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("dev"),
"files": dl_manager.iter_archive(archive_path["dev"]),
},
)
]
test_splits = [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("test"),
"files": dl_manager.iter_archive(archive_path["test"]),
},
)
]
elif self.config.name == "all":
train_splits = [
datasets.SplitGenerator(
name="train.clean.100",
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("train.clean.100"),
"files": dl_manager.iter_archive(archive_path["train.clean.100"]),
},
),
datasets.SplitGenerator(
name="train.clean.360",
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("train.clean.360"),
"files": dl_manager.iter_archive(archive_path["train.clean.360"]),
},
),
datasets.SplitGenerator(
name="train.other.500",
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("train.other.500"),
"files": dl_manager.iter_archive(archive_path["train.other.500"]),
},
),
]
dev_splits = [
datasets.SplitGenerator(
name="validation.clean",
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("validation.clean"),
"files": dl_manager.iter_archive(archive_path["dev.clean"]),
},
),
datasets.SplitGenerator(
name="validation.other",
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("validation.other"),
"files": dl_manager.iter_archive(archive_path["dev.other"]),
},
),
]
test_splits = [
datasets.SplitGenerator(
name="test.clean",
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("test.clean"),
"files": dl_manager.iter_archive(archive_path["test.clean"]),
},
),
datasets.SplitGenerator(
name="test.other",
gen_kwargs={
"local_extracted_archive": local_extracted_archive.get("test.other"),
"files": dl_manager.iter_archive(archive_path["test.other"]),
},
),
]

return train_splits + [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION, gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev"])}
),
datasets.SplitGenerator(
name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive_path["test"])}
),
]
return train_splits + dev_splits + test_splits

def _generate_examples(self, files):
def _generate_examples(self, files, local_extracted_archive):
"""Generate examples from a LibriSpeech archive_path."""
key = 0
audio_data = {}
Expand All @@ -159,6 +271,9 @@ def _generate_examples(self, files):
id_, transcript = line.split(" ", 1)
audio_file = f"{id_}.flac"
speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]]
audio_file = (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needed to have extracted files in non-streaming mode

os.path.join(local_extracted_archive, audio_file) if local_extracted_archive else None
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lhoestq is the whole speech data now stored twice in non-streaming mode? Once as the original flac and once as an array of bytes?

Should we change line 266: from audio_data[id_] = f.read() to just audio_data[id_] = f and change the line: audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]} at the moment it looks to me like both the bytes and the file is saved in non-streaming mode which would create a huge overhead in memory no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opening a new PR for this to change it internally in src/datasets ...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will be fixed in #4187 :)

)
transcripts.append(
{
"id": id_,
Expand Down