diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py index b93baf9e60..5a68c3075a 100644 --- a/TTS/encoder/utils/prepare_voxceleb.py +++ b/TTS/encoder/utils/prepare_voxceleb.py @@ -19,13 +19,13 @@ # pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes """ voxceleb 1 & 2 """ +import csv import hashlib import os import subprocess import sys import zipfile -import pandas import soundfile as sf from absl import logging @@ -185,8 +185,11 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): # Write to CSV file which contains four columns: # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name". csv_file_path = os.path.join(output_dir, output_file) - df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) - df.to_csv(csv_file_path, index=False, sep="\t") + with open(csv_file_path, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f, delimiter="\t") + writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) + for wav_file in files: + writer.writerow(wav_file) logging.info("Successfully generated csv file {}".format(csv_file_path)) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index fbf6881f04..57a38dd8df 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -1,3 +1,4 @@ +import csv import os import re import xml.etree.ElementTree as ET @@ -5,7 +6,6 @@ from pathlib import Path from typing import List -import pandas as pd from tqdm import tqdm ######################## @@ -25,25 +25,27 @@ def cml_tts(root_path, meta_file, ignored_speakers=None): if len(line.split("|")) != num_cols: print(f" > Missing column in line {idx + 1} -> {line.strip()}") # load metadata - metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|") - assert all(x in metadata.columns for x in ["wav_filename", "transcript"]) - client_id = None if "client_id" in metadata.columns else "default" - emotion_name = None if "emotion_name" in metadata.columns else "neutral" + with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f, delimiter="|") + metadata = list(reader) + assert all(x in metadata[0] for x in ["wav_filename", "transcript"]) + client_id = None if "client_id" in metadata[0] else "default" + emotion_name = None if "emotion_name" in metadata[0] else "neutral" items = [] not_found_counter = 0 - for row in metadata.itertuples(): - if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers: + for row in metadata: + if client_id is None and ignored_speakers is not None and row["client_id"] in ignored_speakers: continue - audio_path = os.path.join(root_path, row.wav_filename) + audio_path = os.path.join(root_path, row["wav_filename"]) if not os.path.exists(audio_path): not_found_counter += 1 continue items.append( { - "text": row.transcript, + "text": row["transcript"], "audio_file": audio_path, - "speaker_name": client_id if client_id is not None else row.client_id, - "emotion_name": emotion_name if emotion_name is not None else row.emotion_name, + "speaker_name": client_id if client_id is not None else row["client_id"], + "emotion_name": emotion_name if emotion_name is not None else row["emotion_name"], "root_path": root_path, } ) @@ -63,25 +65,27 @@ def coqui(root_path, meta_file, ignored_speakers=None): if len(line.split("|")) != num_cols: print(f" > Missing column in line {idx + 1} -> {line.strip()}") # load metadata - metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|") - assert all(x in metadata.columns for x in ["audio_file", "text"]) - speaker_name = None if "speaker_name" in metadata.columns else "coqui" - emotion_name = None if "emotion_name" in metadata.columns else "neutral" + with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f, delimiter="|") + metadata = list(reader) + assert all(x in metadata[0] for x in ["audio_file", "text"]) + speaker_name = None if "speaker_name" in metadata[0] else "coqui" + emotion_name = None if "emotion_name" in metadata[0] else "neutral" items = [] not_found_counter = 0 - for row in metadata.itertuples(): - if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers: + for row in metadata: + if speaker_name is None and ignored_speakers is not None and row["speaker_name"] in ignored_speakers: continue - audio_path = os.path.join(root_path, row.audio_file) + audio_path = os.path.join(root_path, row["audio_file"]) if not os.path.exists(audio_path): not_found_counter += 1 continue items.append( { - "text": row.text, + "text": row["text"], "audio_file": audio_path, - "speaker_name": speaker_name if speaker_name is not None else row.speaker_name, - "emotion_name": emotion_name if emotion_name is not None else row.emotion_name, + "speaker_name": speaker_name if speaker_name is not None else row["speaker_name"], + "emotion_name": emotion_name if emotion_name is not None else row["emotion_name"], "root_path": root_path, } ) diff --git a/requirements.notebooks.txt b/requirements.notebooks.txt index 65d3f642c9..6b7e6e8956 100644 --- a/requirements.notebooks.txt +++ b/requirements.notebooks.txt @@ -1 +1,2 @@ -bokeh==1.4.0 \ No newline at end of file +bokeh==1.4.0 +pandas>=1.4,<2.0 diff --git a/requirements.txt b/requirements.txt index 2837c36e66..49ab1e3ac4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,8 +8,7 @@ torchaudio soundfile==0.12.* librosa==0.10.* scikit-learn==1.3.0 -numba==0.55.1;python_version<"3.9" -numba==0.57.0;python_version>="3.9" +numba==0.57.0 inflect==5.6.* tqdm==4.64.* anyascii==0.3.* @@ -23,7 +22,6 @@ flask==2.* pysbd==0.3.4 # deps for notebooks umap-learn==0.5.* -pandas>=1.4,<2.0 # deps for training matplotlib==3.7.* # coqui stack