From 9425c779a0bd8424aba65c83bfbb4885e0a2267b Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Mon, 16 Aug 2021 10:01:51 +0800 Subject: [PATCH] 1. use relative path in metadata.jsonl; 2. support key=value format to pass extra command line arguments to modify config values; 3. use path relative to the config.py to locate the default config. --- .../speedyspeech/baker/compute_statistics.py | 5 +++++ examples/speedyspeech/baker/config.py | 3 +++ examples/speedyspeech/baker/frontend.py | 7 +++++-- examples/speedyspeech/baker/inference.py | 2 +- examples/speedyspeech/baker/normalize.py | 6 +++++- examples/speedyspeech/baker/preprocess.py | 8 ++++++-- .../baker/speedyspeech_updater.py | 1 - examples/speedyspeech/baker/synthesize.py | 2 +- examples/speedyspeech/baker/synthesize_e2e.py | 2 +- examples/speedyspeech/baker/train.py | 20 ++++++++++++++----- setup.py | 5 +++-- 11 files changed, 45 insertions(+), 16 deletions(-) diff --git a/examples/speedyspeech/baker/compute_statistics.py b/examples/speedyspeech/baker/compute_statistics.py index e145974..3d3dd5b 100644 --- a/examples/speedyspeech/baker/compute_statistics.py +++ b/examples/speedyspeech/baker/compute_statistics.py @@ -89,6 +89,11 @@ def main(): with jsonlines.open(args.metadata, 'r') as reader: metadata = list(reader) + + metadata_dir = Path(args.metadata).parent + for item in metadata: + item["feats"] = str(metadata_dir / item["feats"]) + dataset = DataTable( metadata, fields=[args.field_name], diff --git a/examples/speedyspeech/baker/config.py b/examples/speedyspeech/baker/config.py index f555791..3ca3121 100644 --- a/examples/speedyspeech/baker/config.py +++ b/examples/speedyspeech/baker/config.py @@ -14,6 +14,9 @@ import yaml from yacs.config import CfgNode as Configuration +from pathlib import Path + +config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve() with open("conf/default.yaml", 'rt') as f: _C = yaml.safe_load(f) diff --git a/examples/speedyspeech/baker/frontend.py b/examples/speedyspeech/baker/frontend.py index 5914436..e8869dd 100644 --- a/examples/speedyspeech/baker/frontend.py +++ b/examples/speedyspeech/baker/frontend.py @@ -13,6 +13,8 @@ # limitations under the License. import re +from pathlib import Path + import numpy as np import paddle import pypinyin @@ -22,10 +24,11 @@ import phkit phkit.initialize() from parakeet.frontend.vocab import Vocab -with open("phones.txt", 'rt') as f: +file_dir = Path(__file__).parent.resolve() +with open(file_dir / "phones.txt", 'rt') as f: phones = [line.strip() for line in f.readlines()] -with open("tones.txt", 'rt') as f: +with open(file_dir / "tones.txt", 'rt') as f: tones = [line.strip() for line in f.readlines()] voc_phones = Vocab(phones, start_symbol=None, end_symbol=None) voc_tones = Vocab(tones, start_symbol=None, end_symbol=None) diff --git a/examples/speedyspeech/baker/inference.py b/examples/speedyspeech/baker/inference.py index 7e202a8..3bd4384 100644 --- a/examples/speedyspeech/baker/inference.py +++ b/examples/speedyspeech/baker/inference.py @@ -33,7 +33,7 @@ def main(): help="text to synthesize, a 'utt_id sentence' pair per line") parser.add_argument("--output-dir", type=str, help="output dir") - args = parser.parse_args() + args, _ = parser.parse_known_args() speedyspeech_config = inference.Config( str(Path(args.inference_dir) / "speedyspeech.pdmodel"), diff --git a/examples/speedyspeech/baker/normalize.py b/examples/speedyspeech/baker/normalize.py index daa0a91..2d1b028 100644 --- a/examples/speedyspeech/baker/normalize.py +++ b/examples/speedyspeech/baker/normalize.py @@ -96,6 +96,10 @@ def main(): # get dataset with jsonlines.open(args.metadata, 'r') as reader: metadata = list(reader) + metadata_dir = Path(args.metadata).parent + for item in metadata: + item["feats"] = str(metadata_dir / item["feats"]) + dataset = DataTable(metadata, converters={'feats': np.load, }) logging.info(f"The number of files = {len(dataset)}.") @@ -136,7 +140,7 @@ def main(): 'num_phones': item['num_phones'], 'num_frames': item['num_frames'], 'durations': item['durations'], - 'feats': str(mel_path), + 'feats': str(mel_path.relative_to(dumpdir)), }) output_metadata.sort(key=itemgetter('utt_id')) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" diff --git a/examples/speedyspeech/baker/preprocess.py b/examples/speedyspeech/baker/preprocess.py index 1fda34f..0295dee 100644 --- a/examples/speedyspeech/baker/preprocess.py +++ b/examples/speedyspeech/baker/preprocess.py @@ -181,7 +181,7 @@ def process_sentence(config: Dict[str, Any], "num_phones": len(phones), "num_frames": num_frames, "durations": durations_frame, - "feats": str(mel_path.resolve()), # use absolute path + "feats": mel_path, # Path object } return record @@ -212,8 +212,12 @@ def process_sentences(config, results.append(ft.result()) results.sort(key=itemgetter("utt_id")) - with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: + output_dir = Path(output_dir) + metadata_path = output_dir / "metadata.jsonl" + # NOTE: use relative path to the meta jsonlines file + with jsonlines.open(metadata_path, 'w') as writer: for item in results: + item["feats"] = str(item["feats"].relative_to(output_dir)) writer.write(item) print("Done") diff --git a/examples/speedyspeech/baker/speedyspeech_updater.py b/examples/speedyspeech/baker/speedyspeech_updater.py index bc4d0f9..bbb65d7 100644 --- a/examples/speedyspeech/baker/speedyspeech_updater.py +++ b/examples/speedyspeech/baker/speedyspeech_updater.py @@ -70,7 +70,6 @@ class SpeedySpeechUpdater(StandardUpdater): class SpeedySpeechEvaluator(StandardEvaluator): def evaluate_core(self, batch): - print("fire") decoded, predicted_durations = self.model( text=batch["phones"], tones=batch["tones"], diff --git a/examples/speedyspeech/baker/synthesize.py b/examples/speedyspeech/baker/synthesize.py index c4b6cb3..0cddf73 100644 --- a/examples/speedyspeech/baker/synthesize.py +++ b/examples/speedyspeech/baker/synthesize.py @@ -150,7 +150,7 @@ def main(): "--device", type=str, default="gpu", help="device type to use") parser.add_argument("--verbose", type=int, default=1, help="verbose") - args = parser.parse_args() + args, _ = parser.parse_known_args() with open(args.speedyspeech_config) as f: speedyspeech_config = CfgNode(yaml.safe_load(f)) with open(args.pwg_config) as f: diff --git a/examples/speedyspeech/baker/synthesize_e2e.py b/examples/speedyspeech/baker/synthesize_e2e.py index af448c8..2795640 100644 --- a/examples/speedyspeech/baker/synthesize_e2e.py +++ b/examples/speedyspeech/baker/synthesize_e2e.py @@ -152,7 +152,7 @@ def main(): "--device", type=str, default="gpu", help="device type to use") parser.add_argument("--verbose", type=int, default=1, help="verbose") - args = parser.parse_args() + args, _ = parser.parse_known_args() with open(args.speedyspeech_config) as f: speedyspeech_config = CfgNode(yaml.safe_load(f)) with open(args.pwg_config) as f: diff --git a/examples/speedyspeech/baker/train.py b/examples/speedyspeech/baker/train.py index d51afec..e7fd4be 100644 --- a/examples/speedyspeech/baker/train.py +++ b/examples/speedyspeech/baker/train.py @@ -72,6 +72,10 @@ def train_sp(args, config): # construct dataset for training and validation with jsonlines.open(args.train_metadata, 'r') as reader: train_metadata = list(reader) + metadata_dir = Path(args.train_metadata).parent + for item in train_metadata: + item["feats"] = str(metadata_dir / item["feats"]) + train_dataset = DataTable( data=train_metadata, fields=[ @@ -80,6 +84,9 @@ def train_sp(args, config): converters={"feats": np.load, }, ) with jsonlines.open(args.dev_metadata, 'r') as reader: dev_metadata = list(reader) + metadata_dir = Path(args.dev_metadata).parent + for item in dev_metadata: + item["feats"] = str(metadata_dir / item["feats"]) dev_dataset = DataTable( data=dev_metadata, fields=[ @@ -113,9 +120,6 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - # batch = collate_baker_examples([train_dataset[i] for i in range(10)]) - # # batch = collate_baker_examples([dev_dataset[i] for i in range(10)]) - # import pdb; pdb.set_trace() model = SpeedySpeech(**config["model"]) if world_size > 1: model = DataParallel(model) # TODO, do not use vocab size from config @@ -141,7 +145,7 @@ def train_sp(args, config): trainer.extend(VisualDL(writer), trigger=(1, "iteration")) trainer.extend( Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) - print(trainer.extensions) + # print(trainer.extensions) trainer.run() @@ -160,12 +164,18 @@ def main(): "--nprocs", type=int, default=1, help="number of processes") parser.add_argument("--verbose", type=int, default=1, help="verbose") - args = parser.parse_args() + args, rest = parser.parse_known_args() if args.device == "cpu" and args.nprocs > 1: raise RuntimeError("Multiprocess training on CPU is not supported.") config = get_cfg_default() if args.config: config.merge_from_file(args.config) + if rest: + extra = [] + # to support key=value format + for item in rest: + extra.extend(item.split("=", maxsplit=1)) + config.merge_from_list(extra) print("========Args========") print(yaml.safe_dump(vars(args))) diff --git a/setup.py b/setup.py index b7cb4da..b241f97 100644 --- a/setup.py +++ b/setup.py @@ -64,17 +64,18 @@ setup_info = dict( 'scipy', 'pandas', 'sox', - 'soundfile', + 'soundfile~=0.10', 'g2p_en', 'yacs', 'visualdl', 'pypinyin', 'webrtcvad', 'g2pM', - 'praatio', + 'praatio~=4.1', "h5py", "timer", 'jsonlines', + "phkit", ], extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },