From 9425c779a0bd8424aba65c83bfbb4885e0a2267b Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Mon, 16 Aug 2021 10:01:51 +0800
Subject: [PATCH] 1. use relative path in metadata.jsonl; 2. support key=value
 format to pass extra command line arguments to modify config values; 3. use
 path relative to the config.py to locate the default config.

---
 .../speedyspeech/baker/compute_statistics.py  |  5 +++++
 examples/speedyspeech/baker/config.py         |  3 +++
 examples/speedyspeech/baker/frontend.py       |  7 +++++--
 examples/speedyspeech/baker/inference.py      |  2 +-
 examples/speedyspeech/baker/normalize.py      |  6 +++++-
 examples/speedyspeech/baker/preprocess.py     |  8 ++++++--
 .../baker/speedyspeech_updater.py             |  1 -
 examples/speedyspeech/baker/synthesize.py     |  2 +-
 examples/speedyspeech/baker/synthesize_e2e.py |  2 +-
 examples/speedyspeech/baker/train.py          | 20 ++++++++++++++-----
 setup.py                                      |  5 +++--
 11 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/examples/speedyspeech/baker/compute_statistics.py b/examples/speedyspeech/baker/compute_statistics.py
index e145974..3d3dd5b 100644
--- a/examples/speedyspeech/baker/compute_statistics.py
+++ b/examples/speedyspeech/baker/compute_statistics.py
@@ -89,6 +89,11 @@ def main():
 
     with jsonlines.open(args.metadata, 'r') as reader:
         metadata = list(reader)
+
+    metadata_dir = Path(args.metadata).parent
+    for item in metadata:
+        item["feats"] = str(metadata_dir / item["feats"])
+
     dataset = DataTable(
         metadata,
         fields=[args.field_name],
diff --git a/examples/speedyspeech/baker/config.py b/examples/speedyspeech/baker/config.py
index f555791..3ca3121 100644
--- a/examples/speedyspeech/baker/config.py
+++ b/examples/speedyspeech/baker/config.py
@@ -14,6 +14,9 @@
 
 import yaml
 from yacs.config import CfgNode as Configuration
+from pathlib import Path
+
+config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve()
 
 with open("conf/default.yaml", 'rt') as f:
     _C = yaml.safe_load(f)
diff --git a/examples/speedyspeech/baker/frontend.py b/examples/speedyspeech/baker/frontend.py
index 5914436..e8869dd 100644
--- a/examples/speedyspeech/baker/frontend.py
+++ b/examples/speedyspeech/baker/frontend.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import re
+from pathlib import Path
+
 import numpy as np
 import paddle
 import pypinyin
@@ -22,10 +24,11 @@ import phkit
 phkit.initialize()
 from parakeet.frontend.vocab import Vocab
 
-with open("phones.txt", 'rt') as f:
+file_dir = Path(__file__).parent.resolve()
+with open(file_dir / "phones.txt", 'rt') as f:
     phones = [line.strip() for line in f.readlines()]
 
-with open("tones.txt", 'rt') as f:
+with open(file_dir / "tones.txt", 'rt') as f:
     tones = [line.strip() for line in f.readlines()]
 voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)
 voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)
diff --git a/examples/speedyspeech/baker/inference.py b/examples/speedyspeech/baker/inference.py
index 7e202a8..3bd4384 100644
--- a/examples/speedyspeech/baker/inference.py
+++ b/examples/speedyspeech/baker/inference.py
@@ -33,7 +33,7 @@ def main():
         help="text to synthesize, a 'utt_id sentence' pair per line")
     parser.add_argument("--output-dir", type=str, help="output dir")
 
-    args = parser.parse_args()
+    args, _ = parser.parse_known_args()
 
     speedyspeech_config = inference.Config(
         str(Path(args.inference_dir) / "speedyspeech.pdmodel"),
diff --git a/examples/speedyspeech/baker/normalize.py b/examples/speedyspeech/baker/normalize.py
index daa0a91..2d1b028 100644
--- a/examples/speedyspeech/baker/normalize.py
+++ b/examples/speedyspeech/baker/normalize.py
@@ -96,6 +96,10 @@ def main():
     # get dataset
     with jsonlines.open(args.metadata, 'r') as reader:
         metadata = list(reader)
+    metadata_dir = Path(args.metadata).parent
+    for item in metadata:
+        item["feats"] = str(metadata_dir / item["feats"])
+
     dataset = DataTable(metadata, converters={'feats': np.load, })
     logging.info(f"The number of files = {len(dataset)}.")
 
@@ -136,7 +140,7 @@ def main():
             'num_phones': item['num_phones'],
             'num_frames': item['num_frames'],
             'durations': item['durations'],
-            'feats': str(mel_path),
+            'feats': str(mel_path.relative_to(dumpdir)),
         })
     output_metadata.sort(key=itemgetter('utt_id'))
     output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
diff --git a/examples/speedyspeech/baker/preprocess.py b/examples/speedyspeech/baker/preprocess.py
index 1fda34f..0295dee 100644
--- a/examples/speedyspeech/baker/preprocess.py
+++ b/examples/speedyspeech/baker/preprocess.py
@@ -181,7 +181,7 @@ def process_sentence(config: Dict[str, Any],
         "num_phones": len(phones),
         "num_frames": num_frames,
         "durations": durations_frame,
-        "feats": str(mel_path.resolve()),  # use absolute path
+        "feats": mel_path,  # Path object
     }
     return record
 
@@ -212,8 +212,12 @@ def process_sentences(config,
                     results.append(ft.result())
 
     results.sort(key=itemgetter("utt_id"))
-    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
+    output_dir = Path(output_dir)
+    metadata_path = output_dir / "metadata.jsonl"
+    # NOTE: use relative path to the meta jsonlines file
+    with jsonlines.open(metadata_path, 'w') as writer:
         for item in results:
+            item["feats"] = str(item["feats"].relative_to(output_dir))
             writer.write(item)
     print("Done")
 
diff --git a/examples/speedyspeech/baker/speedyspeech_updater.py b/examples/speedyspeech/baker/speedyspeech_updater.py
index bc4d0f9..bbb65d7 100644
--- a/examples/speedyspeech/baker/speedyspeech_updater.py
+++ b/examples/speedyspeech/baker/speedyspeech_updater.py
@@ -70,7 +70,6 @@ class SpeedySpeechUpdater(StandardUpdater):
 
 class SpeedySpeechEvaluator(StandardEvaluator):
     def evaluate_core(self, batch):
-        print("fire")
         decoded, predicted_durations = self.model(
             text=batch["phones"],
             tones=batch["tones"],
diff --git a/examples/speedyspeech/baker/synthesize.py b/examples/speedyspeech/baker/synthesize.py
index c4b6cb3..0cddf73 100644
--- a/examples/speedyspeech/baker/synthesize.py
+++ b/examples/speedyspeech/baker/synthesize.py
@@ -150,7 +150,7 @@ def main():
         "--device", type=str, default="gpu", help="device type to use")
     parser.add_argument("--verbose", type=int, default=1, help="verbose")
 
-    args = parser.parse_args()
+    args, _ = parser.parse_known_args()
     with open(args.speedyspeech_config) as f:
         speedyspeech_config = CfgNode(yaml.safe_load(f))
     with open(args.pwg_config) as f:
diff --git a/examples/speedyspeech/baker/synthesize_e2e.py b/examples/speedyspeech/baker/synthesize_e2e.py
index af448c8..2795640 100644
--- a/examples/speedyspeech/baker/synthesize_e2e.py
+++ b/examples/speedyspeech/baker/synthesize_e2e.py
@@ -152,7 +152,7 @@ def main():
         "--device", type=str, default="gpu", help="device type to use")
     parser.add_argument("--verbose", type=int, default=1, help="verbose")
 
-    args = parser.parse_args()
+    args, _ = parser.parse_known_args()
     with open(args.speedyspeech_config) as f:
         speedyspeech_config = CfgNode(yaml.safe_load(f))
     with open(args.pwg_config) as f:
diff --git a/examples/speedyspeech/baker/train.py b/examples/speedyspeech/baker/train.py
index d51afec..e7fd4be 100644
--- a/examples/speedyspeech/baker/train.py
+++ b/examples/speedyspeech/baker/train.py
@@ -72,6 +72,10 @@ def train_sp(args, config):
     # construct dataset for training and validation
     with jsonlines.open(args.train_metadata, 'r') as reader:
         train_metadata = list(reader)
+    metadata_dir = Path(args.train_metadata).parent
+    for item in train_metadata:
+        item["feats"] = str(metadata_dir / item["feats"])
+
     train_dataset = DataTable(
         data=train_metadata,
         fields=[
@@ -80,6 +84,9 @@ def train_sp(args, config):
         converters={"feats": np.load, }, )
     with jsonlines.open(args.dev_metadata, 'r') as reader:
         dev_metadata = list(reader)
+    metadata_dir = Path(args.dev_metadata).parent
+    for item in dev_metadata:
+        item["feats"] = str(metadata_dir / item["feats"])
     dev_dataset = DataTable(
         data=dev_metadata,
         fields=[
@@ -113,9 +120,6 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    # batch = collate_baker_examples([train_dataset[i] for i in range(10)])
-    # # batch = collate_baker_examples([dev_dataset[i] for i in range(10)])
-    # import pdb; pdb.set_trace()
     model = SpeedySpeech(**config["model"])
     if world_size > 1:
         model = DataParallel(model)  # TODO, do not use vocab size from config
@@ -141,7 +145,7 @@ def train_sp(args, config):
         trainer.extend(VisualDL(writer), trigger=(1, "iteration"))
         trainer.extend(
             Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
-    print(trainer.extensions)
+    # print(trainer.extensions)
     trainer.run()
 
 
@@ -160,12 +164,18 @@ def main():
         "--nprocs", type=int, default=1, help="number of processes")
     parser.add_argument("--verbose", type=int, default=1, help="verbose")
 
-    args = parser.parse_args()
+    args, rest = parser.parse_known_args()
     if args.device == "cpu" and args.nprocs > 1:
         raise RuntimeError("Multiprocess training on CPU is not supported.")
     config = get_cfg_default()
     if args.config:
         config.merge_from_file(args.config)
+    if rest:
+        extra = []
+        # to support key=value format
+        for item in rest:
+            extra.extend(item.split("=", maxsplit=1))
+        config.merge_from_list(extra)
 
     print("========Args========")
     print(yaml.safe_dump(vars(args)))
diff --git a/setup.py b/setup.py
index b7cb4da..b241f97 100644
--- a/setup.py
+++ b/setup.py
@@ -64,17 +64,18 @@ setup_info = dict(
         'scipy',
         'pandas',
         'sox',
-        'soundfile',
+        'soundfile~=0.10',
         'g2p_en',
         'yacs',
         'visualdl',
         'pypinyin',
         'webrtcvad',
         'g2pM',
-        'praatio',
+        'praatio~=4.1',
         "h5py",
         "timer",
         'jsonlines',
+        "phkit",
     ],
     extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },