minor fixes to refine code.

2021-07-01 16:14:55 +08:00 · 2021-07-01 16:14:55 +08:00 · 3ebed00c96
parent afe9d4a4f1
commit 3ebed00c96
4 changed files with 23 additions and 16 deletions
--- a/examples/parallelwave_gan/baker/batch_fn.py
+++ b/examples/parallelwave_gan/baker/batch_fn.py
@ -25,7 +25,7 @@ class Clip(object):
            batch_max_steps=20480,
            hop_size=256,
            aux_context_window=0, ):
-        """Initialize customized collater for PyTorch DataLoader.
+        """Initialize customized collater for DataLoader.

        Args:
            batch_max_steps (int): The maximum length of input signal in batch.
--- a/examples/parallelwave_gan/baker/compute_statistics.py
+++ b/examples/parallelwave_gan/baker/compute_statistics.py
@ -39,7 +39,9 @@ def main():
    parser.add_argument(
        "--metadata", type=str, help="json file with id and file paths ")
    parser.add_argument(
-        "--field-name", type=str, help="json file with id and file paths ")
+        "--field-name",
+        type=str,
+        help="name of the field to compute statistics for.")
    parser.add_argument(
        "--config", type=str, help="yaml format configuration file.")
    parser.add_argument(
--- a/examples/parallelwave_gan/baker/conf/default.yaml
+++ b/examples/parallelwave_gan/baker/conf/default.yaml
@ -18,9 +18,8 @@ fmax: 7600               # Maximum frequency in mel basis calculation.
 # global_gain_scale: 1.0   # Will be multiplied to all of waveform.
 trim_silence: false      # Whether to trim the start and end of silence.
 top_db: 60 # Need to tune carefully if the recording is not good.
-trim_frame_length: 2048    # Frame size in trimming.
-trim_hop_length: 512       # Hop size in trimming.
-# format: "npy"           # Feature file format. "npy" or "hdf5" is supported.
+trim_frame_length: 2048    # Frame size in trimming.(in samples)
+trim_hop_length: 512       # Hop size in trimming.(in samples)

 ###########################################################
 #         GENERATOR NETWORK ARCHITECTURE SETTING          #
@ -119,11 +118,11 @@ discriminator_train_start_steps: 100000 # Number of steps to start to train disc
 train_max_steps: 400000                 # Number of training steps.
 save_interval_steps: 5000               # Interval steps to save checkpoint.
 eval_interval_steps: 1000               # Interval steps to evaluate the network.
-log_interval_steps: 100                 # Interval steps to record the training log.
+

 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
-num_snapshots: 10
-seed: 42
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42	                  # random seed for paddle, random, and np.random
--- a/examples/parallelwave_gan/baker/preprocess.py
+++ b/examples/parallelwave_gan/baker/preprocess.py
@ -147,8 +147,11 @@ def process_sentence(config: Dict[str, Any],

    # adjust time to make num_samples == num_frames * hop_length
    num_frames = logmel.shape[1]
-    y = np.pad(y, (0, config.n_fft), mode="reflect")
-    y = y[:num_frames * config.hop_length]
+    if y.size < num_frames * config.hop_length:
+        y = np.pad(y, (0, num_frames * config.hop_length - y.size),
+                   mode="reflect")
+    else:
+        y = y[:num_frames * config.hop_length]
    num_sample = y.shape[0]

    mel_path = output_dir / (utt_id + "_feats.npy")
@ -241,13 +244,16 @@ def main():
        list((root_dir / "PhoneLabeling").rglob("*.interval")))

    # split data into 3 sections
-    train_wav_files = wav_files[:9800]
-    dev_wav_files = wav_files[9800:9900]
-    test_wav_files = wav_files[9900:]
+    num_train = 9800
+    num_dev = 100

-    train_alignment_files = alignment_files[:9800]
-    dev_alignment_files = alignment_files[9800:9900]
-    test_alignment_files = alignment_files[9900:]
+    train_wav_files = wav_files[:num_train]
+    dev_wav_files = wav_files[num_train:num_train + num_dev]
+    test_wav_files = wav_files[num_train + num_dev:]
+
+    train_alignment_files = alignment_files[:num_train]
+    dev_alignment_files = alignment_files[num_train:num_train + num_dev]
+    test_alignment_files = alignment_files[num_train + num_dev:]

    train_dump_dir = dumpdir / "train" / "raw"
    train_dump_dir.mkdir(parents=True, exist_ok=True)