From 7938a5f6a47b95a9c008ec9ea8fba4b68b157a61 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Mon, 13 Jul 2020 15:19:52 +0800
Subject: [PATCH 1/2] add griffin lim as an alternative vocoder

---
 examples/deepvoice3/synthesize.py | 30 +++++++++++++++++++++++-------
 examples/deepvoice3/vocoder.py    | 14 +++++++++++---
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/examples/deepvoice3/synthesize.py b/examples/deepvoice3/synthesize.py
index 1fd1d95..9f0dda0 100644
--- a/examples/deepvoice3/synthesize.py
+++ b/examples/deepvoice3/synthesize.py
@@ -18,7 +18,7 @@ from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTime
 from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args
 from parakeet.g2p import en
 
-from vocoder import WaveflowVocoder
+from vocoder import WaveflowVocoder, GriffinLimVocoder
 from train import create_model
 
 
@@ -26,8 +26,18 @@ def main(args, config):
     model = create_model(config)
     loaded_step = load_parameters(model, checkpoint_path=args.checkpoint)
     model.eval()
-    vocoder = WaveflowVocoder()
-    vocoder.model.eval()
+    if args.vocoder == "waveflow":
+        vocoder = WaveflowVocoder()
+        vocoder.model.eval()
+    elif args.vocoder == "griffin-lim":
+        vocoder = GriffinLimVocoder(
+            sharpening_factor=config["sharpening_factor"], 
+            sample_rate=config["sample_rate"],
+            n_fft=config["n_fft"],
+            win_length=config["win_length"],
+            hop_length=config["hop_length"])
+    else:
+        raise ValueError("Other vocoders are not supported.")
     
     if not os.path.exists(args.output):
         os.makedirs(args.output)
@@ -35,12 +45,12 @@ def main(args, config):
     with open(args.input, 'rt') as f:
         sentences = [line.strip() for line in f.readlines()]
     for i, sentence in enumerate(sentences):
-        wav = synthesize(config, model, vocoder, sentence, monotonic_layers)
+        wav = synthesize(args, config, model, vocoder, sentence, monotonic_layers)
         sf.write(os.path.join(args.output, "sentence{}.wav".format(i)),
                  wav, samplerate=config["sample_rate"])
 
 
-def synthesize(config, model, vocoder, sentence, monotonic_layers):
+def synthesize(args, config, model, vocoder, sentence, monotonic_layers):
     print("[synthesize] {}".format(sentence))
     text = en.text_to_sequence(sentence, p=1.0)
     text = np.expand_dims(np.array(text, dtype="int64"), 0)
@@ -58,11 +68,16 @@ def synthesize(config, model, vocoder, sentence, monotonic_layers):
             force_monotonic_attention=force_monotonic_attention, 
             window=(config["backward_step"], config["forward_step"]))
         decoded, refined, attentions = outputs
-        wav = vocoder(F.transpose(decoded, (0, 2, 1)))
-        wav_np = wav.numpy()[0]
+        if args.vocoder == "griffin-lim":
+            wav_np = vocoder(refined.numpy()[0].T)
+        else:
+            wav = vocoder(F.transpose(refined, (0, 2, 1)))
+            wav_np = wav.numpy()[0]
     return wav_np
 
 
+
+
 if __name__ == "__main__":
     import argparse
     from ruamel import yaml
@@ -72,6 +87,7 @@ if __name__ == "__main__":
     parser.add_argument("--output", type=str, required=True, help="path to save audio")
     parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint")
     parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layer, index starts friom 1")
+    parser.add_argument("--vocoder", type=str, default="waveflow", choices=['griffin-lim', 'waveflow'], help="vocoder to use")
     args = parser.parse_args()
     with open(args.config, 'rt') as f:
         config = yaml.safe_load(f)
diff --git a/examples/deepvoice3/vocoder.py b/examples/deepvoice3/vocoder.py
index 1471260..5568394 100644
--- a/examples/deepvoice3/vocoder.py
+++ b/examples/deepvoice3/vocoder.py
@@ -31,13 +31,21 @@ class WaveflowVocoder(object):
         return audio
 
 class GriffinLimVocoder(object):
-    def __init__(self, sharpening_factor=1.4, win_length=1024, hop_length=256):
+    def __init__(self, sharpening_factor=1.4, sample_rate=22050, n_fft=1024, 
+                 win_length=1024, hop_length=256):
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
         self.sharpening_factor = sharpening_factor
         self.win_length = win_length
         self.hop_length = hop_length
 
-    def __call__(self, spec):
-        audio = librosa.core.griffinlim(np.exp(spec * self.sharpening_factor), 
+    def __call__(self, mel):
+        spec = librosa.feature.inverse.mel_to_stft(
+            np.exp(mel),
+            sr=self.sample_rate,
+            n_fft=self.n_fft,
+            fmin=0, fmax=8000.0, power=1.0)
+        audio = librosa.core.griffinlim(spec ** self.sharpening_factor, 
             win_length=self.win_length, hop_length=self.hop_length)
         return audio
 

From 8a5f9d75b6945b11f8050d20b460ef48cd2930c8 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Tue, 14 Jul 2020 11:29:49 +0800
Subject: [PATCH 2/2] update README and command line help msg

---
 examples/deepvoice3/README.md     | 9 +++++++--
 examples/deepvoice3/sentences.txt | 5 -----
 examples/deepvoice3/synthesize.py | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)
 delete mode 100644 examples/deepvoice3/sentences.txt

diff --git a/examples/deepvoice3/README.md b/examples/deepvoice3/README.md
index fdf3a46..4f939e1 100644
--- a/examples/deepvoice3/README.md
+++ b/examples/deepvoice3/README.md
@@ -112,6 +112,7 @@ tensorboard --logdir=runs/ --host=$HOSTNAME --port=8000
 usage: synthesize from a checkpoint [-h] --config CONFIG --input INPUT
                                     --output OUTPUT --checkpoint CHECKPOINT
                                     --monotonic_layers MONOTONIC_LAYERS
+                                    [--vocoder {griffin-lim,waveflow}]
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -121,11 +122,14 @@ optional arguments:
   --checkpoint CHECKPOINT
                         data path of the checkpoint
   --monotonic_layers MONOTONIC_LAYERS
-                        monotonic decoder layer, index starts friom 1
+                        monotonic decoder layers' indices(start from 1)
+  --vocoder {griffin-lim,waveflow}
+                        vocoder to use
 ```
 
 `synthesize.py` is used to synthesize several sentences in a text file.
 `--monotonic_layers` is the index of the decoders layer that manifest monotonic diagonal attention. You can get monotonic layers by inspecting tensorboard logs. Mind that the index starts from 1. The layers that manifest monotonic diagonal attention are stable for a model during training and synthesizing, but differ among different runs. So once you get the indices of monotonic layers by inspecting tensorboard log, you can use them at synthesizing. Note that only decoder layers that show strong diagonal attention should be considerd.
+`--vocoder` is the vocoder to use. Current supported values are "waveflow" and "griffin-lim". Default value is "waveflow".
 
 example code:
 
@@ -135,5 +139,6 @@ CUDA_VISIBLE_DEVICES=2 python synthesize.py \
     --input sentences.txt \
     --output outputs/ \
     --checkpoint runs/Jul07_09-39-34_instance-mqcyj27y-4/step-1320000 \
-    --monotonic_layers "5,6"
+    --monotonic_layers "5,6" \
+    --vocoder waveflow
 ```
diff --git a/examples/deepvoice3/sentences.txt b/examples/deepvoice3/sentences.txt
deleted file mode 100644
index 85e3c3a..0000000
--- a/examples/deepvoice3/sentences.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition
-in being comparatively modern.
-For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process
-produced the block books, which were the immediate predecessors of the true printed book,
-the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.
diff --git a/examples/deepvoice3/synthesize.py b/examples/deepvoice3/synthesize.py
index 9f0dda0..39089b2 100644
--- a/examples/deepvoice3/synthesize.py
+++ b/examples/deepvoice3/synthesize.py
@@ -86,7 +86,7 @@ if __name__ == "__main__":
     parser.add_argument("--input", type=str, required=True, help="text file to synthesize")
     parser.add_argument("--output", type=str, required=True, help="path to save audio")
     parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint")
-    parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layer, index starts friom 1")
+    parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layers' indices(start from 1)")
     parser.add_argument("--vocoder", type=str, default="waveflow", choices=['griffin-lim', 'waveflow'], help="vocoder to use")
     args = parser.parse_args()
     with open(args.config, 'rt') as f: