add griffin lim as an alternative vocoder

2020-07-13 15:19:52 +08:00 · 2020-07-13 15:19:52 +08:00 · 7938a5f6a4
parent 282c36c2c1
commit 7938a5f6a4
2 changed files with 34 additions and 10 deletions
--- a/examples/deepvoice3/synthesize.py
+++ b/examples/deepvoice3/synthesize.py
@ -18,7 +18,7 @@ from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTime
 from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args
 from parakeet.g2p import en

-from vocoder import WaveflowVocoder
+from vocoder import WaveflowVocoder, GriffinLimVocoder
 from train import create_model


@ -26,8 +26,18 @@ def main(args, config):
    model = create_model(config)
    loaded_step = load_parameters(model, checkpoint_path=args.checkpoint)
    model.eval()
-    vocoder = WaveflowVocoder()
-    vocoder.model.eval()
+    if args.vocoder == "waveflow":
+        vocoder = WaveflowVocoder()
+        vocoder.model.eval()
+    elif args.vocoder == "griffin-lim":
+        vocoder = GriffinLimVocoder(
+            sharpening_factor=config["sharpening_factor"], 
+            sample_rate=config["sample_rate"],
+            n_fft=config["n_fft"],
+            win_length=config["win_length"],
+            hop_length=config["hop_length"])
+    else:
+        raise ValueError("Other vocoders are not supported.")
    
    if not os.path.exists(args.output):
        os.makedirs(args.output)
@ -35,12 +45,12 @@ def main(args, config):
    with open(args.input, 'rt') as f:
        sentences = [line.strip() for line in f.readlines()]
    for i, sentence in enumerate(sentences):
-        wav = synthesize(config, model, vocoder, sentence, monotonic_layers)
+        wav = synthesize(args, config, model, vocoder, sentence, monotonic_layers)
        sf.write(os.path.join(args.output, "sentence{}.wav".format(i)),
                 wav, samplerate=config["sample_rate"])


-def synthesize(config, model, vocoder, sentence, monotonic_layers):
+def synthesize(args, config, model, vocoder, sentence, monotonic_layers):
    print("[synthesize] {}".format(sentence))
    text = en.text_to_sequence(sentence, p=1.0)
    text = np.expand_dims(np.array(text, dtype="int64"), 0)
@ -58,11 +68,16 @@ def synthesize(config, model, vocoder, sentence, monotonic_layers):
            force_monotonic_attention=force_monotonic_attention, 
            window=(config["backward_step"], config["forward_step"]))
        decoded, refined, attentions = outputs
-        wav = vocoder(F.transpose(decoded, (0, 2, 1)))
-        wav_np = wav.numpy()[0]
+        if args.vocoder == "griffin-lim":
+            wav_np = vocoder(refined.numpy()[0].T)
+        else:
+            wav = vocoder(F.transpose(refined, (0, 2, 1)))
+            wav_np = wav.numpy()[0]
    return wav_np


+
+
 if __name__ == "__main__":
    import argparse
    from ruamel import yaml
@ -72,6 +87,7 @@ if __name__ == "__main__":
    parser.add_argument("--output", type=str, required=True, help="path to save audio")
    parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint")
    parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layer, index starts friom 1")
+    parser.add_argument("--vocoder", type=str, default="waveflow", choices=['griffin-lim', 'waveflow'], help="vocoder to use")
    args = parser.parse_args()
    with open(args.config, 'rt') as f:
        config = yaml.safe_load(f)
--- a/examples/deepvoice3/vocoder.py
+++ b/examples/deepvoice3/vocoder.py
@ -31,13 +31,21 @@ class WaveflowVocoder(object):
        return audio

 class GriffinLimVocoder(object):
-    def __init__(self, sharpening_factor=1.4, win_length=1024, hop_length=256):
+    def __init__(self, sharpening_factor=1.4, sample_rate=22050, n_fft=1024, 
+                 win_length=1024, hop_length=256):
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
        self.sharpening_factor = sharpening_factor
        self.win_length = win_length
        self.hop_length = hop_length

-    def __call__(self, spec):
-        audio = librosa.core.griffinlim(np.exp(spec * self.sharpening_factor), 
+    def __call__(self, mel):
+        spec = librosa.feature.inverse.mel_to_stft(
+            np.exp(mel),
+            sr=self.sample_rate,
+            n_fft=self.n_fft,
+            fmin=0, fmax=8000.0, power=1.0)
+        audio = librosa.core.griffinlim(spec ** self.sharpening_factor, 
            win_length=self.win_length, hop_length=self.hop_length)
        return audio