add griffin lim as an alternative vocoder
This commit is contained in:
parent
282c36c2c1
commit
7938a5f6a4
|
@ -18,7 +18,7 @@ from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTime
|
||||||
from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args
|
from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args
|
||||||
from parakeet.g2p import en
|
from parakeet.g2p import en
|
||||||
|
|
||||||
from vocoder import WaveflowVocoder
|
from vocoder import WaveflowVocoder, GriffinLimVocoder
|
||||||
from train import create_model
|
from train import create_model
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,8 +26,18 @@ def main(args, config):
|
||||||
model = create_model(config)
|
model = create_model(config)
|
||||||
loaded_step = load_parameters(model, checkpoint_path=args.checkpoint)
|
loaded_step = load_parameters(model, checkpoint_path=args.checkpoint)
|
||||||
model.eval()
|
model.eval()
|
||||||
vocoder = WaveflowVocoder()
|
if args.vocoder == "waveflow":
|
||||||
vocoder.model.eval()
|
vocoder = WaveflowVocoder()
|
||||||
|
vocoder.model.eval()
|
||||||
|
elif args.vocoder == "griffin-lim":
|
||||||
|
vocoder = GriffinLimVocoder(
|
||||||
|
sharpening_factor=config["sharpening_factor"],
|
||||||
|
sample_rate=config["sample_rate"],
|
||||||
|
n_fft=config["n_fft"],
|
||||||
|
win_length=config["win_length"],
|
||||||
|
hop_length=config["hop_length"])
|
||||||
|
else:
|
||||||
|
raise ValueError("Other vocoders are not supported.")
|
||||||
|
|
||||||
if not os.path.exists(args.output):
|
if not os.path.exists(args.output):
|
||||||
os.makedirs(args.output)
|
os.makedirs(args.output)
|
||||||
|
@ -35,12 +45,12 @@ def main(args, config):
|
||||||
with open(args.input, 'rt') as f:
|
with open(args.input, 'rt') as f:
|
||||||
sentences = [line.strip() for line in f.readlines()]
|
sentences = [line.strip() for line in f.readlines()]
|
||||||
for i, sentence in enumerate(sentences):
|
for i, sentence in enumerate(sentences):
|
||||||
wav = synthesize(config, model, vocoder, sentence, monotonic_layers)
|
wav = synthesize(args, config, model, vocoder, sentence, monotonic_layers)
|
||||||
sf.write(os.path.join(args.output, "sentence{}.wav".format(i)),
|
sf.write(os.path.join(args.output, "sentence{}.wav".format(i)),
|
||||||
wav, samplerate=config["sample_rate"])
|
wav, samplerate=config["sample_rate"])
|
||||||
|
|
||||||
|
|
||||||
def synthesize(config, model, vocoder, sentence, monotonic_layers):
|
def synthesize(args, config, model, vocoder, sentence, monotonic_layers):
|
||||||
print("[synthesize] {}".format(sentence))
|
print("[synthesize] {}".format(sentence))
|
||||||
text = en.text_to_sequence(sentence, p=1.0)
|
text = en.text_to_sequence(sentence, p=1.0)
|
||||||
text = np.expand_dims(np.array(text, dtype="int64"), 0)
|
text = np.expand_dims(np.array(text, dtype="int64"), 0)
|
||||||
|
@ -58,11 +68,16 @@ def synthesize(config, model, vocoder, sentence, monotonic_layers):
|
||||||
force_monotonic_attention=force_monotonic_attention,
|
force_monotonic_attention=force_monotonic_attention,
|
||||||
window=(config["backward_step"], config["forward_step"]))
|
window=(config["backward_step"], config["forward_step"]))
|
||||||
decoded, refined, attentions = outputs
|
decoded, refined, attentions = outputs
|
||||||
wav = vocoder(F.transpose(decoded, (0, 2, 1)))
|
if args.vocoder == "griffin-lim":
|
||||||
wav_np = wav.numpy()[0]
|
wav_np = vocoder(refined.numpy()[0].T)
|
||||||
|
else:
|
||||||
|
wav = vocoder(F.transpose(refined, (0, 2, 1)))
|
||||||
|
wav_np = wav.numpy()[0]
|
||||||
return wav_np
|
return wav_np
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import argparse
|
import argparse
|
||||||
from ruamel import yaml
|
from ruamel import yaml
|
||||||
|
@ -72,6 +87,7 @@ if __name__ == "__main__":
|
||||||
parser.add_argument("--output", type=str, required=True, help="path to save audio")
|
parser.add_argument("--output", type=str, required=True, help="path to save audio")
|
||||||
parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint")
|
parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint")
|
||||||
parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layer, index starts friom 1")
|
parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layer, index starts friom 1")
|
||||||
|
parser.add_argument("--vocoder", type=str, default="waveflow", choices=['griffin-lim', 'waveflow'], help="vocoder to use")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
with open(args.config, 'rt') as f:
|
with open(args.config, 'rt') as f:
|
||||||
config = yaml.safe_load(f)
|
config = yaml.safe_load(f)
|
||||||
|
|
|
@ -31,13 +31,21 @@ class WaveflowVocoder(object):
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
class GriffinLimVocoder(object):
|
class GriffinLimVocoder(object):
|
||||||
def __init__(self, sharpening_factor=1.4, win_length=1024, hop_length=256):
|
def __init__(self, sharpening_factor=1.4, sample_rate=22050, n_fft=1024,
|
||||||
|
win_length=1024, hop_length=256):
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.n_fft = n_fft
|
||||||
self.sharpening_factor = sharpening_factor
|
self.sharpening_factor = sharpening_factor
|
||||||
self.win_length = win_length
|
self.win_length = win_length
|
||||||
self.hop_length = hop_length
|
self.hop_length = hop_length
|
||||||
|
|
||||||
def __call__(self, spec):
|
def __call__(self, mel):
|
||||||
audio = librosa.core.griffinlim(np.exp(spec * self.sharpening_factor),
|
spec = librosa.feature.inverse.mel_to_stft(
|
||||||
|
np.exp(mel),
|
||||||
|
sr=self.sample_rate,
|
||||||
|
n_fft=self.n_fft,
|
||||||
|
fmin=0, fmax=8000.0, power=1.0)
|
||||||
|
audio = librosa.core.griffinlim(spec ** self.sharpening_factor,
|
||||||
win_length=self.win_length, hop_length=self.hop_length)
|
win_length=self.win_length, hop_length=self.hop_length)
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue