Parakeet/examples/deepvoice3/data.py

import numpy as np
import os
import csv
import pandas as pd

import paddle
from paddle import fluid
from paddle.fluid import dygraph as dg
from paddle.fluid.dataloader import Dataset, BatchSampler
from paddle.fluid.io import DataLoader

from parakeet.data import DatasetMixin, DataCargo, PartialyRandomizedSimilarTimeLengthSampler
from parakeet.g2p import en

class LJSpeech(DatasetMixin):
    def __init__(self, root):
        self._root = root
        self._table = pd.read_csv(
            os.path.join(root, "metadata.csv"), 
            sep="|", 
            encoding="utf-8", 
            quoting=csv.QUOTE_NONE, 
            header=None, 
            names=["num_frames", "spec_name", "mel_name", "text"],
            dtype={"num_frames": np.int64, "spec_name": str, "mel_name":str, "text":str})
    
    def num_frames(self):
        return self._table["num_frames"].to_list()

    def get_example(self, i):
        """
        spec (T_frame, C_spec)
        mel (T_frame, C_mel)
        """
        num_frames, spec_name, mel_name, text = self._table.iloc[i]
        spec = np.load(os.path.join(self._root, spec_name))
        mel = np.load(os.path.join(self._root, mel_name))
        return (text, spec, mel, num_frames)
    
    def __len__(self):
        return len(self._table)

class DataCollector(object):
    def __init__(self, p_pronunciation):
        self.p_pronunciation = p_pronunciation
        
    def __call__(self, examples):
        """
        output shape and dtype
        (B, T_text) int64
        (B,) int64
        (B, T_frame, C_spec) float32
        (B, T_frame, C_mel) float32
        (B,) int64
        """
        text_seqs = []
        specs = []
        mels = []
        num_frames = np.array([example[3] for example in examples], dtype=np.int64)
        max_frames = np.max(num_frames)

        for example in examples:
            text, spec, mel, _ = example
            text_seqs.append(en.text_to_sequence(text, self.p_pronunciation))
            # if max_frames - mel.shape[0] < 0:
            #     import pdb; pdb.set_trace()
            specs.append(np.pad(spec, [(0, max_frames - spec.shape[0]), (0, 0)]))
            mels.append(np.pad(mel, [(0, max_frames - mel.shape[0]), (0, 0)]))

        specs = np.stack(specs)
        mels = np.stack(mels)

        text_lengths = np.array([len(seq) for seq in text_seqs], dtype=np.int64)
        max_length = np.max(text_lengths)
        text_seqs = np.array([seq + [0] * (max_length - len(seq)) for seq in text_seqs], dtype=np.int64)
        return text_seqs, text_lengths, specs, mels, num_frames

if __name__ == "__main__":
    import argparse
    import tqdm
    import time
    from ruamel import yaml

    parser = argparse.ArgumentParser(description="load the preprocessed ljspeech dataset")
    parser.add_argument("--config", type=str, required=True, help="config file")
    parser.add_argument("--input", type=str, required=True, help="data path of the original data")
    args = parser.parse_args()
    with open(args.config, 'rt') as f:
        config = yaml.safe_load(f)
    
    print("========= Command Line Arguments ========")
    for k, v in vars(args).items():
        print("{}: {}".format(k, v))
    print("=========== Configurations ==============")
    for k in ["p_pronunciation", "batch_size"]:
        print("{}: {}".format(k, config[k]))

    ljspeech = LJSpeech(args.input)
    collate_fn = DataCollector(config["p_pronunciation"])

    dg.enable_dygraph(fluid.CPUPlace())
    sampler = PartialyRandomizedSimilarTimeLengthSampler(ljspeech.num_frames())
    cargo = DataCargo(ljspeech, collate_fn, 
                      batch_size=config["batch_size"], sampler=sampler)
    loader = DataLoader\
           .from_generator(capacity=5, return_list=True)\
           .set_batch_generator(cargo)

    for i, batch in tqdm.tqdm(enumerate(loader)):
        continue
dv3 reloaded, back to the origin 2020-07-10 20:22:43 +08:00			`import numpy as np`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`import os`
			`import csv`
			`import pandas as pd`

dv3 reloaded, back to the origin 2020-07-10 20:22:43 +08:00			`import paddle`
			`from paddle import fluid`
			`from paddle.fluid import dygraph as dg`
			`from paddle.fluid.dataloader import Dataset, BatchSampler`
			`from paddle.fluid.io import DataLoader`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00
dv3 reloaded, back to the origin 2020-07-10 20:22:43 +08:00			`from parakeet.data import DatasetMixin, DataCargo, PartialyRandomizedSimilarTimeLengthSampler`
			`from parakeet.g2p import en`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00
dv3 reloaded, back to the origin 2020-07-10 20:22:43 +08:00			`class LJSpeech(DatasetMixin):`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`def __init__(self, root):`
dv3 reloaded, back to the origin 2020-07-10 20:22:43 +08:00			`self._root = root`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`self._table = pd.read_csv(`
dv3 reloaded, back to the origin 2020-07-10 20:22:43 +08:00			`os.path.join(root, "metadata.csv"),`
			`sep="\|",`
			`encoding="utf-8",`
			`quoting=csv.QUOTE_NONE,`
			`header=None,`
			`names=["num_frames", "spec_name", "mel_name", "text"],`
			`dtype={"num_frames": np.int64, "spec_name": str, "mel_name":str, "text":str})`

			`def num_frames(self):`
			`return self._table["num_frames"].to_list()`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00
			`def get_example(self, i):`
dv3 reloaded, back to the origin 2020-07-10 20:22:43 +08:00			`"""`
			`spec (T_frame, C_spec)`
			`mel (T_frame, C_mel)`
			`"""`
			`num_frames, spec_name, mel_name, text = self._table.iloc[i]`
			`spec = np.load(os.path.join(self._root, spec_name))`
			`mel = np.load(os.path.join(self._root, mel_name))`
			`return (text, spec, mel, num_frames)`

add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`def __len__(self):`
			`return len(self._table)`

			`class DataCollector(object):`
dv3 reloaded, back to the origin 2020-07-10 20:22:43 +08:00			`def __init__(self, p_pronunciation):`
			`self.p_pronunciation = p_pronunciation`

add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`def __call__(self, examples):`
dv3 reloaded, back to the origin 2020-07-10 20:22:43 +08:00			`"""`
			`output shape and dtype`
			`(B, T_text) int64`
			`(B,) int64`
			`(B, T_frame, C_spec) float32`
			`(B, T_frame, C_mel) float32`
			`(B,) int64`
			`"""`
			`text_seqs = []`
			`specs = []`
			`mels = []`
			`num_frames = np.array([example[3] for example in examples], dtype=np.int64)`
			`max_frames = np.max(num_frames)`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00
			`for example in examples:`
dv3 reloaded, back to the origin 2020-07-10 20:22:43 +08:00			`text, spec, mel, _ = example`
			`text_seqs.append(en.text_to_sequence(text, self.p_pronunciation))`
			`# if max_frames - mel.shape[0] < 0:`
			`# import pdb; pdb.set_trace()`
			`specs.append(np.pad(spec, [(0, max_frames - spec.shape[0]), (0, 0)]))`
			`mels.append(np.pad(mel, [(0, max_frames - mel.shape[0]), (0, 0)]))`

			`specs = np.stack(specs)`
			`mels = np.stack(mels)`

			`text_lengths = np.array([len(seq) for seq in text_seqs], dtype=np.int64)`
			`max_length = np.max(text_lengths)`
			`text_seqs = np.array([seq + [0] * (max_length - len(seq)) for seq in text_seqs], dtype=np.int64)`
			`return text_seqs, text_lengths, specs, mels, num_frames`

			`if __name__ == "__main__":`
			`import argparse`
			`import tqdm`
			`import time`
			`from ruamel import yaml`

			`parser = argparse.ArgumentParser(description="load the preprocessed ljspeech dataset")`
			`parser.add_argument("--config", type=str, required=True, help="config file")`
			`parser.add_argument("--input", type=str, required=True, help="data path of the original data")`
			`args = parser.parse_args()`
			`with open(args.config, 'rt') as f:`
			`config = yaml.safe_load(f)`

			`print("========= Command Line Arguments ========")`
			`for k, v in vars(args).items():`
			`print("{}: {}".format(k, v))`
			`print("=========== Configurations ==============")`
			`for k in ["p_pronunciation", "batch_size"]:`
			`print("{}: {}".format(k, config[k]))`

			`ljspeech = LJSpeech(args.input)`
			`collate_fn = DataCollector(config["p_pronunciation"])`

			`dg.enable_dygraph(fluid.CPUPlace())`
			`sampler = PartialyRandomizedSimilarTimeLengthSampler(ljspeech.num_frames())`
			`cargo = DataCargo(ljspeech, collate_fn,`
			`batch_size=config["batch_size"], sampler=sampler)`
			`loader = DataLoader\`
			`.from_generator(capacity=5, return_list=True)\`
			`.set_batch_generator(cargo)`

			`for i, batch in tqdm.tqdm(enumerate(loader)):`
			`continue`