update statset and datacargo's design

2020-02-06 15:40:04 +08:00 · 2020-02-06 15:40:04 +08:00 · 837749a32c
parent 2ca5c810b8
commit 837749a32c
3 changed files with 220 additions and 36 deletions
--- a/parakeet/init.py
+++ b/parakeet/init.py
@ -1 +1,3 @@
 __version__ = "0.0.0"
 from . import data, g2p, models, modules, utils
--- a/parakeet/data/datacargo.py
+++ b/parakeet/data/datacargo.py
@ -1,10 +1,18 @@
 from .sampler import SequentialSampler, RandomSampler, BatchSampler
 class DataCargo(object):
-    def __init__(self, dataset, batch_size=1, sampler=None, 
+    def __init__(self,
-                 shuffle=False, batch_sampler=None, drop_last=False):
+                 dataset,
                 batch_fn=None,
                 batch_size=1,
                 sampler=None,
                 shuffle=False,
                 batch_sampler=None,
                 drop_last=False):
        self.dataset = dataset
-        
+        self.batch_fn = batch_fn or self.dataset._batch_examples
        if batch_sampler is not None:
            # auto_collation with custom batch_sampler
            if batch_size != 1 or shuffle or sampler is not None or drop_last:
@ -15,7 +23,8 @@ class DataCargo(object):
            drop_last = False
            shuffle = False
        elif batch_size is None:
-            raise ValueError('batch sampler is none. then batch size must not be none.')
+            raise ValueError(
                'batch sampler is none. then batch size must not be none.')
        elif sampler is None:
            if shuffle:
                sampler = RandomSampler(dataset)
@ -23,18 +32,20 @@ class DataCargo(object):
                sampler = SequentialSampler(dataset)
            # auto_collation without custom batch_sampler
            batch_sampler = BatchSampler(sampler, batch_size, drop_last)
        else:
            batch_sampler = BatchSampler(sampler, batch_size, drop_last)
        self.batch_size = batch_size
        self.drop_last = drop_last
        self.sampler = sampler
        self.batch_sampler = batch_sampler
-    
+
    def __iter__(self):
        return DataIterator(self)
    def __call__(self):
        return DataIterator(self)
-    
+
    @property
    def _auto_collation(self):
        # we will auto batching
@ -49,26 +60,30 @@ class DataCargo(object):
    def __len__(self):
        return len(self._index_sampler)
-    
+
 class DataIterator(object):
    def __init__(self, loader):
        self.loader = loader
        self._dataset = loader.dataset
-        
+
        self._batch_fn = loader.batch_fn
        self._index_sampler = loader._index_sampler
        self._sampler_iter = iter(self._index_sampler)
-    
+
    def __iter__(self):
        return self
-    
+
    def __next__(self):
-        index = self._next_index()  # may raise StopIteration, TODO(chenfeiyu): use dynamic batch size
+        index = self._next_index(
-        minibatch = [self._dataset[i] for i in index] # we can abstract it, too to use dynamic batch size
+        )  # may raise StopIteration, TODO(chenfeiyu): use dynamic batch size
-        minibatch = self._dataset._batch_examples(minibatch) # list[Example] -> Batch
+        minibatch = [self._dataset[i] for i in index
                     ]  # we can abstract it, too to use dynamic batch size
        minibatch = self._batch_fn(minibatch)  # list[Example] -> Batch
        return minibatch
-    
+
    def _next_index(self):
        return next(self._sampler_iter)
-    
+
    def __len__(self):
        return len(self._index_sampler)
--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
@ -1,24 +1,191 @@
-class Dataset(object):
+import six
-    def __init__(self):
+import numpy as np
-        pass
+
-    
+
-    def _load_metadata(self):
+class DatasetMixin(object):
-        raise NotImplementedError
+    """standard indexing interface for dataset."""
-    
+
    def _get_example(self):
        """return a Record (or Example, Instance according to your glossary)"""
        raise NotImplementedError
    def _batch_examples(self, minibatch):
        """get a list of examples, return a batch, whose structure is the same as an example"""
        raise NotImplementedError
    def _prepare_metadata(self):
        raise NotImplementedError
    def __getitem__(self, index):
-        raise NotImplementedError
+        if isinstance(index, slice):
-    
+            start, stop, step = index.indices(len(self))
-    def __iter__(self):
+            return [
                self.get_example(i)
                for i in six.moves.range(start, stop, step)
            ]
        elif isinstance(index, (list, np.ndarray)):
            return [self.get_example(i) for i in index]
        else:
            # assumes it an integer
            return self.get_example(index)
    def get_example(self, i):
        raise NotImplementedError
    def __len__(self):
        raise NotImplementedError
    def __iter__(self):
        for i in range(len(self)):
            yield self.get_example(i)
 class TransformDataset(DatasetMixin):
    """Transform a dataset to another with a transform."""
    def __init__(self, dataset, transform):
        self._dataset = dataset
        self._transform = transform
    def __len__(self):
        return len(self._dataset)
    def get_example(self, i):
        # CAUTION: only int is supported?
        # CAUTION: dataset support support __getitem__ and __len__
        in_data = self._dataset[i]
        return self._transform(in_data)
 class TupleDataset(object):
    def __init__(self, *datasets):
        if not datasets:
            raise ValueError("no datasets are given")
        length = len(datasets[0])
        for i, dataset in enumerate(datasets):
            if len(datasets) != length:
                raise ValueError(
                    "all the datasets should have the same length."
                    "dataset {} has a different length".format(i))
        self._datasets = datasets
        self._length = length
    def __getitem__(self, index):
        # SOA
        batches = [dataset[index] for dataset in self._datasets]
        if isinstance(index, slice):
            length = len(batches[0])
            # AOS
            return [
                tuple([batch[i] for batch in batches])
                for i in six.moves.range(length)
            ]
        else:
            return tuple(batches)
    def __len__(self):
        return self._length
 class DictDataset(object):
    def __init__(self, **datasets):
        if not datasets:
            raise ValueError("no datasets are given")
        length = None
        for key, dataset in six.iteritems(datasets):
            if length is None:
                length = len(dataset)
            elif len(datasets) != length:
                raise ValueError(
                    "all the datasets should have the same length."
                    "dataset {} has a different length".format(key))
        self._datasets = datasets
        self._length = length
    def __getitem__(self, index):
        batches = {
            key: dataset[index]
            for key, dataset in six.iteritems(self._datasets)
        }
        if isinstance(index, slice):
            length = len(six.next(six.itervalues(batches)))
            return [{key: batch[i]
                     for key, batch in six.iteritems(batches)}
                    for i in six.moves.range(length)]
        else:
            return batches
 class SliceDataset(DatasetMixin):
    def __init__(self, dataset, start, finish, order=None):
        if start < 0 or finish > len(dataset):
            raise ValueError("subset overruns the dataset.")
        self._dataset = dataset
        self._start = start
        self._finish = finish
        self._size = finish - start
        if order is not None and len(order) != len(dataset):
            raise ValueError(
                "order should have the same length as the dataset"
                "len(order) = {} which does not euqals len(dataset) = {} ".
                format(len(order), len(dataset)))
        self._order = order
    def len(self):
        return self._size
    def get_example(self, i):
        if i >= 0:
            if i >= self._size:
                raise IndexError('dataset index out of range')
            index = self._start + i
        else:
            if i < -self._size:
                raise IndexError('dataset index out of range')
            index = self._finish + i
        if self._order is not None:
            index = self._order[index]
        return self._dataset[index]
 class SubsetDataset(DatasetMixin):
    def __init__(self, dataset, indices):
        self._dataset = dataset
        if len(indices) > len(dataset):
            raise ValueError("subset's size larger that dataset's size!")
        self._indices = indices
        self._size = len(indices)
    def __len__(self):
        return self._size
    def get_example(self, i):
        index = self._indices[i]
        return self._dataset[index]
 class FilterDataset(DatasetMixin):
    def __init__(self, dataset, filter_fn):
        self._dataset = dataset
        self._indices = [
            i for i in range(len(dataset)) if filter_fn(dataset[i])
        ]
        self._size = len(self._indices)
    def __len__(self):
        return self._size
    def get_example(self, i):
        index = self._indices[i]
        return self._dataset[index]
 class ChainDataset(DatasetMixin):
    def __init__(self, *datasets):
        self._datasets = datasets
    def __len__(self):
        return sum(len(dataset) for dataset in self._datasets)
    def get_example(self, i):
        if i < 0:
            raise IndexError(
                "ChainDataset doesnot support negative indexing.")
        for dataset in self._datasets:
            if i < len(dataset):
                return dataset[i]
            i -= len(dataset)
        raise IndexError("dataset index out of range")
`@ -1 +1,3 @@`
	`__version__ = "0.0.0"`	`__version__ = "0.0.0"`

		`from . import data, g2p, models, modules, utils`