diff --git a/parakeet/frontend/normalizer/normalizer.py b/parakeet/frontend/normalizer/normalizer.py index fe7d9f8..3da6d6f 100644 --- a/parakeet/frontend/normalizer/normalizer.py +++ b/parakeet/frontend/normalizer/normalizer.py @@ -29,4 +29,4 @@ def normalize(sentence): sentence = re.sub(r"[^ a-z'.,?!\-]", "", sentence) sentence = sentence.replace("i.e.", "that is") sentence = sentence.replace("e.g.", "for example") - return sentence.split() + return sentence diff --git a/parakeet/frontend/phonectic.py b/parakeet/frontend/phonectic.py index 2b41db5..6f0de1d 100644 --- a/parakeet/frontend/phonectic.py +++ b/parakeet/frontend/phonectic.py @@ -79,23 +79,14 @@ class EnglishCharacter(Phonetics): self.vocab = Vocab(self.graphemes + self.punctuations) def phoneticize(self, sentence): - start = self.vocab.start_symbol - end = self.vocab.end_symbol - - words = ([] if start is None else [start]) \ - + normalize(sentence) \ - + ([] if end is None else [end]) + words = normalize(sentence) return words - def numericalize(self, words): - ids = [] - for word in words: - if word in self.vocab.stoi: - ids.append(self.vocab.lookup(word)) - continue - for char in word: - if char in self.vocab.stoi: - ids.append(self.vocab.lookup(char)) + def numericalize(self, sentence): + ids = [ + self.vocab.lookup(item) for item in sentence + if item in self.vocab.stoi + ] return ids def reverse(self, ids):