File size: 1,836 Bytes
4304c2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import re

_letter_to_arpabet = {
    "A": "EY1",
    "B": "B IY1",
    "C": "S IY1",
    "D": "D IY1",
    "E": "IY1",
    "F": "EH1 F",
    "G": "JH IY1",
    "H": "EY1 CH",
    "I": "AY1",
    "J": "JH EY1",
    "K": "K EY1",
    "L": "EH1 L",
    "M": "EH1 M",
    "N": "EH1 N",
    "O": "OW1",
    "P": "P IY1",
    "Q": "K Y UW1",
    "R": "AA1 R",
    "S": "EH1 S",
    "T": "T IY1",
    "U": "Y UW1",
    "V": "V IY1",
    "X": "EH1 K S",
    "Y": "W AY1",
    "W": "D AH1 B AH0 L Y UW0",
    "Z": "Z IY1",
    "s": "Z",
}

# must ignore roman numerals
# _acronym_re = re.compile(r'([A-Z][A-Z]+)s?|([A-Z]\.([A-Z]\.)+s?)')
_acronym_re = re.compile(r"([A-Z][A-Z]+)s?")


class AcronymNormalizer(object):
    def __init__(self, phoneme_dict):
        self.phoneme_dict = phoneme_dict

    def normalize_acronyms(self, text):
        def _expand_acronyms(m, add_spaces=True):
            acronym = m.group(0)
            # remove dots if they exist
            acronym = re.sub("\.", "", acronym)

            acronym = "".join(acronym.split())
            arpabet = self.phoneme_dict.lookup(acronym)

            if arpabet is None:
                acronym = list(acronym)
                arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym]
                # temporary fix
                if arpabet[-1] == "{Z}" and len(arpabet) > 1:
                    arpabet[-2] = arpabet[-2][:-1] + " " + arpabet[-1][1:]
                    del arpabet[-1]
                arpabet = " ".join(arpabet)
            elif len(arpabet) == 1:
                arpabet = "{" + arpabet[0] + "}"
            else:
                arpabet = acronym
            return arpabet

        text = re.sub(_acronym_re, _expand_acronyms, text)
        return text

    def __call__(self, text):
        return self.normalize_acronyms(text)