|
import re
|
|
|
|
|
|
abbreviations_en = [
|
|
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
|
for x in [
|
|
("mrs", "misess"),
|
|
("mr", "mister"),
|
|
("dr", "doctor"),
|
|
("st", "saint"),
|
|
("co", "company"),
|
|
("jr", "junior"),
|
|
("maj", "major"),
|
|
("gen", "general"),
|
|
("drs", "doctors"),
|
|
("rev", "reverend"),
|
|
("lt", "lieutenant"),
|
|
("hon", "honorable"),
|
|
("sgt", "sergeant"),
|
|
("capt", "captain"),
|
|
("esq", "esquire"),
|
|
("ltd", "limited"),
|
|
("col", "colonel"),
|
|
("ft", "fort"),
|
|
]
|
|
]
|
|
|
|
def expand_abbreviations(text, lang="en"):
|
|
if lang == "en":
|
|
_abbreviations = abbreviations_en
|
|
else:
|
|
raise NotImplementedError()
|
|
for regex, replacement in _abbreviations:
|
|
text = re.sub(regex, replacement, text)
|
|
return text |