Spaces:

BUT-FIT
/

EMMA_leaderboard

Running

Lakoc

Initial commit

605b3ec 8 days ago

26.7 kB

	import json
	import os
	import re
	from fractions import Fraction
	from typing import Iterator, List, Match, Optional, Union

	from more_itertools import windowed

	from .basic import remove_symbols_and_diacritics


	class EnglishNumberNormalizer:
	"""
	Convert any spelled-out numbers into arabic numbers, while handling:

	- remove any commas
	- keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
	- spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars` # noqa e501
	- spell out `one` and `ones`
	- interpret successive single-digit numbers as nominal: `one oh one` -> `101` # noqa e501
	"""

	def __init__(self):
	super().__init__()

	self.zeros = {"o", "zero"}
	self.ones = {
	name: i
	for i, name in enumerate(
	[
	"one",
	"two",
	"three",
	"four",
	"five",
	"six",
	"seven",
	"eight",
	"nine",
	"ten",
	"eleven",
	"twelve",
	"thirteen",
	"fourteen",
	"fifteen",
	"sixteen",
	"seventeen",
	"eighteen",
	"nineteen",
	],
	start=1,
	)
	}
	self.ones_plural = {
	"sixes" if name == "six" else name + "s": (value, "s")
	for name, value in self.ones.items()
	}
	self.ones_ordinal = {
	"zeroth": (0, "th"),
	"first": (1, "st"),
	"second": (2, "nd"),
	"third": (3, "rd"),
	"fifth": (5, "th"),
	"twelfth": (12, "th"),
	**{
	name + ("h" if name.endswith("t") else "th"): (value, "th")
	for name, value in self.ones.items()
	if value > 3 and value != 5 and value != 12
	},
	}
	self.ones_suffixed = {self.ones_plural, self.ones_ordinal}

	self.tens = {
	"twenty": 20,
	"thirty": 30,
	"forty": 40,
	"fifty": 50,
	"sixty": 60,
	"seventy": 70,
	"eighty": 80,
	"ninety": 90,
	}
	self.tens_plural = {
	name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
	}
	self.tens_ordinal = {
	name.replace("y", "ieth"): (value, "th")
	for name, value in self.tens.items()
	}
	self.tens_suffixed = {self.tens_plural, self.tens_ordinal}

	self.multipliers = {
	"hundred": 100,
	"thousand": 1_000,
	"million": 1_000_000,
	"billion": 1_000_000_000,
	"trillion": 1_000_000_000_000,
	"quadrillion": 1_000_000_000_000_000,
	"quintillion": 1_000_000_000_000_000_000,
	"sextillion": 1_000_000_000_000_000_000_000,
	"septillion": 1_000_000_000_000_000_000_000_000,
	"octillion": 1_000_000_000_000_000_000_000_000_000,
	"nonillion": 1_000_000_000_000_000_000_000_000_000_000,
	"decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
	}
	self.multipliers_plural = {
	name + "s": (value, "s") for name, value in self.multipliers.items()
	}
	self.multipliers_ordinal = {
	name + "th": (value, "th") for name, value in self.multipliers.items()
	}
	self.multipliers_suffixed = {
	**self.multipliers_plural,
	**self.multipliers_ordinal,
	}
	self.decimals = {self.ones, self.tens, *self.zeros}

	self.preceding_prefixers = {
	"minus": "-",
	"negative": "-",
	"plus": "+",
	"positive": "+",
	}
	self.following_prefixers = {
	"pound": "£",
	"pounds": "£",
	"euro": "€",
	"euros": "€",
	"dollar": "$",
	"dollars": "$",
	"cent": "¢",
	"cents": "¢",
	}
	self.prefixes = set(
	list(self.preceding_prefixers.values())
	+ list(self.following_prefixers.values())
	)
	self.suffixers = {
	"per": {"cent": "%"},
	"percent": "%",
	}
	self.specials = {"and", "double", "triple", "point"}

	self.words = set(
	[
	key
	for mapping in [
	self.zeros,
	self.ones,
	self.ones_suffixed,
	self.tens,
	self.tens_suffixed,
	self.multipliers,
	self.multipliers_suffixed,
	self.preceding_prefixers,
	self.following_prefixers,
	self.suffixers,
	self.specials,
	]
	for key in mapping
	]
	)
	self.literal_words = {"one", "ones"}

	def process_words(self, words: List[str]) -> Iterator[str]:
	prefix: Optional[str] = None
	value: Optional[Union[str, int]] = None
	skip = False

	def to_fraction(s: str):
	try:
	return Fraction(s)
	except ValueError:
	return None

	def output(result: Union[str, int]):
	nonlocal prefix, value
	result = str(result)
	if prefix is not None:
	result = prefix + result
	value = None
	prefix = None
	return result

	if len(words) == 0:
	return

	for prev, current, next in windowed([None] + words + [None], 3):
	if skip:
	skip = False
	continue

	next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
	has_prefix = current[0] in self.prefixes
	current_without_prefix = current[1:] if has_prefix else current
	if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
	# arabic numbers (potentially with signs and fractions)
	f = to_fraction(current_without_prefix)
	assert f is not None
	if value is not None:
	if isinstance(value, str) and value.endswith("."):
	# concatenate decimals / ip address components
	value = str(value) + str(current)
	continue
	else:
	yield output(value)

	prefix = current[0] if has_prefix else prefix
	if f.denominator == 1:
	value = f.numerator # store integers as int
	else:
	value = current_without_prefix
	elif current not in self.words:
	# non-numeric words
	if value is not None:
	yield output(value)
	yield output(current)
	elif current in self.zeros:
	value = str(value or "") + "0"
	elif current in self.ones:
	ones = self.ones[current]

	if value is None:
	value = ones
	elif isinstance(value, str) or prev in self.ones:
	if (
	prev in self.tens and ones < 10
	): # replace the last zero with the digit
	assert value[-1] == "0"
	value = value[:-1] + str(ones)
	else:
	value = str(value) + str(ones)
	elif ones < 10:
	if value % 10 == 0:
	value += ones
	else:
	value = str(value) + str(ones)
	else: # eleven to nineteen
	if value % 100 == 0:
	value += ones
	else:
	value = str(value) + str(ones)
	elif current in self.ones_suffixed:
	# ordinal or cardinal; yield the number right away
	ones, suffix = self.ones_suffixed[current]
	if value is None:
	yield output(str(ones) + suffix)
	elif isinstance(value, str) or prev in self.ones:
	if prev in self.tens and ones < 10:
	assert value[-1] == "0"
	yield output(value[:-1] + str(ones) + suffix)
	else:
	yield output(str(value) + str(ones) + suffix)
	elif ones < 10:
	if value % 10 == 0:
	yield output(str(value + ones) + suffix)
	else:
	yield output(str(value) + str(ones) + suffix)
	else: # eleven to nineteen
	if value % 100 == 0:
	yield output(str(value + ones) + suffix)
	else:
	yield output(str(value) + str(ones) + suffix)
	value = None
	elif current in self.tens:
	tens = self.tens[current]
	if value is None:
	value = tens
	elif isinstance(value, str):
	value = str(value) + str(tens)
	else:
	if value % 100 == 0:
	value += tens
	else:
	value = str(value) + str(tens)
	elif current in self.tens_suffixed:
	# ordinal or cardinal; yield the number right away
	tens, suffix = self.tens_suffixed[current]
	if value is None:
	yield output(str(tens) + suffix)
	elif isinstance(value, str):
	yield output(str(value) + str(tens) + suffix)
	else:
	if value % 100 == 0:
	yield output(str(value + tens) + suffix)
	else:
	yield output(str(value) + str(tens) + suffix)
	elif current in self.multipliers:
	multiplier = self.multipliers[current]
	if value is None:
	value = multiplier
	elif isinstance(value, str) or value == 0:
	f = to_fraction(value)
	p = f * multiplier if f is not None else None
	if f is not None and p.denominator == 1:
	value = p.numerator
	else:
	yield output(value)
	value = multiplier
	else:
	before = value // 1000 * 1000
	residual = value % 1000
	value = before + residual * multiplier
	elif current in self.multipliers_suffixed:
	multiplier, suffix = self.multipliers_suffixed[current]
	if value is None:
	yield output(str(multiplier) + suffix)
	elif isinstance(value, str):
	f = to_fraction(value)
	p = f * multiplier if f is not None else None
	if f is not None and p.denominator == 1:
	yield output(str(p.numerator) + suffix)
	else:
	yield output(value)
	yield output(str(multiplier) + suffix)
	else: # int
	before = value // 1000 * 1000
	residual = value % 1000
	value = before + residual * multiplier
	yield output(str(value) + suffix)
	value = None
	elif current in self.preceding_prefixers:
	# apply prefix (positive, minus, etc.) if it precedes a number
	if value is not None:
	yield output(value)

	if next in self.words or next_is_numeric:
	prefix = self.preceding_prefixers[current]
	else:
	yield output(current)
	elif current in self.following_prefixers:
	# apply prefix (dollars, cents, etc.) only after a number
	if value is not None:
	prefix = self.following_prefixers[current]
	yield output(value)
	else:
	yield output(current)
	elif current in self.suffixers:
	# apply suffix symbols (percent -> '%')
	if value is not None:
	suffix = self.suffixers[current]
	if isinstance(suffix, dict):
	if next in suffix:
	yield output(str(value) + suffix[next])
	skip = True
	else:
	yield output(value)
	yield output(current)
	else:
	yield output(str(value) + suffix)
	else:
	yield output(current)
	elif current in self.specials:
	if next not in self.words and not next_is_numeric:
	# apply special handling
	# only if the next word can be numeric
	if value is not None:
	yield output(value)
	yield output(current)
	elif current == "and":
	# ignore "and" after hundreds, thousands, etc.
	if prev not in self.multipliers:
	if value is not None:
	yield output(value)
	yield output(current)
	elif current == "double" or current == "triple":
	if next in self.ones or next in self.zeros:
	repeats = 2 if current == "double" else 3
	ones = self.ones.get(next, 0)
	value = str(value or "") + str(ones) * repeats
	skip = True
	else:
	if value is not None:
	yield output(value)
	yield output(current)
	elif current == "point":
	if next in self.decimals or next_is_numeric:
	value = str(value or "") + "."
	else:
	# should all have been covered at this point
	raise ValueError(f"Unexpected token: {current}")
	else:
	# all should have been covered at this point
	raise ValueError(f"Unexpected token: {current}")

	if value is not None:
	yield output(value)

	def preprocess(self, s: str):
	# replace "<number> and a half" with "<number> point five"
	results = []

	segments = re.split(r"\band\s+a\s+half\b", s)
	for i, segment in enumerate(segments):
	if len(segment.strip()) == 0:
	continue
	if i == len(segments) - 1:
	results.append(segment)
	else:
	results.append(segment)
	last_word = segment.rsplit(maxsplit=2)[-1]
	if last_word in self.decimals or last_word in self.multipliers:
	results.append("point five")
	else:
	results.append("and a half")

	s = " ".join(results)

	# put a space at number/letter boundary
	s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
	s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)

	# but remove spaces which could be a suffix
	s = re.sub(r"([0-9])\s+(st\|nd\|rd\|th\|s)\b", r"\1\2", s)

	return s

	def postprocess(self, s: str):
	def combine_cents(m: Match):
	try:
	currency = m.group(1)
	integer = m.group(2)
	cents = int(m.group(3))
	return f"{currency}{integer}.{cents:02d}"
	except ValueError:
	return m.string

	def extract_cents(m: Match):
	try:
	return f"¢{int(m.group(1))}"
	except ValueError:
	return m.string

	# apply currency postprocessing; "$2 and ¢7" -> "$2.07"
	s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
	s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)

	# write "one(s)" instead of "1(s)", just for the readability
	s = re.sub(r"\b1(s?)\b", r"one\1", s)

	return s

	def __call__(self, s: str):
	s = self.preprocess(s)
	s = " ".join(word for word in self.process_words(s.split()) if word is not None)
	s = self.postprocess(s)

	return s


	class EnglishReverseNumberNormalizer(EnglishNumberNormalizer):
	"""
	This is an approximate inverse of EnglishNumberNormalizer that converts arabic numerals
	into spelled-out numbers.

	Motivation: Whisper's original EnglishNumberNormalizer produces numberals that match Whisper's rich
	token set, which many ASRs cannot output.
	This class takes an alternative normalization approach, converting Whisper's numberals back to
	spelled-out numbers. This ensures compatibility with the token sets of other ASR systems while
	avoiding penalizing Whisper for outputting numerals.

	Examples of cases handled:
	- "365" -> "three hundred sixty five"
	- "$20" -> "twenty dollars"
	- "50%" -> "fifty percent"
	- "12th" -> "twelfth", "12s" -> "twelves"
	- "90th" -> "ninetieth", "90s" -> "nineties"
	- The special cases of "70 000" -> "seventy thousand" but not larger numbers.

	Caveats: this class takes care of the majority of cases, but it is not perfect.
	- Only numerals within the 0-1000 range are handled.
	- Minus/plus signs are not handled.
	- There is inherent ambiguity e.g. "100" -> "one hundred" or "a hundred".
	"""

	def __init__(self):
	super().__init__()
	# Reverse dictionaries
	self.int_to_ones = {v: k for k, v in self.ones.items()}
	self.int_to_tens = {v: k for k, v in self.tens.items()}

	# 11th -> eleventh etc.
	self.str_to_ones_suffixed = {str(n)+s: k for k, (n,s) in self.ones_suffixed.items()}
	# 20s -> twenties etc.
	self.str_to_tens_suffixed = {str(n)+s: k for k, (n,s) in self.tens_suffixed.items()}

	def __call__(self, s: str):
	# "$x[.y]" -> "x[.y] dollars"
	s = re.sub(r'\$(\d+(\.\d+)?)', r'\1 dollars', s)
	# "x[.y]"% -> "x[.y] percent"
	s = re.sub(r'(\d+(\.\d+)?)%', r'\1 percent', s)
	# note this doesn't handle cases such as -x or +x.

	def number_to_words(w: str):
	if w.isdigit():
	num = int(w)
	if w == '000':
	return 'thousand' # will work in case of "70 000" -> "seventy thousand"
	if num == 0:
	return "zero"
	elif num == 100:
	return "hundred"
	elif 0 < num < 1000:
	hundreds, remainder = divmod(num, 100)
	tens, ones = divmod(remainder, 10)
	h = [f"{self.int_to_ones[hundreds]} hundred"] if hundreds > 0 else []
	if 0 < remainder <= 19:
	t = [self.int_to_ones[remainder]]
	o = []
	else:
	t = [self.int_to_tens[tens*10]] if tens > 0 else []
	o = [self.int_to_ones[ones]] if ones > 0 else []
	return " ".join(h + t + o)
	elif num == 1000:
	return "thousand"
	else:
	return w # case not handled
	else:
	# suffixed numbers
	w = self.str_to_ones_suffixed.get(w, w)
	w = self.str_to_tens_suffixed.get(w, w)
	return w

	return " ".join(number_to_words(w) for w in s.split())


	class EnglishSpellingNormalizer:
	"""
	Applies British-American spelling mappings as listed in [1].

	[1] https://www.tysto.com/uk-us-spelling-list.html
	"""

	def __init__(self, mapping_name="english.json"):
	mapping_path = os.path.join(os.path.dirname(__file__), mapping_name)
	self.mapping = json.load(open(mapping_path))

	def __call__(self, s: str):
	return " ".join(self.mapping.get(word, word) for word in s.split())


	class EnglishTextNormalizer:
	"""
	This is a modified version of the Whisper text normalizer designed to enhance compatibility
	across various ASRs.

	Key features:

	1. Idempotency: output is unchanged with repeated application.
	2. The original Whisper-tailored number normalization is replaced with one that is compatible with
	other ASR systems, mapping numerals into spelled-out numbers.
	See EnglishReverseNumberNormalizer for details and limitations.
	3. Filler words are removed by default, similar to the original normalizer: ['hmm', 'uh', 'ah', 'eh'].
	This is for compatibility with ASRs trained to ignore these.
	4. Added normalization for some common words: okay -> ok, everyday -> every day etc.

	"""
	def __init__(self, standardize_numbers=False, standardize_numbers_rev=True, remove_fillers=True):
	self.replacers = {
	# common non verbal sounds are mapped to the similar ones
	r"\b(hm+)\b\|\b(mhm)\b\|\b(mm+)\b\|\b(m+h)\b\|\b(hm+)\b\|\b(um+)\b\|\b(uhm+)\b": ( # noqa e501
	"hmm"
	),
	r"\b(a+h+)\b\|\b(ha+)\b": "ah",
	r"[!?.]+(?=$\|\s)": "", # Okay.. --> okay
	r"\b(o+h+)\b\|\b(h+o+)\b": "oh",
	r"\b(u+h+)\b\|\b(h+u+)\b\|\b(h+u+h+)\b": "uh",
	# common contractions
	r"\b(wi\sfi)\b": "wifi",
	r"\b(goin)\b": "going",
	r"\wi-fi\b": "wifi",
	r"\bwon't\b": "will not",
	r"\bcan't\b": "can not",
	r"\blet's\b": "let us",
	r"\bain't\b": "aint",
	r"\by'all\b": "you all",
	r"\bwanna\b": "want to",
	r"\bgotta\b": "got to",
	r"\bgonna\b": "going to",
	r"\bi'ma\b": "i am going to",
	r"\bimma\b": "i am going to",
	r"\bwoulda\b": "would have",
	r"\bcoulda\b": "could have",
	r"\bshoulda\b": "should have",
	r"\bma'am\b": "madam",
	r"\bokay\b": "ok",
	r"\bsetup\b": "set up",
	r"\beveryday\b": "every day",
	# contractions in titles/prefixes
	r"\bmr\b": "mister ",
	r"\bmrs\b": "missus ",
	r"\bst\b": "saint ",
	r"\bdr\b": "doctor ",
	r"\bprof\b": "professor ",
	r"\bcapt\b": "captain ",
	r"\bgov\b": "governor ",
	r"\bald\b": "alderman ",
	r"\bgen\b": "general ",
	r"\bsen\b": "senator ",
	r"\brep\b": "representative ",
	r"\bpres\b": "president ",
	r"\brev\b": "reverend ",
	r"\bhon\b": "honorable ",
	r"\basst\b": "assistant ",
	r"\bassoc\b": "associate ",
	r"\blt\b": "lieutenant ",
	r"\bcol\b": "colonel ",
	r"\bjr\b": "junior ",
	r"\bsr\b": "senior ",
	r"\besq\b": "esquire ",
	r"'d been\b": " had been",
	r"'s been\b": " has been",
	r"'d gone\b": " had gone",
	r"'s gone\b": " has gone",
	r"'d done\b": " had done",
	r"'s got\b": " has got",
	# general contractions
	r"n't\b": " not",
	r"'re\b": " are",
	r"'s\b": " is",
	r"'d\b": " would",
	r"'ll\b": " will",
	r"'t\b": " not",
	r"'ve\b": " have",
	r"'m\b": " am",
	}
	if standardize_numbers:
	self.standardize_numbers = EnglishNumberNormalizer()
	assert not standardize_numbers_rev
	else:
	self.standardize_numbers = None

	if standardize_numbers_rev:
	self.standardize_numbers_rev = EnglishReverseNumberNormalizer()
	else:
	self.standardize_numbers_rev = None

	self.standardize_spellings = EnglishSpellingNormalizer()
	self.pre_standardize_spellings = EnglishSpellingNormalizer("pre_english.json")

	if remove_fillers:
	self.fillers = ['hmm', 'uh', 'ah', 'eh'] # assumes replacers have been applied
	else:
	self.fillers = None

	def __call__(self, s: str):
	s = s.lower()

	s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)
	# remove words between brackets
	s = re.sub(r"$([^)]+?)$", "", s)
	# remove words between parenthesis
	s = self.pre_standardize_spellings(s)
	s = re.sub(r"\s+'", "'", s)
	# when there's a space before an apostrophe

	for pattern, replacement in self.replacers.items():
	s = re.sub(pattern, replacement, s)

	s = re.sub(r"(\d),(\d)", r"\1\2", s)
	# remove commas between digits
	s = re.sub(r"\.([^0-9]\|$)", r" \1", s)
	# remove periods not followed by numbers
	s = remove_symbols_and_diacritics(s, keep=".%$¢€£")
	# keep numeric symbols

	if self.standardize_numbers is not None:
	s = self.standardize_numbers(s)

	if self.standardize_numbers_rev is not None:
	s = self.standardize_numbers_rev(s)

	s = self.standardize_spellings(s)
	# now remove prefix/suffix symbols
	# that are not preceded/followed by numbers
	s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
	s = re.sub(r"([^0-9])%", r"\1 ", s)

	# remove filler words
	# motivation: these words are very common, yet hold little information in the majority of cases.
	# some ASR systems may ignore them by convention and will be penalized unfairly.
	if self.fillers:
	s = re.sub(r'\b(' + '\|'.join(self.fillers) + r')\b', "", s)

	s = re.sub(r"\s+", " ", s)
	# replace any successive whitespaces with a space

	s = re.sub(r"^\s+\|\s+$", "", s)
	# remove leading and trailing whitespaces

	return s