Spaces:
Sleeping
Sleeping
| import re | |
| _letters_and_numbers_re = re.compile( | |
| r"((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9']*)", re.IGNORECASE) | |
| _hardware_re = re.compile( | |
| '([0-9]+(?:[.,][0-9]+)?)(?:\s?)(tb|gb|mb|kb|ghz|mhz|khz|hz|mm)', re.IGNORECASE) | |
| _hardware_key = {'tb': 'terabyte', | |
| 'gb': 'gigabyte', | |
| 'mb': 'megabyte', | |
| 'kb': 'kilobyte', | |
| 'ghz': 'gigahertz', | |
| 'mhz': 'megahertz', | |
| 'khz': 'kilohertz', | |
| 'hz': 'hertz', | |
| 'mm': 'millimeter', | |
| 'cm': 'centimeter', | |
| 'km': 'kilometer'} | |
| _dimension_re = re.compile( | |
| r'\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b|\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b') | |
| _dimension_key = {'m': 'meter', | |
| 'in': 'inch', | |
| 'inch': 'inch'} | |
| def _expand_letters_and_numbers(m): | |
| text = re.split(r'(\d+)', m.group(0)) | |
| # remove trailing space | |
| if text[-1] == '': | |
| text = text[:-1] | |
| elif text[0] == '': | |
| text = text[1:] | |
| # if not like 1920s, or AK47's , 20th, 1st, 2nd, 3rd, etc... | |
| if text[-1] in ("'s", "s", "th", "nd", "st", "rd") and text[-2].isdigit(): | |
| text[-2] = text[-2] + text[-1] | |
| text = text[:-1] | |
| # for combining digits 2 by 2 | |
| new_text = [] | |
| for i in range(len(text)): | |
| string = text[i] | |
| if string.isdigit() and len(string) < 5: | |
| # heuristics | |
| if len(string) > 2 and string[-2] == '0': | |
| if string[-1] == '0': | |
| string = [string] | |
| else: | |
| string = [string[:-2], string[-2], string[-1]] | |
| elif len(string) % 2 == 0: | |
| string = [string[i:i+2] for i in range(0, len(string), 2)] | |
| elif len(string) > 2: | |
| string = [string[0]] + [string[i:i+2] for i in range(1, len(string), 2)] | |
| new_text.extend(string) | |
| else: | |
| new_text.append(string) | |
| text = new_text | |
| text = " ".join(text) | |
| return text | |
| def _expand_hardware(m): | |
| quantity, measure = m.groups(0) | |
| measure = _hardware_key[measure.lower()] | |
| if measure[-1] != 'z' and float(quantity.replace(',', '')) > 1: | |
| return "{} {}s".format(quantity, measure) | |
| return "{} {}".format(quantity, measure) | |
| def _expand_dimension(m): | |
| text = "".join([x for x in m.groups(0) if x != 0]) | |
| text = text.replace(' x ', ' by ') | |
| text = text.replace('x', ' by ') | |
| if text.endswith(tuple(_dimension_key.keys())): | |
| if text[-2].isdigit(): | |
| text = "{} {}".format(text[:-1], _dimension_key[text[-1:]]) | |
| elif text[-3].isdigit(): | |
| text = "{} {}".format(text[:-2], _dimension_key[text[-2:]]) | |
| return text | |
| def normalize_letters_and_numbers(text): | |
| text = re.sub(_hardware_re, _expand_hardware, text) | |
| text = re.sub(_dimension_re, _expand_dimension, text) | |
| text = re.sub(_letters_and_numbers_re, _expand_letters_and_numbers, text) | |
| return text | |