Spaces:
Runtime error
Runtime error
File size: 1,045 Bytes
b2fdf59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
import re
def clean_text(text):
# html pre-proc
reg = re.compile(r'<.*?>')
cleaned = reg.sub('', text)
# cleaned = re.sub(r'\s([?.!"](?:\s|$))', r'\1', cleaned)
cleaned = re.sub(r'\([^)]*\)', '', cleaned)
# reg = re.compile(r'[\n\r\t]')
# cleaned = reg.sub(" ", cleaned)
# cleaned = re.sub('\.(?!$)', '', cleaned) # remove periods in between sentence
cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned)
cleaned = cleaned.strip()
cleaned = cleaned.lstrip()
cleaned = "".join(ch for ch in cleaned if unicodedata.category(ch)[0]!="C")
cleaned = re.sub(' +', ' ', cleaned)
cleaned = cleaned.replace(";", ", and")
cleaned = cleaned.replace(":", "")
cleaned = cleaned.replace(" .", ".")
cleaned = cleaned.replace(" ,", ",")
cleaned = cleaned.replace("\xa0", " ")
cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start
cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words
# cleaned = cleaned.strip()
return cleaned |