File size: 1,045 Bytes
b2fdf59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import re

def clean_text(text):
    # html pre-proc
    reg = re.compile(r'<.*?>')
    cleaned = reg.sub('', text)
#     cleaned = re.sub(r'\s([?.!"](?:\s|$))', r'\1', cleaned)
    cleaned = re.sub(r'\([^)]*\)', '', cleaned)
#     reg = re.compile(r'[\n\r\t]')
#     cleaned = reg.sub(" ", cleaned)
#     cleaned = re.sub('\.(?!$)', '', cleaned) # remove periods in between sentence
    cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.lstrip()
    cleaned = "".join(ch for ch in cleaned if unicodedata.category(ch)[0]!="C")
    cleaned = re.sub(' +', ' ', cleaned)
    cleaned = cleaned.replace(";", ", and")
    cleaned = cleaned.replace(":", "")
    cleaned = cleaned.replace(" .", ".")
    cleaned = cleaned.replace(" ,", ",")
    cleaned = cleaned.replace("\xa0", " ")
    cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start
    cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words
      
#     cleaned = cleaned.strip()
    return cleaned