reviews-insights / preprocessing.py
analytics-jiten's picture
Update preprocessing.py
25b96ed
import re
def correct_spellings(phrase):
corrections = {
'pricethe': 'price the',
'loudnessand': 'loudness and',
'muffeled': 'muffled',
'expidite': 'expedite',
'suerb': 'superb',
'eeplaces': 'ear pieces',
'exilent': 'excellent',
'worthable': 'worth able',
'soundaverage': 'sound average',
'bukd': 'build',
'breliant': 'brilliant',
'dvsvinyl': 'dvd vinyl',
'qudoes': 'kudos',
'extarnal': 'external',
'heaten': 'heats',
'iseent': 'is not',
'worth the prize': 'worth the price',
"laptop's": 'laptop',
"laptop’s": 'laptop',
"aslo": "also",
"qulity": "quality",
"qaulity": "quality",
"sable": "cable"
}
for k, v in corrections.items():
phrase = phrase.replace(f"{k}", v)
return phrase
def undo_contractions(phrase):
# specific
phrase = re.sub(r"won[\'’]t", "will not", phrase)
phrase = re.sub(r"can[\'’]t", "can not", phrase)
# general
phrase = re.sub(r"n[\'’]t", " not", phrase)
phrase = re.sub(r"[\'’]re", " are", phrase)
phrase = re.sub(r"[\'’]s", " is", phrase)
phrase = re.sub(r"[\'’]d", " would", phrase)
phrase = re.sub(r"[\'’]ll", " will", phrase)
phrase = re.sub(r"[\'’]t", " not", phrase)
phrase = re.sub(r"[\'’]ve", " have", phrase)
phrase = re.sub(r"[\'’]m", " am", phrase)
return phrase
emoji_regex = ''
def preprocess_reviews(reviews):
reviews['text'] = reviews['title'] + ' . ' + reviews['review']
reviews['text_cleaned'] = (reviews['text']
.fillna('')
.str.replace(r'([a-z]+)([A-Z])', r'\1 \2') # badProduct bad Product
.str.lower()
.str.replace(emoji_regex, '')
.str.replace('\n', '.')
.str.replace(r'\s*\.+\s*', '. ') # dots and spaces
.str.replace(r'([\{\(\[\}\)\]])', r' \1 ') # spaces between parenthesis
.str.replace(r'([:])', r' \1 ') # spaces between :
.str.replace(r'(\d+\.?\d*)', r' \1 ') # spaces between numbers
.apply(correct_spellings)
.apply(undo_contractions)
)
return reviews