Spaces:
Sleeping
Sleeping
import re | |
def correct_spellings(phrase): | |
corrections = { | |
'pricethe': 'price the', | |
'loudnessand': 'loudness and', | |
'muffeled': 'muffled', | |
'expidite': 'expedite', | |
'suerb': 'superb', | |
'eeplaces': 'ear pieces', | |
'exilent': 'excellent', | |
'worthable': 'worth able', | |
'soundaverage': 'sound average', | |
'bukd': 'build', | |
'breliant': 'brilliant', | |
'dvsvinyl': 'dvd vinyl', | |
'qudoes': 'kudos', | |
'extarnal': 'external', | |
'heaten': 'heats', | |
'iseent': 'is not', | |
'worth the prize': 'worth the price', | |
"laptop's": 'laptop', | |
"laptopβs": 'laptop', | |
"aslo": "also", | |
"qulity": "quality", | |
"qaulity": "quality", | |
"sable": "cable" | |
} | |
for k, v in corrections.items(): | |
phrase = phrase.replace(f"{k}", v) | |
return phrase | |
def undo_contractions(phrase): | |
# specific | |
phrase = re.sub(r"won[\'β]t", "will not", phrase) | |
phrase = re.sub(r"can[\'β]t", "can not", phrase) | |
# general | |
phrase = re.sub(r"n[\'β]t", " not", phrase) | |
phrase = re.sub(r"[\'β]re", " are", phrase) | |
phrase = re.sub(r"[\'β]s", " is", phrase) | |
phrase = re.sub(r"[\'β]d", " would", phrase) | |
phrase = re.sub(r"[\'β]ll", " will", phrase) | |
phrase = re.sub(r"[\'β]t", " not", phrase) | |
phrase = re.sub(r"[\'β]ve", " have", phrase) | |
phrase = re.sub(r"[\'β]m", " am", phrase) | |
return phrase | |
emoji_regex = '' | |
def preprocess_reviews(reviews): | |
reviews['text'] = reviews['title'] + ' . ' + reviews['review'] | |
reviews['text_cleaned'] = (reviews['text'] | |
.fillna('') | |
.str.replace(r'([a-z]+)([A-Z])', r'\1 \2') # badProduct bad Product | |
.str.lower() | |
.str.replace(emoji_regex, '') | |
.str.replace('\n', '.') | |
.str.replace(r'\s*\.+\s*', '. ') # dots and spaces | |
.str.replace(r'([\{\(\[\}\)\]])', r' \1 ') # spaces between parenthesis | |
.str.replace(r'([:])', r' \1 ') # spaces between : | |
.str.replace(r'(\d+\.?\d*)', r' \1 ') # spaces between numbers | |
.apply(correct_spellings) | |
.apply(undo_contractions) | |
) | |
return reviews |