import re def correct_spellings(phrase): corrections = { 'pricethe': 'price the', 'loudnessand': 'loudness and', 'muffeled': 'muffled', 'expidite': 'expedite', 'suerb': 'superb', 'eeplaces': 'ear pieces', 'exilent': 'excellent', 'worthable': 'worth able', 'soundaverage': 'sound average', 'bukd': 'build', 'breliant': 'brilliant', 'dvsvinyl': 'dvd vinyl', 'qudoes': 'kudos', 'extarnal': 'external', 'heaten': 'heats', 'iseent': 'is not', 'worth the prize': 'worth the price', "laptop's": 'laptop', "laptop’s": 'laptop', "aslo": "also", "qulity": "quality", "qaulity": "quality", "sable": "cable" } for k, v in corrections.items(): phrase = phrase.replace(f"{k}", v) return phrase def undo_contractions(phrase): # specific phrase = re.sub(r"won[\'’]t", "will not", phrase) phrase = re.sub(r"can[\'’]t", "can not", phrase) # general phrase = re.sub(r"n[\'’]t", " not", phrase) phrase = re.sub(r"[\'’]re", " are", phrase) phrase = re.sub(r"[\'’]s", " is", phrase) phrase = re.sub(r"[\'’]d", " would", phrase) phrase = re.sub(r"[\'’]ll", " will", phrase) phrase = re.sub(r"[\'’]t", " not", phrase) phrase = re.sub(r"[\'’]ve", " have", phrase) phrase = re.sub(r"[\'’]m", " am", phrase) return phrase emoji_regex = '' def preprocess_reviews(reviews): reviews['text'] = reviews['title'] + ' . ' + reviews['review'] reviews['text_cleaned'] = (reviews['text'] .fillna('') .str.replace(r'([a-z]+)([A-Z])', r'\1 \2') # badProduct bad Product .str.lower() .str.replace(emoji_regex, '') .str.replace('\n', '.') .str.replace(r'\s*\.+\s*', '. ') # dots and spaces .str.replace(r'([\{\(\[\}\)\]])', r' \1 ') # spaces between parenthesis .str.replace(r'([:])', r' \1 ') # spaces between : .str.replace(r'(\d+\.?\d*)', r' \1 ') # spaces between numbers .apply(correct_spellings) .apply(undo_contractions) ) return reviews