Spaces:

analytics-jiten
/

reviews-insights

Sleeping

App Files Files Community

reviews-insights / preprocessing.py

analytics-jiten

Update preprocessing.py

25b96ed over 1 year ago

raw

history blame contribute delete

2.04 kB

	import re

	def correct_spellings(phrase):
	corrections = {
	'pricethe': 'price the',
	'loudnessand': 'loudness and',
	'muffeled': 'muffled',
	'expidite': 'expedite',
	'suerb': 'superb',
	'eeplaces': 'ear pieces',
	'exilent': 'excellent',
	'worthable': 'worth able',
	'soundaverage': 'sound average',
	'bukd': 'build',
	'breliant': 'brilliant',
	'dvsvinyl': 'dvd vinyl',
	'qudoes': 'kudos',
	'extarnal': 'external',
	'heaten': 'heats',
	'iseent': 'is not',
	'worth the prize': 'worth the price',
	"laptop's": 'laptop',
	"laptop’s": 'laptop',
	"aslo": "also",
	"qulity": "quality",
	"qaulity": "quality",
	"sable": "cable"
	}
	for k, v in corrections.items():
	phrase = phrase.replace(f"{k}", v)
	return phrase

	def undo_contractions(phrase):
	# specific
	phrase = re.sub(r"won[\'’]t", "will not", phrase)
	phrase = re.sub(r"can[\'’]t", "can not", phrase)

	# general
	phrase = re.sub(r"n[\'’]t", " not", phrase)
	phrase = re.sub(r"[\'’]re", " are", phrase)
	phrase = re.sub(r"[\'’]s", " is", phrase)
	phrase = re.sub(r"[\'’]d", " would", phrase)
	phrase = re.sub(r"[\'’]ll", " will", phrase)
	phrase = re.sub(r"[\'’]t", " not", phrase)
	phrase = re.sub(r"[\'’]ve", " have", phrase)
	phrase = re.sub(r"[\'’]m", " am", phrase)
	return phrase

	emoji_regex = ''

	def preprocess_reviews(reviews):
	reviews['text'] = reviews['title'] + ' . ' + reviews['review']

	reviews['text_cleaned'] = (reviews['text']
	.fillna('')
	.str.replace(r'([a-z]+)([A-Z])', r'\1 \2') # badProduct bad Product
	.str.lower()
	.str.replace(emoji_regex, '')
	.str.replace('\n', '.')
	.str.replace(r'\s\.+\s', '. ') # dots and spaces
	.str.replace(r'([\{\(\[\}\)\]])', r' \1 ') # spaces between parenthesis
	.str.replace(r'([:])', r' \1 ') # spaces between :
	.str.replace(r'(\d+\.?\d*)', r' \1 ') # spaces between numbers
	.apply(correct_spellings)
	.apply(undo_contractions)
	)

	return reviews