File size: 1,727 Bytes
d4829ef
 
 
 
 
 
 
 
9e290f8
d4829ef
 
 
 
 
 
 
9e290f8
d4829ef
 
 
9e290f8
4d6d610
d4829ef
 
9e290f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import string
import re
import joblib
from pydantic import BaseModel

# SCHEMA
class Schema(BaseModel):
	text: str
		
# Request Handler
def movie_reviews(req):
	text = req.text
	output = predict(text)
	return output

# PIPELINE
PIPELINE_PATH = "./src/movie_reviews/pipeline.pkl"

def predict(text):
	cleaned = preprocess(text)
	pipeline = joblib.load(PIPELINE_PATH)
	return pipeline.predict_proba([cleaned]).round(3).tolist()

def preprocess(text):
	
	# PREPROCESSING
	punc = string.punctuation
	abbv = {
		"AFAIK":"as far as I know", "IMO":	"in my opinion", "IMHO":	"in my humble opinion", "LGTM":	"look good to me", "AKA":	"also know as", "ASAP":	"as sone as possible", "BTW":	"by the way", "FAQ":	"frequently asked questions", "DIY":	"do it yourself", "DM":	"direct message", "FYI":	"for your information", "IC":	"i see", "IOW":	"in other words", "IIRC":	"If I Remember Correctly", "icymi":"In case you missed it", "CUZ":	"because", "COS":	"because", "nv":	"nevermind", "PLZ":	"please",
	}
	html_pattern = re.compile('<.*?>')
	urls_pattern = re.compile(r'https?://\S+|www\.\S+')
	emoji_pattern = re.compile("["
		u"\U0001F600-\U0001F64F"  # emoticons
		u"\U0001F300-\U0001F5FF"  # symbols & pictographs
		u"\U0001F680-\U0001F6FF"  # transport & map symbols
		u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
	"]+", flags=re.UNICODE)

	text = text.lower()	# Lowercase
	text = html_pattern.sub(r'', text)	# HTML Tags
	text = urls_pattern.sub(r'', text)	# urls
	text = text.translate(str.maketrans("", "", punc))	# punctuations
	text = emoji_pattern.sub(r'', text)	# Emojis
	new_text = []
	for word in text.split(" "):
		word = abbv.get(word.upper(), word)	# abbreviations
		new_text.append(word)
	text = " ".join(new_text)
	return text