File size: 1,802 Bytes
d4829ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e577af3
d4829ef
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import string
import re
import joblib
from pydantic import BaseModel

# SCHEMA
class Schema(BaseModel):
	text: str
        
# Request Handler
def movie_reviews(req):
	text = req.text
	output = predict(text)
	return output

# PREPROCESSING
punc = string.punctuation
abbv = {
    "AFAIK":"as far as I know",
	"IMO":	"in my opinion",
	"IMHO":	"in my humble opinion",
	"LGTM":	"look good to me",
	"AKA":	"also know as",
	"ASAP":	"as sone as possible",
	"BTW":	"by the way",
	"FAQ":	"frequently asked questions",
	"DIY":	"do it yourself",
	"DM":	"direct message",
	"FYI":	"for your information",
	"IC":	"i see",
	"IOW":	"in other words",
	"IIRC":	"If I Remember Correctly",
	"icymi":"In case you missed it",
	"CUZ":	"because",
	"COS":	"because",
	"nv":	"nevermind",
	"PLZ":	"please",
}
html_pattern = re.compile('<.*?>')
urls_pattern = re.compile(r'https?://\S+|www\.\S+')
emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F"  # emoticons
	u"\U0001F300-\U0001F5FF"  # symbols & pictographs
	u"\U0001F680-\U0001F6FF"  # transport & map symbols
	u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
"]+", flags=re.UNICODE)

# PIPELINE
pipeline = joblib.load("./src/movie_reviews/pipeline.pkl")

def predict(text):
	cleaned = preprocess(text)
	pred = pipeline.predict([cleaned])[0]
	output = [0, 0]
	output[pred] = 0.8
	output[1-pred] = 0.2
	return [output]

def preprocess(text):
    text = text.lower()	# Lowercase
    text = html_pattern.sub(r'', text)	# HTML Tags
    text = urls_pattern.sub(r'', text)	# urls
    text = text.translate(str.maketrans("", "", punc))	# punctuations
    text = emoji_pattern.sub(r'', text)	# Emojis
    new_text = []
    for word in text.split(" "):
        word = abbv.get(word.upper(), word)	# abbreviations
        new_text.append(word)
    text = " ".join(new_text)
    return text