File size: 1,727 Bytes
d4829ef 9e290f8 d4829ef 9e290f8 d4829ef 9e290f8 4d6d610 d4829ef 9e290f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import string
import re
import joblib
from pydantic import BaseModel
# SCHEMA
class Schema(BaseModel):
text: str
# Request Handler
def movie_reviews(req):
text = req.text
output = predict(text)
return output
# PIPELINE
PIPELINE_PATH = "./src/movie_reviews/pipeline.pkl"
def predict(text):
cleaned = preprocess(text)
pipeline = joblib.load(PIPELINE_PATH)
return pipeline.predict_proba([cleaned]).round(3).tolist()
def preprocess(text):
# PREPROCESSING
punc = string.punctuation
abbv = {
"AFAIK":"as far as I know", "IMO": "in my opinion", "IMHO": "in my humble opinion", "LGTM": "look good to me", "AKA": "also know as", "ASAP": "as sone as possible", "BTW": "by the way", "FAQ": "frequently asked questions", "DIY": "do it yourself", "DM": "direct message", "FYI": "for your information", "IC": "i see", "IOW": "in other words", "IIRC": "If I Remember Correctly", "icymi":"In case you missed it", "CUZ": "because", "COS": "because", "nv": "nevermind", "PLZ": "please",
}
html_pattern = re.compile('<.*?>')
urls_pattern = re.compile(r'https?://\S+|www\.\S+')
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
text = text.lower() # Lowercase
text = html_pattern.sub(r'', text) # HTML Tags
text = urls_pattern.sub(r'', text) # urls
text = text.translate(str.maketrans("", "", punc)) # punctuations
text = emoji_pattern.sub(r'', text) # Emojis
new_text = []
for word in text.split(" "):
word = abbv.get(word.upper(), word) # abbreviations
new_text.append(word)
text = " ".join(new_text)
return text |