File size: 2,152 Bytes
3683686
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b053f3d
 
 
0191339
 
 
 
 
3683686
0191339
3683686
0191339
 
3683686
0191339
 
 
 
3683686
0191339
 
 
3683686
 
 
58ca4fd
 
a7201bc
0191339
58ca4fd
3683686
98a42e2
58ca4fd
3683686
58ca4fd
3683686
0191339
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"
os.environ["HF_HOME"] = "/tmp/hf-home"

from fastapi import FastAPI, Request
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax
import numpy as np

# ✅ Define app BEFORE any @app.route
app = FastAPI()

MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def preprocess(text):
    tokens = []
    for t in text.split():
        if t.startswith("@") and len(t) > 1:
            t = "@user"
        elif t.startswith("http"):
            t = "http"
        tokens.append(t)
    return " ".join(tokens)

@app.post("/analyze")
async def analyze(request: Request):
    data = await request.json()
    text = preprocess(data.get("text", ""))

    if not text.strip():
        return {"error": "Empty input"}

    # Token length check
    tokenized = tokenizer(text, return_tensors='pt', add_special_tokens=True)
    if tokenized.input_ids.shape[1] <= 512:
        encoded_input = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        output = model(**encoded_input)
        probs = softmax(output[0][0].detach().numpy())
    else:
        max_words = 500
        words = text.split()
        chunks = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
        all_probs = []
        for chunk in chunks:
            encoded_input = tokenizer(chunk, return_tensors='pt', truncation=True, padding=True, max_length=512)
            output = model(**encoded_input)
            probs_chunk = softmax(output[0][0].detach().numpy())
            all_probs.append(probs_chunk)
        probs = np.mean(all_probs, axis=0)
        
    # Define the fixed order
    fixed_order = ["negative", "neutral", "positive"]

    # Build the result using fixed order
    result = [
        {"label": label, "score": round(float(probs[config.label2id[label]]), 4)}
        for label in fixed_order
    ]

    return {"result": result}