Spaces:

service-internal
/

sentiment-analysis

Sleeping

App Files Files Community

service-internal commited on 29 days ago

Commit

0191339

verified ·

1 Parent(s): 0a4ba60

Update main.py

Browse files

Files changed (1) hide show

main.py +46 -60

main.py CHANGED Viewed

@@ -1,63 +1,49 @@
-import os
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"
-os.environ["HF_HOME"] = "/tmp/hf-home"
-from fastapi import FastAPI, Request
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
-from scipy.special import softmax
-import numpy as np
-app = FastAPI()
-MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
-# Load model and tokenizer
-tokenizer = AutoTokenizer.from_pretrained(MODEL)
-config = AutoConfig.from_pretrained(MODEL)
-model = AutoModelForSequenceClassification.from_pretrained(MODEL)
-# Preprocessing step for Twitter-style input
-def preprocess(text):
-    tokens = []
-    for t in text.split():
-        if t.startswith("@") and len(t) > 1:
-            t = "@user"
-        elif t.startswith("http"):
-            t = "http"
-        tokens.append(t)
-    return " ".join(tokens)
 @app.post("/analyze")
 async def analyze(request: Request):
     data = await request.json()
-    raw_text = data.get("text", "")
-    # Logging for debugging
-    print(f"Raw input: {raw_text}")
-    if not raw_text.strip():
-        return {"error": "Empty input text."}
-    text = preprocess(raw_text)
-    print(f"Preprocessed: {text}")
-    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
-    print(f"Encoded input: {encoded_input.input_ids}")
-    output = model(**encoded_input)
-    scores = output[0][0].detach().numpy()
-    probs = softmax(scores)
-    # Logging output
-    print(f"Raw scores: {scores}")
-    print(f"Softmax probs: {probs}")
-    result = [
-        {"label": config.id2label[i], "score": round(float(probs[i]), 4)}
-        for i in probs.argsort()[::-1]
-    ]
-    print(f"Result: {result}")
-    return {"result": result}

 @app.post("/analyze")
 async def analyze(request: Request):
     data = await request.json()
+    text = preprocess(data.get("text", ""))
+    if not text.strip():
+        return {"error": "Empty input"}
+    # Tokenize to check length without truncating
+    tokenized = tokenizer(text, return_tensors='pt', add_special_tokens=True)
+    num_tokens = tokenized.input_ids.shape[1]
+    if num_tokens <= 512:
+        # ✅ Use direct inference for short inputs
+        encoded_input = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
+        output = model(**encoded_input)
+        scores = output[0][0].detach().numpy()
+        probs = softmax(scores)
+        result = [
+            {"label": config.id2label[i], "score": round(float(probs[i]), 4)}
+            for i in probs.argsort()[::-1]
+        ]
+        return {"result": result}
+    else:
+        # ✅ Long input: Split into chunks of ~500 words
+        max_words = 500
+        words = text.split()
+        chunks = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
+        all_scores = []
+        for chunk in chunks:
+            encoded_input = tokenizer(chunk, return_tensors='pt', truncation=True, padding=True, max_length=512)
+            output = model(**encoded_input)
+            scores = output[0][0].detach().numpy()
+            probs = softmax(scores)
+            all_scores.append(probs)
+        # Average softmax scores
+        avg_scores = np.mean(all_scores, axis=0)
+        result = [
+            {"label": config.id2label[i], "score": round(float(avg_scores[i]), 4)}
+            for i in avg_scores.argsort()[::-1]
+        ]
+        return {"result": result}