Spaces:
Sleeping
Sleeping
async def analyze(request: Request): | |
data = await request.json() | |
text = preprocess(data.get("text", "")) | |
if not text.strip(): | |
return {"error": "Empty input"} | |
# Tokenize to check length without truncating | |
tokenized = tokenizer(text, return_tensors='pt', add_special_tokens=True) | |
num_tokens = tokenized.input_ids.shape[1] | |
if num_tokens <= 512: | |
# β Use direct inference for short inputs | |
encoded_input = tokenizer(text, return_tensors='pt', truncation=True, padding=True) | |
output = model(**encoded_input) | |
scores = output[0][0].detach().numpy() | |
probs = softmax(scores) | |
result = [ | |
{"label": config.id2label[i], "score": round(float(probs[i]), 4)} | |
for i in probs.argsort()[::-1] | |
] | |
return {"result": result} | |
else: | |
# β Long input: Split into chunks of ~500 words | |
max_words = 500 | |
words = text.split() | |
chunks = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)] | |
all_scores = [] | |
for chunk in chunks: | |
encoded_input = tokenizer(chunk, return_tensors='pt', truncation=True, padding=True, max_length=512) | |
output = model(**encoded_input) | |
scores = output[0][0].detach().numpy() | |
probs = softmax(scores) | |
all_scores.append(probs) | |
# Average softmax scores | |
avg_scores = np.mean(all_scores, axis=0) | |
result = [ | |
{"label": config.id2label[i], "score": round(float(avg_scores[i]), 4)} | |
for i in avg_scores.argsort()[::-1] | |
] | |
return {"result": result} | |