File size: 1,402 Bytes
017d40d
84f1ee8
589009a
84f1ee8
80490de
1e9ac73
589009a
1e9ac73
589009a
 
 
 
 
80490de
 
589009a
4afa954
80490de
589009a
 
80490de
589009a
017d40d
 
589009a
017d40d
 
 
 
80490de
589009a
017d40d
 
 
 
 
589009a
017d40d
 
589009a
017d40d
589009a
017d40d
d7d161f
 
80490de
589009a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from fastapi import FastAPI, HTTPException
from onnxruntime import InferenceSession
from transformers import AutoTokenizer
import numpy as np
import os

app = FastAPI()

# Initialize tokenizer (doesn't require PyTorch/TensorFlow)
tokenizer = AutoTokenizer.from_pretrained(
    "Xenova/multi-qa-mpnet-base-dot-v1",
    use_fast=True,  # Uses Rust implementation
    legacy=False
)

# Load ONNX model
session = InferenceSession("model.onnx")

@app.post("/api/predict")
async def predict(text: str):
    try:
        # Tokenize without framework dependencies
        inputs = tokenizer(
            text,
            return_tensors="np",  # Get NumPy arrays directly
            padding=True,
            truncation=True,
            max_length=32  # Match your model's expected input size
        )
        
        # Prepare ONNX inputs
        onnx_inputs = {
            "input_ids": inputs["input_ids"].astype(np.int64),
            "attention_mask": inputs["attention_mask"].astype(np.int64)
        }
        
        # Run inference
        outputs = session.run(None, onnx_inputs)
        
        # Convert to native Python types
        return {
            "embedding": outputs[0].astype(np.float32).tolist(),
            "tokens": tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        }
        
    except Exception as e:
        raise HTTPException(status_code=400, detail=str(e))