File size: 1,402 Bytes
017d40d 84f1ee8 589009a 84f1ee8 80490de 1e9ac73 589009a 1e9ac73 589009a 80490de 589009a 4afa954 80490de 589009a 80490de 589009a 017d40d 589009a 017d40d 80490de 589009a 017d40d 589009a 017d40d 589009a 017d40d 589009a 017d40d d7d161f 80490de 589009a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from fastapi import FastAPI, HTTPException
from onnxruntime import InferenceSession
from transformers import AutoTokenizer
import numpy as np
import os
app = FastAPI()
# Initialize tokenizer (doesn't require PyTorch/TensorFlow)
tokenizer = AutoTokenizer.from_pretrained(
"Xenova/multi-qa-mpnet-base-dot-v1",
use_fast=True, # Uses Rust implementation
legacy=False
)
# Load ONNX model
session = InferenceSession("model.onnx")
@app.post("/api/predict")
async def predict(text: str):
try:
# Tokenize without framework dependencies
inputs = tokenizer(
text,
return_tensors="np", # Get NumPy arrays directly
padding=True,
truncation=True,
max_length=32 # Match your model's expected input size
)
# Prepare ONNX inputs
onnx_inputs = {
"input_ids": inputs["input_ids"].astype(np.int64),
"attention_mask": inputs["attention_mask"].astype(np.int64)
}
# Run inference
outputs = session.run(None, onnx_inputs)
# Convert to native Python types
return {
"embedding": outputs[0].astype(np.float32).tolist(),
"tokens": tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
}
except Exception as e:
raise HTTPException(status_code=400, detail=str(e)) |