Spaces:

usag1e
/

my-llm-endpoint-fresh

Runtime error

usag1e

Add accelerate to requirements

7ac3def 11 months ago

1.14 kB

	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch

	# Load the model and tokenizer
	MODEL_NAME = "deepseek-ai/DeepSeek-V3-Base" # Change to the model you want
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	device_map="auto",
	trust_remote_code=True, # Allow execution of custom code
	low_cpu_mem_usage=True # Ensures reduced memory usage
	).to(device)

	app = FastAPI()

	class Query(BaseModel):
	input_text: str

	@app.post("/predict")
	async def predict(query: Query):
	input_text = query.input_text
	if not input_text:
	raise HTTPException(status_code=400, detail="Input text cannot be empty.")
	inputs = tokenizer(input_text, return_tensors="pt").to(device)
	outputs = model.generate(inputs["input_ids"], max_new_tokens=50, temperature=0.7)
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return {"response": response}