from fastapi import FastAPI from pydantic import BaseModel from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse import torch from transformers import AutoModelForCausalLM, AutoTokenizer # Define model ID model_id = "Qwen/Qwen2.5-VL-7B-Instruct" # Download model and tokenizer locally tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", # Use "cpu" if you want to force CPU: device_map="cpu" torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # GPU: float16, CPU: float32 trust_remote_code=True ) model.eval() # Initialize FastAPI app = FastAPI() # CORS settings app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Request model class Question(BaseModel): question: str # Generate response chunks async def generate_response_chunks(prompt: str): try: # Define system prompt system_prompt = ( "You are a Orion AI assistant created by Abdullah Ali who is very intelligent and he is 13 years old and lives in Lahore." ) full_prompt = f"{system_prompt}\n\nUser: {prompt}\nAssistant:" # Tokenize input input_ids = tokenizer(full_prompt, return_tensors="pt").to(model.device) # Generate output output_ids = model.generate( **input_ids, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1 ) # Decode output output_text = tokenizer.decode(output_ids[0][input_ids.input_ids.shape[-1]:], skip_special_tokens=True) # Stream output letter-by-letter for letter in output_text: yield letter except Exception as e: yield f"Error occurred: {e}" # API Endpoint @app.post("/ask") async def ask(question: Question): return StreamingResponse( generate_response_chunks(question.question), media_type="text/plain" )