hostserver3

Runtime error

App Files Files Community

hostserver3 / main.py

abdullahalioo

Update main.py

eb96984 verified about 2 months ago

raw

history blame

2.73 kB

	from fastapi import FastAPI
	from pydantic import BaseModel
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import StreamingResponse
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	import os
	import asyncio

	# Set cache directories
	cache_dir = "/tmp/hf_home"
	os.environ["HF_HOME"] = cache_dir
	os.environ["TRANSFORMERS_CACHE"] = cache_dir
	os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir

	os.makedirs(cache_dir, exist_ok=True)
	os.chmod(cache_dir, 0o777)

	# Load model and tokenizer
	model_name = "microsoft/DialoGPT-small"
	tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
	model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)

	# Set pad token if not defined
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Set device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	# Initialize FastAPI
	app = FastAPI()

	# Enable CORS
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	class Question(BaseModel):
	question: str

	SYSTEM_PROMPT = "You are a helpful, professional, and highly persuasive sales assistant..."

	chat_history_ids = None

	async def generate_response_chunks(prompt: str):
	global chat_history_ids

	# Combine system prompt and user input
	input_text = SYSTEM_PROMPT + "\nUser: " + prompt + "\nBot:"
	new_input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

	# Create attention mask (handle case where pad_token_id might be None)
	attention_mask = torch.ones_like(new_input_ids)

	if chat_history_ids is not None:
	input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1)
	attention_mask = torch.cat([
	torch.ones_like(chat_history_ids),
	attention_mask
	], dim=-1)
	else:
	input_ids = new_input_ids

	# Generate response
	output_ids = model.generate(
	input_ids,
	attention_mask=attention_mask,
	max_new_tokens=200,
	do_sample=True,
	top_p=0.9,
	temperature=0.7,
	pad_token_id=tokenizer.eos_token_id
	)

	# Update chat history
	chat_history_ids = output_ids

	# Decode only the new tokens
	response = tokenizer.decode(output_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)

	# Stream the response
	for word in response.split():
	yield word + " "
	await asyncio.sleep(0.03)

	@app.post("/ask")
	async def ask(question: Question):
	return StreamingResponse(
	generate_response_chunks(question.question),
	media_type="text/plain"
	)