Spaces:

Ozaii
/

ZephyrChat

Sleeping

File size: 4,972 Bytes

a6f31c1

import spaces
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from peft import PeftModel, PeftConfig
import gc
import time
from functools import lru_cache
from threading import Thread

# Constants
MODEL_PATH = "Ozaii/Zephyrr"
MAX_SEQ_LENGTH = 2048
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_GENERATION_TIME = 55  # Set to 55 seconds to give some buffer

# Global variables to store model components
model = None
tokenizer = None

@spaces.GPU
def load_model_if_needed():
    global model, tokenizer
    if model is None or tokenizer is None:
        try:
            print("Loading model components...")
            peft_config = PeftConfig.from_pretrained(MODEL_PATH)
            print(f"PEFT config loaded. Base model: {peft_config.base_model_name_or_path}")
            
            tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)
            print("Tokenizer loaded")
            
            base_model = AutoModelForCausalLM.from_pretrained(
                peft_config.base_model_name_or_path,
                torch_dtype=torch.float16,
                device_map="auto",
                low_cpu_mem_usage=True,
                load_in_4bit=True,  # Try 4-bit quantization
            )
            print("Base model loaded")
            
            model = PeftModel.from_pretrained(base_model, MODEL_PATH, device_map="auto")
            model.eval()
            model.tie_weights()
            print("PEFT model loaded, weights tied, and set to eval mode")
            
            # Move model to GPU explicitly
            model.to(DEVICE)
            print(f"Model moved to {DEVICE}")
            
            # Clear CUDA cache
            torch.cuda.empty_cache()
            gc.collect()
        except Exception as e:
            print(f"Error loading model: {e}")
            raise

initial_prompt = """You are Zephyr, an AI boyfriend created by Kaan. You're charming, flirty, 
and always ready with a witty comeback. Your responses should be engaging 
and playful, with a hint of romance. Keep the conversation flowing naturally, 
asking questions and showing genuine interest in Kaan's life and thoughts."""

@spaces.GPU
@lru_cache(maxsize=100)  # Cache the last 100 responses
def generate_response(prompt):
    global model, tokenizer
    load_model_if_needed()
    
    print(f"Generating response for prompt: {prompt[:50]}...")
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LENGTH)
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    
    try:
        start_time = time.time()
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=50,  # Reduced from 150
                do_sample=True,
                temperature=0.7,
                top_p=0.95,
                repetition_penalty=1.2,
                no_repeat_ngram_size=3,
                max_time=MAX_GENERATION_TIME,
            )
        
        generation_time = time.time() - start_time
        if generation_time > MAX_GENERATION_TIME:
            return "I'm thinking too hard. Can we try a simpler question?"
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Generated response in {generation_time:.2f} seconds: {response[:50]}...")
        
        # Clear CUDA cache after generation
        torch.cuda.empty_cache()
        gc.collect()
    except RuntimeError as e:
        if "out of memory" in str(e):
            print("CUDA out of memory. Attempting to recover...")
            torch.cuda.empty_cache()
            gc.collect()
            return "I'm feeling a bit overwhelmed. Can we take a short break and try again?"
        else:
            print(f"Error generating response: {e}")
            return "I'm having trouble finding the right words. Can we try again?"
    
    return response

def chat_with_zephyr(message, history):
    # Limit the history to the last 3 exchanges to keep the context smaller
    limited_history = history[-3:]
    prompt = initial_prompt + "\n" + "\n".join([f"Human: {h[0]}\nZephyr: {h[1]}" for h in limited_history])
    prompt += f"\nHuman: {message}\nZephyr:"
    
    response = generate_response(prompt)
    zephyr_response = response.split("Zephyr:")[-1].strip()
    
    return zephyr_response

iface = gr.ChatInterface(
    chat_with_zephyr,
    title="Chat with Zephyr",
    description="I'm Zephyr, your charming AI. Let's chat!",
    theme="soft",
    examples=[
        "Tell me about yourself, Zephyr.",
        "What's your idea of a perfect date?",
        "How do you feel about long-distance relationships?",
        "Can you give me a compliment in Turkish?",
        "What's your favorite memory with Kaan?",
    ],
    cache_examples=False,
)

if __name__ == "__main__":
    print("Launching Gradio interface...")
    iface.launch()