Spaces:

eyad-silx
/

Quasar

Runtime error

File size: 7,216 Bytes

import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@st.cache_resource
def load_model():
    """Load model and tokenizer with caching"""
    try:
        st.spinner("Loading model... This may take a few minutes")
        logger.info("Starting model loading...")
        
        # Load with 8-bit quantization for CPU
        model = AutoModelForCausalLM.from_pretrained(
            "NousResearch/Llama-3.2-1B",
            load_in_8bit=True,          # Use 8-bit quantization
            device_map="auto",          # Automatically handle device placement
            low_cpu_mem_usage=True,
            torch_dtype=torch.float32 if not torch.cuda.is_available() else torch.float16
        )
        
        tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
        
        # Set up padding token
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = model.config.eos_token_id
        
        logger.info("Model loaded successfully")
        return model, tokenizer
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        st.error(f"Error loading model: {str(e)}")
        return None, None

def check_for_repetition(text, threshold=3):
    """Check if the generated text has too many repetitions"""
    words = text.split()
    if len(words) < threshold:
        return False
        
    # Check for repeated phrases
    for i in range(len(words) - threshold):
        phrase = ' '.join(words[i:i+threshold])
        if text.count(phrase) > 2:  # If phrase appears more than twice
            return True
    return False

def generate_response_with_timeout(model, tokenizer, prompt, timeout_seconds=30):
    """Generate response with timeout and repetition checking"""
    try:
        # Prepare the input
        inputs = tokenizer(
            prompt, 
            return_tensors="pt", 
            padding=True,
            truncation=True,
            max_length=256  # Reduced for CPU
        ).to(model.device)
        
        start_time = time.time()
        
        # Generate response with stricter parameters
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                max_length=100,          # Shorter responses
                min_length=20,           # Ensure some minimum content
                num_return_sequences=1,
                temperature=0.8,         # Slightly higher temperature
                pad_token_id=tokenizer.pad_token_id,
                attention_mask=inputs["attention_mask"],
                do_sample=True,
                top_p=0.92,
                top_k=40,
                repetition_penalty=1.5,  # Increased repetition penalty
                no_repeat_ngram_size=3,  # Prevent 3-gram repetitions
                early_stopping=True,
                length_penalty=1.0
            )
        
        generation_time = time.time() - start_time
        logger.info(f"Response generated in {generation_time:.2f} seconds")
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response.replace(prompt, "").strip()
        
        # Check for repetitions and retry if necessary
        if check_for_repetition(response):
            logger.warning("Detected repetition, retrying with stricter parameters")
            return "I apologize, but I'm having trouble generating a coherent response. Could you try rephrasing your question?"
            
        return response
    
    except Exception as e:
        logger.error(f"Error in generation: {str(e)}")
        return f"Error generating response: {str(e)}"

# Page config
st.set_page_config(page_title="Chat with Quasar-32B", layout="wide")

# Add debug information in sidebar
with st.sidebar:
    st.write("### System Information")
    st.write("Model: Quasar-32B")
    
    # Device and memory information
    device = "GPU" if torch.cuda.is_available() else "CPU"
    st.write(f"Running on: {device}")
    if torch.cuda.is_available():
        st.write(f"GPU: {torch.cuda.get_device_name(0)}")
        st.write(f"Memory Usage: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
    else:
        import psutil
        st.write(f"CPU Memory Usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")
        st.write("⚠️ Running on CPU - Responses may be slow")
    
    # Model settings
    st.write("### Model Settings")
    if 'temperature' not in st.session_state:
        st.session_state.temperature = 0.8
    if 'max_length' not in st.session_state:
        st.session_state.max_length = 100
        
    st.session_state.temperature = st.slider("Temperature", 0.1, 1.0, st.session_state.temperature)
    st.session_state.max_length = st.slider("Max Length", 50, 200, st.session_state.max_length)

st.title("Chat with Quasar-32B")

# Initialize session state for chat history
if 'messages' not in st.session_state:
    st.session_state.messages = []

# Load model and tokenizer
model, tokenizer = load_model()

# Chat interface
st.write("### Chat")
chat_container = st.container()

# Display chat history
with chat_container:
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.write(message["content"])

# User input
if prompt := st.chat_input("Type your message here"):
    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": prompt})
    
    # Display user message
    with chat_container:
        with st.chat_message("user"):
            st.write(prompt)
    
    # Generate and display assistant response
    if model and tokenizer:
        with st.chat_message("assistant"):
            try:
                with st.spinner("Generating response... (timeout: 30s)"):
                    with ThreadPoolExecutor() as executor:
                        future = executor.submit(
                            generate_response_with_timeout, 
                            model, 
                            tokenizer, 
                            prompt
                        )
                        response = future.result(timeout=30)
                        
                st.write(response)
                st.session_state.messages.append({"role": "assistant", "content": response})
                
            except TimeoutError:
                error_msg = "Response generation timed out. The model might be overloaded."
                st.error(error_msg)
                logger.error(error_msg)
            except Exception as e:
                error_msg = f"Error generating response: {str(e)}"
                st.error(error_msg)
                logger.error(error_msg)
    else:
        st.error("Model failed to load. Please check your configuration.")

# Add a button to clear chat history
if st.button("Clear Chat History"):
    st.session_state.messages = []
    st.experimental_rerun()