import gradio as gr
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

model_id = "thrishala/mental_health_chatbot"

try:
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="cpu",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        max_memory={"cpu": "15GB"},
        offload_folder="offload",
    )
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.model_max_length = 512  # Set maximum length
    # ok this is just to slow with pipe i wish it was faster. Si were ren=moving pipe in favor of local model
    
    # pipe = pipeline(
    #     "text-generation",
    #     model=model,
    #     tokenizer=tokenizer,
    #     torch_dtype=torch.float16,
    #     num_return_sequences=1,
    #     do_sample=False,
    #     truncation=True,
    #     max_new_tokens=128
    # )

except Exception as e:
    print(f"Error loading model: {e}")
    exit()
def generate_text(prompt, max_new_tokens=128):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device) #Move input to the same device as the model

    with torch.no_grad(): #Disable gradients during inference
        output = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # Or True for sampling
            eos_token_id=tokenizer.eos_token_id,  # Use EOS token to stop generation
            )[0]["generated_text"]
        
        # Extract only the new assistant response after the last Assistant: in the prompt
        bot_response = response[len(prompt):].split("User:")[0].strip()  # Take text after prompt and before next User
        )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text
def respond(
    message,
    history,
    system_message,
    max_tokens,
):
    # Construct the prompt with clear separation
    prompt = f"{system_message}\n"
    for user_msg, bot_msg in history:
        prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
    prompt += f"User: {message}\nAssistant:"
    
    try:
        # response = pipe(
        #     prompt,
        #     max_new_tokens=max_tokens,
        #     do_sample=False,
        #     eos_token_id=tokenizer.eos_token_id,  # Use EOS token to stop generation
        # )[0]["generated_text"]
        
        # Extract only the new assistant response after the last Assistant: in the prompt
         bot_response = generate_text(prompt, max_tokens)  
        yield bot_response
    except Exception as e:
        print(f"Error during generation: {e}")
        yield "An error occurred."

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(
            value="You are a friendly and helpful mental health chatbot.",
            label="System message",
        ),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
    ],
)

if __name__ == "__main__":
    demo.launch()