import gradio as gr
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
)
import threading
import time

# -----------------------------------------------------------------------------
# 1. MODEL LOADING
# -----------------------------------------------------------------------------
# In this advanced example, we'll instantiate the model directly (instead of using pipeline).
# We'll do streaming outputs via TextIteratorStreamer.

MODEL_NAME = "microsoft/phi-4"  # Replace with an actual HF model if phi-4 is unavailable
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
except:
    # Fallback if model is not found or large. Here we default to a smaller model
    MODEL_NAME = "gpt2"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

model.eval()


# -----------------------------------------------------------------------------
# 2. CONVERSATION / PROMPTS
# -----------------------------------------------------------------------------
# We'll keep track of conversation using a list of dictionaries:
# [
#   {"role": "system",    "content": "..."},
#   {"role": "developer", "content": "..."},
#   {"role": "user",      "content": "User message"},
#   {"role": "assistant", "content": "Assistant answer"},
#   ...
# ]
# 
# We’ll also build in a mock retrieval system that merges knowledge snippets 
# into the final prompt if the user chooses to do so.

DEFAULT_SYSTEM_PROMPT = (
    "You are Philos, an advanced AI system created by ACC (Algorithmic Computer-generated Consciousness). "
    "Answer user queries accurately, thoroughly, and helpfully. Keep your responses relevant and correct."
)

DEFAULT_DEVELOPER_PROMPT = (
    "Ensure that you respond in a style that is professional, clear, and approachable. "
    "Include reasoning steps if needed, but keep them concise."
)

# A small dictionary to emulate knowledge retrieval
# In real scenarios, you might use an actual vector DB + retrieval method
MOCK_KB = {
    "python": "Python is a high-level, interpreted programming language famous for its readability and flexibility.",
    "accelerate library": "The accelerate library by HF helps in distributed training and inference.",
    "phi-4 architecture": "phi-4 is a 14B-parameter, decoder-only Transformer with a 16K context window.",
}

def retrieve_knowledge(user_query):
    # Simple naive approach: check keywords in user query
    # Return a knowledge snippet if found
    matches = []
    for keyword, snippet in MOCK_KB.items():
        if keyword.lower() in user_query.lower():
            matches.append(snippet)
    return matches

# -----------------------------------------------------------------------------
# 3. HELPER: Build the prompt from conversation
# -----------------------------------------------------------------------------
def build_prompt(conversation):
    """
    Convert conversation (list of role/content dicts) into a single text prompt 
    that the model can process. We adopt a simple format:
        system, developer, user, assistant, ...
    """
    prompt = ""
    for msg in conversation:
        if msg["role"] == "system":
            prompt += f"[System]\n{msg['content']}\n"
        elif msg["role"] == "developer":
            prompt += f"[Developer]\n{msg['content']}\n"
        elif msg["role"] == "user":
            prompt += f"[User]\n{msg['content']}\n"
        else:  # assistant
            prompt += f"[Assistant]\n{msg['content']}\n"
    prompt += "[Assistant]\n"  # We end with an assistant role so model can continue
    return prompt


# -----------------------------------------------------------------------------
# 4. STREAMING GENERATION
# -----------------------------------------------------------------------------
def generate_tokens_stream(prompt, temperature=0.7, top_p=0.9, max_new_tokens=128):
    """
    Uses TextIteratorStreamer to yield tokens one by one (or in small chunks).
    """
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)

    generation_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    # We'll run generation in a background thread, streaming tokens
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Stream tokens
    partial_text = ""
    for new_token in streamer:
        partial_text += new_token
        yield partial_text

    thread.join()


# -----------------------------------------------------------------------------
# 5. MAIN CHAT FUNCTION
# -----------------------------------------------------------------------------
def advanced_chat(user_msg, conversation, system_prompt, dev_prompt, retrieve_flg, temperature, top_p):
    """
    - Update conversation with the user's message
    - Optionally retrieve knowledge and incorporate into the system or developer prompt
    - Build the final prompt
    - Stream the assistant's reply
    """
    # If user message is empty
    if not user_msg.strip():
        yield "Please enter a message."
        return

    # 1) Construct or update system/dev prompts
    system_message = {"role": "system", "content": system_prompt}
    developer_message = {"role": "developer", "content": dev_prompt}

    # 2) Insert or replace system/dev in the conversation
    #    We'll assume the first system/dev messages are at the start of conversation
    #    or add them if not present
    filtered = [msg for msg in conversation if msg["role"] not in ["system", "developer"]]
    conversation = [system_message, developer_message] + filtered

    # 3) Append user's message
    conversation.append({"role": "user", "content": user_msg})

    # 4) Retrieve knowledge if user toggled "Include knowledge retrieval"
    if retrieve_flg:
        knowledge_snippets = retrieve_knowledge(user_msg)
        if knowledge_snippets:
            # We can just append them to developer or system content for simplicity
            knowledge_text = "\n".join(["[Knowledge] " + s for s in knowledge_snippets])
            conversation[1]["content"] += f"\n\n[Additional Knowledge]\n{knowledge_text}"

    # 5) Build final prompt
    prompt = build_prompt(conversation)

    # 6) Stream the assistant’s response
    partial_response = ""
    for partial_text in generate_tokens_stream(prompt, temperature, top_p):
        partial_response = partial_text
        yield partial_text  # Send partial tokens to Gradio for display

    # 7) Now that generation is complete, append final assistant message
    conversation.append({"role": "assistant", "content": partial_response})


# -----------------------------------------------------------------------------
# 6. BUILD GRADIO INTERFACE
# -----------------------------------------------------------------------------
def build_ui():
    with gr.Blocks(title="PhilosBeta-Advanced", css="#chatbot{height:550px} .overflow-y-auto{max-height:550px}") as demo:

        gr.Markdown("# **PhilosBeta: Advanced Demo**")
        gr.Markdown(
            "An example of multi-turn conversation with streaming responses, "
            "optional retrieval, and custom system/developer prompts."
        )

        # State to store the conversation as a list of role/content dicts
        conversation_state = gr.State([])

        # TEXT ELEMENTS
        with gr.Row():
            with gr.Column():
                system_prompt_box = gr.Textbox(
                    label="System Prompt",
                    value=DEFAULT_SYSTEM_PROMPT,
                    lines=3
                )
                developer_prompt_box = gr.Textbox(
                    label="Developer Prompt",
                    value=DEFAULT_DEVELOPER_PROMPT,
                    lines=3
                )
            with gr.Column():
                retrieve_flag = gr.Checkbox(label="Include Knowledge Retrieval", value=False)
                temperature_slider = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
                top_p_slider = gr.Slider(0.0, 1.0, 0.9, step=0.05, label="Top-p")
                max_tokens_info = gr.Markdown("Max new tokens = 128 (fixed in code).")

        # MAIN CHAT UI
        chatbox = gr.Chatbot(label="Philos Conversation", elem_id="chatbot").style(height=500)
        user_input = gr.Textbox(
            label="Your Message",
            placeholder="Type here...",
            lines=3
        )
        send_btn = gr.Button("Send", variant="primary")

        # ---------------------------------------------------------------------
        # ACTION: Handle user input
        # ---------------------------------------------------------------------
        def user_send(
            user_text, conversation, sys_prompt, dev_prompt, retrieve_flg, temperature, top_p
        ):
            """
            This function calls advanced_chat() and streams tokens back to update the Chatbot UI.
            """
            # We'll create a generator to update the Chatbot in real-time
            message_stream = advanced_chat(
                user_msg=user_text,
                conversation=conversation,
                system_prompt=sys_prompt,
                dev_prompt=dev_prompt,
                retrieve_flg=retrieve_flg,
                temperature=temperature,
                top_p=top_p
            )
            return message_stream, conversation

        # Gradio can handle generator outputs for streaming. 
        # We map the streamed tokens to the Chatbot component in real-time.
        chatbox_stream = gr.Chatbot.update()
        send_btn.click(
            fn=user_send,
            inputs=[
                user_input,
                conversation_state,
                system_prompt_box,
                developer_prompt_box,
                retrieve_flag,
                temperature_slider,
                top_p_slider,
            ],
            outputs=[chatbox_stream, conversation_state],
        )

        # We also let the user press Enter to send messages
        user_input.submit(
            fn=user_send,
            inputs=[
                user_input,
                conversation_state,
                system_prompt_box,
                developer_prompt_box,
                retrieve_flag,
                temperature_slider,
                top_p_slider,
            ],
            outputs=[chatbox_stream, conversation_state],
        )

    return demo


# -----------------------------------------------------------------------------
# 7. LAUNCH
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    ui = build_ui()
    ui.launch()