Spaces:

jsbeaudry
/

makandal

Sleeping

File size: 3,173 Bytes

import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer once at startup
model_name = "jsbeaudry/makandal-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

think_token_id = tokenizer.convert_tokens_to_ids("</think>")

def generate_response(prompt):
    # Format input for chat template
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    
    )
    
    # Tokenize
    model_inputs = tokenizer([text], return_tensors="pt")
    model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}

    # Generate
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=2048,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

    output_ids = generated_ids[0][len(model_inputs["input_ids"][0]):].tolist()

    try:
        index = len(output_ids) - output_ids[::-1].index(think_token_id)
    except ValueError:
        index = 0

    thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

    return thinking_content, content

# Gradio Interface
demo = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Your Prompt", placeholder="Ask something..."),
    outputs=[
        gr.Textbox(label="Thinking Content"),
        gr.Textbox(label="Final Response")
    ],
    title="Qwen3 Thinking Chatbot",
    description="Ask a question and get both the thinking trace and final answer from Qwen3-0.6B."
)

if __name__ == "__main__":
    demo.launch()





# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import torch

# # Load model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained("jsbeaudry/makandal-v2")
# model = AutoModelForCausalLM.from_pretrained("jsbeaudry/makandal-v2")

# # Set device
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)

# # Generation function
# def generate_text(prompt):
#     inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
#     output = model.generate(
#         **inputs,
#         max_new_tokens=30,
#         do_sample=True,
#         repetition_penalty=1.2,
#         no_repeat_ngram_size=3,
#         temperature=0.9,
#         top_k=40,
#         top_p=0.85,
#         pad_token_id=tokenizer.pad_token_id,
#         eos_token_id=tokenizer.eos_token_id
#     )
#     return tokenizer.decode(output[0], skip_special_tokens=True)

# # Gradio interface
# iface = gr.Interface(
#     fn=generate_text,
#     inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."),
#     outputs="text",
#     title="Makandal Text Generator",
#     description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti."
# )

# if __name__ == "__main__":
#     iface.launch()