import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer from threading import Thread from unsloth import FastLanguageModel # Load model and tokenizer once at startup model_name = "jsbeaudry/makandal-v2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto" ) # Prepare model for inference FastLanguageModel.for_inference(model) think_token_id = tokenizer.convert_tokens_to_ids("") def generate_response_stream(prompt): """Generator function that yields streaming responses""" # Format input for chat template messages = [{"role": "user", "content": prompt}] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) # Tokenize model_inputs = tokenizer([text], return_tensors="pt") model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()} # Setup streamer text_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Generation parameters generation_kwargs = dict( **model_inputs, streamer=text_streamer, max_new_tokens=100, do_sample=True, temperature=0.7, top_p=0.9, use_cache=True, ) # Start generation in a separate thread thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Stream the response full_response = "" thinking_content = "" content = "" for new_text in text_streamer: full_response += new_text # Check if we've hit the think token if "" in full_response: parts = full_response.split("", 1) thinking_content = parts[0].strip() content = parts[1].strip() if len(parts) > 1 else "" yield thinking_content, content else: # If no think token yet, everything is thinking content thinking_content = full_response.strip() yield thinking_content, content # Final yield with complete response if "" in full_response: parts = full_response.split("", 1) thinking_content = parts[0].strip() content = parts[1].strip() if len(parts) > 1 else "" else: # If no think token found, treat everything as content thinking_content = "" content = full_response.strip() yield thinking_content, content def generate_response_interface(prompt): """Interface function for Gradio that handles streaming""" for thinking, content in generate_response_stream(prompt): yield thinking, content # Gradio Interface with streaming demo = gr.Interface( fn=generate_response_interface, inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), outputs=[ gr.Textbox(label="Thinking Content", interactive=False), gr.Textbox(label="Respons", interactive=False) ], title="Makandal Text Generator (Streaming)", description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti.", live=False # Set to True if you want real-time updates as user types ) if __name__ == "__main__": demo.launch() # import torch # import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM # # Load model and tokenizer once at startup # model_name = "jsbeaudry/makandal-v2" # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = AutoModelForCausalLM.from_pretrained( # model_name, # torch_dtype=torch.float16, # device_map="auto" # ) # think_token_id = tokenizer.convert_tokens_to_ids("") # def generate_response(prompt): # # Format input for chat template # messages = [{"role": "user", "content": prompt}] # text = tokenizer.apply_chat_template( # messages, # tokenize=False, # add_generation_prompt=True, # enable_thinking=False # ) # # Tokenize # model_inputs = tokenizer([text], return_tensors="pt") # model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()} # # Generate # generated_ids = model.generate( # **model_inputs, # max_new_tokens=100, # do_sample=True, # temperature=0.7, # top_p=0.9 # ) # output_ids = generated_ids[0][len(model_inputs["input_ids"][0]):].tolist() # try: # index = len(output_ids) - output_ids[::-1].index(think_token_id) # except ValueError: # index = 0 # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n") # content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n") # return thinking_content, content # # Gradio Interface # demo = gr.Interface( # fn=generate_response, # inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), # outputs=[ # # gr.Textbox(label="Thinking Content"), # gr.Textbox(label="Respons") # ], # title="Makandal Text Generator", # description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti." # ) # if __name__ == "__main__": # demo.launch() # import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM # import torch # # Load model and tokenizer # tokenizer = AutoTokenizer.from_pretrained("jsbeaudry/makandal-v2") # model = AutoModelForCausalLM.from_pretrained("jsbeaudry/makandal-v2") # # Set device # device = "cuda" if torch.cuda.is_available() else "cpu" # model.to(device) # # Generation function # def generate_text(prompt): # inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device) # output = model.generate( # **inputs, # max_new_tokens=30, # do_sample=True, # repetition_penalty=1.2, # no_repeat_ngram_size=3, # temperature=0.9, # top_k=40, # top_p=0.85, # pad_token_id=tokenizer.pad_token_id, # eos_token_id=tokenizer.eos_token_id # ) # return tokenizer.decode(output[0], skip_special_tokens=True) # # Gradio interface # iface = gr.Interface( # fn=generate_text, # inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), # outputs="text", # title="Makandal Text Generator", # description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti." # ) # if __name__ == "__main__": # iface.launch()