import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer from threading import Thread # Load model and tokenizer once at startup model_name = "jsbeaudry/makandal-v2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto" ) think_token_id = tokenizer.convert_tokens_to_ids("") def generate_response_stream(prompt): # Format input for chat template messages = [{"role": "user", "content": prompt}] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) # Tokenize model_inputs = tokenizer([text], return_tensors="pt") model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()} # Create streamer text_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Generation parameters generation_kwargs = dict( **model_inputs, streamer=text_streamer, max_new_tokens=100, do_sample=True, temperature=0.7, top_p=0.9, use_cache=True, ) # Start generation in a separate thread thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Stream the response partial_response = "" for new_text in text_streamer: partial_response += new_text yield partial_response # Wait for thread to complete thread.join() # Gradio Interface with streaming demo = gr.Interface( fn=generate_response_stream, inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), outputs=gr.Textbox(label="Respons"), title="Makandal Text Generator (Streaming)", description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti.", live=False # Set to False to prevent auto-triggering ) if __name__ == "__main__": demo.launch() # import torch # import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM # # Load model and tokenizer once at startup # model_name = "jsbeaudry/makandal-v2" # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = AutoModelForCausalLM.from_pretrained( # model_name, # torch_dtype=torch.float16, # device_map="auto" # ) # think_token_id = tokenizer.convert_tokens_to_ids("") # def generate_response(prompt): # # Format input for chat template # messages = [{"role": "user", "content": prompt}] # text = tokenizer.apply_chat_template( # messages, # tokenize=False, # add_generation_prompt=True, # enable_thinking=False # ) # # Tokenize # model_inputs = tokenizer([text], return_tensors="pt") # model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()} # # Generate # generated_ids = model.generate( # **model_inputs, # max_new_tokens=100, # do_sample=True, # temperature=0.7, # top_p=0.9 # ) # output_ids = generated_ids[0][len(model_inputs["input_ids"][0]):].tolist() # try: # index = len(output_ids) - output_ids[::-1].index(think_token_id) # except ValueError: # index = 0 # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n") # content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n") # return thinking_content, content # # Gradio Interface # demo = gr.Interface( # fn=generate_response, # inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), # outputs=[ # # gr.Textbox(label="Thinking Content"), # gr.Textbox(label="Respons") # ], # title="Makandal Text Generator", # description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti." # ) # if __name__ == "__main__": # demo.launch() # import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM # import torch # # Load model and tokenizer # tokenizer = AutoTokenizer.from_pretrained("jsbeaudry/makandal-v2") # model = AutoModelForCausalLM.from_pretrained("jsbeaudry/makandal-v2") # # Set device # device = "cuda" if torch.cuda.is_available() else "cpu" # model.to(device) # # Generation function # def generate_text(prompt): # inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device) # output = model.generate( # **inputs, # max_new_tokens=30, # do_sample=True, # repetition_penalty=1.2, # no_repeat_ngram_size=3, # temperature=0.9, # top_k=40, # top_p=0.85, # pad_token_id=tokenizer.pad_token_id, # eos_token_id=tokenizer.eos_token_id # ) # return tokenizer.decode(output[0], skip_special_tokens=True) # # Gradio interface # iface = gr.Interface( # fn=generate_text, # inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), # outputs="text", # title="Makandal Text Generator", # description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti." # ) # if __name__ == "__main__": # iface.launch()