import torch | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer | |
from threading import Thread | |
# Load model and tokenizer once at startup | |
model_name = "jsbeaudry/makandal-v2" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float16, | |
device_map="auto" | |
) | |
think_token_id = tokenizer.convert_tokens_to_ids("</think>") | |
def generate_response_stream(prompt): | |
# Format input for chat template | |
messages = [{"role": "user", "content": prompt}] | |
text = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True, | |
enable_thinking=False | |
) | |
# Tokenize | |
model_inputs = tokenizer([text], return_tensors="pt") | |
model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()} | |
# Create streamer | |
text_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
# Generation parameters | |
generation_kwargs = dict( | |
**model_inputs, | |
streamer=text_streamer, | |
max_new_tokens=100, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.9, | |
use_cache=True, | |
) | |
# Start generation in a separate thread | |
thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
thread.start() | |
# Stream the response | |
partial_response = "" | |
for new_text in text_streamer: | |
partial_response += new_text | |
yield partial_response | |
# Wait for thread to complete | |
thread.join() | |
# Gradio Interface with streaming | |
demo = gr.Interface( | |
fn=generate_response_stream, | |
inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), | |
outputs=gr.Textbox(label="Respons"), | |
title="Makandal Text Generator (Streaming)", | |
description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti.", | |
live=False # Set to False to prevent auto-triggering | |
) | |
if __name__ == "__main__": | |
demo.launch() | |
# import torch | |
# import gradio as gr | |
# from transformers import AutoTokenizer, AutoModelForCausalLM | |
# # Load model and tokenizer once at startup | |
# model_name = "jsbeaudry/makandal-v2" | |
# tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# model = AutoModelForCausalLM.from_pretrained( | |
# model_name, | |
# torch_dtype=torch.float16, | |
# device_map="auto" | |
# ) | |
# think_token_id = tokenizer.convert_tokens_to_ids("</think>") | |
# def generate_response(prompt): | |
# # Format input for chat template | |
# messages = [{"role": "user", "content": prompt}] | |
# text = tokenizer.apply_chat_template( | |
# messages, | |
# tokenize=False, | |
# add_generation_prompt=True, | |
# enable_thinking=False | |
# ) | |
# # Tokenize | |
# model_inputs = tokenizer([text], return_tensors="pt") | |
# model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()} | |
# # Generate | |
# generated_ids = model.generate( | |
# **model_inputs, | |
# max_new_tokens=100, | |
# do_sample=True, | |
# temperature=0.7, | |
# top_p=0.9 | |
# ) | |
# output_ids = generated_ids[0][len(model_inputs["input_ids"][0]):].tolist() | |
# try: | |
# index = len(output_ids) - output_ids[::-1].index(think_token_id) | |
# except ValueError: | |
# index = 0 | |
# thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n") | |
# content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n") | |
# return thinking_content, content | |
# # Gradio Interface | |
# demo = gr.Interface( | |
# fn=generate_response, | |
# inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), | |
# outputs=[ | |
# # gr.Textbox(label="Thinking Content"), | |
# gr.Textbox(label="Respons") | |
# ], | |
# title="Makandal Text Generator", | |
# description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti." | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
# import gradio as gr | |
# from transformers import AutoTokenizer, AutoModelForCausalLM | |
# import torch | |
# # Load model and tokenizer | |
# tokenizer = AutoTokenizer.from_pretrained("jsbeaudry/makandal-v2") | |
# model = AutoModelForCausalLM.from_pretrained("jsbeaudry/makandal-v2") | |
# # Set device | |
# device = "cuda" if torch.cuda.is_available() else "cpu" | |
# model.to(device) | |
# # Generation function | |
# def generate_text(prompt): | |
# inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device) | |
# output = model.generate( | |
# **inputs, | |
# max_new_tokens=30, | |
# do_sample=True, | |
# repetition_penalty=1.2, | |
# no_repeat_ngram_size=3, | |
# temperature=0.9, | |
# top_k=40, | |
# top_p=0.85, | |
# pad_token_id=tokenizer.pad_token_id, | |
# eos_token_id=tokenizer.eos_token_id | |
# ) | |
# return tokenizer.decode(output[0], skip_special_tokens=True) | |
# # Gradio interface | |
# iface = gr.Interface( | |
# fn=generate_text, | |
# inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), | |
# outputs="text", | |
# title="Makandal Text Generator", | |
# description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti." | |
# ) | |
# if __name__ == "__main__": | |
# iface.launch() | |