File size: 5,459 Bytes
93c520c 8a85e81 f63cdc4 904b1f3 93c520c 8a85e81 f63cdc4 93c520c 2ef655f 93c520c f63cdc4 594db6a f63cdc4 93c520c f63cdc4 8183b3d 8a85e81 93c520c f63cdc4 8a85e81 f63cdc4 594db6a f63cdc4 594db6a f63cdc4 594db6a 93c520c f63cdc4 93c520c 594db6a 23efa83 594db6a f63cdc4 594db6a 8a85e81 93c520c f63cdc4 93c520c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
# Load model and tokenizer once at startup
model_name = "jsbeaudry/makandal-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto"
)
think_token_id = tokenizer.convert_tokens_to_ids("</think>")
def generate_response_stream(prompt):
# Format input for chat template
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=False
)
# Tokenize
model_inputs = tokenizer([text], return_tensors="pt")
model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
# Create streamer
text_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Generation parameters
generation_kwargs = dict(
**model_inputs,
streamer=text_streamer,
max_new_tokens=100,
do_sample=True,
temperature=0.7,
top_p=0.9,
use_cache=True,
)
# Start generation in a separate thread
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Stream the response
partial_response = ""
for new_text in text_streamer:
partial_response += new_text
yield partial_response
# Wait for thread to complete
thread.join()
# Gradio Interface with streaming
demo = gr.Interface(
fn=generate_response_stream,
inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."),
outputs=gr.Textbox(label="Respons"),
title="Makandal Text Generator (Streaming)",
description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti.",
live=False # Set to False to prevent auto-triggering
)
if __name__ == "__main__":
demo.launch()
# import torch
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM
# # Load model and tokenizer once at startup
# model_name = "jsbeaudry/makandal-v2"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
# model_name,
# torch_dtype=torch.float16,
# device_map="auto"
# )
# think_token_id = tokenizer.convert_tokens_to_ids("</think>")
# def generate_response(prompt):
# # Format input for chat template
# messages = [{"role": "user", "content": prompt}]
# text = tokenizer.apply_chat_template(
# messages,
# tokenize=False,
# add_generation_prompt=True,
# enable_thinking=False
# )
# # Tokenize
# model_inputs = tokenizer([text], return_tensors="pt")
# model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
# # Generate
# generated_ids = model.generate(
# **model_inputs,
# max_new_tokens=100,
# do_sample=True,
# temperature=0.7,
# top_p=0.9
# )
# output_ids = generated_ids[0][len(model_inputs["input_ids"][0]):].tolist()
# try:
# index = len(output_ids) - output_ids[::-1].index(think_token_id)
# except ValueError:
# index = 0
# thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
# content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
# return thinking_content, content
# # Gradio Interface
# demo = gr.Interface(
# fn=generate_response,
# inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."),
# outputs=[
# # gr.Textbox(label="Thinking Content"),
# gr.Textbox(label="Respons")
# ],
# title="Makandal Text Generator",
# description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti."
# )
# if __name__ == "__main__":
# demo.launch()
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import torch
# # Load model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained("jsbeaudry/makandal-v2")
# model = AutoModelForCausalLM.from_pretrained("jsbeaudry/makandal-v2")
# # Set device
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)
# # Generation function
# def generate_text(prompt):
# inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
# output = model.generate(
# **inputs,
# max_new_tokens=30,
# do_sample=True,
# repetition_penalty=1.2,
# no_repeat_ngram_size=3,
# temperature=0.9,
# top_k=40,
# top_p=0.85,
# pad_token_id=tokenizer.pad_token_id,
# eos_token_id=tokenizer.eos_token_id
# )
# return tokenizer.decode(output[0], skip_special_tokens=True)
# # Gradio interface
# iface = gr.Interface(
# fn=generate_text,
# inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."),
# outputs="text",
# title="Makandal Text Generator",
# description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti."
# )
# if __name__ == "__main__":
# iface.launch()
|