jsbeaudry commited on
Commit
f63cdc4
·
verified ·
1 Parent(s): 1c612b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -25
app.py CHANGED
@@ -1,10 +1,11 @@
1
  import torch
2
  import gradio as gr
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
4
 
5
  # Load model and tokenizer once at startup
6
  model_name = "jsbeaudry/makandal-v2"
7
-
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
  model = AutoModelForCausalLM.from_pretrained(
10
  model_name,
@@ -12,9 +13,13 @@ model = AutoModelForCausalLM.from_pretrained(
12
  device_map="auto"
13
  )
14
 
 
 
 
15
  think_token_id = tokenizer.convert_tokens_to_ids("</think>")
16
 
17
- def generate_response(prompt):
 
18
  # Format input for chat template
19
  messages = [{"role": "user", "content": prompt}]
20
  text = tokenizer.apply_chat_template(
@@ -22,44 +27,77 @@ def generate_response(prompt):
22
  tokenize=False,
23
  add_generation_prompt=True,
24
  enable_thinking=False
25
-
26
  )
27
 
28
  # Tokenize
29
  model_inputs = tokenizer([text], return_tensors="pt")
30
  model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
31
-
32
- # Generate
33
- generated_ids = model.generate(
 
 
 
34
  **model_inputs,
 
35
  max_new_tokens=100,
36
  do_sample=True,
37
  temperature=0.7,
38
- top_p=0.9
 
39
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- output_ids = generated_ids[0][len(model_inputs["input_ids"][0]):].tolist()
42
-
43
- try:
44
- index = len(output_ids) - output_ids[::-1].index(think_token_id)
45
- except ValueError:
46
- index = 0
47
-
48
- thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
49
- content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
50
-
51
- return thinking_content, content
52
 
53
- # Gradio Interface
54
  demo = gr.Interface(
55
- fn=generate_response,
56
  inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."),
57
  outputs=[
58
- # gr.Textbox(label="Thinking Content"),
59
- gr.Textbox(label="Respons")
60
  ],
61
- title="Makandal Text Generator",
62
- description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti."
 
63
  )
64
 
65
  if __name__ == "__main__":
@@ -69,6 +107,84 @@ if __name__ == "__main__":
69
 
70
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  # import gradio as gr
73
  # from transformers import AutoTokenizer, AutoModelForCausalLM
74
  # import torch
 
1
  import torch
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
4
+ from threading import Thread
5
+ from unsloth import FastLanguageModel
6
 
7
  # Load model and tokenizer once at startup
8
  model_name = "jsbeaudry/makandal-v2"
 
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
  model = AutoModelForCausalLM.from_pretrained(
11
  model_name,
 
13
  device_map="auto"
14
  )
15
 
16
+ # Prepare model for inference
17
+ FastLanguageModel.for_inference(model)
18
+
19
  think_token_id = tokenizer.convert_tokens_to_ids("</think>")
20
 
21
+ def generate_response_stream(prompt):
22
+ """Generator function that yields streaming responses"""
23
  # Format input for chat template
24
  messages = [{"role": "user", "content": prompt}]
25
  text = tokenizer.apply_chat_template(
 
27
  tokenize=False,
28
  add_generation_prompt=True,
29
  enable_thinking=False
 
30
  )
31
 
32
  # Tokenize
33
  model_inputs = tokenizer([text], return_tensors="pt")
34
  model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
35
+
36
+ # Setup streamer
37
+ text_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
38
+
39
+ # Generation parameters
40
+ generation_kwargs = dict(
41
  **model_inputs,
42
+ streamer=text_streamer,
43
  max_new_tokens=100,
44
  do_sample=True,
45
  temperature=0.7,
46
+ top_p=0.9,
47
+ use_cache=True,
48
  )
49
+
50
+ # Start generation in a separate thread
51
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
52
+ thread.start()
53
+
54
+ # Stream the response
55
+ full_response = ""
56
+ thinking_content = ""
57
+ content = ""
58
+
59
+ for new_text in text_streamer:
60
+ full_response += new_text
61
+
62
+ # Check if we've hit the think token
63
+ if "</think>" in full_response:
64
+ parts = full_response.split("</think>", 1)
65
+ thinking_content = parts[0].strip()
66
+ content = parts[1].strip() if len(parts) > 1 else ""
67
+ yield thinking_content, content
68
+ else:
69
+ # If no think token yet, everything is thinking content
70
+ thinking_content = full_response.strip()
71
+ yield thinking_content, content
72
+
73
+ # Final yield with complete response
74
+ if "</think>" in full_response:
75
+ parts = full_response.split("</think>", 1)
76
+ thinking_content = parts[0].strip()
77
+ content = parts[1].strip() if len(parts) > 1 else ""
78
+ else:
79
+ # If no think token found, treat everything as content
80
+ thinking_content = ""
81
+ content = full_response.strip()
82
+
83
+ yield thinking_content, content
84
 
85
+ def generate_response_interface(prompt):
86
+ """Interface function for Gradio that handles streaming"""
87
+ for thinking, content in generate_response_stream(prompt):
88
+ yield thinking, content
 
 
 
 
 
 
 
89
 
90
+ # Gradio Interface with streaming
91
  demo = gr.Interface(
92
+ fn=generate_response_interface,
93
  inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."),
94
  outputs=[
95
+ gr.Textbox(label="Thinking Content", interactive=False),
96
+ gr.Textbox(label="Respons", interactive=False)
97
  ],
98
+ title="Makandal Text Generator (Streaming)",
99
+ description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti.",
100
+ live=False # Set to True if you want real-time updates as user types
101
  )
102
 
103
  if __name__ == "__main__":
 
107
 
108
 
109
 
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+ # import torch
118
+ # import gradio as gr
119
+ # from transformers import AutoTokenizer, AutoModelForCausalLM
120
+
121
+ # # Load model and tokenizer once at startup
122
+ # model_name = "jsbeaudry/makandal-v2"
123
+
124
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
125
+ # model = AutoModelForCausalLM.from_pretrained(
126
+ # model_name,
127
+ # torch_dtype=torch.float16,
128
+ # device_map="auto"
129
+ # )
130
+
131
+ # think_token_id = tokenizer.convert_tokens_to_ids("</think>")
132
+
133
+ # def generate_response(prompt):
134
+ # # Format input for chat template
135
+ # messages = [{"role": "user", "content": prompt}]
136
+ # text = tokenizer.apply_chat_template(
137
+ # messages,
138
+ # tokenize=False,
139
+ # add_generation_prompt=True,
140
+ # enable_thinking=False
141
+
142
+ # )
143
+
144
+ # # Tokenize
145
+ # model_inputs = tokenizer([text], return_tensors="pt")
146
+ # model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
147
+
148
+ # # Generate
149
+ # generated_ids = model.generate(
150
+ # **model_inputs,
151
+ # max_new_tokens=100,
152
+ # do_sample=True,
153
+ # temperature=0.7,
154
+ # top_p=0.9
155
+ # )
156
+
157
+ # output_ids = generated_ids[0][len(model_inputs["input_ids"][0]):].tolist()
158
+
159
+ # try:
160
+ # index = len(output_ids) - output_ids[::-1].index(think_token_id)
161
+ # except ValueError:
162
+ # index = 0
163
+
164
+ # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
165
+ # content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
166
+
167
+ # return thinking_content, content
168
+
169
+ # # Gradio Interface
170
+ # demo = gr.Interface(
171
+ # fn=generate_response,
172
+ # inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."),
173
+ # outputs=[
174
+ # # gr.Textbox(label="Thinking Content"),
175
+ # gr.Textbox(label="Respons")
176
+ # ],
177
+ # title="Makandal Text Generator",
178
+ # description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti."
179
+ # )
180
+
181
+ # if __name__ == "__main__":
182
+ # demo.launch()
183
+
184
+
185
+
186
+
187
+
188
  # import gradio as gr
189
  # from transformers import AutoTokenizer, AutoModelForCausalLM
190
  # import torch