jsbeaudry commited on
Commit
594db6a
·
verified ·
1 Parent(s): 904b1f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -53
app.py CHANGED
@@ -1,10 +1,8 @@
1
- from unsloth import FastLanguageModel
2
  import torch
3
  import gradio as gr
4
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
5
  from threading import Thread
6
 
7
-
8
  # Load model and tokenizer once at startup
9
  model_name = "jsbeaudry/makandal-v2"
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -14,13 +12,9 @@ model = AutoModelForCausalLM.from_pretrained(
14
  device_map="auto"
15
  )
16
 
17
- # Prepare model for inference
18
- FastLanguageModel.for_inference(model)
19
-
20
  think_token_id = tokenizer.convert_tokens_to_ids("</think>")
21
 
22
  def generate_response_stream(prompt):
23
- """Generator function that yields streaming responses"""
24
  # Format input for chat template
25
  messages = [{"role": "user", "content": prompt}]
26
  text = tokenizer.apply_chat_template(
@@ -34,7 +28,7 @@ def generate_response_stream(prompt):
34
  model_inputs = tokenizer([text], return_tensors="pt")
35
  model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
36
 
37
- # Setup streamer
38
  text_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
39
 
40
  # Generation parameters
@@ -53,52 +47,22 @@ def generate_response_stream(prompt):
53
  thread.start()
54
 
55
  # Stream the response
56
- full_response = ""
57
- thinking_content = ""
58
- content = ""
59
-
60
  for new_text in text_streamer:
61
- full_response += new_text
62
-
63
- # Check if we've hit the think token
64
- if "</think>" in full_response:
65
- parts = full_response.split("</think>", 1)
66
- thinking_content = parts[0].strip()
67
- content = parts[1].strip() if len(parts) > 1 else ""
68
- yield thinking_content, content
69
- else:
70
- # If no think token yet, everything is thinking content
71
- thinking_content = full_response.strip()
72
- yield thinking_content, content
73
-
74
- # Final yield with complete response
75
- if "</think>" in full_response:
76
- parts = full_response.split("</think>", 1)
77
- thinking_content = parts[0].strip()
78
- content = parts[1].strip() if len(parts) > 1 else ""
79
- else:
80
- # If no think token found, treat everything as content
81
- thinking_content = ""
82
- content = full_response.strip()
83
 
84
- yield thinking_content, content
85
-
86
- def generate_response_interface(prompt):
87
- """Interface function for Gradio that handles streaming"""
88
- for thinking, content in generate_response_stream(prompt):
89
- yield thinking, content
90
 
91
  # Gradio Interface with streaming
92
  demo = gr.Interface(
93
- fn=generate_response_interface,
94
  inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."),
95
- outputs=[
96
- gr.Textbox(label="Thinking Content", interactive=False),
97
- gr.Textbox(label="Respons", interactive=False)
98
- ],
99
  title="Makandal Text Generator (Streaming)",
100
  description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti.",
101
- live=False # Set to True if you want real-time updates as user types
102
  )
103
 
104
  if __name__ == "__main__":
@@ -107,14 +71,6 @@ if __name__ == "__main__":
107
 
108
 
109
 
110
-
111
-
112
-
113
-
114
-
115
-
116
-
117
-
118
  # import torch
119
  # import gradio as gr
120
  # from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
1
  import torch
2
  import gradio as gr
3
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
4
  from threading import Thread
5
 
 
6
  # Load model and tokenizer once at startup
7
  model_name = "jsbeaudry/makandal-v2"
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
12
  device_map="auto"
13
  )
14
 
 
 
 
15
  think_token_id = tokenizer.convert_tokens_to_ids("</think>")
16
 
17
  def generate_response_stream(prompt):
 
18
  # Format input for chat template
19
  messages = [{"role": "user", "content": prompt}]
20
  text = tokenizer.apply_chat_template(
 
28
  model_inputs = tokenizer([text], return_tensors="pt")
29
  model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
30
 
31
+ # Create streamer
32
  text_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
33
 
34
  # Generation parameters
 
47
  thread.start()
48
 
49
  # Stream the response
50
+ partial_response = ""
 
 
 
51
  for new_text in text_streamer:
52
+ partial_response += new_text
53
+ yield partial_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ # Wait for thread to complete
56
+ thread.join()
 
 
 
 
57
 
58
  # Gradio Interface with streaming
59
  demo = gr.Interface(
60
+ fn=generate_response_stream,
61
  inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."),
62
+ outputs=gr.Textbox(label="Respons"),
 
 
 
63
  title="Makandal Text Generator (Streaming)",
64
  description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti.",
65
+ live=False # Set to False to prevent auto-triggering
66
  )
67
 
68
  if __name__ == "__main__":
 
71
 
72
 
73
 
 
 
 
 
 
 
 
 
74
  # import torch
75
  # import gradio as gr
76
  # from transformers import AutoTokenizer, AutoModelForCausalLM