vikhyatk commited on
Commit
e05052e
·
verified ·
1 Parent(s): 42586be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -31
app.py CHANGED
@@ -1,5 +1,32 @@
1
- import spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import torch
 
3
  import os
4
  import gradio as gr
5
  from threading import Thread
@@ -11,24 +38,24 @@ from transformers import (
11
  from PIL import ImageDraw
12
  from torchvision.transforms.v2 import Resize
13
 
14
- import subprocess
 
15
 
16
- subprocess.run(
17
- "pip install flash-attn --no-build-isolation",
18
- env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
19
- shell=True,
20
- )
21
 
22
  auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
23
  tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
24
  moondream = AutoModelForCausalLM.from_pretrained(
25
  "vikhyatk/moondream-next",
26
- revision="591ff5569240caf61126be6b080ff5c9370b87d4",
27
  trust_remote_code=True,
28
  torch_dtype=torch.float16,
29
  device_map={"": "cuda"},
30
  attn_implementation="flash_attention_2",
31
- token=auth_token,
32
  )
33
  moondream.eval()
34
 
@@ -36,17 +63,20 @@ moondream.eval()
36
  @spaces.GPU(duration=10)
37
  def answer_question(img, prompt):
38
  if img is None:
39
- yield ""
40
  return
41
 
42
  image_embeds = moondream.encode_image(img)
43
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
 
44
  thread = Thread(
45
  target=moondream.answer_question,
46
  kwargs={
47
  "image_embeds": image_embeds,
48
  "question": prompt,
49
  "tokenizer": tokenizer,
 
 
50
  "streamer": streamer,
51
  },
52
  )
@@ -55,7 +85,11 @@ def answer_question(img, prompt):
55
  buffer = ""
56
  for new_text in streamer:
57
  buffer += new_text
58
- yield buffer.strip()
 
 
 
 
59
 
60
 
61
  @spaces.GPU(duration=10)
@@ -84,6 +118,10 @@ def caption(img, mode):
84
 
85
  @spaces.GPU(duration=10)
86
  def detect(img, object):
 
 
 
 
87
  w, h = img.size
88
  if w > 768 or h > 768:
89
  img = Resize(768)(img)
@@ -97,7 +135,7 @@ def detect(img, object):
97
  width=3,
98
  )
99
 
100
- return gr.update(visible=True, value=img)
101
 
102
 
103
  js = """
@@ -173,22 +211,27 @@ js = """
173
 
174
  // Dark mode colors
175
  var darkColors = {
 
176
  1: '#4a5788', // Deep blue-grey
177
  2: '#4c5a8d',
178
  3: '#4e5d92',
179
  4: '#506097',
180
  5: '#52639c' // Brighter blue-grey
 
 
 
 
 
 
181
  };
182
 
183
  return isDarkMode ? darkColors[age] : lightColors[age];
184
  }
185
 
186
  function draw() {
187
- // var isDarkMode = document.body.classList.contains('dark');
188
- var isDarkMode = false;
189
- ctx.fillStyle = isDarkMode ? '#333' : '#f0f0f0';
190
  ctx.fillRect(0, 0, canvas.width, canvas.height);
191
-
192
  for (var i = 0; i < cols; i++) {
193
  for (var j = 0; j < rows; j++) {
194
  if (grid[i][j]) {
@@ -220,6 +263,10 @@ css = """
220
  font-size: 1.4rem !important;
221
  }
222
 
 
 
 
 
223
  #life-canvas {
224
  position: fixed;
225
  top: 0;
@@ -262,9 +309,9 @@ with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
262
  )
263
  submit = gr.Button("Submit")
264
  img = gr.Image(type="pil", label="Upload an Image")
265
- submit.click(answer_question, [img, prompt], output)
266
- prompt.submit(answer_question, [img, prompt], output)
267
- img.change(answer_question, [img, prompt], output)
268
  elif mode == "Caption":
269
  with gr.Group():
270
  with gr.Row():
@@ -278,7 +325,7 @@ with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
278
  img = gr.Image(type="pil", label="Upload an Image")
279
  submit.click(caption, [img, caption_mode], output)
280
  img.change(caption, [img, caption_mode], output)
281
- else:
282
  with gr.Group():
283
  with gr.Row():
284
  prompt = gr.Textbox(
@@ -288,18 +335,21 @@ with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
288
  )
289
  submit = gr.Button("Submit")
290
  img = gr.Image(type="pil", label="Upload an Image")
291
- submit.click(detect, [img, prompt], ann)
292
- prompt.submit(detect, [img, prompt], ann)
293
- img.change(detect, [img, prompt], ann)
 
 
294
 
295
  with gr.Column():
296
- output = gr.Markdown(
297
- label="Response",
298
- elem_classes=["output-text"],
299
- )
300
- ann = gr.Image(visible=False, show_label=False)
301
-
302
- mode_radio.change(lambda: "", [], output)
303
- mode_radio.change(lambda: gr.update(visible=False, value=None), [], ann)
 
304
 
305
  demo.queue().launch()
 
1
+ try:
2
+ import spaces
3
+
4
+ IN_SPACES = True
5
+ except ImportError:
6
+ from functools import wraps
7
+ import inspect
8
+
9
+ class spaces:
10
+ @staticmethod
11
+ def GPU(duration):
12
+ def decorator(func):
13
+ @wraps(func) # Preserves the original function's metadata
14
+ def wrapper(*args, **kwargs):
15
+ if inspect.isgeneratorfunction(func):
16
+ # If the decorated function is a generator, yield from it
17
+ yield from func(*args, **kwargs)
18
+ else:
19
+ # For regular functions, just return the result
20
+ return func(*args, **kwargs)
21
+
22
+ return wrapper
23
+
24
+ return decorator
25
+
26
+ IN_SPACES = False
27
+
28
  import torch
29
+ from queue import Queue
30
  import os
31
  import gradio as gr
32
  from threading import Thread
 
38
  from PIL import ImageDraw
39
  from torchvision.transforms.v2 import Resize
40
 
41
+ if IN_SPACES:
42
+ import subprocess
43
 
44
+ subprocess.run(
45
+ "pip install flash-attn --no-build-isolation",
46
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
47
+ shell=True,
48
+ )
49
 
50
  auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
51
  tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
52
  moondream = AutoModelForCausalLM.from_pretrained(
53
  "vikhyatk/moondream-next",
 
54
  trust_remote_code=True,
55
  torch_dtype=torch.float16,
56
  device_map={"": "cuda"},
57
  attn_implementation="flash_attention_2",
58
+ token=auth_token if IN_SPACES else None,
59
  )
60
  moondream.eval()
61
 
 
63
  @spaces.GPU(duration=10)
64
  def answer_question(img, prompt):
65
  if img is None:
66
+ yield "", ""
67
  return
68
 
69
  image_embeds = moondream.encode_image(img)
70
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
71
+ queue = Queue()
72
  thread = Thread(
73
  target=moondream.answer_question,
74
  kwargs={
75
  "image_embeds": image_embeds,
76
  "question": prompt,
77
  "tokenizer": tokenizer,
78
+ "allow_cot": True,
79
+ "result_queue": queue,
80
  "streamer": streamer,
81
  },
82
  )
 
85
  buffer = ""
86
  for new_text in streamer:
87
  buffer += new_text
88
+ yield buffer.strip(), "Thinking..."
89
+
90
+ answer = queue.get()
91
+ # yield answer["answer"], answer["thought"]
92
+ yield answer["answer"], ""
93
 
94
 
95
  @spaces.GPU(duration=10)
 
118
 
119
  @spaces.GPU(duration=10)
120
  def detect(img, object):
121
+ if img is None:
122
+ yield "", gr.update(visible=False, value=None)
123
+ return
124
+
125
  w, h = img.size
126
  if w > 768 or h > 768:
127
  img = Resize(768)(img)
 
135
  width=3,
136
  )
137
 
138
+ yield f"{len(objs)} detected", gr.update(visible=True, value=img)
139
 
140
 
141
  js = """
 
211
 
212
  // Dark mode colors
213
  var darkColors = {
214
+ /*
215
  1: '#4a5788', // Deep blue-grey
216
  2: '#4c5a8d',
217
  3: '#4e5d92',
218
  4: '#506097',
219
  5: '#52639c' // Brighter blue-grey
220
+ */
221
+ 1: 'rgb(16, 20, 32)',
222
+ 2: 'rgb(21, 25, 39)',
223
+ 3: 'rgb(26, 30, 46)',
224
+ 4: 'rgb(31, 35, 53)',
225
+ 5: 'rgb(36, 40, 60)'
226
  };
227
 
228
  return isDarkMode ? darkColors[age] : lightColors[age];
229
  }
230
 
231
  function draw() {
232
+ var isDarkMode = document.body.classList.contains('dark');
233
+ ctx.fillStyle = isDarkMode ? '#0b0f19' : '#f0f0f0';
 
234
  ctx.fillRect(0, 0, canvas.width, canvas.height);
 
235
  for (var i = 0; i < cols; i++) {
236
  for (var j = 0; j < rows; j++) {
237
  if (grid[i][j]) {
 
263
  font-size: 1.4rem !important;
264
  }
265
 
266
+ .chain-of-thought span p {
267
+ opacity: 0.7 !important;
268
+ }
269
+
270
  #life-canvas {
271
  position: fixed;
272
  top: 0;
 
309
  )
310
  submit = gr.Button("Submit")
311
  img = gr.Image(type="pil", label="Upload an Image")
312
+ submit.click(answer_question, [img, prompt], [output, thought])
313
+ prompt.submit(answer_question, [img, prompt], [output, thought])
314
+ img.change(answer_question, [img, prompt], [output, thought])
315
  elif mode == "Caption":
316
  with gr.Group():
317
  with gr.Row():
 
325
  img = gr.Image(type="pil", label="Upload an Image")
326
  submit.click(caption, [img, caption_mode], output)
327
  img.change(caption, [img, caption_mode], output)
328
+ elif mode == "Detect":
329
  with gr.Group():
330
  with gr.Row():
331
  prompt = gr.Textbox(
 
335
  )
336
  submit = gr.Button("Submit")
337
  img = gr.Image(type="pil", label="Upload an Image")
338
+ submit.click(detect, [img, prompt], [thought, ann])
339
+ prompt.submit(detect, [img, prompt], [thought, ann])
340
+ img.change(detect, [img, prompt], [thought, ann])
341
+ else:
342
+ gr.Markdown("Coming soon!")
343
 
344
  with gr.Column():
345
+ thought = gr.Markdown(elem_classes=["chain-of-thought"])
346
+ output = gr.Markdown(label="Response", elem_classes=["output-text"])
347
+ ann = gr.Image(visible=False)
348
+
349
+ mode_radio.change(
350
+ lambda: ("", "", gr.update(visible=False, value=None)),
351
+ [],
352
+ [output, thought, ann],
353
+ )
354
 
355
  demo.queue().launch()