Spaces:

cyrus28214
/

Revisual-R1

Runtime error

App Files Files Community

cyrus28214 commited on Jun 20

Commit

29bc91e

unverified ·

1 Parent(s): 80cd182

update

Browse files

Files changed (2) hide show

app.py +62 -50
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -1,43 +1,58 @@
 import gradio as gr
-import torch
-from PIL import Image
-from threading import Thread
-from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
 from transformers.image_utils import load_image
 import spaces
-device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-MODEL_ID = "HuggingFaceTB/SmolVLM-256M-Instruct"
-processor = AutoProcessor.from_pretrained(MODEL_ID)
-model = AutoModelForVision2Seq.from_pretrained(
     MODEL_ID,
-    torch_dtype=torch_dtype,
-    trust_remote_code=True
-).to(device)
 @spaces.GPU
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    text = message["text"]
-    files = message["files"]
     all_images = []
     current_message_images = []
-    messages = [{"role": "system", "content": system_message}]
-    print(message)
-    print(history)
     current_message_images = [load_image(image) for image in files]
     messages.append({
         "role": "user",
         "content": [
@@ -45,14 +60,20 @@ def respond(
             {"type": "text", "text": text},
         ],
     })
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt],
-        images=current_message_images if current_message_images else None,
         return_tensors="pt",
         padding=True,
-    ).to(device, torch_dtype)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
@@ -64,24 +85,15 @@ def respond(
         buffer += new_text
         yield buffer
-demo = gr.ChatInterface(
-    respond,
-    multimodal=True,
-    additional_inputs=[
-        gr.Textbox(value="You are a helpful and friendly multimodal assistant. You can analyze images and answer questions about them.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-    title="Chatbot",
-    description="Ask me anything or upload an image. This version uses AutoModel and AutoProcessor directly.",
-)
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
 from transformers.image_utils import load_image
+from threading import Thread
+import torch
 import spaces
+MODEL_ID = "TIGER-Lab/VL-Rethinker-7B"
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = AutoModelForImageTextToText.from_pretrained(
     MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16
+).to("cuda").eval()
 @spaces.GPU
+def model_inference(input_dict, history):
+    text = input_dict["text"]
+    files = input_dict["files"]
+    """
+    Create chat history
+    Example history value:
+    [
+        [('pixel.png',), None],
+        ['ignore this image. just say "hi" and nothing else', 'Hi!'],
+        ['just say "hi" and nothing else', 'Hi!']
+    ]
+    """
     all_images = []
     current_message_images = []
+    messages = []
+    for val in history:
+        if val[0]:
+            if isinstance(val[0], str):
+                messages.append({
+                    "role": "user",
+                    "content": [
+                        *[{"type": "image", "image": image} for image in current_message_images],
+                        {"type": "text", "text": val[0]},
+                    ],
+                })
+                current_message_images = []
+            else:
+                # Load messages. These will be appended to the first user text message that comes after
+                current_message_images = [load_image(image) for image in val[0]]
+                all_images += current_message_images
+        if val[1]:
+            messages.append({"role": "assistant", "content": val[1]})
     current_message_images = [load_image(image) for image in files]
+    all_images += current_message_images
     messages.append({
         "role": "user",
         "content": [
             {"type": "text", "text": text},
         ],
     })
+    #print(messages)
+    """
+    Generate and stream text
+    """
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt],
+        images=all_images if all_images else None,
         return_tensors="pt",
         padding=True,
+    ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
         buffer += new_text
         yield buffer
+with gr.Blocks() as demo:
+    gr.ChatInterface(
+        fn=model_inference,
+        description="# **VL-Rethinker-7B**",
+        fill_height=True,
+        textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
+        stop_btn="Stop Generation",
+        multimodal=True,
+        cache_examples=False,
+    )
+demo.launch(debug=True)

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
-huggingface_hub==0.33.0
-transformers==4.52.4

+huggingface_hub
+transformers
+torchvision