Spaces:

mrfakename
/

VoiceStar

Running on Zero

App Files Files Community

mrfakename commited on Jan 20

Commit

1ed1b9e

verified ·

1 Parent(s): baa5da7

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -66

app.py CHANGED Viewed

@@ -1,70 +1,112 @@
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
-import spaces
 import gradio as gr
-# Default: Load the model on the available device(s)
-model = Qwen2VLForConditionalGeneration.from_pretrained(
-    "OS-Copilot/OS-Atlas-Base-7B", torch_dtype="auto", device_map="auto"
-)
-processor = AutoProcessor.from_pretrained("OS-Copilot/OS-Atlas-Base-7B")
-@spaces.GPU
-def run(image, message):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": image,
-                },
-                {"type": "text", "text": message},
-            ],
-        }
-    ]
-    # Preparation for inference
-    text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to("cuda")
-    # Inference: Generation of the output
-    generated_ids = model.generate(**inputs, max_new_tokens=128)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=False, clean_up_tokenization_spaces=False
     )
-    return output_text
-    # <|object_ref_start|>language switch<|object_ref_end|><|box_start|>(576,12),(592,42)<|box_end|><|im_end|>
-with gr.Blocks() as demo:
-    gr.Markdown("# Unofficial OS-Atlas demo")
-    image = gr.Image(label="Image", type="filepath")
-    text = gr.Textbox(label="Prompt")
-    btn = gr.Button("Generate", variant="primary")
-    output = gr.Textbox(interactive=False)
-    btn.click(run, inputs=[image, text], outputs=output)
-    examples = gr.Examples([
-        [
-            'examples/web_6f93090a-81f6-489e-bb35-1a2838b18c01.png',
-            'In this UI screenshot, what is the position of the element corresponding to the command "switch language of current page" (with bbox)?'
-        ],
-        [
-            'examples/web_dfacd48d-d2c2-492f-b94c-41e6a34ea99f.png',
-            'In this UI screenshot, what is the position of the top button (with bbox)?'
-        ]
-    ], inputs=[image, text])
-demo.queue().launch()

 import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+from threading import Thread
+import spaces
+class ChatInterface:
+    def __init__(self, model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+    def format_chat_prompt(self, message, history, system_message):
+        messages = [{"role": "system", "content": system_message}]
+        for user_msg, assistant_msg in history:
+            if user_msg:
+                messages.append({"role": "user", "content": user_msg})
+            if assistant_msg:
+                messages.append({"role": "assistant", "content": assistant_msg})
+        messages.append({"role": "user", "content": message})
+        # Format messages according to model's expected chat template
+        prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        return prompt
+    @spaces.GPU
+    def generate_response(
+        self,
+        message,
+        history,
+        system_message,
+        max_tokens,
+        temperature,
+        top_p,
+    ):
+        prompt = self.format_chat_prompt(message, history, system_message)
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        # Setup streamer
+        streamer = TextIteratorStreamer(
+            self.tokenizer,
+            timeout=10.0,
+            skip_prompt=True,
+            skip_special_tokens=True
+        )
+        # Generate in a separate thread to enable streaming
+        generation_kwargs = dict(
+            inputs=inputs,
+            streamer=streamer,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=True,
+        )
+        thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+        thread.start()
+        # Stream the response
+        response = ""
+        for new_text in streamer:
+            response += new_text
+            yield response
+def create_demo():
+    chat_interface = ChatInterface()
+    demo = gr.ChatInterface(
+        chat_interface.generate_response,
+        additional_inputs=[
+            gr.Textbox(
+                value="You are a friendly Chatbot.",
+                label="System message"
+            ),
+            gr.Slider(
+                minimum=1,
+                maximum=2048,
+                value=512,
+                step=1,
+                label="Max new tokens"
+            ),
+            gr.Slider(
+                minimum=0.1,
+                maximum=4.0,
+                value=0.7,
+                step=0.1,
+                label="Temperature"
+            ),
+            gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.95,
+                step=0.05,
+                label="Top-p (nucleus sampling)"
+            ),
+        ],
     )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch()