Spaces:

Curify-dev
/

qwen-test

Runtime error

qqwjq1981 commited on 18 days ago

Commit

16a689c

verified ·

1 Parent(s): e3e1200

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,45 +1,27 @@
 import gradio as gr
-from TTS.api import TTS
-import tempfile
-# Load XTTS model
-tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
-def clone_voice(text, speaker_wav):
-    if speaker_wav is None:
-        return None, "Please upload a reference audio file."
-    # Save uploaded audio
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
-        tmp.write(speaker_wav.read())
-        speaker_path = tmp.name
-    output_path = "cloned_output.wav"
-    # Generate audio using XTTS
-    tts.tts_to_file(
-        text=text,
-        speaker_wav=speaker_path,
-        language="zh",
-        file_path=output_path
-    )
-    return output_path, "Voice cloning completed successfully."
 # Gradio UI
-demo = gr.Interface(
-    fn=clone_voice,
-    inputs=[
-        gr.Textbox(label="Enter Chinese Text"),
-        gr.Audio(label="Upload Speaker Audio (.wav)", type="file")
-    ],
-    outputs=[
-        gr.Audio(label="Cloned Output Audio"),
-        gr.Textbox(label="Status")
-    ],
-    title="XTTS Voice Cloning Demo",
-    description="Upload reference audio and enter Chinese text to generate speech in cloned voice."
-)
-demo.launch()

+# app.py
 import gradio as gr
+from transformers import AutoModelForVision2Seq, AutoProcessor
+import torch
+from PIL import Image
+# Load Qwen-VL model and processor
+model_id = "Qwen/Qwen-VL-Chat"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForVision2Seq.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
+# Inference function
+def ocr_with_qwen(image):
+    prompt = "<|im_start|>system\nYou are a helpful assistant. Extract all text from the image and output only the text.<|im_end|>\n<|im_start|>user\n"
+    inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=512)
+    result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+    return result.strip()
 # Gradio UI
+gr.Interface(
+    fn=ocr_with_qwen,
+    inputs=gr.Image(type="pil", label="Upload Image (test.jpg)"),
+    outputs=gr.Textbox(label="Extracted Text"),
+    title="OCR with Qwen2.5-VL",
+    description="Upload an image to extract text using Qwen-VL model."
+).launch()