qqwjq1981 commited on
Commit
16a689c
Β·
verified Β·
1 Parent(s): e3e1200

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -42
app.py CHANGED
@@ -1,45 +1,27 @@
1
-
2
  import gradio as gr
3
- from TTS.api import TTS
4
- import tempfile
5
-
6
- # Load XTTS model
7
- tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
8
-
9
- def clone_voice(text, speaker_wav):
10
- if speaker_wav is None:
11
- return None, "Please upload a reference audio file."
12
-
13
- # Save uploaded audio
14
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
15
- tmp.write(speaker_wav.read())
16
- speaker_path = tmp.name
17
-
18
- output_path = "cloned_output.wav"
19
-
20
- # Generate audio using XTTS
21
- tts.tts_to_file(
22
- text=text,
23
- speaker_wav=speaker_path,
24
- language="zh",
25
- file_path=output_path
26
- )
27
-
28
- return output_path, "Voice cloning completed successfully."
29
 
30
  # Gradio UI
31
- demo = gr.Interface(
32
- fn=clone_voice,
33
- inputs=[
34
- gr.Textbox(label="Enter Chinese Text"),
35
- gr.Audio(label="Upload Speaker Audio (.wav)", type="file")
36
- ],
37
- outputs=[
38
- gr.Audio(label="Cloned Output Audio"),
39
- gr.Textbox(label="Status")
40
- ],
41
- title="XTTS Voice Cloning Demo",
42
- description="Upload reference audio and enter Chinese text to generate speech in cloned voice."
43
- )
44
-
45
- demo.launch()
 
1
+ # app.py
2
  import gradio as gr
3
+ from transformers import AutoModelForVision2Seq, AutoProcessor
4
+ import torch
5
+ from PIL import Image
6
+
7
+ # Load Qwen-VL model and processor
8
+ model_id = "Qwen/Qwen-VL-Chat"
9
+ processor = AutoProcessor.from_pretrained(model_id)
10
+ model = AutoModelForVision2Seq.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
11
+
12
+ # Inference function
13
+ def ocr_with_qwen(image):
14
+ prompt = "<|im_start|>system\nYou are a helpful assistant. Extract all text from the image and output only the text.<|im_end|>\n<|im_start|>user\n"
15
+ inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
16
+ outputs = model.generate(**inputs, max_new_tokens=512)
17
+ result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
18
+ return result.strip()
 
 
 
 
 
 
 
 
 
 
19
 
20
  # Gradio UI
21
+ gr.Interface(
22
+ fn=ocr_with_qwen,
23
+ inputs=gr.Image(type="pil", label="Upload Image (test.jpg)"),
24
+ outputs=gr.Textbox(label="Extracted Text"),
25
+ title="OCR with Qwen2.5-VL",
26
+ description="Upload an image to extract text using Qwen-VL model."
27
+ ).launch()