preston-cell commited on
Commit
eda8103
·
verified ·
1 Parent(s): f6399dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -13
app.py CHANGED
@@ -19,13 +19,8 @@ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
19
  # Load image captioning model (BLIP)
20
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
21
 
22
- # Load TTS model (SpeechT5) and static speaker embedding
23
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
24
- SPEAKER_EMBEDDING_URL = "https://huggingface.co/microsoft/speecht5_tts/resolve/main/speaker_embeddings/spkemb_female.pt"
25
- response = requests.get(SPEAKER_EMBEDDING_URL)
26
- buffer = io.BytesIO(response.content)
27
- speaker_embedding = torch.load(buffer)
28
- speaker_embedding = speaker_embedding.unsqueeze(0)
29
 
30
  # Load Florence-2-base for OCR
31
  ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
@@ -76,11 +71,8 @@ def process_image(image):
76
 
77
  context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
78
 
79
- # Text-to-speech
80
- speech = synthesiser(
81
- context,
82
- forward_params={"speaker_embeddings": speaker_embedding}
83
- )
84
 
85
  audio = np.array(speech["audio"])
86
  rate = speech["sampling_rate"]
@@ -100,8 +92,8 @@ iface = gr.Interface(
100
  gr.Textbox(label="Extracted Text (OCR)"),
101
  gr.Textbox(label="Generated Context")
102
  ],
103
- title="SeeSay Contextualizer with Doge-320M & Florence-2-base",
104
  description="Upload an image to generate a caption, extract text, convert context to speech, and understand the image using Doge-320M."
105
  )
106
 
107
- iface.launch(share=True)
 
19
  # Load image captioning model (BLIP)
20
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
21
 
22
+ # Load TTS model (SpeechT5)
23
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
 
 
 
 
 
24
 
25
  # Load Florence-2-base for OCR
26
  ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
 
71
 
72
  context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
73
 
74
+ # Text-to-speech (no speaker embedding required)
75
+ speech = synthesiser(context)
 
 
 
76
 
77
  audio = np.array(speech["audio"])
78
  rate = speech["sampling_rate"]
 
92
  gr.Textbox(label="Extracted Text (OCR)"),
93
  gr.Textbox(label="Generated Context")
94
  ],
95
+ title="SeeSay",
96
  description="Upload an image to generate a caption, extract text, convert context to speech, and understand the image using Doge-320M."
97
  )
98
 
99
+ iface.launch(share=True)