Spaces:

preston-cell
/

image-text-to-text

Running

App Files Files Community

preston-cell commited on Apr 12

Commit

f6399dc

verified ·

1 Parent(s): 5f7e64b

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -12

app.py CHANGED Viewed

@@ -4,13 +4,13 @@ from transformers import (
     AutoProcessor,
     AutoModelForCausalLM,
     AutoTokenizer,
-    GenerationConfig,
-    TextStreamer
 )
 import torch
 import numpy as np
 from PIL import Image
 import requests
 # Device setup
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -21,16 +21,17 @@ caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captionin
 # Load TTS model (SpeechT5) and static speaker embedding
 synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
 SPEAKER_EMBEDDING_URL = "https://huggingface.co/microsoft/speecht5_tts/resolve/main/speaker_embeddings/spkemb_female.pt"
-speaker_embedding = torch.load(requests.get(SPEAKER_EMBEDDING_URL, stream=True).raw)
 speaker_embedding = speaker_embedding.unsqueeze(0)
 # Load Florence-2-base for OCR
 ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
 ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
-# Load Doge-320M-Instruct model for contextual response generation
 doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
 doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
@@ -45,10 +46,10 @@ generation_config = GenerationConfig(
 def process_image(image):
     try:
-        # Generate caption
         caption = caption_model(image)[0]['generated_text']
-        # Extract text using Florence-2
         inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
         generated_ids = ocr_model.generate(
             input_ids=inputs["input_ids"],
@@ -59,7 +60,7 @@ def process_image(image):
         )
         extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # Generate context using Doge model
         prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
         conversation = [{"role": "user", "content": prompt}]
         doge_inputs = doge_tokenizer.apply_chat_template(
@@ -75,7 +76,7 @@ def process_image(image):
         context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        # Convert context to speech
         speech = synthesiser(
             context,
             forward_params={"speaker_embeddings": speaker_embedding}
@@ -89,7 +90,7 @@ def process_image(image):
     except Exception as e:
         return None, f"Error: {str(e)}", "", ""
-# Gradio Interface
 iface = gr.Interface(
     fn=process_image,
     inputs=gr.Image(type='pil', label="Upload an Image"),
@@ -100,7 +101,7 @@ iface = gr.Interface(
         gr.Textbox(label="Generated Context")
     ],
     title="SeeSay Contextualizer with Doge-320M & Florence-2-base",
-    description="Upload an image to generate a caption, extract text, create speech from context, and generate the context using Doge."
 )
-iface.launch(share=True)

     AutoProcessor,
     AutoModelForCausalLM,
     AutoTokenizer,
+    GenerationConfig
 )
 import torch
 import numpy as np
 from PIL import Image
 import requests
+import io
 # Device setup
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # Load TTS model (SpeechT5) and static speaker embedding
 synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
 SPEAKER_EMBEDDING_URL = "https://huggingface.co/microsoft/speecht5_tts/resolve/main/speaker_embeddings/spkemb_female.pt"
+response = requests.get(SPEAKER_EMBEDDING_URL)
+buffer = io.BytesIO(response.content)
+speaker_embedding = torch.load(buffer)
 speaker_embedding = speaker_embedding.unsqueeze(0)
 # Load Florence-2-base for OCR
 ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
 ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
+# Load Doge-320M-Instruct for contextual reasoning
 doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
 doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
 def process_image(image):
     try:
+        # Captioning
         caption = caption_model(image)[0]['generated_text']
+        # OCR
         inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
         generated_ids = ocr_model.generate(
             input_ids=inputs["input_ids"],
         )
         extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # Context generation
         prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
         conversation = [{"role": "user", "content": prompt}]
         doge_inputs = doge_tokenizer.apply_chat_template(
         context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        # Text-to-speech
         speech = synthesiser(
             context,
             forward_params={"speaker_embeddings": speaker_embedding}
     except Exception as e:
         return None, f"Error: {str(e)}", "", ""
+# Gradio UI
 iface = gr.Interface(
     fn=process_image,
     inputs=gr.Image(type='pil', label="Upload an Image"),
         gr.Textbox(label="Generated Context")
     ],
     title="SeeSay Contextualizer with Doge-320M & Florence-2-base",
+    description="Upload an image to generate a caption, extract text, convert context to speech, and understand the image using Doge-320M."
 )
+iface.launch(share=True)