preston-cell commited on
Commit
f6399dc
·
verified ·
1 Parent(s): 5f7e64b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -12
app.py CHANGED
@@ -4,13 +4,13 @@ from transformers import (
4
  AutoProcessor,
5
  AutoModelForCausalLM,
6
  AutoTokenizer,
7
- GenerationConfig,
8
- TextStreamer
9
  )
10
  import torch
11
  import numpy as np
12
  from PIL import Image
13
  import requests
 
14
 
15
  # Device setup
16
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -21,16 +21,17 @@ caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captionin
21
 
22
  # Load TTS model (SpeechT5) and static speaker embedding
23
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
24
-
25
  SPEAKER_EMBEDDING_URL = "https://huggingface.co/microsoft/speecht5_tts/resolve/main/speaker_embeddings/spkemb_female.pt"
26
- speaker_embedding = torch.load(requests.get(SPEAKER_EMBEDDING_URL, stream=True).raw)
 
 
27
  speaker_embedding = speaker_embedding.unsqueeze(0)
28
 
29
  # Load Florence-2-base for OCR
30
  ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
31
  ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
32
 
33
- # Load Doge-320M-Instruct model for contextual response generation
34
  doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
35
  doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
36
 
@@ -45,10 +46,10 @@ generation_config = GenerationConfig(
45
 
46
  def process_image(image):
47
  try:
48
- # Generate caption
49
  caption = caption_model(image)[0]['generated_text']
50
 
51
- # Extract text using Florence-2
52
  inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
53
  generated_ids = ocr_model.generate(
54
  input_ids=inputs["input_ids"],
@@ -59,7 +60,7 @@ def process_image(image):
59
  )
60
  extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
61
 
62
- # Generate context using Doge model
63
  prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
64
  conversation = [{"role": "user", "content": prompt}]
65
  doge_inputs = doge_tokenizer.apply_chat_template(
@@ -75,7 +76,7 @@ def process_image(image):
75
 
76
  context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
77
 
78
- # Convert context to speech
79
  speech = synthesiser(
80
  context,
81
  forward_params={"speaker_embeddings": speaker_embedding}
@@ -89,7 +90,7 @@ def process_image(image):
89
  except Exception as e:
90
  return None, f"Error: {str(e)}", "", ""
91
 
92
- # Gradio Interface
93
  iface = gr.Interface(
94
  fn=process_image,
95
  inputs=gr.Image(type='pil', label="Upload an Image"),
@@ -100,7 +101,7 @@ iface = gr.Interface(
100
  gr.Textbox(label="Generated Context")
101
  ],
102
  title="SeeSay Contextualizer with Doge-320M & Florence-2-base",
103
- description="Upload an image to generate a caption, extract text, create speech from context, and generate the context using Doge."
104
  )
105
 
106
- iface.launch(share=True)
 
4
  AutoProcessor,
5
  AutoModelForCausalLM,
6
  AutoTokenizer,
7
+ GenerationConfig
 
8
  )
9
  import torch
10
  import numpy as np
11
  from PIL import Image
12
  import requests
13
+ import io
14
 
15
  # Device setup
16
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
21
 
22
  # Load TTS model (SpeechT5) and static speaker embedding
23
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
 
24
  SPEAKER_EMBEDDING_URL = "https://huggingface.co/microsoft/speecht5_tts/resolve/main/speaker_embeddings/spkemb_female.pt"
25
+ response = requests.get(SPEAKER_EMBEDDING_URL)
26
+ buffer = io.BytesIO(response.content)
27
+ speaker_embedding = torch.load(buffer)
28
  speaker_embedding = speaker_embedding.unsqueeze(0)
29
 
30
  # Load Florence-2-base for OCR
31
  ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
32
  ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
33
 
34
+ # Load Doge-320M-Instruct for contextual reasoning
35
  doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
36
  doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
37
 
 
46
 
47
  def process_image(image):
48
  try:
49
+ # Captioning
50
  caption = caption_model(image)[0]['generated_text']
51
 
52
+ # OCR
53
  inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
54
  generated_ids = ocr_model.generate(
55
  input_ids=inputs["input_ids"],
 
60
  )
61
  extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
62
 
63
+ # Context generation
64
  prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
65
  conversation = [{"role": "user", "content": prompt}]
66
  doge_inputs = doge_tokenizer.apply_chat_template(
 
76
 
77
  context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
78
 
79
+ # Text-to-speech
80
  speech = synthesiser(
81
  context,
82
  forward_params={"speaker_embeddings": speaker_embedding}
 
90
  except Exception as e:
91
  return None, f"Error: {str(e)}", "", ""
92
 
93
+ # Gradio UI
94
  iface = gr.Interface(
95
  fn=process_image,
96
  inputs=gr.Image(type='pil', label="Upload an Image"),
 
101
  gr.Textbox(label="Generated Context")
102
  ],
103
  title="SeeSay Contextualizer with Doge-320M & Florence-2-base",
104
+ description="Upload an image to generate a caption, extract text, convert context to speech, and understand the image using Doge-320M."
105
  )
106
 
107
+ iface.launch(share=True)