preston-cell commited on
Commit
29f7833
·
verified ·
1 Parent(s): 68bf04e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -41
app.py CHANGED
@@ -1,86 +1,83 @@
1
  import gradio as gr
 
 
2
  from transformers import (
3
  pipeline,
4
- AutoProcessor,
5
  AutoModelForCausalLM,
 
6
  AutoTokenizer,
7
  GenerationConfig,
8
- TextStreamer,
9
- set_seed
10
  )
11
- import torch
12
- import numpy as np
13
- import requests
14
- import io
15
  from datasets import load_dataset
 
16
 
17
- # Set device
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
20
- set_seed(42)
21
 
22
- # Image Captioning (BLIP)
23
- caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device=0 if torch.cuda.is_available() else -1)
24
 
25
- # Text-to-Speech (SpeechT5)
26
- synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=0 if torch.cuda.is_available() else -1)
27
 
28
- # OCR using Florence-2-base
29
- ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
 
 
30
  ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
31
 
32
- # Load SmallDoge for context generation
33
- doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
34
  doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
35
- doge_config = GenerationConfig(
36
  max_new_tokens=100,
37
  use_cache=True,
38
  do_sample=True,
39
  temperature=0.8,
40
  top_p=0.9,
41
- repetition_penalty=1.0
42
  )
43
 
44
- # Load speaker embedding (600-dim)
45
- SPEAKER_EMBEDDING_URL = "https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors/resolve/main/spkemb/fn0012.npy"
46
- response = requests.get(SPEAKER_EMBEDDING_URL)
47
- buffer = io.BytesIO(response.content)
48
- speaker_embedding = torch.tensor(np.load(buffer, allow_pickle=True)).unsqueeze(0) # Shape: [1, 600]
49
  if speaker_embedding.shape[1] < 600:
50
  raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
51
 
52
- # Main function
53
  def process_image(image):
54
  try:
55
- # Generate caption
56
  caption = caption_model(image)[0]['generated_text']
57
 
58
- # OCR extraction
59
  inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
60
  generated_ids = ocr_model.generate(
61
  input_ids=inputs["input_ids"],
62
  pixel_values=inputs["pixel_values"],
63
- max_new_tokens=512,
64
  do_sample=False,
65
- num_beams=3
66
  )
67
  extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
68
 
69
- # Generate context using SmallDoge
70
- prompt = f"Determine the context of this image based on the caption and extracted text.\nCaption: {caption}\nExtracted text: {extracted_text}\nContext:"
71
  conversation = [{"role": "user", "content": prompt}]
72
- inputs = doge_tokenizer.apply_chat_template(
73
  conversation=conversation, tokenize=True, return_tensors="pt"
74
  ).to(device)
75
 
76
- outputs = doge_model.generate(inputs, generation_config=doge_config)
77
- context = doge_tokenizer.decode(outputs[0], skip_special_tokens=True)
78
-
79
- # Text-to-Speech
80
- speech = synthesiser(
81
- context,
82
- forward_params={"speaker_embeddings": speaker_embedding}
83
  )
 
 
 
 
84
  audio = np.array(speech["audio"])
85
  rate = speech["sampling_rate"]
86
 
@@ -89,6 +86,7 @@ def process_image(image):
89
  except Exception as e:
90
  return None, f"Error: {str(e)}", "", ""
91
 
 
92
  # Gradio UI
93
  iface = gr.Interface(
94
  fn=process_image,
@@ -99,8 +97,8 @@ iface = gr.Interface(
99
  gr.Textbox(label="Extracted Text (OCR)"),
100
  gr.Textbox(label="Generated Context")
101
  ],
102
- title="SeeSay Contextualizer",
103
- description="Upload an image to generate a caption, extract text with Florence-2-base, contextualize with Doge-320M-Instruct, and hear it with SpeechT5."
104
  )
105
 
106
  iface.launch(share=True)
 
1
  import gradio as gr
2
+ import torch
3
+ import numpy as np
4
  from transformers import (
5
  pipeline,
 
6
  AutoModelForCausalLM,
7
+ AutoProcessor,
8
  AutoTokenizer,
9
  GenerationConfig,
 
 
10
  )
 
 
 
 
11
  from datasets import load_dataset
12
+ from PIL import Image
13
 
14
+ # Set device and dtype
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
17
+ print(f"Device set to use {device}")
18
 
19
+ # Load image captioning model
20
+ caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device=device)
21
 
22
+ # Load text-to-speech model
23
+ synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
24
 
25
+ # Load Florence-2-base for OCR
26
+ ocr_model = AutoModelForCausalLM.from_pretrained(
27
+ "microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True
28
+ ).to(device)
29
  ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
30
 
31
+ # Load Doge model for context generation
32
+ doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True)
33
  doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
34
+ generation_config = GenerationConfig(
35
  max_new_tokens=100,
36
  use_cache=True,
37
  do_sample=True,
38
  temperature=0.8,
39
  top_p=0.9,
40
+ repetition_penalty=1.0,
41
  )
42
 
43
+ # Load speaker embedding
44
+ embedding_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
45
+ speaker_embedding = torch.tensor(embedding_dataset[7306]["xvector"]).unsqueeze(0)
 
 
46
  if speaker_embedding.shape[1] < 600:
47
  raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
48
 
49
+
50
  def process_image(image):
51
  try:
52
+ # Generate image caption
53
  caption = caption_model(image)[0]['generated_text']
54
 
55
+ # Run OCR using Florence-2-base
56
  inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
57
  generated_ids = ocr_model.generate(
58
  input_ids=inputs["input_ids"],
59
  pixel_values=inputs["pixel_values"],
60
+ max_new_tokens=1024,
61
  do_sample=False,
62
+ num_beams=3,
63
  )
64
  extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
65
 
66
+ # Generate context using Doge model
67
+ prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
68
  conversation = [{"role": "user", "content": prompt}]
69
+ doge_inputs = doge_tokenizer.apply_chat_template(
70
  conversation=conversation, tokenize=True, return_tensors="pt"
71
  ).to(device)
72
 
73
+ doge_outputs = doge_model.generate(
74
+ input_ids=doge_inputs,
75
+ generation_config=generation_config,
 
 
 
 
76
  )
77
+ context = doge_tokenizer.decode(doge_outputs[0], skip_special_tokens=True)
78
+
79
+ # Generate speech from context
80
+ speech = synthesiser(context, forward_params={"speaker_embeddings": speaker_embedding})
81
  audio = np.array(speech["audio"])
82
  rate = speech["sampling_rate"]
83
 
 
86
  except Exception as e:
87
  return None, f"Error: {str(e)}", "", ""
88
 
89
+
90
  # Gradio UI
91
  iface = gr.Interface(
92
  fn=process_image,
 
97
  gr.Textbox(label="Extracted Text (OCR)"),
98
  gr.Textbox(label="Generated Context")
99
  ],
100
+ title="SeeSay Contextualizer with Doge and Florence-2",
101
+ description="Upload an image to generate a caption, extract text, determine the context using Doge, and convert context to speech."
102
  )
103
 
104
  iface.launch(share=True)