preston-cell commited on
Commit
1b524ac
·
verified ·
1 Parent(s): 29f7833

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -42
app.py CHANGED
@@ -1,58 +1,61 @@
1
  import gradio as gr
2
- import torch
3
- import numpy as np
4
  from transformers import (
5
- pipeline,
6
- AutoModelForCausalLM,
7
- AutoProcessor,
8
- AutoTokenizer,
9
- GenerationConfig,
10
  )
11
  from datasets import load_dataset
 
 
12
  from PIL import Image
13
 
14
- # Set device and dtype
15
- device = "cuda" if torch.cuda.is_available() else "cpu"
16
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
17
- print(f"Device set to use {device}")
18
-
19
- # Load image captioning model
20
- caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device=device)
21
 
22
- # Load text-to-speech model
23
- synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
24
 
25
- # Load Florence-2-base for OCR
26
  ocr_model = AutoModelForCausalLM.from_pretrained(
27
- "microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True
 
 
28
  ).to(device)
29
  ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
30
 
31
- # Load Doge model for context generation
32
- doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True)
33
- doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
34
  generation_config = GenerationConfig(
35
  max_new_tokens=100,
36
  use_cache=True,
37
  do_sample=True,
38
  temperature=0.8,
39
  top_p=0.9,
40
- repetition_penalty=1.0,
41
  )
 
42
 
43
- # Load speaker embedding
44
- embedding_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
45
- speaker_embedding = torch.tensor(embedding_dataset[7306]["xvector"]).unsqueeze(0)
46
- if speaker_embedding.shape[1] < 600:
47
- raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
48
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def process_image(image):
51
  try:
52
- # Generate image caption
53
  caption = caption_model(image)[0]['generated_text']
54
 
55
- # Run OCR using Florence-2-base
56
  inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
57
  generated_ids = ocr_model.generate(
58
  input_ids=inputs["input_ids"],
@@ -63,21 +66,18 @@ def process_image(image):
63
  )
64
  extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
65
 
66
- # Generate context using Doge model
67
  prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
68
  conversation = [{"role": "user", "content": prompt}]
69
- doge_inputs = doge_tokenizer.apply_chat_template(
70
- conversation=conversation, tokenize=True, return_tensors="pt"
71
- ).to(device)
72
-
73
- doge_outputs = doge_model.generate(
74
- input_ids=doge_inputs,
75
  generation_config=generation_config,
76
  )
77
- context = doge_tokenizer.decode(doge_outputs[0], skip_special_tokens=True)
78
 
79
- # Generate speech from context
80
- speech = synthesiser(context, forward_params={"speaker_embeddings": speaker_embedding})
81
  audio = np.array(speech["audio"])
82
  rate = speech["sampling_rate"]
83
 
@@ -86,8 +86,7 @@ def process_image(image):
86
  except Exception as e:
87
  return None, f"Error: {str(e)}", "", ""
88
 
89
-
90
- # Gradio UI
91
  iface = gr.Interface(
92
  fn=process_image,
93
  inputs=gr.Image(type='pil', label="Upload an Image"),
@@ -97,8 +96,8 @@ iface = gr.Interface(
97
  gr.Textbox(label="Extracted Text (OCR)"),
98
  gr.Textbox(label="Generated Context")
99
  ],
100
- title="SeeSay Contextualizer with Doge and Florence-2",
101
- description="Upload an image to generate a caption, extract text, determine the context using Doge, and convert context to speech."
102
  )
103
 
104
  iface.launch(share=True)
 
1
  import gradio as gr
 
 
2
  from transformers import (
3
+ pipeline, AutoModelForCausalLM, AutoProcessor, AutoTokenizer,
4
+ GenerationConfig, TextStreamer
 
 
 
5
  )
6
  from datasets import load_dataset
7
+ import torch
8
+ import numpy as np
9
  from PIL import Image
10
 
11
+ # Device and dtype setup
12
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 
 
 
 
14
 
15
+ # Caption model (BLIP)
16
+ caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
17
 
18
+ # Florence-2-base model for OCR
19
  ocr_model = AutoModelForCausalLM.from_pretrained(
20
+ "microsoft/Florence-2-base",
21
+ torch_dtype=torch_dtype,
22
+ trust_remote_code=True
23
  ).to(device)
24
  ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
25
 
26
+ # Doge model for context generation
27
+ context_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
28
+ context_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
29
  generation_config = GenerationConfig(
30
  max_new_tokens=100,
31
  use_cache=True,
32
  do_sample=True,
33
  temperature=0.8,
34
  top_p=0.9,
35
+ repetition_penalty=1.0
36
  )
37
+ streamer = TextStreamer(tokenizer=context_tokenizer, skip_prompt=True)
38
 
39
+ # SpeechT5 for TTS
40
+ tts = pipeline("text-to-speech", model="microsoft/speecht5_tts")
 
 
 
41
 
42
+ # Load valid 600-dim speaker embedding
43
+ speaker_embedding = None
44
+ embedding_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
45
+ for item in embedding_dataset:
46
+ vec = torch.tensor(item["xvector"])
47
+ if vec.shape[0] == 600:
48
+ speaker_embedding = vec.unsqueeze(0)
49
+ break
50
+ if speaker_embedding is None:
51
+ raise ValueError("No suitable speaker embedding of 600 dimensions found.")
52
 
53
  def process_image(image):
54
  try:
55
+ # Generate caption
56
  caption = caption_model(image)[0]['generated_text']
57
 
58
+ # Extract text using Florence-2
59
  inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
60
  generated_ids = ocr_model.generate(
61
  input_ids=inputs["input_ids"],
 
66
  )
67
  extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
68
 
69
+ # Generate context using Doge
70
  prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
71
  conversation = [{"role": "user", "content": prompt}]
72
+ context_inputs = context_tokenizer.apply_chat_template(conversation=conversation, tokenize=True, return_tensors="pt").to(device)
73
+ output = context_model.generate(
74
+ context_inputs,
 
 
 
75
  generation_config=generation_config,
76
  )
77
+ context = context_tokenizer.decode(output[0], skip_special_tokens=True)
78
 
79
+ # Convert context to speech
80
+ speech = tts(context, forward_params={"speaker_embeddings": speaker_embedding})
81
  audio = np.array(speech["audio"])
82
  rate = speech["sampling_rate"]
83
 
 
86
  except Exception as e:
87
  return None, f"Error: {str(e)}", "", ""
88
 
89
+ # Gradio Interface
 
90
  iface = gr.Interface(
91
  fn=process_image,
92
  inputs=gr.Image(type='pil', label="Upload an Image"),
 
96
  gr.Textbox(label="Extracted Text (OCR)"),
97
  gr.Textbox(label="Generated Context")
98
  ],
99
+ title="SeeSay Contextualizer",
100
+ description="Upload an image to generate a caption, extract text, generate context with Doge, and convert to speech."
101
  )
102
 
103
  iface.launch(share=True)