Bils commited on
Commit
0def226
·
verified ·
1 Parent(s): 60ed470

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -9
app.py CHANGED
@@ -8,31 +8,39 @@ from scipy.io.wavfile import write
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
  from pathlib import Path
 
 
11
 
12
  load_dotenv()
13
  hf_token = os.getenv("HF_TKN")
14
 
15
  device_id = 0 if torch.cuda.is_available() else -1
16
 
 
17
  captioning_pipeline = pipeline(
18
  "image-to-text",
19
- model="Salesforce/blip-image-captioning-large", #
20
  device=device_id
21
  )
22
 
 
23
  pipe = DiffusionPipeline.from_pretrained(
24
  "cvssp/audioldm2",
25
  use_auth_token=hf_token
26
  )
27
 
 
 
28
  @spaces.GPU(duration=120)
29
- def analyze_image_with_free_model(image_file):
30
  try:
31
- with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
- temp_file.write(image_file)
33
- temp_image_path = temp_file.name
34
 
35
- results = captioning_pipeline(temp_image_path)
 
 
36
  if not results or not isinstance(results, list):
37
  return "Error: Could not generate caption.", True
38
 
@@ -42,8 +50,10 @@ def analyze_image_with_free_model(image_file):
42
  return caption, False
43
 
44
  except Exception as e:
 
45
  return f"Error analyzing image: {e}", True
46
 
 
47
  @spaces.GPU(duration=120)
48
  def get_audioldm_from_caption(caption):
49
  try:
@@ -64,6 +74,7 @@ def get_audioldm_from_caption(caption):
64
  print(f"Error generating audio from caption: {e}")
65
  return None
66
 
 
67
  css = """
68
  #col-container{
69
  margin: 0 auto;
@@ -116,9 +127,11 @@ with gr.Blocks(css=css) as demo:
116
  This app is a testament to the creative possibilities that emerge when technology meets art.
117
  Enjoy exploring the auditory landscape of your images!
118
  """)
119
-
120
- def update_caption(image_file):
121
- description, _ = analyze_image_with_free_model(image_file)
 
 
122
  return description
123
 
124
  def generate_sound(description):
 
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
  from pathlib import Path
11
+ from PIL import Image # <-- ADDED THIS IMPORT
12
+ import io # <-- ADDED THIS IMPORT
13
 
14
  load_dotenv()
15
  hf_token = os.getenv("HF_TKN")
16
 
17
  device_id = 0 if torch.cuda.is_available() else -1
18
 
19
+ # Correctly initialize the modern, reliable captioning pipeline
20
  captioning_pipeline = pipeline(
21
  "image-to-text",
22
+ model="Salesforce/blip-image-captioning-large",
23
  device=device_id
24
  )
25
 
26
+ # Initialize the audio pipeline
27
  pipe = DiffusionPipeline.from_pretrained(
28
  "cvssp/audioldm2",
29
  use_auth_token=hf_token
30
  )
31
 
32
+
33
+ # === THIS IS THE CORRECTED FUNCTION ===
34
  @spaces.GPU(duration=120)
35
+ def analyze_image_with_free_model(image_file_bytes):
36
  try:
37
+ # No more temp files!
38
+ # Open the image data directly from memory using Pillow
39
+ image = Image.open(io.BytesIO(image_file_bytes))
40
 
41
+ # Pass the Pillow Image object directly to the pipeline. This is the robust method.
42
+ results = captioning_pipeline(image)
43
+
44
  if not results or not isinstance(results, list):
45
  return "Error: Could not generate caption.", True
46
 
 
50
  return caption, False
51
 
52
  except Exception as e:
53
+ print(f"ERROR in analyze_image_with_free_model: {e}") # Print error to logs
54
  return f"Error analyzing image: {e}", True
55
 
56
+
57
  @spaces.GPU(duration=120)
58
  def get_audioldm_from_caption(caption):
59
  try:
 
74
  print(f"Error generating audio from caption: {e}")
75
  return None
76
 
77
+ # --- Gradio UI (No changes needed here) ---
78
  css = """
79
  #col-container{
80
  margin: 0 auto;
 
127
  This app is a testament to the creative possibilities that emerge when technology meets art.
128
  Enjoy exploring the auditory landscape of your images!
129
  """)
130
+
131
+ # --- Gradio event handlers (I've updated the function called here) ---
132
+ def update_caption(image_file_bytes):
133
+ # We pass the bytes from the uploader directly to our corrected function
134
+ description, _ = analyze_image_with_free_model(image_file_bytes)
135
  return description
136
 
137
  def generate_sound(description):