Spaces:

Xalphinions
/

watermelon2

Sleeping

App Files Files Community

Xalphinions commited on Apr 6

Commit

945fdb4

verified ·

1 Parent(s): 83d5b74

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +282 -150

app.py CHANGED Viewed

@@ -78,165 +78,297 @@ def app_process_audio_data(waveform, sample_rate):
 # Similarly for images, but let's import the original one
 from preprocess import process_image_data
-# Define prediction function
-def predict_sweetness(audio, image, model_path):
-    """Predict sweetness of a watermelon from audio and image input"""
-    try:
-        # Now check CUDA availability inside the GPU-decorated function
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-            print(f"\033[92mINFO\033[0m: CUDA is available. Using device: {device}")
-        else:
-            device = torch.device("cpu")
-            print(f"\033[92mINFO\033[0m: CUDA is not available. Using device: {device}")
-        # Load model inside the function to ensure it's on the correct device
-        model = WatermelonModel().to(device)
-        model.load_state_dict(torch.load(model_path, map_location=device))
-        model.eval()
-        print(f"\033[92mINFO\033[0m: Loaded model from {model_path}")
-        # Debug information about input types
-        print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
-        print(f"\033[92mDEBUG\033[0m: Audio input shape/length: {len(audio)}")
-        print(f"\033[92mDEBUG\033[0m: Image input type: {type(image)}")
-        if isinstance(image, np.ndarray):
-            print(f"\033[92mDEBUG\033[0m: Image input shape: {image.shape}")
-        # Handle different audio input formats
-        if isinstance(audio, tuple) and len(audio) == 2:
-            # Standard Gradio format: (sample_rate, audio_data)
-            sample_rate, audio_data = audio
-            print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
-            print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
-        elif isinstance(audio, tuple) and len(audio) > 2:
-            # Sometimes Gradio returns (sample_rate, audio_data, other_info...)
-            sample_rate, audio_data = audio[0], audio[-1]
-            print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
-            print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
-        elif isinstance(audio, str):
-            # Direct path to audio file
-            audio_data, sample_rate = torchaudio.load(audio)
-            print(f"\033[92mDEBUG\033[0m: Loaded audio from path with shape: {audio_data.shape}")
-        else:
-            return f"Error: Unsupported audio format. Got {type(audio)}"
-        # Create a temporary file path for the audio and image
-        temp_dir = "temp"
-        os.makedirs(temp_dir, exist_ok=True)
-        temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
-        temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
-        # Import necessary libraries
-        from PIL import Image
-        # Audio handling - direct processing from the data in memory
-        if isinstance(audio_data, np.ndarray):
-            # Convert numpy array to tensor
-            print(f"\033[92mDEBUG\033[0m: Converting numpy audio with shape {audio_data.shape} to tensor")
-            audio_tensor = torch.tensor(audio_data).float()
-            # Handle different audio dimensions
-            if audio_data.ndim == 1:
-                # Single channel audio
-                audio_tensor = audio_tensor.unsqueeze(0)
-            elif audio_data.ndim == 2:
-                # Ensure channels are first dimension
-                if audio_data.shape[0] > audio_data.shape[1]:
-                    # More rows than columns, probably (samples, channels)
-                    audio_tensor = torch.tensor(audio_data.T).float()
-        else:
-            # Already a tensor
-            audio_tensor = audio_data.float()
-        print(f"\033[92mDEBUG\033[0m: Audio tensor shape before processing: {audio_tensor.shape}")
-        # Skip saving/loading and process directly
-        mfcc = app_process_audio_data(audio_tensor, sample_rate)
-        print(f"\033[92mDEBUG\033[0m: MFCC tensor shape after processing: {mfcc.shape if mfcc is not None else None}")
-        # Image handling
-        if isinstance(image, np.ndarray):
-            print(f"\033[92mDEBUG\033[0m: Converting numpy image with shape {image.shape} to PIL")
-            pil_image = Image.fromarray(image)
-            pil_image.save(temp_image_path)
-            print(f"\033[92mDEBUG\033[0m: Saved image to {temp_image_path}")
-        elif isinstance(image, str):
-            # If image is already a path
-            temp_image_path = image
-            print(f"\033[92mDEBUG\033[0m: Using provided image path: {temp_image_path}")
-        else:
-            return f"Error: Unsupported image format. Got {type(image)}"
-        # Process image
-        print(f"\033[92mDEBUG\033[0m: Loading and preprocessing image from {temp_image_path}")
-        image_tensor = torchvision.io.read_image(temp_image_path)
-        print(f"\033[92mDEBUG\033[0m: Loaded image shape: {image_tensor.shape}")
-        image_tensor = image_tensor.float()
-        processed_image = process_image_data(image_tensor)
-        print(f"\033[92mDEBUG\033[0m: Processed image shape: {processed_image.shape if processed_image is not None else None}")
-        # Add batch dimension for inference and move to device
-        if mfcc is not None:
-            mfcc = mfcc.unsqueeze(0).to(device)
-            print(f"\033[92mDEBUG\033[0m: Final MFCC shape with batch dimension: {mfcc.shape}")
-        if processed_image is not None:
-            processed_image = processed_image.unsqueeze(0).to(device)
-            print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
-        # Run inference
-        print(f"\033[92mDEBUG\033[0m: Running inference on device: {device}")
-        if mfcc is not None and processed_image is not None:
-            with torch.no_grad():
-                sweetness = model(mfcc, processed_image)
-                print(f"\033[92mDEBUG\033[0m: Prediction successful: {sweetness.item()}")
-        else:
-            return "Error: Failed to process inputs. Please check the debug logs."
-        # Format the result
-        if sweetness is not None:
-            result = f"Predicted Sweetness: {sweetness.item():.2f}/13"
-            # Add a qualitative description
-            if sweetness.item() < 9:
-                result += "\n\nThis watermelon is not very sweet. You might want to choose another one."
-            elif sweetness.item() < 10:
-                result += "\n\nThis watermelon has moderate sweetness."
-            elif sweetness.item() < 11:
-                result += "\n\nThis watermelon is sweet! A good choice."
             else:
-                result += "\n\nThis watermelon is very sweet! Excellent choice!"
-            return result
-        else:
-            return "Error: Could not predict sweetness. Please try again with different inputs."
-    except Exception as e:
-        import traceback
-        error_msg = f"Error: {str(e)}\n\n"
-        error_msg += traceback.format_exc()
-        print(f"\033[91mERR!\033[0m: {error_msg}")
-        return error_msg
-# Apply GPU decorator if available in Gradio Spaces environment
-if HAS_SPACES:
-    predict_sweetness_gpu = spaces.GPU(predict_sweetness)
-    print("\033[92mINFO\033[0m: GPU optimization enabled for prediction function")
 else:
-    predict_sweetness_gpu = predict_sweetness
 def create_app(model_path):
     """Create and launch the Gradio interface"""
     # Define the prediction function with model path
     def predict_fn(audio, image):
-        if HAS_SPACES:
-            # Use GPU-optimized function if available
-            return predict_sweetness_gpu(audio, image, model_path)
-        else:
-            # Use regular function otherwise
-            return predict_sweetness(audio, image, model_path)
     # Create Gradio interface
     with gr.Blocks(title="Watermelon Sweetness Predictor", theme=gr.themes.Soft()) as interface:

 # Similarly for images, but let's import the original one
 from preprocess import process_image_data
+# Apply GPU decorator directly to the function if available
+if HAS_SPACES:
+    # Using the decorator directly on the function definition
+    @spaces.GPU
+    def predict_sweetness(audio, image, model_path):
+        """Function with GPU acceleration"""
+        try:
+            # Now check CUDA availability inside the GPU-decorated function
+            if torch.cuda.is_available():
+                device = torch.device("cuda")
+                print(f"\033[92mINFO\033[0m: CUDA is available. Using device: {device}")
+            else:
+                device = torch.device("cpu")
+                print(f"\033[92mINFO\033[0m: CUDA is not available. Using device: {device}")
+            # Load model inside the function to ensure it's on the correct device
+            model = WatermelonModel().to(device)
+            model.load_state_dict(torch.load(model_path, map_location=device))
+            model.eval()
+            print(f"\033[92mINFO\033[0m: Loaded model from {model_path}")
+            # Debug information about input types
+            print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
+            print(f"\033[92mDEBUG\033[0m: Audio input shape/length: {len(audio)}")
+            print(f"\033[92mDEBUG\033[0m: Image input type: {type(image)}")
+            if isinstance(image, np.ndarray):
+                print(f"\033[92mDEBUG\033[0m: Image input shape: {image.shape}")
+            # Handle different audio input formats
+            if isinstance(audio, tuple) and len(audio) == 2:
+                # Standard Gradio format: (sample_rate, audio_data)
+                sample_rate, audio_data = audio
+                print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
+                print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
+            elif isinstance(audio, tuple) and len(audio) > 2:
+                # Sometimes Gradio returns (sample_rate, audio_data, other_info...)
+                sample_rate, audio_data = audio[0], audio[-1]
+                print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
+                print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
+            elif isinstance(audio, str):
+                # Direct path to audio file
+                audio_data, sample_rate = torchaudio.load(audio)
+                print(f"\033[92mDEBUG\033[0m: Loaded audio from path with shape: {audio_data.shape}")
             else:
+                return f"Error: Unsupported audio format. Got {type(audio)}"
+            # Create a temporary file path for the audio and image
+            temp_dir = "temp"
+            os.makedirs(temp_dir, exist_ok=True)
+            temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
+            temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
+            # Import necessary libraries
+            from PIL import Image
+            # Audio handling - direct processing from the data in memory
+            if isinstance(audio_data, np.ndarray):
+                # Convert numpy array to tensor
+                print(f"\033[92mDEBUG\033[0m: Converting numpy audio with shape {audio_data.shape} to tensor")
+                audio_tensor = torch.tensor(audio_data).float()
+                # Handle different audio dimensions
+                if audio_data.ndim == 1:
+                    # Single channel audio
+                    audio_tensor = audio_tensor.unsqueeze(0)
+                elif audio_data.ndim == 2:
+                    # Ensure channels are first dimension
+                    if audio_data.shape[0] > audio_data.shape[1]:
+                        # More rows than columns, probably (samples, channels)
+                        audio_tensor = torch.tensor(audio_data.T).float()
+            else:
+                # Already a tensor
+                audio_tensor = audio_data.float()
+            print(f"\033[92mDEBUG\033[0m: Audio tensor shape before processing: {audio_tensor.shape}")
+            # Skip saving/loading and process directly
+            mfcc = app_process_audio_data(audio_tensor, sample_rate)
+            print(f"\033[92mDEBUG\033[0m: MFCC tensor shape after processing: {mfcc.shape if mfcc is not None else None}")
+            # Image handling
+            if isinstance(image, np.ndarray):
+                print(f"\033[92mDEBUG\033[0m: Converting numpy image with shape {image.shape} to PIL")
+                pil_image = Image.fromarray(image)
+                pil_image.save(temp_image_path)
+                print(f"\033[92mDEBUG\033[0m: Saved image to {temp_image_path}")
+            elif isinstance(image, str):
+                # If image is already a path
+                temp_image_path = image
+                print(f"\033[92mDEBUG\033[0m: Using provided image path: {temp_image_path}")
+            else:
+                return f"Error: Unsupported image format. Got {type(image)}"
+            # Process image
+            print(f"\033[92mDEBUG\033[0m: Loading and preprocessing image from {temp_image_path}")
+            image_tensor = torchvision.io.read_image(temp_image_path)
+            print(f"\033[92mDEBUG\033[0m: Loaded image shape: {image_tensor.shape}")
+            image_tensor = image_tensor.float()
+            processed_image = process_image_data(image_tensor)
+            print(f"\033[92mDEBUG\033[0m: Processed image shape: {processed_image.shape if processed_image is not None else None}")
+            # Add batch dimension for inference and move to device
+            if mfcc is not None:
+                mfcc = mfcc.unsqueeze(0).to(device)
+                print(f"\033[92mDEBUG\033[0m: Final MFCC shape with batch dimension: {mfcc.shape}")
+            if processed_image is not None:
+                processed_image = processed_image.unsqueeze(0).to(device)
+                print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
+            # Run inference
+            print(f"\033[92mDEBUG\033[0m: Running inference on device: {device}")
+            if mfcc is not None and processed_image is not None:
+                with torch.no_grad():
+                    sweetness = model(mfcc, processed_image)
+                    print(f"\033[92mDEBUG\033[0m: Prediction successful: {sweetness.item()}")
+            else:
+                return "Error: Failed to process inputs. Please check the debug logs."
+            # Format the result
+            if sweetness is not None:
+                result = f"Predicted Sweetness: {sweetness.item():.2f}/13"
+                # Add a qualitative description
+                if sweetness.item() < 9:
+                    result += "\n\nThis watermelon is not very sweet. You might want to choose another one."
+                elif sweetness.item() < 10:
+                    result += "\n\nThis watermelon has moderate sweetness."
+                elif sweetness.item() < 11:
+                    result += "\n\nThis watermelon is sweet! A good choice."
+                else:
+                    result += "\n\nThis watermelon is very sweet! Excellent choice!"
+                return result
+            else:
+                return "Error: Could not predict sweetness. Please try again with different inputs."
+        except Exception as e:
+            import traceback
+            error_msg = f"Error: {str(e)}\n\n"
+            error_msg += traceback.format_exc()
+            print(f"\033[91mERR!\033[0m: {error_msg}")
+            return error_msg
+    print("\033[92mINFO\033[0m: GPU-accelerated prediction function created with @spaces.GPU decorator")
 else:
+    # Regular version without GPU decorator for non-Spaces environments
+    def predict_sweetness(audio, image, model_path):
+        """Predict sweetness of a watermelon from audio and image input"""
+        try:
+            # Check for device - will be CPU in this case
+            device = torch.device("cpu")
+            print(f"\033[92mINFO\033[0m: Using device: {device}")
+            # Load model inside the function
+            model = WatermelonModel().to(device)
+            model.load_state_dict(torch.load(model_path, map_location=device))
+            model.eval()
+            print(f"\033[92mINFO\033[0m: Loaded model from {model_path}")
+            # Rest of function identical - processing code
+            # Debug information about input types
+            print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
+            print(f"\033[92mDEBUG\033[0m: Audio input shape/length: {len(audio)}")
+            print(f"\033[92mDEBUG\033[0m: Image input type: {type(image)}")
+            if isinstance(image, np.ndarray):
+                print(f"\033[92mDEBUG\033[0m: Image input shape: {image.shape}")
+            # Handle different audio input formats
+            if isinstance(audio, tuple) and len(audio) == 2:
+                # Standard Gradio format: (sample_rate, audio_data)
+                sample_rate, audio_data = audio
+                print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
+                print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
+            elif isinstance(audio, tuple) and len(audio) > 2:
+                # Sometimes Gradio returns (sample_rate, audio_data, other_info...)
+                sample_rate, audio_data = audio[0], audio[-1]
+                print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
+                print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
+            elif isinstance(audio, str):
+                # Direct path to audio file
+                audio_data, sample_rate = torchaudio.load(audio)
+                print(f"\033[92mDEBUG\033[0m: Loaded audio from path with shape: {audio_data.shape}")
+            else:
+                return f"Error: Unsupported audio format. Got {type(audio)}"
+            # Create a temporary file path for the audio and image
+            temp_dir = "temp"
+            os.makedirs(temp_dir, exist_ok=True)
+            temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
+            temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
+            # Import necessary libraries
+            from PIL import Image
+            # Audio handling - direct processing from the data in memory
+            if isinstance(audio_data, np.ndarray):
+                # Convert numpy array to tensor
+                print(f"\033[92mDEBUG\033[0m: Converting numpy audio with shape {audio_data.shape} to tensor")
+                audio_tensor = torch.tensor(audio_data).float()
+                # Handle different audio dimensions
+                if audio_data.ndim == 1:
+                    # Single channel audio
+                    audio_tensor = audio_tensor.unsqueeze(0)
+                elif audio_data.ndim == 2:
+                    # Ensure channels are first dimension
+                    if audio_data.shape[0] > audio_data.shape[1]:
+                        # More rows than columns, probably (samples, channels)
+                        audio_tensor = torch.tensor(audio_data.T).float()
+            else:
+                # Already a tensor
+                audio_tensor = audio_data.float()
+            print(f"\033[92mDEBUG\033[0m: Audio tensor shape before processing: {audio_tensor.shape}")
+            # Skip saving/loading and process directly
+            mfcc = app_process_audio_data(audio_tensor, sample_rate)
+            print(f"\033[92mDEBUG\033[0m: MFCC tensor shape after processing: {mfcc.shape if mfcc is not None else None}")
+            # Image handling
+            if isinstance(image, np.ndarray):
+                print(f"\033[92mDEBUG\033[0m: Converting numpy image with shape {image.shape} to PIL")
+                pil_image = Image.fromarray(image)
+                pil_image.save(temp_image_path)
+                print(f"\033[92mDEBUG\033[0m: Saved image to {temp_image_path}")
+            elif isinstance(image, str):
+                # If image is already a path
+                temp_image_path = image
+                print(f"\033[92mDEBUG\033[0m: Using provided image path: {temp_image_path}")
+            else:
+                return f"Error: Unsupported image format. Got {type(image)}"
+            # Process image
+            print(f"\033[92mDEBUG\033[0m: Loading and preprocessing image from {temp_image_path}")
+            image_tensor = torchvision.io.read_image(temp_image_path)
+            print(f"\033[92mDEBUG\033[0m: Loaded image shape: {image_tensor.shape}")
+            image_tensor = image_tensor.float()
+            processed_image = process_image_data(image_tensor)
+            print(f"\033[92mDEBUG\033[0m: Processed image shape: {processed_image.shape if processed_image is not None else None}")
+            # Add batch dimension for inference and move to device
+            if mfcc is not None:
+                mfcc = mfcc.unsqueeze(0).to(device)
+                print(f"\033[92mDEBUG\033[0m: Final MFCC shape with batch dimension: {mfcc.shape}")
+            if processed_image is not None:
+                processed_image = processed_image.unsqueeze(0).to(device)
+                print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
+            # Run inference
+            print(f"\033[92mDEBUG\033[0m: Running inference on device: {device}")
+            if mfcc is not None and processed_image is not None:
+                with torch.no_grad():
+                    sweetness = model(mfcc, processed_image)
+                    print(f"\033[92mDEBUG\033[0m: Prediction successful: {sweetness.item()}")
+            else:
+                return "Error: Failed to process inputs. Please check the debug logs."
+            # Format the result
+            if sweetness is not None:
+                result = f"Predicted Sweetness: {sweetness.item():.2f}/13"
+                # Add a qualitative description
+                if sweetness.item() < 9:
+                    result += "\n\nThis watermelon is not very sweet. You might want to choose another one."
+                elif sweetness.item() < 10:
+                    result += "\n\nThis watermelon has moderate sweetness."
+                elif sweetness.item() < 11:
+                    result += "\n\nThis watermelon is sweet! A good choice."
+                else:
+                    result += "\n\nThis watermelon is very sweet! Excellent choice!"
+                return result
+            else:
+                return "Error: Could not predict sweetness. Please try again with different inputs."
+        except Exception as e:
+            import traceback
+            error_msg = f"Error: {str(e)}\n\n"
+            error_msg += traceback.format_exc()
+            print(f"\033[91mERR!\033[0m: {error_msg}")
+            return error_msg
 def create_app(model_path):
     """Create and launch the Gradio interface"""
     # Define the prediction function with model path
     def predict_fn(audio, image):
+        return predict_sweetness(audio, image, model_path)
     # Create Gradio interface
     with gr.Blocks(title="Watermelon Sweetness Predictor", theme=gr.themes.Soft()) as interface: