Spaces:

hevold
/

transkribering

Sleeping

App Files Files Community

hevold commited on Oct 29

Commit

d0351bc

verified ·

1 Parent(s): c3ffa44

Create app.py

Browse files

Files changed (1) hide show

app.py +103 -0

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import gradio as gr
+from transformers import pipeline
+import numpy as np
+import os
+import shutil
+import tempfile
+# Install ffmpeg and pydub for audio extraction from video if needed
+!apt-get update -qq && apt-get install -qq -y ffmpeg
+!pip install pydub -q
+from pydub import AudioSegment
+# Initialize the transcription pipeline with a multilingual model
+# Note: openai/whisper-large-v3 is a very large model and might cause OutOfMemoryError
+try:
+    print("👂 Loading multilingual transcription pipeline with openai/whisper-large-v3...")
+    transcriber = pipeline(
+        "automatic-speech-recognition",
+        model="openai/whisper-large-v3",
+        return_timestamps=True, # Needed for long audio
+        device_map="auto" # Automatically chooses device
+    )
+    print("✅ Multilingual transcription pipeline loaded")
+    # Function to handle file upload, extract audio if necessary, and transcribe
+    def handle_upload_and_transcribe(file_obj):
+        """Handles uploaded file (audio or video), extracts audio, and transcribes."""
+        if file_obj is None:
+            return "Please upload an audio or video file."
+        input_path = file_obj # file_obj is already the file path string
+        output_audio_path = None
+        temp_dir = None # Initialize temp_dir to None
+        try:
+            # Check if the file is likely a video based on extension (a simple heuristic)
+            video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.webm']
+            is_video = any(input_path.lower().endswith(ext) for ext in video_extensions)
+            if is_video:
+                print(f"🎬 Detected video file: {input_path}. Extracting audio...")
+                # Use pydub and ffmpeg to extract audio
+                audio = AudioSegment.from_file(input_path)
+                # Create a temporary file for the extracted audio
+                temp_dir = tempfile.mkdtemp()
+                output_audio_path = os.path.join(temp_dir, "extracted_audio.wav")
+                audio.export(output_audio_path, format="wav")
+                print(f"🔊 Audio extracted to: {output_audio_path}")
+                audio_source_path = output_audio_path
+            else:
+                # Assume it's an audio file, use the original path
+                print(f"🎵 Detected audio file: {input_path}. Using directly for transcription.")
+                audio_source_path = input_path
+            # Now transcribe the audio source path
+            print(f" transcribe {audio_source_path}...")
+            transcription = transcriber(audio_source_path)
+            # Clean up temporary directory if audio was extracted and temp_dir was created
+            if temp_dir and os.path.exists(temp_dir):
+                 shutil.rmtree(temp_dir)
+                 print(f"🗑️ Cleaned up temporary directory {temp_dir}")
+            # The output format depends on return_timestamps. If True, it's a dict with 'text'.
+            if isinstance(transcription, dict) and 'text' in transcription:
+                 return transcription['text']
+            elif isinstance(transcription, list) and transcription:
+                 # Handle cases where output might be a list of dicts (e.g., without timestamps)
+                 return transcription[0].get('text', str(transcription)) # Return text from first item or string representation
+            else:
+                 return str(transcription) # Return string representation if format is unexpected
+        except Exception as e:
+            # Clean up temporary directory in case of error during transcription
+            if temp_dir and os.path.exists(temp_dir):
+                 shutil.rmtree(temp_dir)
+                 print(f"🗑️ Cleaned up temporary directory {temp_dir} after error")
+            return f"❌ Processing or Transcription failed: {e}"
+    # Create the Gradio interface
+    print("🚀 Creating Gradio interface...")
+    # Use gr.File for broader input type support, although gr.Audio often handles videos too
+    # gr.Audio(type="filepath") might be sufficient if ffmpeg handles the format
+    # Let's stick to gr.Audio with filepath type as it often works with ffmpeg installed
+    interface = gr.Interface(
+        fn=handle_upload_and_transcribe,
+        inputs=gr.Audio(type="filepath", label="Upload Audio or Video File"),
+        outputs=gr.Textbox(label="Transcription"),
+        title="Multilingual Audio/Video Transcription",
+        description="Upload an audio (.mp3, .wav, .m4a, etc.) or video (.mp4, .avi, etc.) file to get its transcription."
+    )
+    # Launch the interface
+    print("Starting Gradio interface...")
+    interface.launch(debug=True) # Set debug=True for more detailed error messages
+except Exception as e:
+    print(f"❌ Error initializing the transcription pipeline or Gradio interface: {e}")
+    print("Please check the model name and available resources.")
+    display({"error": f"Initialization failed: {e}"})