hevold commited on
Commit
d0351bc
Β·
verified Β·
1 Parent(s): c3ffa44

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import numpy as np
4
+ import os
5
+ import shutil
6
+ import tempfile
7
+ # Install ffmpeg and pydub for audio extraction from video if needed
8
+ !apt-get update -qq && apt-get install -qq -y ffmpeg
9
+ !pip install pydub -q
10
+
11
+
12
+ from pydub import AudioSegment
13
+
14
+ # Initialize the transcription pipeline with a multilingual model
15
+ # Note: openai/whisper-large-v3 is a very large model and might cause OutOfMemoryError
16
+ try:
17
+ print("πŸ‘‚ Loading multilingual transcription pipeline with openai/whisper-large-v3...")
18
+ transcriber = pipeline(
19
+ "automatic-speech-recognition",
20
+ model="openai/whisper-large-v3",
21
+ return_timestamps=True, # Needed for long audio
22
+ device_map="auto" # Automatically chooses device
23
+ )
24
+ print("βœ… Multilingual transcription pipeline loaded")
25
+
26
+ # Function to handle file upload, extract audio if necessary, and transcribe
27
+ def handle_upload_and_transcribe(file_obj):
28
+ """Handles uploaded file (audio or video), extracts audio, and transcribes."""
29
+ if file_obj is None:
30
+ return "Please upload an audio or video file."
31
+
32
+ input_path = file_obj # file_obj is already the file path string
33
+ output_audio_path = None
34
+ temp_dir = None # Initialize temp_dir to None
35
+
36
+ try:
37
+ # Check if the file is likely a video based on extension (a simple heuristic)
38
+ video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.webm']
39
+ is_video = any(input_path.lower().endswith(ext) for ext in video_extensions)
40
+
41
+ if is_video:
42
+ print(f"🎬 Detected video file: {input_path}. Extracting audio...")
43
+ # Use pydub and ffmpeg to extract audio
44
+ audio = AudioSegment.from_file(input_path)
45
+ # Create a temporary file for the extracted audio
46
+ temp_dir = tempfile.mkdtemp()
47
+ output_audio_path = os.path.join(temp_dir, "extracted_audio.wav")
48
+ audio.export(output_audio_path, format="wav")
49
+ print(f"πŸ”Š Audio extracted to: {output_audio_path}")
50
+ audio_source_path = output_audio_path
51
+ else:
52
+ # Assume it's an audio file, use the original path
53
+ print(f"🎡 Detected audio file: {input_path}. Using directly for transcription.")
54
+ audio_source_path = input_path
55
+
56
+ # Now transcribe the audio source path
57
+ print(f" transcribe {audio_source_path}...")
58
+ transcription = transcriber(audio_source_path)
59
+
60
+ # Clean up temporary directory if audio was extracted and temp_dir was created
61
+ if temp_dir and os.path.exists(temp_dir):
62
+ shutil.rmtree(temp_dir)
63
+ print(f"πŸ—‘οΈ Cleaned up temporary directory {temp_dir}")
64
+
65
+
66
+ # The output format depends on return_timestamps. If True, it's a dict with 'text'.
67
+ if isinstance(transcription, dict) and 'text' in transcription:
68
+ return transcription['text']
69
+ elif isinstance(transcription, list) and transcription:
70
+ # Handle cases where output might be a list of dicts (e.g., without timestamps)
71
+ return transcription[0].get('text', str(transcription)) # Return text from first item or string representation
72
+ else:
73
+ return str(transcription) # Return string representation if format is unexpected
74
+
75
+ except Exception as e:
76
+ # Clean up temporary directory in case of error during transcription
77
+ if temp_dir and os.path.exists(temp_dir):
78
+ shutil.rmtree(temp_dir)
79
+ print(f"πŸ—‘οΈ Cleaned up temporary directory {temp_dir} after error")
80
+ return f"❌ Processing or Transcription failed: {e}"
81
+
82
+
83
+ # Create the Gradio interface
84
+ print("πŸš€ Creating Gradio interface...")
85
+ # Use gr.File for broader input type support, although gr.Audio often handles videos too
86
+ # gr.Audio(type="filepath") might be sufficient if ffmpeg handles the format
87
+ # Let's stick to gr.Audio with filepath type as it often works with ffmpeg installed
88
+ interface = gr.Interface(
89
+ fn=handle_upload_and_transcribe,
90
+ inputs=gr.Audio(type="filepath", label="Upload Audio or Video File"),
91
+ outputs=gr.Textbox(label="Transcription"),
92
+ title="Multilingual Audio/Video Transcription",
93
+ description="Upload an audio (.mp3, .wav, .m4a, etc.) or video (.mp4, .avi, etc.) file to get its transcription."
94
+ )
95
+
96
+ # Launch the interface
97
+ print("Starting Gradio interface...")
98
+ interface.launch(debug=True) # Set debug=True for more detailed error messages
99
+
100
+ except Exception as e:
101
+ print(f"❌ Error initializing the transcription pipeline or Gradio interface: {e}")
102
+ print("Please check the model name and available resources.")
103
+ display({"error": f"Initialization failed: {e}"})