MuhammadFarhanAslam commited on
Commit
5632b5a
Β·
verified Β·
1 Parent(s): 1310a47

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. README.md +3 -9
  2. app.py +281 -0
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: AI-Powered Speech-to-Text Transcriber
3
- emoji: πŸ†
4
- colorFrom: purple
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.33.0
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: AI-Powered_Speech-to-Text_Transcriber
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 5.31.0
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # app.py
3
+
4
+ !pip install gradio
5
+ !pip install transformers
6
+ !pip install soundfile
7
+
8
+ import gradio as gr
9
+ import soundfile as sf
10
+ import os
11
+ from transformers import pipeline
12
+
13
+ asr = pipeline(task="automatic-speech-recognition",
14
+ model="distil-whisper/distil-small.en")
15
+
16
+ def transcribe_speech(audio_filepath):
17
+ if audio_filepath is None:
18
+ gr.Warning('No audio found. Please try again!')
19
+ # This line defines a Python function named 'transcribe_speech'
20
+ # It takes one argument: 'audio_filepath', which is expected to be a string
21
+ # representing the path to an audio file on your system (e.g., 'my_audio.wav').
22
+
23
+ # 1. Load audio from file
24
+ # This line uses 'sf.read()' (likely from the 'soundfile' library, or similar)
25
+ # to read the contents of the audio file specified by 'audio_filepath'.
26
+ # It returns two main pieces of information:
27
+ # - 'audio': A NumPy array containing the numerical samples of the audio waveform.
28
+ # This is the raw digital representation of the sound.
29
+ # - 'sr': The sampling rate (in Hertz) of the audio. This tells you how many
30
+ # samples per second are in the 'audio' array (e.g., 16000 Hz, 44100 Hz).
31
+ audio, sr = sf.read(audio_filepath)
32
+
33
+ # 2. Pass audio data to the ASR model/pipeline for transcription
34
+ # This is the core step where the speech recognition happens.
35
+ # - 'asr': This variable (which must be defined and initialized elsewhere in your code)
36
+ # represents your pre-trained ASR model or, more likely, a Hugging Face
37
+ # ASR pipeline (like the one you'd get from `pipeline("automatic-speech-recognition", model="...")`).
38
+ # - `{"array": audio, "sampling_rate": sr}`: This is the crucial input format
39
+ # expected by many Hugging Face ASR models and pipelines. It's a dictionary
40
+ # where:
41
+ # - 'array': Contains the raw numerical audio waveform.
42
+ # - 'sampling_rate': Provides the corresponding sampling rate.
43
+ # The ASR model needs both to correctly interpret the audio.
44
+ # - 'result': The output from the 'asr' model/pipeline. For ASR tasks, this is
45
+ # typically a dictionary containing the transcribed text and potentially
46
+ # other metadata (like word timestamps or confidence scores).
47
+ result = asr(
48
+ {"array": audio, "sampling_rate": sr}
49
+ )
50
+
51
+ # 3. Extract and return the transcribed text
52
+ # The ASR pipeline or model usually returns its primary output (the transcription)
53
+ # under a specific key, commonly 'text'.
54
+ # This line extracts that text string from the 'result' dictionary.
55
+ return result['text']
56
+
57
+
58
+ mic_transcribe = gr.Interface(
59
+ fn=transcribe_speech,
60
+ inputs=gr.Audio(
61
+ sources="microphone",
62
+ type="filepath",
63
+ label="🎀 Speak into your microphone" # Appealing label
64
+ ),
65
+ outputs=gr.Textbox(
66
+ label="πŸ“ Transcription Result", # Appealing label
67
+ lines=4, # Slightly more lines for longer transcriptions
68
+ placeholder="Your transcribed text will appear here..."
69
+ ),
70
+ allow_flagging="never", # Disable flagging
71
+ description="Record your voice directly using your device's microphone. Get an instant transcription."
72
+ )
73
+
74
+
75
+ file_transcribe = gr.Interface(
76
+ fn=transcribe_speech,
77
+ inputs=gr.Audio(
78
+ sources="upload", # Allow input from file upload
79
+ type="filepath", # Function receives audio as a temporary file path
80
+ label="πŸ“ Upload an Audio File" # Appealing label
81
+ ),
82
+ outputs=gr.Textbox(
83
+ label="πŸ“ Transcription Result", # Appealing label
84
+ lines=4, # Slightly more lines
85
+ placeholder="Upload an audio file (e.g., .wav, .mp3) to get its transcription."
86
+ ),
87
+ allow_flagging="never", # Disable flagging
88
+ description="Upload an audio file for transcription."
89
+ )
90
+
91
+
92
+ custom_css = """
93
+ /* Import Google Font - Arial (or a very similar sans-serif if Arial isn't universally available on all systems) */
94
+ /* Note: Arial is typically a system font, so direct import isn't strictly necessary for it to work,
95
+ but it's good practice for other fonts. */
96
+ @import url('https://fonts.googleapis.com/css2?family=Arial:wght@400;700&display=swap');
97
+
98
+ /* Apply Arial to ALL text elements by default within the Gradio container */
99
+ .gradio-container, body, button, input, select, textarea, div, p, span, h1, h2, h3, h4, h5, h6 {
100
+ font-family: 'Arial', sans-serif !important;
101
+ }
102
+
103
+ /* Overall container styling */
104
+ .gradio-container {
105
+ max-width: 900px; /* Limit overall width for better readability */
106
+ margin: 30px auto; /* Center the app on the page */
107
+ padding: 30px;
108
+ border-radius: 15px; /* Rounded corners for a softer look */
109
+ box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1); /* Subtle shadow for depth */
110
+ background-color: #ffffff; /* White background for the main content area */
111
+ }
112
+
113
+ /* Titles and Headers */
114
+ h1 {
115
+ color: #34495e; /* Darker blue-grey for main title */
116
+ text-align: center;
117
+ font-size: 2.5em; /* Larger main title */
118
+ margin-bottom: 10px;
119
+ font-weight: 700; /* Bold */
120
+ }
121
+
122
+ h3 {
123
+ color: #5d6d7e; /* Slightly lighter blue-grey for subtitle */
124
+ text-align: center;
125
+ font-size: 1.2em;
126
+ margin-top: 0;
127
+ margin-bottom: 25px;
128
+ }
129
+
130
+ p {
131
+ text-align: center;
132
+ color: #7f8c8d; /* Muted grey for descriptions */
133
+ font-size: 0.95em;
134
+ margin-bottom: 20px;
135
+ }
136
+
137
+ /* Tabbed Interface Styling */
138
+ .tabs {
139
+ border-radius: 10px;
140
+ overflow: hidden; /* Ensures rounded corners on tabs */
141
+ margin-bottom: 20px;
142
+ }
143
+
144
+ .tab-nav button {
145
+ background-color: #ecf0f1; /* Light grey for inactive tabs */
146
+ color: #34495e; /* Dark text for inactive tabs */
147
+ font-weight: bold;
148
+ padding: 12px 20px;
149
+ border-radius: 8px 8px 0 0;
150
+ margin-right: 5px; /* Small space between tabs */
151
+ transition: all 0.3s ease;
152
+ }
153
+
154
+ .tab-nav button.selected {
155
+ background-color: #4a90e2; /* Vibrant blue for active tab */
156
+ color: white; /* White text for active tab */
157
+ box-shadow: 0 4px 10px rgba(74, 144, 226, 0.3); /* Subtle shadow for active tab */
158
+ }
159
+
160
+ /* Input and Output Component Styling (General) */
161
+ .gr-box {
162
+ border-radius: 10px; /* Rounded corners for input/output boxes */
163
+ border: 1px solid #dfe6e9; /* Light border */
164
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05); /* Very subtle shadow */
165
+ padding: 20px;
166
+ background-color: #fcfcfc; /* Slightly off-white background */
167
+ }
168
+
169
+ /* Labels within components (e.g., "Upload Audio File", "Transcription Result") */
170
+ .label {
171
+ font-weight: bold;
172
+ color: #2c3e50; /* Dark text for labels */
173
+ font-size: 1.1em;
174
+ margin-bottom: 8px;
175
+ }
176
+
177
+ /* Buttons (Clear, Submit) */
178
+ .gr-button {
179
+ background-color: #4a90e2 !important; /* Primary blue for actions */
180
+ color: white !important;
181
+ border: none !important;
182
+ border-radius: 8px !important; /* Rounded buttons */
183
+ padding: 12px 25px !important;
184
+ font-weight: bold !important;
185
+ transition: background-color 0.3s ease, box-shadow 0.3s ease !important;
186
+ margin: 5px; /* Spacing between buttons */
187
+ }
188
+
189
+ .gr-button:hover {
190
+ background-color: #3a7bd2 !important; /* Darker blue on hover */
191
+ box-shadow: 0 4px 15px rgba(74, 144, 226, 0.4) !important;
192
+ }
193
+
194
+ /* Clear button specific */
195
+ .gr-button.secondary {
196
+ background-color: #e0e6eb !important; /* Lighter grey for clear */
197
+ color: #34495e !important;
198
+ }
199
+ .gr-button.secondary:hover {
200
+ background-color: #d1d8df !important;
201
+ box-shadow: none !important;
202
+ }
203
+
204
+ /* Textbox specific */
205
+ textarea {
206
+ border-radius: 8px !important;
207
+ border: 1px solid #bdc3c7 !important;
208
+ padding: 10px !important;
209
+ resize: vertical; /* Allow vertical resizing */
210
+ }
211
+
212
+ /* Audio component player */
213
+ .gr-audio-player {
214
+ border-radius: 8px;
215
+ background-color: #f0f0f0;
216
+ padding: 10px;
217
+ }
218
+
219
+ /* Footer styling */
220
+ hr {
221
+ border: none;
222
+ border-top: 1px solid #e0e0e0;
223
+ margin-top: 30px;
224
+ margin-bottom: 15px;
225
+ }
226
+
227
+ .footer-text {
228
+ font-size: 0.85em;
229
+ color: #a0a0a0;
230
+ text-align: center;
231
+ }
232
+ """
233
+
234
+ # --- 6. Main Gradio App using Blocks for layout and styling ---
235
+ # Initialize a Gradio Blocks interface with a theme and custom CSS.
236
+ demo = gr.Blocks(
237
+ theme=gr.themes.Soft(), # A good base theme for soft colors
238
+ css=custom_css # Apply our custom CSS
239
+ )
240
+
241
+ # Define the layout within the 'demo' Blocks context
242
+ with demo:
243
+ # Main Title and Description using Markdown for rich formatting and appealing colors
244
+ # Removed inline style for font-family as it's handled by global CSS now.
245
+ gr.Markdown(
246
+ """
247
+ <center>
248
+ <h1 style="color: #4A90E2;">
249
+ πŸŽ™οΈ AI-Powered Speech-to-Text Transcriber πŸ“
250
+ </h1>
251
+ <h3 style="color: #6C7A89;">
252
+ Developed by Muhammad Farhan Aslam.
253
+ </h3>
254
+ <h3 style="color: #6C7A89;">
255
+ Convert spoken words into accurate text with ease and precision.
256
+ </h3>
257
+ <p style="color: #8C9CA7; font-size: 1.05em;">
258
+ Effortlessly transcribe audio from your microphone or by uploading a file.
259
+ This application leverages advanced AI to provide clear and reliable transcriptions.
260
+ </p>
261
+ </center>
262
+ """
263
+ )
264
+
265
+ # Create a tabbed interface for microphone and file upload transcription
266
+ gr.TabbedInterface(
267
+ [file_transcribe, mic_transcribe],
268
+ ["πŸ“ Transcribe Audio File", "🎀 Transcribe from Microphone"],
269
+ )
270
+
271
+ # Add a subtle footer for information or credits
272
+ gr.Markdown(
273
+ """
274
+ <hr>
275
+ <p class="footer-text">
276
+ Built with ❀️ and Gradio on Hugging Face Transformers.
277
+ </p>
278
+ """
279
+ )
280
+ # start_port = int(os.environ.get('PORT1', 7861))
281
+ # demo.launch(share=True, server_port=start_port)