camparchimedes commited on
Commit
b98f4ad
·
verified ·
1 Parent(s): 820ab43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -86
app.py CHANGED
@@ -5,89 +5,57 @@ import nltk
5
  nltk.download('punkt')
6
  from nltk.tokenize import sent_tokenize
7
 
8
-
9
  import gradio as gr
10
  import warnings
11
  import torch
12
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq # pipeline
13
  from pydub import AudioSegment
14
- import soundfile as sf
15
- import numpy as np
16
  from fpdf import FPDF
17
  from PIL import Image
18
  import time
19
  import os
20
- # import spaces
21
 
22
  warnings.filterwarnings("ignore")
23
 
24
- # HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
25
-
26
- processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
27
- model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
28
-
29
-
30
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
  torch_dtype = torch.float32
32
- model.to(device)
33
 
 
 
 
 
34
  def convert_to_wav(audio_file):
35
  audio = AudioSegment.from_file(audio_file, format="m4a")
36
  wav_file = "temp.wav"
37
  audio.export(wav_file, format="wav")
38
  return wav_file
39
 
40
-
41
- # @spaces.GPU(queue=True)
42
-
43
- def transcribe_audio(audio_file, batch_size=4):
44
- start_time = time.time()
45
-
46
  if audio_file.endswith(".m4a"):
47
  audio_file = convert_to_wav(audio_file)
48
 
49
- audio_input, sample_rate = sf.read(audio_file)
50
- chunk_size = 16000 * 30
51
- chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
52
-
53
- transcription = ""
54
-
55
- for i in range(0, len(chunks), batch_size):
56
- batch_chunks = chunks[i:i + batch_size]
57
- inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
58
- inputs = inputs.to(device)
59
-
60
- attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
61
-
62
- with torch.no_grad():
63
- output = model.generate(
64
- inputs.input_features,
65
- max_length=2048,
66
- num_beams=8,
67
- attention_mask=attention_mask,
68
- pad_token_id=processor.tokenizer.pad_token_id,
69
- eos_token_id=processor.tokenizer.eos_token_id
70
- )
71
 
72
- transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
 
73
 
 
74
  end_time = time.time()
75
- transcription_time = end_time - start_time
 
76
  word_count = len(transcription.split())
77
 
78
- result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
79
 
80
  return transcription.strip(), result
81
 
82
-
83
- # summarization model
84
  summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
85
  summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
86
-
87
- # t5-base to device
88
  summarization_model.to(device)
89
 
90
- # Graph-based summarization|TextRank
91
  def summarize_text(text):
92
  sentences = sent_tokenize(text)
93
  if len(sentences) == 0:
@@ -102,48 +70,29 @@ def summarize_text(text):
102
 
103
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
104
 
105
- # Select top N sentences (e.g., 3 sentences for the summary)
106
  top_n = 3
107
  summary = " ".join([s for _, s in ranked_sentences[:top_n]])
108
  return summary
109
 
110
- # HTML syntax for imagery
111
- image_html = """
112
- <div style="text-align: center;">
113
- <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
114
- </div>
115
- """
116
-
117
  def save_to_pdf(transcription, summary):
118
  pdf = FPDF()
119
  pdf.add_page()
120
  pdf.set_font("Arial", size=12)
121
 
122
- # include transcription
123
- pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
124
 
125
- # paragraph space
126
  pdf.ln(10)
127
 
128
- # include summary
129
- pdf.multi_cell(0, 10, "Summary:\n" + summary)
130
 
131
  pdf_output_path = "transcription_summary.pdf"
132
  pdf.output(pdf_output_path)
133
  return pdf_output_path
134
 
135
- # Gradio UI
136
- iface = gr.Interface(
137
- fn=transcribe_audio,
138
- inputs=gr.Audio(type="filepath"),
139
- outputs="text",
140
- title="Audio Transcription App",
141
- description="Upload an audio file to get the transcription",
142
- theme="default",
143
- live=False
144
- )
145
-
146
- # Gradio UI
147
  iface = gr.Blocks()
148
 
149
  with iface:
@@ -151,26 +100,24 @@ with iface:
151
  gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
152
 
153
  with gr.Tabs():
154
-
155
- # First Tab: Transcription
156
  with gr.TabItem("Transcription"):
157
  audio_input = gr.Audio(type="filepath")
158
- batch_size_input = gr.Slider(minimum=7, maximum=16, step=1, label="Batch Size")
159
  transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
160
  result_output = gr.Textbox(label="Time taken and Number of words")
161
  transcribe_button = gr.Button("Transcribe")
162
 
163
- def transcribe(audio_file, batch_size):
164
- transcription, result = transcribe_audio(audio_file, batch_size)
165
  return transcription, result
166
 
167
  transcribe_button.click(
168
  fn=transcribe,
169
- inputs=[audio_input, batch_size_input],
170
  outputs=[transcription_output, result_output]
171
  )
172
 
173
- # Second Tab: Summary
174
  with gr.TabItem("Summary"):
175
  summary_output = gr.Textbox(label="Summary | TextRank, graph-based")
176
  summarize_button = gr.Button("Summarize")
@@ -183,11 +130,11 @@ with iface:
183
 
184
  summarize_button.click(
185
  fn=summarize,
186
- inputs=[transcription_output], # Use the transcription from the first tab
187
  outputs=summary_output
188
  )
189
 
190
- # Third Tab: PDF Download Options
191
  with gr.TabItem("Download PDF"):
192
  pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
193
  pdf_summary_only = gr.Button("Download PDF with Summary Only")
@@ -224,8 +171,5 @@ with iface:
224
  outputs=[pdf_output_both]
225
  )
226
 
227
-
228
- # run
229
  iface.launch(share=True, debug=True)
230
-
231
-
 
5
  nltk.download('punkt')
6
  from nltk.tokenize import sent_tokenize
7
 
 
8
  import gradio as gr
9
  import warnings
10
  import torch
11
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
12
  from pydub import AudioSegment
 
 
13
  from fpdf import FPDF
14
  from PIL import Image
15
  import time
16
  import os
 
17
 
18
  warnings.filterwarnings("ignore")
19
 
 
 
 
 
 
 
20
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
  torch_dtype = torch.float32
 
22
 
23
+ # Initialize the ASR pipeline
24
+ pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch.float32)
25
+
26
+ # Function to convert m4a files to wav
27
  def convert_to_wav(audio_file):
28
  audio = AudioSegment.from_file(audio_file, format="m4a")
29
  wav_file = "temp.wav"
30
  audio.export(wav_file, format="wav")
31
  return wav_file
32
 
33
+ # Transcription function using the ASR pipeline
34
+ def transcribe_audio(audio_file):
 
 
 
 
35
  if audio_file.endswith(".m4a"):
36
  audio_file = convert_to_wav(audio_file)
37
 
38
+ start_time = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ with torch.no_grad():
41
+ output = pipe(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"})
42
 
43
+ transcription = output["text"]
44
  end_time = time.time()
45
+
46
+ output_time = end_time - start_time
47
  word_count = len(transcription.split())
48
 
49
+ result = f"Transcription: {transcription.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
50
 
51
  return transcription.strip(), result
52
 
53
+ # Summarization model setup
 
54
  summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
55
  summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
 
56
  summarization_model.to(device)
57
 
58
+ # Graph-based summarization (TextRank)
59
  def summarize_text(text):
60
  sentences = sent_tokenize(text)
61
  if len(sentences) == 0:
 
70
 
71
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
72
 
 
73
  top_n = 3
74
  summary = " ".join([s for _, s in ranked_sentences[:top_n]])
75
  return summary
76
 
77
+ # Save transcription and summary to PDF
 
 
 
 
 
 
78
  def save_to_pdf(transcription, summary):
79
  pdf = FPDF()
80
  pdf.add_page()
81
  pdf.set_font("Arial", size=12)
82
 
83
+ if transcription:
84
+ pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
85
 
 
86
  pdf.ln(10)
87
 
88
+ if summary:
89
+ pdf.multi_cell(0, 10, "Summary:\n" + summary)
90
 
91
  pdf_output_path = "transcription_summary.pdf"
92
  pdf.output(pdf_output_path)
93
  return pdf_output_path
94
 
95
+ # Gradio Interface setup
 
 
 
 
 
 
 
 
 
 
 
96
  iface = gr.Blocks()
97
 
98
  with iface:
 
100
  gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
101
 
102
  with gr.Tabs():
103
+ # Transcription Tab
 
104
  with gr.TabItem("Transcription"):
105
  audio_input = gr.Audio(type="filepath")
 
106
  transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
107
  result_output = gr.Textbox(label="Time taken and Number of words")
108
  transcribe_button = gr.Button("Transcribe")
109
 
110
+ def transcribe(audio_file):
111
+ transcription, result = transcribe_audio(audio_file)
112
  return transcription, result
113
 
114
  transcribe_button.click(
115
  fn=transcribe,
116
+ inputs=[audio_input],
117
  outputs=[transcription_output, result_output]
118
  )
119
 
120
+ # Summary Tab
121
  with gr.TabItem("Summary"):
122
  summary_output = gr.Textbox(label="Summary | TextRank, graph-based")
123
  summarize_button = gr.Button("Summarize")
 
130
 
131
  summarize_button.click(
132
  fn=summarize,
133
+ inputs=[transcription_output],
134
  outputs=summary_output
135
  )
136
 
137
+ # PDF Download Tab
138
  with gr.TabItem("Download PDF"):
139
  pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
140
  pdf_summary_only = gr.Button("Download PDF with Summary Only")
 
171
  outputs=[pdf_output_both]
172
  )
173
 
174
+ # Run the Gradio interface
 
175
  iface.launch(share=True, debug=True)