camparchimedes commited on
Commit
d2774a4
·
verified ·
1 Parent(s): fe8ea39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -61
app.py CHANGED
@@ -9,7 +9,7 @@ from nltk.tokenize import sent_tokenize
9
  import gradio as gr
10
  import warnings
11
  import torch
12
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq
13
  from pydub import AudioSegment
14
  import soundfile as sf
15
  import numpy as np
@@ -21,43 +21,16 @@ import spaces
21
 
22
  warnings.filterwarnings("ignore")
23
 
24
- HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
25
 
26
-
27
- model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
28
  processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
 
 
29
 
30
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
  #torch_dtype = torch.float32
32
  model.to(device)
33
 
34
- #asr = pipeline("automatic-speech-recognition", model=model, processor=processor.tokenizer, device=device, torch_dtype=torch.float32)
35
- pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch.float32)
36
-
37
-
38
- def transcribe_audio(audio_file):
39
- with torch.no_grad():
40
- output = pipe(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"})
41
- return output["text"]
42
-
43
- # Gradio UI
44
- iface = gr.Interface(
45
- fn=transcribe_audio,
46
- inputs=gr.Audio(type="filepath"),
47
- outputs="text",
48
- title="Audio Transcription App",
49
- description="Upload an audio file to get the transcription",
50
- theme="default",
51
- live=False
52
- )
53
-
54
- # summarization model
55
- summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
56
- summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
57
-
58
- # t5-base to device
59
- summarization_model.to(device)
60
-
61
  def convert_to_wav(audio_file):
62
  audio = AudioSegment.from_file(audio_file, format="m4a")
63
  wav_file = "temp.wav"
@@ -65,47 +38,37 @@ def convert_to_wav(audio_file):
65
  return wav_file
66
 
67
 
68
- # Configure_is__not good enough
69
- #if processor.tokenizer.pad_token_id is None:
70
- #processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + 1
71
-
72
- # Sanity check
73
- #assert processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id, \
74
- #"pad_token_id and eos_token_id must be distinct..and they is not"
75
-
76
-
77
  @spaces.GPU(queue=True)
78
- # transcription
79
  def transcribe_audio(audio_file, batch_size=4):
80
  start_time = time.time()
 
81
  if audio_file.endswith(".m4a"):
82
  audio_file = convert_to_wav(audio_file)
83
-
84
  audio_input, sample_rate = sf.read(audio_file)
85
- chunk_size = 16000 * 30
86
  chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
87
 
88
  transcription = ""
 
89
  for i in range(0, len(chunks), batch_size):
90
  batch_chunks = chunks[i:i + batch_size]
91
  inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
92
  inputs = inputs.to(device)
93
-
94
  attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
95
-
96
  with torch.no_grad():
97
  output = model.generate(
98
  inputs.input_features,
99
- max_length=2048,
100
  num_beams=8,
101
- task="transcribe",
102
  attention_mask=attention_mask,
103
- language="no",
104
- **encoded_input, pad_token_id=tokenizer.eos_token_id,
105
- # pad_token_id=processor.tokenizer.pad_token_id,
106
- # eos_token_id=processor.tokenizer.eos_token_id
107
  )
108
-
109
  transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
110
 
111
  end_time = time.time()
@@ -113,18 +76,23 @@ def transcribe_audio(audio_file, batch_size=4):
113
  word_count = len(transcription.split())
114
 
115
  result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
116
-
117
  return transcription.strip(), result
118
 
119
 
 
 
 
120
 
 
 
121
 
122
  # Graph-based summarization|TextRank
123
  def summarize_text(text):
124
  sentences = sent_tokenize(text)
125
  if len(sentences) == 0:
126
  return ""
127
-
128
  tfidf_vectorizer = TfidfVectorizer()
129
  tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
130
  similarity_matrix = cosine_similarity(tfidf_matrix)
@@ -142,7 +110,7 @@ def summarize_text(text):
142
  # HTML syntax for imagery
143
  image_html = """
144
  <div style="text-align: center;">
145
- <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture2.png" alt="Banner" width="85%" height="auto">
146
  </div>
147
  """
148
 
@@ -150,29 +118,40 @@ def save_to_pdf(transcription, summary):
150
  pdf = FPDF()
151
  pdf.add_page()
152
  pdf.set_font("Arial", size=12)
153
-
154
  # include transcription
155
  pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
156
-
157
  # paragraph space
158
  pdf.ln(10)
159
-
160
  # include summary
161
  pdf.multi_cell(0, 10, "Summary:\n" + summary)
162
-
163
  pdf_output_path = "transcription_summary.pdf"
164
  pdf.output(pdf_output_path)
165
  return pdf_output_path
166
 
 
 
 
 
 
 
 
 
 
 
 
167
  # Gradio UI
168
  iface = gr.Blocks()
169
 
170
  with iface:
171
  gr.HTML(image_html)
172
  gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
173
-
174
  with gr.Tabs():
175
-
176
  # First Tab: Transcription
177
  with gr.TabItem("Transcription"):
178
  audio_input = gr.Audio(type="filepath")
 
9
  import gradio as gr
10
  import warnings
11
  import torch
12
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq # pipeline
13
  from pydub import AudioSegment
14
  import soundfile as sf
15
  import numpy as np
 
21
 
22
  warnings.filterwarnings("ignore")
23
 
24
+ # HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
25
 
 
 
26
  processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
27
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
28
+ model.to(device)
29
 
30
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
  #torch_dtype = torch.float32
32
  model.to(device)
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def convert_to_wav(audio_file):
35
  audio = AudioSegment.from_file(audio_file, format="m4a")
36
  wav_file = "temp.wav"
 
38
  return wav_file
39
 
40
 
 
 
 
 
 
 
 
 
 
41
  @spaces.GPU(queue=True)
42
+
43
  def transcribe_audio(audio_file, batch_size=4):
44
  start_time = time.time()
45
+
46
  if audio_file.endswith(".m4a"):
47
  audio_file = convert_to_wav(audio_file)
48
+
49
  audio_input, sample_rate = sf.read(audio_file)
50
+ chunk_size = 16000 * 30
51
  chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
52
 
53
  transcription = ""
54
+
55
  for i in range(0, len(chunks), batch_size):
56
  batch_chunks = chunks[i:i + batch_size]
57
  inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
58
  inputs = inputs.to(device)
59
+
60
  attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
61
+
62
  with torch.no_grad():
63
  output = model.generate(
64
  inputs.input_features,
65
+ max_length=2048,
66
  num_beams=8,
 
67
  attention_mask=attention_mask,
68
+ pad_token_id=processor.tokenizer.pad_token_id,
69
+ eos_token_id=processor.tokenizer.eos_token_id
 
 
70
  )
71
+
72
  transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
73
 
74
  end_time = time.time()
 
76
  word_count = len(transcription.split())
77
 
78
  result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
79
+
80
  return transcription.strip(), result
81
 
82
 
83
+ # summarization model
84
+ summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
85
+ summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
86
 
87
+ # t5-base to device
88
+ summarization_model.to(device)
89
 
90
  # Graph-based summarization|TextRank
91
  def summarize_text(text):
92
  sentences = sent_tokenize(text)
93
  if len(sentences) == 0:
94
  return ""
95
+
96
  tfidf_vectorizer = TfidfVectorizer()
97
  tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
98
  similarity_matrix = cosine_similarity(tfidf_matrix)
 
110
  # HTML syntax for imagery
111
  image_html = """
112
  <div style="text-align: center;">
113
+ <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
114
  </div>
115
  """
116
 
 
118
  pdf = FPDF()
119
  pdf.add_page()
120
  pdf.set_font("Arial", size=12)
121
+
122
  # include transcription
123
  pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
124
+
125
  # paragraph space
126
  pdf.ln(10)
127
+
128
  # include summary
129
  pdf.multi_cell(0, 10, "Summary:\n" + summary)
130
+
131
  pdf_output_path = "transcription_summary.pdf"
132
  pdf.output(pdf_output_path)
133
  return pdf_output_path
134
 
135
+ # Gradio UI
136
+ iface = gr.Interface(
137
+ fn=transcribe_audio,
138
+ inputs=gr.Audio(type="filepath"),
139
+ outputs="text",
140
+ title="Audio Transcription App",
141
+ description="Upload an audio file to get the transcription",
142
+ theme="default",
143
+ live=False
144
+ )
145
+
146
  # Gradio UI
147
  iface = gr.Blocks()
148
 
149
  with iface:
150
  gr.HTML(image_html)
151
  gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
152
+
153
  with gr.Tabs():
154
+
155
  # First Tab: Transcription
156
  with gr.TabItem("Transcription"):
157
  audio_input = gr.Audio(type="filepath")