Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,7 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#---------------------------------------------------------------------------------------------------------------------------------------------
|
|
|
16 |
import gradio as gr
|
17 |
from PIL import Image
|
18 |
from pydub import AudioSegment
|
@@ -40,10 +41,9 @@ import spacy
|
|
40 |
import networkx as nx
|
41 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
42 |
from sklearn.metrics.pairwise import cosine_similarity
|
43 |
-
#---------------------------------------------------------------------------------------------------------------------------------------------
|
44 |
warnings.filterwarnings("ignore")
|
45 |
|
46 |
-
|
47 |
HEADER_INFO = """
|
48 |
# WEB APP ✨| Norwegian WHISPER Model
|
49 |
Switch Work [Transkribering av lydfiler til norsk skrift]
|
@@ -54,14 +54,12 @@ SIDEBAR_INFO = f"""
|
|
54 |
<img src="{LOGO}" style="width: 100%; height: auto;"/>
|
55 |
</div>
|
56 |
"""
|
57 |
-
# Check if CUDA is available
|
58 |
-
if not torch.cuda.is_available():
|
59 |
-
raise RuntimeError("CUDA not available. Go look for a GPU.")
|
60 |
|
61 |
-
#
|
62 |
dtype = torch.bfloat32
|
63 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
64 |
|
|
|
65 |
def convert_to_wav(filepath):
|
66 |
_, file_ending = os.path.splitext(f'{filepath}')
|
67 |
audio_file = filepath.replace(file_ending, ".wav")
|
@@ -70,6 +68,7 @@ def convert_to_wav(filepath):
|
|
70 |
|
71 |
pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, torch_dtype=dtype).to(device)
|
72 |
|
|
|
73 |
def transcribe_audio(audio_file, batch_size=16):
|
74 |
start_time = time.time()
|
75 |
|
@@ -100,9 +99,9 @@ def transcribe_audio(audio_file, batch_size=16):
|
|
100 |
"""
|
101 |
|
102 |
return text.strip(), system_info
|
103 |
-
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
104 |
|
105 |
-
#
|
|
|
106 |
def clean_text(text):
|
107 |
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
|
108 |
text = re.sub(r'[^\w\s]', '', text)
|
@@ -113,6 +112,7 @@ nlp = spacy.blank("nb") # 'nb' ==> codename = Norwegian Bokmål
|
|
113 |
nlp.add_pipe('sentencizer')
|
114 |
spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
|
115 |
|
|
|
116 |
def preprocess_text(text):
|
117 |
# Process the text with SpaCy
|
118 |
doc = nlp(text)
|
@@ -123,6 +123,7 @@ def preprocess_text(text):
|
|
123 |
return ' '.join(words)
|
124 |
|
125 |
# Summarize w/T5 model
|
|
|
126 |
def summarize_text(text):
|
127 |
preprocessed_text = preprocess_text(text)
|
128 |
inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
|
@@ -130,7 +131,6 @@ def summarize_text(text):
|
|
130 |
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
|
131 |
return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
132 |
|
133 |
-
# Builds similarity matrix
|
134 |
def build_similarity_matrix(sentences, stop_words):
|
135 |
similarity_matrix = nx.Graph()
|
136 |
for i, tokens_a in enumerate(sentences):
|
@@ -142,11 +142,10 @@ def build_similarity_matrix(sentences, stop_words):
|
|
142 |
|
143 |
# PageRank
|
144 |
def graph_based_summary(text, num_paragraphs=3):
|
145 |
-
|
146 |
doc = nlp(text)
|
147 |
sentences = [sent.text for sent in doc.sents]
|
148 |
if len(sentences) < num_paragraphs:
|
149 |
-
return ' '.join(sentences)
|
150 |
|
151 |
sentence_tokens = [nlp(sent) for sent in sentences]
|
152 |
stop_words = spacy_stop_words
|
@@ -183,7 +182,7 @@ def text_rank_summary(text, num_paragraphs=3):
|
|
183 |
doc = nlp(text)
|
184 |
sentences = [sent.text for sent in doc.sents]
|
185 |
if len(sentences) < num_paragraphs:
|
186 |
-
return ' '.join(sentences)
|
187 |
|
188 |
stop_words = spacy_stop_words
|
189 |
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
@@ -227,7 +226,6 @@ with iface:
|
|
227 |
text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
|
228 |
summary_output_graph = gr.Textbox(label="PageRank | token-based similarity")
|
229 |
|
230 |
-
# Displaying AiLab and Bjørn explanations
|
231 |
gr.Markdown("""
|
232 |
**token-based**: similarity matrix edge weights representing token overlap/
|
233 |
ranked by their centrality in the graph (good with dense inter-sentence relationships)
|
@@ -245,7 +243,6 @@ with iface:
|
|
245 |
text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
|
246 |
summary_output_lex = gr.Textbox(label="LexRank | cosine similarity")
|
247 |
|
248 |
-
# Displaying AiLab and Bjørn explanations
|
249 |
gr.Markdown("""
|
250 |
**semantic**: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
|
251 |
(good for sparse graph structures with thresholding)
|
@@ -263,7 +260,6 @@ with iface:
|
|
263 |
text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
|
264 |
summary_output_text_rank = gr.Textbox(label="TextRank | lexical similarity")
|
265 |
|
266 |
-
# Displaying AiLab and Bjørn explanations
|
267 |
gr.Markdown("""
|
268 |
**sentence**: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
|
269 |
""")
|
@@ -288,4 +284,4 @@ with iface:
|
|
288 |
pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output]) # Includes all summary outputs
|
289 |
pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output_graph], outputs=[pdf_output]) # Defaulting to Graph-based summary
|
290 |
|
291 |
-
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#---------------------------------------------------------------------------------------------------------------------------------------------
|
16 |
+
import spaces
|
17 |
import gradio as gr
|
18 |
from PIL import Image
|
19 |
from pydub import AudioSegment
|
|
|
41 |
import networkx as nx
|
42 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
43 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
44 |
warnings.filterwarnings("ignore")
|
45 |
|
46 |
+
# ------------header section------------
|
47 |
HEADER_INFO = """
|
48 |
# WEB APP ✨| Norwegian WHISPER Model
|
49 |
Switch Work [Transkribering av lydfiler til norsk skrift]
|
|
|
54 |
<img src="{LOGO}" style="width: 100%; height: auto;"/>
|
55 |
</div>
|
56 |
"""
|
|
|
|
|
|
|
57 |
|
58 |
+
# ------------transcribe section------------
|
59 |
dtype = torch.bfloat32
|
60 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
61 |
|
62 |
+
@spaces.GPU()
|
63 |
def convert_to_wav(filepath):
|
64 |
_, file_ending = os.path.splitext(f'{filepath}')
|
65 |
audio_file = filepath.replace(file_ending, ".wav")
|
|
|
68 |
|
69 |
pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, torch_dtype=dtype).to(device)
|
70 |
|
71 |
+
@spaces.GPU()
|
72 |
def transcribe_audio(audio_file, batch_size=16):
|
73 |
start_time = time.time()
|
74 |
|
|
|
99 |
"""
|
100 |
|
101 |
return text.strip(), system_info
|
|
|
102 |
|
103 |
+
# ------------summary section------------
|
104 |
+
@spaces.GPU()
|
105 |
def clean_text(text):
|
106 |
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
|
107 |
text = re.sub(r'[^\w\s]', '', text)
|
|
|
112 |
nlp.add_pipe('sentencizer')
|
113 |
spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
|
114 |
|
115 |
+
@spaces.GPU()
|
116 |
def preprocess_text(text):
|
117 |
# Process the text with SpaCy
|
118 |
doc = nlp(text)
|
|
|
123 |
return ' '.join(words)
|
124 |
|
125 |
# Summarize w/T5 model
|
126 |
+
@spaces.GPU()
|
127 |
def summarize_text(text):
|
128 |
preprocessed_text = preprocess_text(text)
|
129 |
inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
|
|
|
131 |
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
|
132 |
return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
133 |
|
|
|
134 |
def build_similarity_matrix(sentences, stop_words):
|
135 |
similarity_matrix = nx.Graph()
|
136 |
for i, tokens_a in enumerate(sentences):
|
|
|
142 |
|
143 |
# PageRank
|
144 |
def graph_based_summary(text, num_paragraphs=3):
|
|
|
145 |
doc = nlp(text)
|
146 |
sentences = [sent.text for sent in doc.sents]
|
147 |
if len(sentences) < num_paragraphs:
|
148 |
+
return ' '.join(sentences)
|
149 |
|
150 |
sentence_tokens = [nlp(sent) for sent in sentences]
|
151 |
stop_words = spacy_stop_words
|
|
|
182 |
doc = nlp(text)
|
183 |
sentences = [sent.text for sent in doc.sents]
|
184 |
if len(sentences) < num_paragraphs:
|
185 |
+
return ' '.join(sentences)
|
186 |
|
187 |
stop_words = spacy_stop_words
|
188 |
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
|
|
226 |
text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
|
227 |
summary_output_graph = gr.Textbox(label="PageRank | token-based similarity")
|
228 |
|
|
|
229 |
gr.Markdown("""
|
230 |
**token-based**: similarity matrix edge weights representing token overlap/
|
231 |
ranked by their centrality in the graph (good with dense inter-sentence relationships)
|
|
|
243 |
text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
|
244 |
summary_output_lex = gr.Textbox(label="LexRank | cosine similarity")
|
245 |
|
|
|
246 |
gr.Markdown("""
|
247 |
**semantic**: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
|
248 |
(good for sparse graph structures with thresholding)
|
|
|
260 |
text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
|
261 |
summary_output_text_rank = gr.Textbox(label="TextRank | lexical similarity")
|
262 |
|
|
|
263 |
gr.Markdown("""
|
264 |
**sentence**: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
|
265 |
""")
|
|
|
284 |
pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output]) # Includes all summary outputs
|
285 |
pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output_graph], outputs=[pdf_output]) # Defaulting to Graph-based summary
|
286 |
|
287 |
+
iface.launch(share=True, debug=True)
|