Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
import spaces
|
2 |
import os
|
3 |
import gradio as gr
|
4 |
import trafilatura
|
@@ -25,12 +25,11 @@ import tempfile
|
|
25 |
nltk.download("punkt")
|
26 |
nltk.download("punkt_tab")
|
27 |
|
28 |
-
|
29 |
-
stanza.
|
30 |
-
nlp = stanza.Pipeline("en", processors="tokenize,ner", use_gpu=False) # Disable GPU for Hugging Face Spaces
|
31 |
|
32 |
-
|
33 |
-
kokoro_tts = KPipeline(lang_code='a', device="cpu")
|
34 |
|
35 |
# Supported TTS Languages
|
36 |
SUPPORTED_TTS_LANGUAGES = {
|
@@ -55,12 +54,12 @@ model = BartForConditionalGeneration.from_pretrained(model_name)
|
|
55 |
# Initialize GLINER model
|
56 |
gliner_model = GLiNER.from_pretrained("urchade/gliner_base")
|
57 |
|
58 |
-
|
59 |
def fetch_and_display_content(url):
|
60 |
"""Fetch and extract text from a given URL (HTML or PDF)."""
|
61 |
if url.endswith(".pdf") or "pdf" in url:
|
62 |
converter = MarkItDown()
|
63 |
-
|
64 |
text = converter.convert(url).text_content
|
65 |
else:
|
66 |
downloaded = trafilatura.fetch_url(url)
|
@@ -72,7 +71,7 @@ def fetch_and_display_content(url):
|
|
72 |
metadata["Detected Language"] = detected_lang.upper()
|
73 |
return cleaned_text, metadata, detected_lang, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
|
74 |
|
75 |
-
|
76 |
def extract_and_clean_text(data):
|
77 |
|
78 |
metadata_dict = {}
|
@@ -84,71 +83,61 @@ def extract_and_clean_text(data):
|
|
84 |
metadata_raw = metadata_pattern.group(1).strip()
|
85 |
data = data[metadata_pattern.end():].strip() # Remove metadata from text
|
86 |
|
87 |
-
|
88 |
metadata_lines = metadata_raw.split("\n")
|
89 |
for line in metadata_lines:
|
90 |
-
if ": " in line:
|
91 |
key, value = line.split(": ", 1) # Split at first ": "
|
92 |
|
93 |
-
|
94 |
if value.startswith("[") and value.endswith("]"):
|
95 |
try:
|
96 |
-
value = json.loads(value)
|
97 |
except json.JSONDecodeError:
|
98 |
-
pass
|
99 |
|
100 |
-
metadata_dict[key.strip()] = value.strip()
|
101 |
|
102 |
#Step 2: Remove everything before the "Abstract" section
|
103 |
def remove_text_before_abstract(text):
|
104 |
"""Removes all text before the first occurrence of 'Abstract'."""
|
105 |
-
abstract_pattern = re.compile(r"(?i)\babstract\b")
|
106 |
match = abstract_pattern.search(text)
|
107 |
|
108 |
if match:
|
109 |
-
return text[match.start():]
|
110 |
-
return text
|
111 |
|
112 |
data = remove_text_before_abstract(data)
|
113 |
|
114 |
# Step 3: Clean the extracted text
|
115 |
def clean_text(text):
|
116 |
-
|
117 |
text = re.sub(r'\[\d+\]', '', text)
|
118 |
-
|
119 |
-
# Remove URLs (both direct links and markdown-style links)
|
120 |
-
text = re.sub(r'http[s]?://\S+', '', text) # Direct links
|
121 |
-
text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text) # Markdown links
|
122 |
-
|
123 |
-
# Remove markdown-style headings and special characters (#, ##, *, etc.)
|
124 |
-
#text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE) # Remove headings
|
125 |
-
#text = re.sub(r'[*_`]', '', text) # Remove bold/italic/monospace markers
|
126 |
|
127 |
-
|
|
|
|
|
|
|
128 |
patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
|
129 |
for pattern in patterns:
|
130 |
text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
|
131 |
|
132 |
-
# Remove extra whitespace and newlines
|
133 |
text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
|
134 |
|
135 |
return text
|
136 |
-
|
137 |
-
#cleaned_text = clean_text(data)
|
138 |
-
|
139 |
-
#return metadata_dict, cleaned_text
|
140 |
return metadata_dict, clean_text(data)
|
141 |
|
142 |
### 3️⃣ Language Detection
|
143 |
def detect_language(text):
|
144 |
-
|
145 |
try:
|
146 |
lang = detect(text)
|
147 |
return lang if lang in SUPPORTED_TTS_LANGUAGES else "en" # Default to English if not supported
|
148 |
except:
|
149 |
-
return "en"
|
150 |
|
151 |
-
|
152 |
def extract_entities_with_stanza(text, chunk_size=1000):
|
153 |
"""Splits text into chunks, runs Stanza NER, and combines results."""
|
154 |
sentences = sent_tokenize(text)
|
@@ -156,7 +145,7 @@ def extract_entities_with_stanza(text, chunk_size=1000):
|
|
156 |
current_chunk = []
|
157 |
current_length = 0
|
158 |
|
159 |
-
|
160 |
for sentence in sentences:
|
161 |
if current_length + len(sentence) > chunk_size:
|
162 |
chunks.append(" ".join(current_chunk))
|
@@ -169,7 +158,7 @@ def extract_entities_with_stanza(text, chunk_size=1000):
|
|
169 |
if current_chunk:
|
170 |
chunks.append(" ".join(current_chunk))
|
171 |
|
172 |
-
|
173 |
entities = []
|
174 |
for chunk in chunks:
|
175 |
doc = nlp(chunk)
|
@@ -181,25 +170,24 @@ def extract_entities_with_stanza(text, chunk_size=1000):
|
|
181 |
return entities
|
182 |
|
183 |
def generate_wordcloud(text):
|
184 |
-
|
185 |
if not text:
|
186 |
return None
|
187 |
|
188 |
-
|
189 |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
|
190 |
|
191 |
-
|
192 |
plt.figure(figsize=(10, 5))
|
193 |
plt.imshow(wordcloud, interpolation='bilinear')
|
194 |
plt.axis('off')
|
195 |
|
196 |
-
# Save the plot to a BytesIO object
|
197 |
buf = io.BytesIO()
|
198 |
plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
|
199 |
buf.seek(0)
|
200 |
plt.close()
|
201 |
|
202 |
-
|
203 |
image = Image.open(buf)
|
204 |
return image
|
205 |
|
@@ -207,7 +195,7 @@ def generate_wordcloud(text):
|
|
207 |
@spaces.GPU(duration=1000)
|
208 |
def generate_audio_kokoro(text, lang, selected_voice):
|
209 |
"""Generate speech using KokoroTTS for supported languages."""
|
210 |
-
global kokoro_tts
|
211 |
if os.path.exists(f"audio_{lang}.wav"):
|
212 |
os.remove(f"audio_{lang}.wav")
|
213 |
|
@@ -215,7 +203,7 @@ def generate_audio_kokoro(text, lang, selected_voice):
|
|
215 |
#generator = kokoro_tts(text, voice="bm_george", speed=1, split_pattern=r'\n+')
|
216 |
generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')
|
217 |
|
218 |
-
|
219 |
audio_data_list = [audio for _, _, audio in generator]
|
220 |
full_audio = np.concatenate(audio_data_list)
|
221 |
|
@@ -250,13 +238,12 @@ def split_text_with_optimized_overlap(text, max_tokens=1024, overlap_tokens=25):
|
|
250 |
chunks.append(previous_chunk_text + " " + " ".join(current_chunk))
|
251 |
return chunks
|
252 |
def summarize_text(text, max_input_tokens=1024, max_output_tokens=200):
|
253 |
-
|
254 |
-
#inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True).to(device)
|
255 |
inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True)
|
256 |
summary_ids = model.generate(inputs, max_length=max_output_tokens, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
|
257 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
258 |
def hierarchical_summarization(text):
|
259 |
-
|
260 |
chunks = split_text_with_optimized_overlap(text)
|
261 |
|
262 |
chunk_summaries = [summarize_text(chunk) for chunk in chunks]
|
@@ -267,22 +254,19 @@ def extract_entities_with_gliner(text, default_entity_types, custom_entity_types
|
|
267 |
"""
|
268 |
Extract entities using GLINER with default and custom entity types.
|
269 |
"""
|
270 |
-
|
271 |
entity_types = default_entity_types.split(",") + [etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types]
|
272 |
|
273 |
-
# Remove duplicates and empty strings
|
274 |
entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))
|
275 |
-
|
276 |
-
# Use GLINER to extract entities
|
277 |
entities = gliner_model.predict_entities(text, entity_types)
|
278 |
-
|
279 |
-
# Format entities for display
|
280 |
formatted_entities = "\n".join([f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(entities)])
|
281 |
return formatted_entities
|
282 |
|
283 |
### 5️⃣ Main Processing Function
|
284 |
def process_url(url):
|
285 |
-
|
286 |
content = fetch_content(url)
|
287 |
metadata,cleaned_text = extract_and_clean_text(content)
|
288 |
detected_lang = detect_language(cleaned_text)
|
@@ -297,7 +281,7 @@ with gr.Blocks() as demo:
|
|
297 |
url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
|
298 |
|
299 |
voice_selection = gr.Dropdown(AVAILABLE_VOICES, label="Select Voice", value="bm_george")
|
300 |
-
tts_option = gr.Radio(["Summary
|
301 |
with gr.Row():
|
302 |
process_text_button = gr.Button("Fetch Text & Detect Language",scale = 1)
|
303 |
process_audio_button = gr.Button("Generate Audio", visible=False,scale = 1)
|
@@ -343,13 +327,12 @@ with gr.Blocks() as demo:
|
|
343 |
show_progress=True
|
344 |
)
|
345 |
|
346 |
-
|
347 |
-
|
348 |
process_audio_button.click(
|
349 |
lambda text, summary, lang, voice, tts_choice: (
|
350 |
None, # Clear previous audio
|
351 |
generate_audio_kokoro(
|
352 |
-
summary if tts_choice == "Summary
|
353 |
)
|
354 |
),
|
355 |
inputs=[extracted_text, summary_output, detected_lang, voice_selection, tts_option],
|
@@ -359,9 +342,9 @@ with gr.Blocks() as demo:
|
|
359 |
|
360 |
|
361 |
process_ner_button.click(
|
362 |
-
|
363 |
extract_entities_with_gliner,
|
364 |
-
|
365 |
inputs=[extracted_text, default_entity_types, custom_entity_types],
|
366 |
outputs=[ner_output]
|
367 |
)
|
|
|
1 |
+
import spaces
|
2 |
import os
|
3 |
import gradio as gr
|
4 |
import trafilatura
|
|
|
25 |
nltk.download("punkt")
|
26 |
nltk.download("punkt_tab")
|
27 |
|
28 |
+
stanza.download("en")
|
29 |
+
nlp = stanza.Pipeline("en", processors="tokenize,ner", use_gpu=False)
|
|
|
30 |
|
31 |
+
|
32 |
+
kokoro_tts = KPipeline(lang_code='a', device="cpu")
|
33 |
|
34 |
# Supported TTS Languages
|
35 |
SUPPORTED_TTS_LANGUAGES = {
|
|
|
54 |
# Initialize GLINER model
|
55 |
gliner_model = GLiNER.from_pretrained("urchade/gliner_base")
|
56 |
|
57 |
+
|
58 |
def fetch_and_display_content(url):
|
59 |
"""Fetch and extract text from a given URL (HTML or PDF)."""
|
60 |
if url.endswith(".pdf") or "pdf" in url:
|
61 |
converter = MarkItDown()
|
62 |
+
|
63 |
text = converter.convert(url).text_content
|
64 |
else:
|
65 |
downloaded = trafilatura.fetch_url(url)
|
|
|
71 |
metadata["Detected Language"] = detected_lang.upper()
|
72 |
return cleaned_text, metadata, detected_lang, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
|
73 |
|
74 |
+
|
75 |
def extract_and_clean_text(data):
|
76 |
|
77 |
metadata_dict = {}
|
|
|
83 |
metadata_raw = metadata_pattern.group(1).strip()
|
84 |
data = data[metadata_pattern.end():].strip() # Remove metadata from text
|
85 |
|
86 |
+
|
87 |
metadata_lines = metadata_raw.split("\n")
|
88 |
for line in metadata_lines:
|
89 |
+
if ": " in line:
|
90 |
key, value = line.split(": ", 1) # Split at first ": "
|
91 |
|
92 |
+
|
93 |
if value.startswith("[") and value.endswith("]"):
|
94 |
try:
|
95 |
+
value = json.loads(value)
|
96 |
except json.JSONDecodeError:
|
97 |
+
pass
|
98 |
|
99 |
+
metadata_dict[key.strip()] = value.strip()
|
100 |
|
101 |
#Step 2: Remove everything before the "Abstract" section
|
102 |
def remove_text_before_abstract(text):
|
103 |
"""Removes all text before the first occurrence of 'Abstract'."""
|
104 |
+
abstract_pattern = re.compile(r"(?i)\babstract\b")
|
105 |
match = abstract_pattern.search(text)
|
106 |
|
107 |
if match:
|
108 |
+
return text[match.start():]
|
109 |
+
return text
|
110 |
|
111 |
data = remove_text_before_abstract(data)
|
112 |
|
113 |
# Step 3: Clean the extracted text
|
114 |
def clean_text(text):
|
115 |
+
|
116 |
text = re.sub(r'\[\d+\]', '', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
+
text = re.sub(r'http[s]?://\S+', '', text)
|
119 |
+
text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text)
|
120 |
+
|
121 |
+
|
122 |
patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
|
123 |
for pattern in patterns:
|
124 |
text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
|
125 |
|
|
|
126 |
text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
|
127 |
|
128 |
return text
|
|
|
|
|
|
|
|
|
129 |
return metadata_dict, clean_text(data)
|
130 |
|
131 |
### 3️⃣ Language Detection
|
132 |
def detect_language(text):
|
133 |
+
|
134 |
try:
|
135 |
lang = detect(text)
|
136 |
return lang if lang in SUPPORTED_TTS_LANGUAGES else "en" # Default to English if not supported
|
137 |
except:
|
138 |
+
return "en"
|
139 |
|
140 |
+
#Not using this one below. Using Gliner
|
141 |
def extract_entities_with_stanza(text, chunk_size=1000):
|
142 |
"""Splits text into chunks, runs Stanza NER, and combines results."""
|
143 |
sentences = sent_tokenize(text)
|
|
|
145 |
current_chunk = []
|
146 |
current_length = 0
|
147 |
|
148 |
+
|
149 |
for sentence in sentences:
|
150 |
if current_length + len(sentence) > chunk_size:
|
151 |
chunks.append(" ".join(current_chunk))
|
|
|
158 |
if current_chunk:
|
159 |
chunks.append(" ".join(current_chunk))
|
160 |
|
161 |
+
|
162 |
entities = []
|
163 |
for chunk in chunks:
|
164 |
doc = nlp(chunk)
|
|
|
170 |
return entities
|
171 |
|
172 |
def generate_wordcloud(text):
|
173 |
+
|
174 |
if not text:
|
175 |
return None
|
176 |
|
177 |
+
|
178 |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
|
179 |
|
180 |
+
|
181 |
plt.figure(figsize=(10, 5))
|
182 |
plt.imshow(wordcloud, interpolation='bilinear')
|
183 |
plt.axis('off')
|
184 |
|
|
|
185 |
buf = io.BytesIO()
|
186 |
plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
|
187 |
buf.seek(0)
|
188 |
plt.close()
|
189 |
|
190 |
+
|
191 |
image = Image.open(buf)
|
192 |
return image
|
193 |
|
|
|
195 |
@spaces.GPU(duration=1000)
|
196 |
def generate_audio_kokoro(text, lang, selected_voice):
|
197 |
"""Generate speech using KokoroTTS for supported languages."""
|
198 |
+
global kokoro_tts
|
199 |
if os.path.exists(f"audio_{lang}.wav"):
|
200 |
os.remove(f"audio_{lang}.wav")
|
201 |
|
|
|
203 |
#generator = kokoro_tts(text, voice="bm_george", speed=1, split_pattern=r'\n+')
|
204 |
generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')
|
205 |
|
206 |
+
|
207 |
audio_data_list = [audio for _, _, audio in generator]
|
208 |
full_audio = np.concatenate(audio_data_list)
|
209 |
|
|
|
238 |
chunks.append(previous_chunk_text + " " + " ".join(current_chunk))
|
239 |
return chunks
|
240 |
def summarize_text(text, max_input_tokens=1024, max_output_tokens=200):
|
241 |
+
|
|
|
242 |
inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True)
|
243 |
summary_ids = model.generate(inputs, max_length=max_output_tokens, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
|
244 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
245 |
def hierarchical_summarization(text):
|
246 |
+
|
247 |
chunks = split_text_with_optimized_overlap(text)
|
248 |
|
249 |
chunk_summaries = [summarize_text(chunk) for chunk in chunks]
|
|
|
254 |
"""
|
255 |
Extract entities using GLINER with default and custom entity types.
|
256 |
"""
|
257 |
+
|
258 |
entity_types = default_entity_types.split(",") + [etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types]
|
259 |
|
|
|
260 |
entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))
|
261 |
+
|
|
|
262 |
entities = gliner_model.predict_entities(text, entity_types)
|
263 |
+
|
|
|
264 |
formatted_entities = "\n".join([f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(entities)])
|
265 |
return formatted_entities
|
266 |
|
267 |
### 5️⃣ Main Processing Function
|
268 |
def process_url(url):
|
269 |
+
|
270 |
content = fetch_content(url)
|
271 |
metadata,cleaned_text = extract_and_clean_text(content)
|
272 |
detected_lang = detect_language(cleaned_text)
|
|
|
281 |
url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
|
282 |
|
283 |
voice_selection = gr.Dropdown(AVAILABLE_VOICES, label="Select Voice", value="bm_george")
|
284 |
+
tts_option = gr.Radio(["TTS based on Summary", "TTS based on Raw Data"], value="TTS based on Summary", label="Select TTS Source")
|
285 |
with gr.Row():
|
286 |
process_text_button = gr.Button("Fetch Text & Detect Language",scale = 1)
|
287 |
process_audio_button = gr.Button("Generate Audio", visible=False,scale = 1)
|
|
|
327 |
show_progress=True
|
328 |
)
|
329 |
|
330 |
+
|
|
|
331 |
process_audio_button.click(
|
332 |
lambda text, summary, lang, voice, tts_choice: (
|
333 |
None, # Clear previous audio
|
334 |
generate_audio_kokoro(
|
335 |
+
summary if tts_choice == "TTS based on Summary" else text, lang, voice
|
336 |
)
|
337 |
),
|
338 |
inputs=[extracted_text, summary_output, detected_lang, voice_selection, tts_option],
|
|
|
342 |
|
343 |
|
344 |
process_ner_button.click(
|
345 |
+
|
346 |
extract_entities_with_gliner,
|
347 |
+
|
348 |
inputs=[extracted_text, default_entity_types, custom_entity_types],
|
349 |
outputs=[ner_output]
|
350 |
)
|