Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -19,6 +19,7 @@ from wordcloud import WordCloud
|
|
19 |
import matplotlib.pyplot as plt
|
20 |
from PIL import Image
|
21 |
import io
|
|
|
22 |
|
23 |
nltk.download("punkt")
|
24 |
nltk.download("punkt_tab")
|
@@ -50,6 +51,9 @@ model_name = "facebook/bart-large-cnn"
|
|
50 |
tokenizer = BartTokenizer.from_pretrained(model_name)
|
51 |
model = BartForConditionalGeneration.from_pretrained(model_name)
|
52 |
|
|
|
|
|
|
|
53 |
### 1️⃣ Fetch and Extract Content (Runs Immediately)
|
54 |
def fetch_and_display_content(url):
|
55 |
"""Fetch and extract text from a given URL (HTML or PDF)."""
|
@@ -262,6 +266,23 @@ def hierarchical_summarization(text):
|
|
262 |
final_summary = " ".join(chunk_summaries)
|
263 |
return final_summary
|
264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
### 5️⃣ Main Processing Function
|
266 |
def process_url(url):
|
267 |
"""Processes the URL, extracts text, detects language, and converts to audio."""
|
@@ -289,7 +310,7 @@ with gr.Blocks() as demo:
|
|
289 |
with gr.Row():
|
290 |
extracted_text = gr.Textbox(label="Extracted Content", visible=False, interactive=False, lines=15)
|
291 |
metadata_output = gr.JSON(label="Article Metadata", visible=False) # Displays metadata
|
292 |
-
|
293 |
|
294 |
|
295 |
detected_lang = gr.Textbox(label="Detected Language", visible=False)
|
@@ -297,6 +318,10 @@ with gr.Blocks() as demo:
|
|
297 |
full_audio_output = gr.Audio(label="Generated Audio", visible=True)
|
298 |
ner_output = gr.Textbox(label="Extracted Entities", visible=True, interactive=False)
|
299 |
#wordcloud_output = gr.Image(label="Word Cloud", visible=True)
|
|
|
|
|
|
|
|
|
300 |
|
301 |
# Step 1: Fetch Text & Detect Language First
|
302 |
process_text_button.click(
|
@@ -324,8 +349,10 @@ with gr.Blocks() as demo:
|
|
324 |
)
|
325 |
|
326 |
process_ner_button.click(
|
327 |
-
extract_entities_with_stanza,
|
328 |
-
|
|
|
|
|
329 |
outputs=[ner_output]
|
330 |
)
|
331 |
|
|
|
19 |
import matplotlib.pyplot as plt
|
20 |
from PIL import Image
|
21 |
import io
|
22 |
+
from gliner import GLiNER
|
23 |
|
24 |
nltk.download("punkt")
|
25 |
nltk.download("punkt_tab")
|
|
|
51 |
tokenizer = BartTokenizer.from_pretrained(model_name)
|
52 |
model = BartForConditionalGeneration.from_pretrained(model_name)
|
53 |
|
54 |
+
# Initialize GLINER model
|
55 |
+
gliner_model = GLiNER.from_pretrained("urchade/gliner_base")
|
56 |
+
|
57 |
### 1️⃣ Fetch and Extract Content (Runs Immediately)
|
58 |
def fetch_and_display_content(url):
|
59 |
"""Fetch and extract text from a given URL (HTML or PDF)."""
|
|
|
266 |
final_summary = " ".join(chunk_summaries)
|
267 |
return final_summary
|
268 |
|
269 |
+
def extract_entities_with_gliner(text, default_entity_types, custom_entity_types):
|
270 |
+
"""
|
271 |
+
Extract entities using GLINER with default and custom entity types.
|
272 |
+
"""
|
273 |
+
# Combine default and custom entity types
|
274 |
+
entity_types = default_entity_types.split(",") + [etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types]
|
275 |
+
|
276 |
+
# Remove duplicates and empty strings
|
277 |
+
entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))
|
278 |
+
|
279 |
+
# Use GLINER to extract entities
|
280 |
+
entities = gliner_model.predict_entities(text, entity_types)
|
281 |
+
|
282 |
+
# Format entities for display
|
283 |
+
formatted_entities = "\n".join([f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(entities)])
|
284 |
+
return formatted_entities
|
285 |
+
|
286 |
### 5️⃣ Main Processing Function
|
287 |
def process_url(url):
|
288 |
"""Processes the URL, extracts text, detects language, and converts to audio."""
|
|
|
310 |
with gr.Row():
|
311 |
extracted_text = gr.Textbox(label="Extracted Content", visible=False, interactive=False, lines=15)
|
312 |
metadata_output = gr.JSON(label="Article Metadata", visible=False) # Displays metadata
|
313 |
+
wordcloud_output = gr.Image(label="Word Cloud", visible=True)
|
314 |
|
315 |
|
316 |
detected_lang = gr.Textbox(label="Detected Language", visible=False)
|
|
|
318 |
full_audio_output = gr.Audio(label="Generated Audio", visible=True)
|
319 |
ner_output = gr.Textbox(label="Extracted Entities", visible=True, interactive=False)
|
320 |
#wordcloud_output = gr.Image(label="Word Cloud", visible=True)
|
321 |
+
|
322 |
+
#Add textboxes for default and custom entity types
|
323 |
+
default_entity_types = gr.Textbox(label="Default Entity Types", value="PERSON, Organization, location, Date, PRODUCT, EVENT", interactive=True)
|
324 |
+
custom_entity_types = gr.Textbox(label="Custom Entity Types", placeholder="Enter additional entity types (comma-separated)", interactive=True)
|
325 |
|
326 |
# Step 1: Fetch Text & Detect Language First
|
327 |
process_text_button.click(
|
|
|
349 |
)
|
350 |
|
351 |
process_ner_button.click(
|
352 |
+
#extract_entities_with_stanza,
|
353 |
+
extract_entities_with_gliner
|
354 |
+
#inputs=[extracted_text],
|
355 |
+
inputs=[extracted_text, default_entity_types, custom_entity_types],
|
356 |
outputs=[ner_output]
|
357 |
)
|
358 |
|