Spaces:

PuristanLabs1
/

VocalWeb

Sleeping

App Files Files Community

PuristanLabs1 commited on Mar 2

Commit

877bcf1

verified ·

1 Parent(s): bd8a12c

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -3

app.py CHANGED Viewed

@@ -19,6 +19,7 @@ from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 from PIL import Image
 import io
 nltk.download("punkt")
 nltk.download("punkt_tab")
@@ -50,6 +51,9 @@ model_name = "facebook/bart-large-cnn"
 tokenizer = BartTokenizer.from_pretrained(model_name)
 model = BartForConditionalGeneration.from_pretrained(model_name)
 ### 1️⃣ Fetch and Extract Content (Runs Immediately)
 def fetch_and_display_content(url):
     """Fetch and extract text from a given URL (HTML or PDF)."""
@@ -262,6 +266,23 @@ def hierarchical_summarization(text):
     final_summary = " ".join(chunk_summaries)
     return final_summary
 ### 5️⃣ Main Processing Function
 def process_url(url):
     """Processes the URL, extracts text, detects language, and converts to audio."""
@@ -289,7 +310,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         extracted_text = gr.Textbox(label="Extracted Content", visible=False, interactive=False, lines=15)
         metadata_output = gr.JSON(label="Article Metadata", visible=False)  # Displays metadata
-        wordcloud_output = gr.Image(label="Word Cloud", visible=True)
     detected_lang = gr.Textbox(label="Detected Language", visible=False)
@@ -297,6 +318,10 @@ with gr.Blocks() as demo:
     full_audio_output = gr.Audio(label="Generated Audio", visible=True)
     ner_output = gr.Textbox(label="Extracted Entities", visible=True, interactive=False)
     #wordcloud_output = gr.Image(label="Word Cloud", visible=True)
     # Step 1: Fetch Text & Detect Language First
     process_text_button.click(
@@ -324,8 +349,10 @@ with gr.Blocks() as demo:
     )
     process_ner_button.click(
-        extract_entities_with_stanza,
-        inputs=[extracted_text],
         outputs=[ner_output]
     )

 import matplotlib.pyplot as plt
 from PIL import Image
 import io
+from gliner import GLiNER
 nltk.download("punkt")
 nltk.download("punkt_tab")
 tokenizer = BartTokenizer.from_pretrained(model_name)
 model = BartForConditionalGeneration.from_pretrained(model_name)
+# Initialize GLINER model
+gliner_model = GLiNER.from_pretrained("urchade/gliner_base")
 ### 1️⃣ Fetch and Extract Content (Runs Immediately)
 def fetch_and_display_content(url):
     """Fetch and extract text from a given URL (HTML or PDF)."""
     final_summary = " ".join(chunk_summaries)
     return final_summary
+def extract_entities_with_gliner(text, default_entity_types, custom_entity_types):
+    """
+    Extract entities using GLINER with default and custom entity types.
+    """
+    # Combine default and custom entity types
+    entity_types = default_entity_types.split(",") + [etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types]
+    # Remove duplicates and empty strings
+    entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))
+    # Use GLINER to extract entities
+    entities = gliner_model.predict_entities(text, entity_types)
+    # Format entities for display
+    formatted_entities = "\n".join([f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(entities)])
+    return formatted_entities
 ### 5️⃣ Main Processing Function
 def process_url(url):
     """Processes the URL, extracts text, detects language, and converts to audio."""
     with gr.Row():
         extracted_text = gr.Textbox(label="Extracted Content", visible=False, interactive=False, lines=15)
         metadata_output = gr.JSON(label="Article Metadata", visible=False)  # Displays metadata
+		wordcloud_output = gr.Image(label="Word Cloud", visible=True)
     detected_lang = gr.Textbox(label="Detected Language", visible=False)
     full_audio_output = gr.Audio(label="Generated Audio", visible=True)
     ner_output = gr.Textbox(label="Extracted Entities", visible=True, interactive=False)
     #wordcloud_output = gr.Image(label="Word Cloud", visible=True)
+	#Add textboxes for default and custom entity types
+    default_entity_types = gr.Textbox(label="Default Entity Types", value="PERSON, Organization, location, Date, PRODUCT, EVENT", interactive=True)
+    custom_entity_types = gr.Textbox(label="Custom Entity Types", placeholder="Enter additional entity types (comma-separated)", interactive=True)
     # Step 1: Fetch Text & Detect Language First
     process_text_button.click(
     )
     process_ner_button.click(
+        #extract_entities_with_stanza,
+	    extract_entities_with_gliner
+        #inputs=[extracted_text],
+		inputs=[extracted_text, default_entity_types, custom_entity_types],
         outputs=[ner_output]
     )