PuristanLabs1 commited on
Commit
877bcf1
·
verified ·
1 Parent(s): bd8a12c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -3
app.py CHANGED
@@ -19,6 +19,7 @@ from wordcloud import WordCloud
19
  import matplotlib.pyplot as plt
20
  from PIL import Image
21
  import io
 
22
 
23
  nltk.download("punkt")
24
  nltk.download("punkt_tab")
@@ -50,6 +51,9 @@ model_name = "facebook/bart-large-cnn"
50
  tokenizer = BartTokenizer.from_pretrained(model_name)
51
  model = BartForConditionalGeneration.from_pretrained(model_name)
52
 
 
 
 
53
  ### 1️⃣ Fetch and Extract Content (Runs Immediately)
54
  def fetch_and_display_content(url):
55
  """Fetch and extract text from a given URL (HTML or PDF)."""
@@ -262,6 +266,23 @@ def hierarchical_summarization(text):
262
  final_summary = " ".join(chunk_summaries)
263
  return final_summary
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  ### 5️⃣ Main Processing Function
266
  def process_url(url):
267
  """Processes the URL, extracts text, detects language, and converts to audio."""
@@ -289,7 +310,7 @@ with gr.Blocks() as demo:
289
  with gr.Row():
290
  extracted_text = gr.Textbox(label="Extracted Content", visible=False, interactive=False, lines=15)
291
  metadata_output = gr.JSON(label="Article Metadata", visible=False) # Displays metadata
292
- wordcloud_output = gr.Image(label="Word Cloud", visible=True)
293
 
294
 
295
  detected_lang = gr.Textbox(label="Detected Language", visible=False)
@@ -297,6 +318,10 @@ with gr.Blocks() as demo:
297
  full_audio_output = gr.Audio(label="Generated Audio", visible=True)
298
  ner_output = gr.Textbox(label="Extracted Entities", visible=True, interactive=False)
299
  #wordcloud_output = gr.Image(label="Word Cloud", visible=True)
 
 
 
 
300
 
301
  # Step 1: Fetch Text & Detect Language First
302
  process_text_button.click(
@@ -324,8 +349,10 @@ with gr.Blocks() as demo:
324
  )
325
 
326
  process_ner_button.click(
327
- extract_entities_with_stanza,
328
- inputs=[extracted_text],
 
 
329
  outputs=[ner_output]
330
  )
331
 
 
19
  import matplotlib.pyplot as plt
20
  from PIL import Image
21
  import io
22
+ from gliner import GLiNER
23
 
24
  nltk.download("punkt")
25
  nltk.download("punkt_tab")
 
51
  tokenizer = BartTokenizer.from_pretrained(model_name)
52
  model = BartForConditionalGeneration.from_pretrained(model_name)
53
 
54
+ # Initialize GLINER model
55
+ gliner_model = GLiNER.from_pretrained("urchade/gliner_base")
56
+
57
  ### 1️⃣ Fetch and Extract Content (Runs Immediately)
58
  def fetch_and_display_content(url):
59
  """Fetch and extract text from a given URL (HTML or PDF)."""
 
266
  final_summary = " ".join(chunk_summaries)
267
  return final_summary
268
 
269
+ def extract_entities_with_gliner(text, default_entity_types, custom_entity_types):
270
+ """
271
+ Extract entities using GLINER with default and custom entity types.
272
+ """
273
+ # Combine default and custom entity types
274
+ entity_types = default_entity_types.split(",") + [etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types]
275
+
276
+ # Remove duplicates and empty strings
277
+ entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))
278
+
279
+ # Use GLINER to extract entities
280
+ entities = gliner_model.predict_entities(text, entity_types)
281
+
282
+ # Format entities for display
283
+ formatted_entities = "\n".join([f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(entities)])
284
+ return formatted_entities
285
+
286
  ### 5️⃣ Main Processing Function
287
  def process_url(url):
288
  """Processes the URL, extracts text, detects language, and converts to audio."""
 
310
  with gr.Row():
311
  extracted_text = gr.Textbox(label="Extracted Content", visible=False, interactive=False, lines=15)
312
  metadata_output = gr.JSON(label="Article Metadata", visible=False) # Displays metadata
313
+ wordcloud_output = gr.Image(label="Word Cloud", visible=True)
314
 
315
 
316
  detected_lang = gr.Textbox(label="Detected Language", visible=False)
 
318
  full_audio_output = gr.Audio(label="Generated Audio", visible=True)
319
  ner_output = gr.Textbox(label="Extracted Entities", visible=True, interactive=False)
320
  #wordcloud_output = gr.Image(label="Word Cloud", visible=True)
321
+
322
+ #Add textboxes for default and custom entity types
323
+ default_entity_types = gr.Textbox(label="Default Entity Types", value="PERSON, Organization, location, Date, PRODUCT, EVENT", interactive=True)
324
+ custom_entity_types = gr.Textbox(label="Custom Entity Types", placeholder="Enter additional entity types (comma-separated)", interactive=True)
325
 
326
  # Step 1: Fetch Text & Detect Language First
327
  process_text_button.click(
 
349
  )
350
 
351
  process_ner_button.click(
352
+ #extract_entities_with_stanza,
353
+ extract_entities_with_gliner
354
+ #inputs=[extracted_text],
355
+ inputs=[extracted_text, default_entity_types, custom_entity_types],
356
  outputs=[ner_output]
357
  )
358