PuristanLabs1 commited on
Commit
0097003
·
verified ·
1 Parent(s): ea1861f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -61
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import spaces # Import spaces first to avoid CUDA initialization issues
2
  import os
3
  import gradio as gr
4
  import trafilatura
@@ -25,12 +25,11 @@ import tempfile
25
  nltk.download("punkt")
26
  nltk.download("punkt_tab")
27
 
28
- # Load Stanza's NER model
29
- stanza.download("en") # Load English pipeline (can be changed for other languages)
30
- nlp = stanza.Pipeline("en", processors="tokenize,ner", use_gpu=False) # Disable GPU for Hugging Face Spaces
31
 
32
- # Initialize KokoroTTS with default English
33
- kokoro_tts = KPipeline(lang_code='a', device="cpu") # Load initially on CPU
34
 
35
  # Supported TTS Languages
36
  SUPPORTED_TTS_LANGUAGES = {
@@ -55,12 +54,12 @@ model = BartForConditionalGeneration.from_pretrained(model_name)
55
  # Initialize GLINER model
56
  gliner_model = GLiNER.from_pretrained("urchade/gliner_base")
57
 
58
- ### 1️⃣ Fetch and Extract Content (Runs Immediately)
59
  def fetch_and_display_content(url):
60
  """Fetch and extract text from a given URL (HTML or PDF)."""
61
  if url.endswith(".pdf") or "pdf" in url:
62
  converter = MarkItDown()
63
- #result = converter.convert(source)
64
  text = converter.convert(url).text_content
65
  else:
66
  downloaded = trafilatura.fetch_url(url)
@@ -72,7 +71,7 @@ def fetch_and_display_content(url):
72
  metadata["Detected Language"] = detected_lang.upper()
73
  return cleaned_text, metadata, detected_lang, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
74
 
75
- ### 2️⃣ Cleaning Function
76
  def extract_and_clean_text(data):
77
 
78
  metadata_dict = {}
@@ -84,71 +83,61 @@ def extract_and_clean_text(data):
84
  metadata_raw = metadata_pattern.group(1).strip()
85
  data = data[metadata_pattern.end():].strip() # Remove metadata from text
86
 
87
- # Convert metadata into dictionary format manually (since YAML isn't reliable)
88
  metadata_lines = metadata_raw.split("\n")
89
  for line in metadata_lines:
90
- if ": " in line: # Only process lines with key-value pairs
91
  key, value = line.split(": ", 1) # Split at first ": "
92
 
93
- # Convert lists (wrapped in square brackets) into Python lists
94
  if value.startswith("[") and value.endswith("]"):
95
  try:
96
- value = json.loads(value) # Convert to list
97
  except json.JSONDecodeError:
98
- pass # If JSON parsing fails, keep it as a string
99
 
100
- metadata_dict[key.strip()] = value.strip() # Store cleaned key-value pair
101
 
102
  #Step 2: Remove everything before the "Abstract" section
103
  def remove_text_before_abstract(text):
104
  """Removes all text before the first occurrence of 'Abstract'."""
105
- abstract_pattern = re.compile(r"(?i)\babstract\b") # Case-insensitive search
106
  match = abstract_pattern.search(text)
107
 
108
  if match:
109
- return text[match.start():] # Keep text from "Abstract" onwards
110
- return text # If "Abstract" is not found, return the full text
111
 
112
  data = remove_text_before_abstract(data)
113
 
114
  # Step 3: Clean the extracted text
115
  def clean_text(text):
116
- # Remove inline citations like [2][4]
117
  text = re.sub(r'\[\d+\]', '', text)
118
-
119
- # Remove URLs (both direct links and markdown-style links)
120
- text = re.sub(r'http[s]?://\S+', '', text) # Direct links
121
- text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text) # Markdown links
122
-
123
- # Remove markdown-style headings and special characters (#, ##, *, etc.)
124
- #text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE) # Remove headings
125
- #text = re.sub(r'[*_`]', '', text) # Remove bold/italic/monospace markers
126
 
127
- # Remove References, Bibliography, External Links, and Comments sections
 
 
 
128
  patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
129
  for pattern in patterns:
130
  text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
131
 
132
- # Remove extra whitespace and newlines
133
  text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
134
 
135
  return text
136
-
137
- #cleaned_text = clean_text(data)
138
-
139
- #return metadata_dict, cleaned_text
140
  return metadata_dict, clean_text(data)
141
 
142
  ### 3️⃣ Language Detection
143
  def detect_language(text):
144
- """Detects the language of extracted text."""
145
  try:
146
  lang = detect(text)
147
  return lang if lang in SUPPORTED_TTS_LANGUAGES else "en" # Default to English if not supported
148
  except:
149
- return "en" # Default to English if detection fails
150
 
151
- ### 2️⃣ Named Entity Recognition (NER) Using Stanza
152
  def extract_entities_with_stanza(text, chunk_size=1000):
153
  """Splits text into chunks, runs Stanza NER, and combines results."""
154
  sentences = sent_tokenize(text)
@@ -156,7 +145,7 @@ def extract_entities_with_stanza(text, chunk_size=1000):
156
  current_chunk = []
157
  current_length = 0
158
 
159
- # Split text into manageable chunks
160
  for sentence in sentences:
161
  if current_length + len(sentence) > chunk_size:
162
  chunks.append(" ".join(current_chunk))
@@ -169,7 +158,7 @@ def extract_entities_with_stanza(text, chunk_size=1000):
169
  if current_chunk:
170
  chunks.append(" ".join(current_chunk))
171
 
172
- # Process each chunk separately with Stanza
173
  entities = []
174
  for chunk in chunks:
175
  doc = nlp(chunk)
@@ -181,25 +170,24 @@ def extract_entities_with_stanza(text, chunk_size=1000):
181
  return entities
182
 
183
  def generate_wordcloud(text):
184
- """Generate a word cloud from the given text."""
185
  if not text:
186
  return None
187
 
188
- # Generate word cloud
189
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
190
 
191
- # Convert word cloud to PIL image
192
  plt.figure(figsize=(10, 5))
193
  plt.imshow(wordcloud, interpolation='bilinear')
194
  plt.axis('off')
195
 
196
- # Save the plot to a BytesIO object
197
  buf = io.BytesIO()
198
  plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
199
  buf.seek(0)
200
  plt.close()
201
 
202
- # Convert to PIL image
203
  image = Image.open(buf)
204
  return image
205
 
@@ -207,7 +195,7 @@ def generate_wordcloud(text):
207
  @spaces.GPU(duration=1000)
208
  def generate_audio_kokoro(text, lang, selected_voice):
209
  """Generate speech using KokoroTTS for supported languages."""
210
- global kokoro_tts # Access the preloaded model
211
  if os.path.exists(f"audio_{lang}.wav"):
212
  os.remove(f"audio_{lang}.wav")
213
 
@@ -215,7 +203,7 @@ def generate_audio_kokoro(text, lang, selected_voice):
215
  #generator = kokoro_tts(text, voice="bm_george", speed=1, split_pattern=r'\n+')
216
  generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')
217
 
218
- # Generate and collect audio data
219
  audio_data_list = [audio for _, _, audio in generator]
220
  full_audio = np.concatenate(audio_data_list)
221
 
@@ -250,13 +238,12 @@ def split_text_with_optimized_overlap(text, max_tokens=1024, overlap_tokens=25):
250
  chunks.append(previous_chunk_text + " " + " ".join(current_chunk))
251
  return chunks
252
  def summarize_text(text, max_input_tokens=1024, max_output_tokens=200):
253
- """Generates summary for a given chunk of text."""
254
- #inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True).to(device)
255
  inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True)
256
  summary_ids = model.generate(inputs, max_length=max_output_tokens, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
257
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
258
  def hierarchical_summarization(text):
259
- """Summarizes text in chunks."""
260
  chunks = split_text_with_optimized_overlap(text)
261
 
262
  chunk_summaries = [summarize_text(chunk) for chunk in chunks]
@@ -267,22 +254,19 @@ def extract_entities_with_gliner(text, default_entity_types, custom_entity_types
267
  """
268
  Extract entities using GLINER with default and custom entity types.
269
  """
270
- # Combine default and custom entity types
271
  entity_types = default_entity_types.split(",") + [etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types]
272
 
273
- # Remove duplicates and empty strings
274
  entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))
275
-
276
- # Use GLINER to extract entities
277
  entities = gliner_model.predict_entities(text, entity_types)
278
-
279
- # Format entities for display
280
  formatted_entities = "\n".join([f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(entities)])
281
  return formatted_entities
282
 
283
  ### 5️⃣ Main Processing Function
284
  def process_url(url):
285
- """Processes the URL, extracts text, detects language, and converts to audio."""
286
  content = fetch_content(url)
287
  metadata,cleaned_text = extract_and_clean_text(content)
288
  detected_lang = detect_language(cleaned_text)
@@ -297,7 +281,7 @@ with gr.Blocks() as demo:
297
  url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
298
 
299
  voice_selection = gr.Dropdown(AVAILABLE_VOICES, label="Select Voice", value="bm_george")
300
- tts_option = gr.Radio(["Summary Audio", "Raw Data Audio"], value="Summary Audio", label="Select TTS Source")
301
  with gr.Row():
302
  process_text_button = gr.Button("Fetch Text & Detect Language",scale = 1)
303
  process_audio_button = gr.Button("Generate Audio", visible=False,scale = 1)
@@ -343,13 +327,12 @@ with gr.Blocks() as demo:
343
  show_progress=True
344
  )
345
 
346
- #process_summary_button.click(hierarchical_summarization, inputs=[extracted_text], outputs=[summary_output])
347
-
348
  process_audio_button.click(
349
  lambda text, summary, lang, voice, tts_choice: (
350
  None, # Clear previous audio
351
  generate_audio_kokoro(
352
- summary if tts_choice == "Summary Audio" else text, lang, voice
353
  )
354
  ),
355
  inputs=[extracted_text, summary_output, detected_lang, voice_selection, tts_option],
@@ -359,9 +342,9 @@ with gr.Blocks() as demo:
359
 
360
 
361
  process_ner_button.click(
362
- #extract_entities_with_stanza,
363
  extract_entities_with_gliner,
364
- #inputs=[extracted_text],
365
  inputs=[extracted_text, default_entity_types, custom_entity_types],
366
  outputs=[ner_output]
367
  )
 
1
+ import spaces
2
  import os
3
  import gradio as gr
4
  import trafilatura
 
25
  nltk.download("punkt")
26
  nltk.download("punkt_tab")
27
 
28
+ stanza.download("en")
29
+ nlp = stanza.Pipeline("en", processors="tokenize,ner", use_gpu=False)
 
30
 
31
+
32
+ kokoro_tts = KPipeline(lang_code='a', device="cpu")
33
 
34
  # Supported TTS Languages
35
  SUPPORTED_TTS_LANGUAGES = {
 
54
  # Initialize GLINER model
55
  gliner_model = GLiNER.from_pretrained("urchade/gliner_base")
56
 
57
+
58
  def fetch_and_display_content(url):
59
  """Fetch and extract text from a given URL (HTML or PDF)."""
60
  if url.endswith(".pdf") or "pdf" in url:
61
  converter = MarkItDown()
62
+
63
  text = converter.convert(url).text_content
64
  else:
65
  downloaded = trafilatura.fetch_url(url)
 
71
  metadata["Detected Language"] = detected_lang.upper()
72
  return cleaned_text, metadata, detected_lang, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
73
 
74
+
75
  def extract_and_clean_text(data):
76
 
77
  metadata_dict = {}
 
83
  metadata_raw = metadata_pattern.group(1).strip()
84
  data = data[metadata_pattern.end():].strip() # Remove metadata from text
85
 
86
+
87
  metadata_lines = metadata_raw.split("\n")
88
  for line in metadata_lines:
89
+ if ": " in line:
90
  key, value = line.split(": ", 1) # Split at first ": "
91
 
92
+
93
  if value.startswith("[") and value.endswith("]"):
94
  try:
95
+ value = json.loads(value)
96
  except json.JSONDecodeError:
97
+ pass
98
 
99
+ metadata_dict[key.strip()] = value.strip()
100
 
101
  #Step 2: Remove everything before the "Abstract" section
102
  def remove_text_before_abstract(text):
103
  """Removes all text before the first occurrence of 'Abstract'."""
104
+ abstract_pattern = re.compile(r"(?i)\babstract\b")
105
  match = abstract_pattern.search(text)
106
 
107
  if match:
108
+ return text[match.start():]
109
+ return text
110
 
111
  data = remove_text_before_abstract(data)
112
 
113
  # Step 3: Clean the extracted text
114
  def clean_text(text):
115
+
116
  text = re.sub(r'\[\d+\]', '', text)
 
 
 
 
 
 
 
 
117
 
118
+ text = re.sub(r'http[s]?://\S+', '', text)
119
+ text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text)
120
+
121
+
122
  patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
123
  for pattern in patterns:
124
  text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
125
 
 
126
  text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
127
 
128
  return text
 
 
 
 
129
  return metadata_dict, clean_text(data)
130
 
131
  ### 3️⃣ Language Detection
132
  def detect_language(text):
133
+
134
  try:
135
  lang = detect(text)
136
  return lang if lang in SUPPORTED_TTS_LANGUAGES else "en" # Default to English if not supported
137
  except:
138
+ return "en"
139
 
140
+ #Not using this one below. Using Gliner
141
  def extract_entities_with_stanza(text, chunk_size=1000):
142
  """Splits text into chunks, runs Stanza NER, and combines results."""
143
  sentences = sent_tokenize(text)
 
145
  current_chunk = []
146
  current_length = 0
147
 
148
+
149
  for sentence in sentences:
150
  if current_length + len(sentence) > chunk_size:
151
  chunks.append(" ".join(current_chunk))
 
158
  if current_chunk:
159
  chunks.append(" ".join(current_chunk))
160
 
161
+
162
  entities = []
163
  for chunk in chunks:
164
  doc = nlp(chunk)
 
170
  return entities
171
 
172
  def generate_wordcloud(text):
173
+
174
  if not text:
175
  return None
176
 
177
+
178
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
179
 
180
+
181
  plt.figure(figsize=(10, 5))
182
  plt.imshow(wordcloud, interpolation='bilinear')
183
  plt.axis('off')
184
 
 
185
  buf = io.BytesIO()
186
  plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
187
  buf.seek(0)
188
  plt.close()
189
 
190
+
191
  image = Image.open(buf)
192
  return image
193
 
 
195
  @spaces.GPU(duration=1000)
196
  def generate_audio_kokoro(text, lang, selected_voice):
197
  """Generate speech using KokoroTTS for supported languages."""
198
+ global kokoro_tts
199
  if os.path.exists(f"audio_{lang}.wav"):
200
  os.remove(f"audio_{lang}.wav")
201
 
 
203
  #generator = kokoro_tts(text, voice="bm_george", speed=1, split_pattern=r'\n+')
204
  generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')
205
 
206
+
207
  audio_data_list = [audio for _, _, audio in generator]
208
  full_audio = np.concatenate(audio_data_list)
209
 
 
238
  chunks.append(previous_chunk_text + " " + " ".join(current_chunk))
239
  return chunks
240
  def summarize_text(text, max_input_tokens=1024, max_output_tokens=200):
241
+
 
242
  inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True)
243
  summary_ids = model.generate(inputs, max_length=max_output_tokens, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
244
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
245
  def hierarchical_summarization(text):
246
+
247
  chunks = split_text_with_optimized_overlap(text)
248
 
249
  chunk_summaries = [summarize_text(chunk) for chunk in chunks]
 
254
  """
255
  Extract entities using GLINER with default and custom entity types.
256
  """
257
+
258
  entity_types = default_entity_types.split(",") + [etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types]
259
 
 
260
  entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))
261
+
 
262
  entities = gliner_model.predict_entities(text, entity_types)
263
+
 
264
  formatted_entities = "\n".join([f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(entities)])
265
  return formatted_entities
266
 
267
  ### 5️⃣ Main Processing Function
268
  def process_url(url):
269
+
270
  content = fetch_content(url)
271
  metadata,cleaned_text = extract_and_clean_text(content)
272
  detected_lang = detect_language(cleaned_text)
 
281
  url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
282
 
283
  voice_selection = gr.Dropdown(AVAILABLE_VOICES, label="Select Voice", value="bm_george")
284
+ tts_option = gr.Radio(["TTS based on Summary", "TTS based on Raw Data"], value="TTS based on Summary", label="Select TTS Source")
285
  with gr.Row():
286
  process_text_button = gr.Button("Fetch Text & Detect Language",scale = 1)
287
  process_audio_button = gr.Button("Generate Audio", visible=False,scale = 1)
 
327
  show_progress=True
328
  )
329
 
330
+
 
331
  process_audio_button.click(
332
  lambda text, summary, lang, voice, tts_choice: (
333
  None, # Clear previous audio
334
  generate_audio_kokoro(
335
+ summary if tts_choice == "TTS based on Summary" else text, lang, voice
336
  )
337
  ),
338
  inputs=[extracted_text, summary_output, detected_lang, voice_selection, tts_option],
 
342
 
343
 
344
  process_ner_button.click(
345
+
346
  extract_entities_with_gliner,
347
+
348
  inputs=[extracted_text, default_entity_types, custom_entity_types],
349
  outputs=[ner_output]
350
  )