Spaces:

PuristanLabs1
/

VocalWeb

Sleeping

App Files Files Community

PuristanLabs1 commited on Mar 13

Commit

0097003

verified ·

1 Parent(s): ea1861f

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -61

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import spaces  # Import spaces first to avoid CUDA initialization issues
 import os
 import gradio as gr
 import trafilatura
@@ -25,12 +25,11 @@ import tempfile
 nltk.download("punkt")
 nltk.download("punkt_tab")
-# Load Stanza's NER model
-stanza.download("en")  # Load English pipeline (can be changed for other languages)
-nlp = stanza.Pipeline("en", processors="tokenize,ner", use_gpu=False)  # Disable GPU for Hugging Face Spaces
-# Initialize KokoroTTS with default English
-kokoro_tts = KPipeline(lang_code='a', device="cpu")  # Load initially on CPU
 # Supported TTS Languages
 SUPPORTED_TTS_LANGUAGES = {
@@ -55,12 +54,12 @@ model = BartForConditionalGeneration.from_pretrained(model_name)
 # Initialize GLINER model
 gliner_model = GLiNER.from_pretrained("urchade/gliner_base")
-### 1️⃣ Fetch and Extract Content (Runs Immediately)
 def fetch_and_display_content(url):
     """Fetch and extract text from a given URL (HTML or PDF)."""
     if url.endswith(".pdf") or "pdf" in url:
         converter = MarkItDown()
-        #result = converter.convert(source)
         text = converter.convert(url).text_content
     else:
         downloaded = trafilatura.fetch_url(url)
@@ -72,7 +71,7 @@ def fetch_and_display_content(url):
     metadata["Detected Language"] = detected_lang.upper()
     return cleaned_text, metadata, detected_lang, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
-### 2️⃣ Cleaning Function
 def extract_and_clean_text(data):
     metadata_dict = {}
@@ -84,71 +83,61 @@ def extract_and_clean_text(data):
         metadata_raw = metadata_pattern.group(1).strip()
         data = data[metadata_pattern.end():].strip()  # Remove metadata from text
-        # Convert metadata into dictionary format manually (since YAML isn't reliable)
         metadata_lines = metadata_raw.split("\n")
         for line in metadata_lines:
-            if ": " in line:  # Only process lines with key-value pairs
                 key, value = line.split(": ", 1)  # Split at first ": "
-                # Convert lists (wrapped in square brackets) into Python lists
                 if value.startswith("[") and value.endswith("]"):
                     try:
-                        value = json.loads(value)  # Convert to list
                     except json.JSONDecodeError:
-                        pass  # If JSON parsing fails, keep it as a string
-                metadata_dict[key.strip()] = value.strip()  # Store cleaned key-value pair
     #Step 2: Remove everything before the "Abstract" section
     def remove_text_before_abstract(text):
         """Removes all text before the first occurrence of 'Abstract'."""
-        abstract_pattern = re.compile(r"(?i)\babstract\b")  # Case-insensitive search
         match = abstract_pattern.search(text)
         if match:
-            return text[match.start():]  # Keep text from "Abstract" onwards
-        return text  # If "Abstract" is not found, return the full text
     data = remove_text_before_abstract(data)
     # Step 3: Clean the extracted text
     def clean_text(text):
-        # Remove inline citations like [2][4]
         text = re.sub(r'\[\d+\]', '', text)
-        # Remove URLs (both direct links and markdown-style links)
-        text = re.sub(r'http[s]?://\S+', '', text)  # Direct links
-        text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text)  # Markdown links
-        # Remove markdown-style headings and special characters (#, ##, *, etc.)
-        #text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE)  # Remove headings
-        #text = re.sub(r'[*_`]', '', text)  # Remove bold/italic/monospace markers
-        # Remove References, Bibliography, External Links, and Comments sections
         patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
         for pattern in patterns:
             text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
-        # Remove extra whitespace and newlines
         text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
         return text
-    #cleaned_text = clean_text(data)
-    #return metadata_dict, cleaned_text
     return metadata_dict, clean_text(data)
 ### 3️⃣ Language Detection
 def detect_language(text):
-    """Detects the language of extracted text."""
     try:
         lang = detect(text)
         return lang if lang in SUPPORTED_TTS_LANGUAGES else "en"  # Default to English if not supported
     except:
-        return "en"  # Default to English if detection fails
-### 2️⃣ Named Entity Recognition (NER) Using Stanza
 def extract_entities_with_stanza(text, chunk_size=1000):
     """Splits text into chunks, runs Stanza NER, and combines results."""
     sentences = sent_tokenize(text)
@@ -156,7 +145,7 @@ def extract_entities_with_stanza(text, chunk_size=1000):
     current_chunk = []
     current_length = 0
-    # Split text into manageable chunks
     for sentence in sentences:
         if current_length + len(sentence) > chunk_size:
             chunks.append(" ".join(current_chunk))
@@ -169,7 +158,7 @@ def extract_entities_with_stanza(text, chunk_size=1000):
     if current_chunk:
         chunks.append(" ".join(current_chunk))
-    # Process each chunk separately with Stanza
     entities = []
     for chunk in chunks:
         doc = nlp(chunk)
@@ -181,25 +170,24 @@ def extract_entities_with_stanza(text, chunk_size=1000):
     return entities
 def generate_wordcloud(text):
-    """Generate a word cloud from the given text."""
     if not text:
         return None
-    # Generate word cloud
     wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
-    # Convert word cloud to PIL image
     plt.figure(figsize=(10, 5))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
-    # Save the plot to a BytesIO object
     buf = io.BytesIO()
     plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
     buf.seek(0)
     plt.close()
-    # Convert to PIL image
     image = Image.open(buf)
     return image
@@ -207,7 +195,7 @@ def generate_wordcloud(text):
 @spaces.GPU(duration=1000)
 def generate_audio_kokoro(text, lang, selected_voice):
     """Generate speech using KokoroTTS for supported languages."""
-    global kokoro_tts  # Access the preloaded model
     if os.path.exists(f"audio_{lang}.wav"):
         os.remove(f"audio_{lang}.wav")
@@ -215,7 +203,7 @@ def generate_audio_kokoro(text, lang, selected_voice):
     #generator = kokoro_tts(text, voice="bm_george", speed=1, split_pattern=r'\n+')
     generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')
-    # Generate and collect audio data
     audio_data_list = [audio for _, _, audio in generator]
     full_audio = np.concatenate(audio_data_list)
@@ -250,13 +238,12 @@ def split_text_with_optimized_overlap(text, max_tokens=1024, overlap_tokens=25):
         chunks.append(previous_chunk_text + " " + " ".join(current_chunk))
     return chunks
 def summarize_text(text, max_input_tokens=1024, max_output_tokens=200):
-    """Generates summary for a given chunk of text."""
-    #inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True).to(device)
     inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True)
     summary_ids = model.generate(inputs, max_length=max_output_tokens, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def hierarchical_summarization(text):
-    """Summarizes text in chunks."""
     chunks = split_text_with_optimized_overlap(text)
     chunk_summaries = [summarize_text(chunk) for chunk in chunks]
@@ -267,22 +254,19 @@ def extract_entities_with_gliner(text, default_entity_types, custom_entity_types
     """
     Extract entities using GLINER with default and custom entity types.
     """
-    # Combine default and custom entity types
     entity_types = default_entity_types.split(",") + [etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types]
-    # Remove duplicates and empty strings
     entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))
-    # Use GLINER to extract entities
     entities = gliner_model.predict_entities(text, entity_types)
-    # Format entities for display
     formatted_entities = "\n".join([f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(entities)])
     return formatted_entities
 ### 5️⃣ Main Processing Function
 def process_url(url):
-    """Processes the URL, extracts text, detects language, and converts to audio."""
     content = fetch_content(url)
     metadata,cleaned_text = extract_and_clean_text(content)
     detected_lang = detect_language(cleaned_text)
@@ -297,7 +281,7 @@ with gr.Blocks() as demo:
     url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
     voice_selection = gr.Dropdown(AVAILABLE_VOICES, label="Select Voice", value="bm_george")
-    tts_option = gr.Radio(["Summary Audio", "Raw Data Audio"], value="Summary Audio", label="Select TTS Source")
     with gr.Row():
         process_text_button = gr.Button("Fetch Text & Detect Language",scale = 1)
         process_audio_button = gr.Button("Generate Audio", visible=False,scale = 1)
@@ -343,13 +327,12 @@ with gr.Blocks() as demo:
         show_progress=True
     )
-    #process_summary_button.click(hierarchical_summarization, inputs=[extracted_text], outputs=[summary_output])
     process_audio_button.click(
         lambda text, summary, lang, voice, tts_choice: (
 		    None,  # Clear previous audio
 			generate_audio_kokoro(
-                summary if tts_choice == "Summary Audio" else text, lang, voice
             )
         ),
         inputs=[extracted_text, summary_output, detected_lang, voice_selection, tts_option],
@@ -359,9 +342,9 @@ with gr.Blocks() as demo:
     process_ner_button.click(
-        #extract_entities_with_stanza,
 	    extract_entities_with_gliner,
-        #inputs=[extracted_text],
 		inputs=[extracted_text, default_entity_types, custom_entity_types],
         outputs=[ner_output]
     )

+import spaces
 import os
 import gradio as gr
 import trafilatura
 nltk.download("punkt")
 nltk.download("punkt_tab")
+stanza.download("en")
+nlp = stanza.Pipeline("en", processors="tokenize,ner", use_gpu=False)
+kokoro_tts = KPipeline(lang_code='a', device="cpu")
 # Supported TTS Languages
 SUPPORTED_TTS_LANGUAGES = {
 # Initialize GLINER model
 gliner_model = GLiNER.from_pretrained("urchade/gliner_base")
 def fetch_and_display_content(url):
     """Fetch and extract text from a given URL (HTML or PDF)."""
     if url.endswith(".pdf") or "pdf" in url:
         converter = MarkItDown()
         text = converter.convert(url).text_content
     else:
         downloaded = trafilatura.fetch_url(url)
     metadata["Detected Language"] = detected_lang.upper()
     return cleaned_text, metadata, detected_lang, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
 def extract_and_clean_text(data):
     metadata_dict = {}
         metadata_raw = metadata_pattern.group(1).strip()
         data = data[metadata_pattern.end():].strip()  # Remove metadata from text
         metadata_lines = metadata_raw.split("\n")
         for line in metadata_lines:
+            if ": " in line:
                 key, value = line.split(": ", 1)  # Split at first ": "
                 if value.startswith("[") and value.endswith("]"):
                     try:
+                        value = json.loads(value)
                     except json.JSONDecodeError:
+                        pass
+                metadata_dict[key.strip()] = value.strip()
     #Step 2: Remove everything before the "Abstract" section
     def remove_text_before_abstract(text):
         """Removes all text before the first occurrence of 'Abstract'."""
+        abstract_pattern = re.compile(r"(?i)\babstract\b")
         match = abstract_pattern.search(text)
         if match:
+            return text[match.start():]
+        return text
     data = remove_text_before_abstract(data)
     # Step 3: Clean the extracted text
     def clean_text(text):
         text = re.sub(r'\[\d+\]', '', text)
+        text = re.sub(r'http[s]?://\S+', '', text)
+        text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text)
         patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
         for pattern in patterns:
             text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
         text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
         return text
     return metadata_dict, clean_text(data)
 ### 3️⃣ Language Detection
 def detect_language(text):
     try:
         lang = detect(text)
         return lang if lang in SUPPORTED_TTS_LANGUAGES else "en"  # Default to English if not supported
     except:
+        return "en"
+#Not using this one below. Using Gliner
 def extract_entities_with_stanza(text, chunk_size=1000):
     """Splits text into chunks, runs Stanza NER, and combines results."""
     sentences = sent_tokenize(text)
     current_chunk = []
     current_length = 0
     for sentence in sentences:
         if current_length + len(sentence) > chunk_size:
             chunks.append(" ".join(current_chunk))
     if current_chunk:
         chunks.append(" ".join(current_chunk))
     entities = []
     for chunk in chunks:
         doc = nlp(chunk)
     return entities
 def generate_wordcloud(text):
     if not text:
         return None
     wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
     plt.figure(figsize=(10, 5))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
     buf = io.BytesIO()
     plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
     buf.seek(0)
     plt.close()
     image = Image.open(buf)
     return image
 @spaces.GPU(duration=1000)
 def generate_audio_kokoro(text, lang, selected_voice):
     """Generate speech using KokoroTTS for supported languages."""
+    global kokoro_tts
     if os.path.exists(f"audio_{lang}.wav"):
         os.remove(f"audio_{lang}.wav")
     #generator = kokoro_tts(text, voice="bm_george", speed=1, split_pattern=r'\n+')
     generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')
     audio_data_list = [audio for _, _, audio in generator]
     full_audio = np.concatenate(audio_data_list)
         chunks.append(previous_chunk_text + " " + " ".join(current_chunk))
     return chunks
 def summarize_text(text, max_input_tokens=1024, max_output_tokens=200):
     inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True)
     summary_ids = model.generate(inputs, max_length=max_output_tokens, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def hierarchical_summarization(text):
     chunks = split_text_with_optimized_overlap(text)
     chunk_summaries = [summarize_text(chunk) for chunk in chunks]
     """
     Extract entities using GLINER with default and custom entity types.
     """
     entity_types = default_entity_types.split(",") + [etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types]
     entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))
     entities = gliner_model.predict_entities(text, entity_types)
     formatted_entities = "\n".join([f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(entities)])
     return formatted_entities
 ### 5️⃣ Main Processing Function
 def process_url(url):
     content = fetch_content(url)
     metadata,cleaned_text = extract_and_clean_text(content)
     detected_lang = detect_language(cleaned_text)
     url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
     voice_selection = gr.Dropdown(AVAILABLE_VOICES, label="Select Voice", value="bm_george")
+    tts_option = gr.Radio(["TTS based on Summary", "TTS based on Raw Data"], value="TTS based on Summary", label="Select TTS Source")
     with gr.Row():
         process_text_button = gr.Button("Fetch Text & Detect Language",scale = 1)
         process_audio_button = gr.Button("Generate Audio", visible=False,scale = 1)
         show_progress=True
     )
     process_audio_button.click(
         lambda text, summary, lang, voice, tts_choice: (
 		    None,  # Clear previous audio
 			generate_audio_kokoro(
+                summary if tts_choice == "TTS based on Summary" else text, lang, voice
             )
         ),
         inputs=[extracted_text, summary_output, detected_lang, voice_selection, tts_option],
     process_ner_button.click(
 	    extract_entities_with_gliner,
 		inputs=[extracted_text, default_entity_types, custom_entity_types],
         outputs=[ner_output]
     )