Spaces:

PuristanLabs1
/

VocalWeb

Sleeping

App Files Files Community

PuristanLabs1 commited on Mar 23

Commit

392490b

verified ·

1 Parent(s): b097fde

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -45

app.py CHANGED Viewed

@@ -19,17 +19,18 @@ from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 from PIL import Image
 import io
 from gliner import GLiNER
 import tempfile
 nltk.download("punkt")
 nltk.download("punkt_tab")
-stanza.download("en")
-nlp = stanza.Pipeline("en", processors="tokenize,ner", use_gpu=False)
-kokoro_tts = KPipeline(lang_code='a', device="cpu")
 # Supported TTS Languages
 SUPPORTED_TTS_LANGUAGES = {
@@ -48,28 +49,73 @@ AVAILABLE_VOICES = [
 # Load BART Large CNN Model for Summarization
 model_name = "facebook/bart-large-cnn"
-tokenizer = BartTokenizer.from_pretrained(model_name)
-model = BartForConditionalGeneration.from_pretrained(model_name)
 # Initialize GLINER model
 gliner_model = GLiNER.from_pretrained("urchade/gliner_base")
 def fetch_and_display_content(url):
-    """Fetch and extract text from a given URL (HTML or PDF)."""
-    if url.endswith(".pdf") or "pdf" in url:
-        converter = MarkItDown()
-        text = converter.convert(url).text_content
     else:
-        downloaded = trafilatura.fetch_url(url)
-        text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
     metadata, cleaned_text = extract_and_clean_text(text)
     detected_lang = detect_language(cleaned_text)
     # Add detected language to metadata
     metadata["Detected Language"] = detected_lang.upper()
-    return cleaned_text, metadata, detected_lang, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
 def extract_and_clean_text(data):
@@ -171,13 +217,11 @@ def extract_entities_with_stanza(text, chunk_size=1000):
 def generate_wordcloud(text):
-    if not text:
-        return None
     wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
     plt.figure(figsize=(10, 5))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
@@ -196,11 +240,8 @@ def generate_wordcloud(text):
 def generate_audio_kokoro(text, lang, selected_voice):
     """Generate speech using KokoroTTS for supported languages."""
     global kokoro_tts
-    if os.path.exists(f"audio_{lang}.wav"):
-        os.remove(f"audio_{lang}.wav")
     lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a")  # Default to English
-    #generator = kokoro_tts(text, voice="bm_george", speed=1, split_pattern=r'\n+')
     generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')
@@ -242,26 +283,90 @@ def summarize_text(text, max_input_tokens=1024, max_output_tokens=200):
     inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True)
     summary_ids = model.generate(inputs, max_length=max_output_tokens, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def hierarchical_summarization(text):
     chunks = split_text_with_optimized_overlap(text)
-    chunk_summaries = [summarize_text(chunk) for chunk in chunks]
     final_summary = " ".join(chunk_summaries)
-    return final_summary
-def extract_entities_with_gliner(text, default_entity_types, custom_entity_types):
     """
-    Extract entities using GLINER with default and custom entity types.
     """
-    entity_types = default_entity_types.split(",") + [etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types]
     entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))
-    entities = gliner_model.predict_entities(text, entity_types)
-    formatted_entities = "\n".join([f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(entities)])
     return formatted_entities
 ### 5️⃣ Main Processing Function
@@ -298,9 +403,7 @@ with gr.Blocks() as demo:
     summary_output = gr.Textbox(label="Summary", visible=True, interactive=False)
     full_audio_output = gr.Audio(label="Generated Audio", visible=True)
     ner_output = gr.Textbox(label="Extracted Entities", visible=True, interactive=False)
     default_entity_types = gr.Textbox(label="Default Entity Types", value="PERSON, Organization, location, Date, PRODUCT, EVENT", interactive=True)
     custom_entity_types = gr.Textbox(label="Custom Entity Types", placeholder="Enter additional entity types (comma-separated)", interactive=True)
@@ -320,14 +423,20 @@ with gr.Blocks() as demo:
         show_progress=True
     )
     extracted_text.change(
-        hierarchical_summarization,
         inputs=[extracted_text],
-        outputs=[summary_output],
         show_progress=True
     )
     process_audio_button.click(
         lambda text, summary, lang, voice, tts_choice: (
 		    None,  # Clear previous audio
@@ -340,13 +449,13 @@ with gr.Blocks() as demo:
         show_progress=True
     )
     process_ner_button.click(
 	    extract_entities_with_gliner,
-		inputs=[extracted_text, default_entity_types, custom_entity_types],
         outputs=[ner_output]
     )
-demo.launch()

 import matplotlib.pyplot as plt
 from PIL import Image
 import io
+import requests
 from gliner import GLiNER
 import tempfile
 nltk.download("punkt")
 nltk.download("punkt_tab")
+# Automatically select device based on hardware availability
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"✅ Using {DEVICE.upper()} for TTS and Summarization")
+kokoro_tts = KPipeline(lang_code='a', device=DEVICE)
 # Supported TTS Languages
 SUPPORTED_TTS_LANGUAGES = {
 # Load BART Large CNN Model for Summarization
 model_name = "facebook/bart-large-cnn"
+try:
+    tokenizer = BartTokenizer.from_pretrained(model_name, cache_dir=os.path.join(os.getcwd(), ".cache"))
+    model = BartForConditionalGeneration.from_pretrained(model_name, cache_dir=os.path.join(os.getcwd(), ".cache")).to(DEVICE)
+except Exception as e:
+    raise RuntimeError(f"Error loading BART model: {e}")
 # Initialize GLINER model
 gliner_model = GLiNER.from_pretrained("urchade/gliner_base")
+def is_pdf_url(url):
+    """Robustly detects PDF files via URL patterns and Content-Type headers."""
+    # URL Pattern Check
+    if url.endswith(".pdf") or "pdf" in url.lower():
+        return True
+    # Check Content-Type Header (for URLs without '.pdf')
+    try:
+        response = requests.head(url, timeout=10)
+        content_type = response.headers.get('Content-Type', '')
+        if 'application/pdf' in content_type:
+            return True
+    except requests.RequestException:
+        pass  # Ignore errors in Content-Type check
+    return False
 def fetch_and_display_content(url):
+    """
+    Fetch and extract text from a given URL (HTML or PDF).
+    Extract metadata, clean text, and detect language.
+    """
+    downloaded = trafilatura.fetch_url(url)
+    if not downloaded:
+        raise ValueError(f"❌ Failed to fetch content from URL: {url}")
+    if is_pdf_url(url):
+        converter = MarkItDown(enable_plugins=False)
+        try:
+            text = converter.convert(url).text_content
+        except Exception as e:
+            raise RuntimeError(f"❌ Error converting PDF with MarkItDown: {e}")
     else:
+        text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False)
+    if not text or len(text.strip()) == 0:
+        raise ValueError("❌ No content found in the extracted data.")
     metadata, cleaned_text = extract_and_clean_text(text)
     detected_lang = detect_language(cleaned_text)
     # Add detected language to metadata
     metadata["Detected Language"] = detected_lang.upper()
+    return (
+        cleaned_text,
+        metadata,
+        detected_lang,
+        gr.update(visible=True),  # Show Word Cloud
+        gr.update(visible=True),  # Show Process Audio Button
+        gr.update(visible=True),  # Show Process NER Button
+        gr.update(visible=True),  # Show Extracted Text
+        gr.update(visible=True)   # Show Metadata Output
+    )
 def extract_and_clean_text(data):
 def generate_wordcloud(text):
+    if not text.strip():
+        raise ValueError("❌ Text is empty or invalid for WordCloud generation.")
     wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
     plt.figure(figsize=(10, 5))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
 def generate_audio_kokoro(text, lang, selected_voice):
     """Generate speech using KokoroTTS for supported languages."""
     global kokoro_tts
     lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a")  # Default to English
     generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')
     inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True)
     summary_ids = model.generate(inputs, max_length=max_output_tokens, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def hierarchical_summarization(text):
+    """Performs hierarchical summarization by chunking content first."""
+    print(f"✅ Summarization will run on: {DEVICE.upper()}")
+    if len(text) > 10000:
+        print("⚠️ Warning: Large input text detected. Summarization may take longer than usual.")
     chunks = split_text_with_optimized_overlap(text)
+    #Tokenize the input cleaned text
+    encoded_inputs = tokenizer(
+        ["summarize: " + chunk for chunk in chunks],
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=1024
+    ).to(DEVICE)
+	#Generate the summary
+    summary_ids = model.generate(
+        encoded_inputs["input_ids"],
+        max_length=200,
+        min_length=50,
+        length_penalty=2.0,
+        num_beams=4,
+        early_stopping=True
+    )
+	#decode the summary generated in above step
+    chunk_summaries = [tokenizer.decode(ids, skip_special_tokens=True) for ids in summary_ids]
     final_summary = " ".join(chunk_summaries)
+    return final_summary
+def chunk_text_with_overlap(text, max_tokens=500, overlap_tokens=50):
+    """Splits text into overlapping chunks for large document processing."""
+    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split on sentence boundaries
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    previous_chunk_text = ""
+    for sentence in sentences:
+        token_length = len(sentence.split())
+        if current_length + token_length > max_tokens:
+            chunks.append(previous_chunk_text + " " + " ".join(current_chunk))
+            previous_chunk_text = " ".join(current_chunk)[-overlap_tokens:]
+            current_chunk = [sentence]
+            current_length = token_length
+        else:
+            current_chunk.append(sentence)
+            current_length += token_length
+    if current_chunk:
+        chunks.append(previous_chunk_text + " " + " ".join(current_chunk))
+    return chunks
+def extract_entities_with_gliner(text, default_entity_types, custom_entity_types, batch_size=4):
     """
+    Extract entities using GLINER with efficient chunking, sliding window, and batching.
     """
+    # Entity types preparation
+    entity_types = default_entity_types.split(",") + [
+        etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types
+    ]
     entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))
+    # Chunk the text to avoid overflow
+    chunks = chunk_text_with_overlap(text)
+    # Process each chunk individually for improved stability
+    all_entities = []
+    for i, chunk in enumerate(chunks):
+        try:
+            entities = gliner_model.predict_entities(chunk, entity_types)
+            all_entities.extend(entities)
+        except Exception as e:
+            print(f"⚠️ Error processing chunk {i}: {e}")
+    # Format the results
+    formatted_entities = "\n".join(
+        [f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(all_entities)]
+    )
     return formatted_entities
 ### 5️⃣ Main Processing Function
     summary_output = gr.Textbox(label="Summary", visible=True, interactive=False)
     full_audio_output = gr.Audio(label="Generated Audio", visible=True)
     ner_output = gr.Textbox(label="Extracted Entities", visible=True, interactive=False)
     default_entity_types = gr.Textbox(label="Default Entity Types", value="PERSON, Organization, location, Date, PRODUCT, EVENT", interactive=True)
     custom_entity_types = gr.Textbox(label="Custom Entity Types", placeholder="Enter additional entity types (comma-separated)", interactive=True)
         show_progress=True
     )
+	# Step 3: Summarization (Generate Summary Before Enabling TTS Button)
+    def generate_summary_and_enable_tts(text):
+        summary = hierarchical_summarization(text)
+        return summary, gr.update(visible=True)  # Enable the TTS button only after summary is generated
+    # Summarization
     extracted_text.change(
+        generate_summary_and_enable_tts,
         inputs=[extracted_text],
+        outputs=[summary_output, process_audio_button],
         show_progress=True
     )
+    # Audio Generation
     process_audio_button.click(
         lambda text, summary, lang, voice, tts_choice: (
 		    None,  # Clear previous audio
         show_progress=True
     )
+    # NER Extraction
     process_ner_button.click(
 	    extract_entities_with_gliner,
+        inputs=[extracted_text, default_entity_types, custom_entity_types],
         outputs=[ner_output]
     )
+demo.launch(share=True)