Spaces:

mithra99
/

Doc_Summarizer

Runtime error

File size: 15,070 Bytes

d82600f

from typing import List, Dict, Any
import asyncio
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import requests
import os

# Remove top-level import of transformers and torch
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import torch

def clean_markdown_formatting(text: str) -> str:
    """
    Clean markdown formatting from text and convert to plain text
    
    Args:
        text: Text that may contain markdown formatting
        
    Returns:
        Cleaned plain text without markdown
    """
    if not text:
        return text
    
    # Remove markdown headers (# ## ### etc.)
    text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
    
    # Remove bold formatting (**text** or __text__)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    text = re.sub(r'__(.*?)__', r'\1', text)
    
    # Remove italic formatting (*text* or _text_)
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    text = re.sub(r'_(.*?)_', r'\1', text)
    
    # Remove code formatting (`text`)
    text = re.sub(r'`(.*?)`', r'\1', text)
    
    # Remove links [text](url) -> text
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    
    # Remove inline links [text] -> text
    text = re.sub(r'\[([^\]]+)\]', r'\1', text)
    
    # Remove strikethrough ~~text~~
    text = re.sub(r'~~(.*?)~~', r'\1', text)
    
    # Remove blockquotes (> text)
    text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
    
    # Remove horizontal rules (---, ***, ___)
    text = re.sub(r'^[-*_]{3,}$', '', text, flags=re.MULTILINE)
    
    # Clean up extra whitespace
    text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)  # Remove excessive line breaks
    text = re.sub(r' +', ' ', text)  # Remove multiple spaces
    text = re.sub(r'\n +', '\n', text)  # Remove leading spaces after line breaks
    
    # Clean up the text
    text = text.strip()
    
    return text

MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY", "")
if not MISTRAL_API_KEY:
    print("[WARNING] MISTRAL_API_KEY environment variable is not set. API calls will fail.")
MISTRAL_API_URL = "https://api.mistral.ai/v1/chat/completions"

class DocumentSummarizer:
    def __init__(self, chunk_size=1200, chunk_overlap=200):
        """
        Initialize the document summarizer (CPU-optimized version)
        
        Args:
            llm_model: Qwen2-0.5B model instance (CPU-friendly)
            chunk_size: Size of text chunks for processing (optimized for CPU)
            chunk_overlap: Overlap between chunks (reduced for memory efficiency)
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
        )

    def classify_document_size(self, text: str) -> Dict[str, Any]:
        """
        Classify document as small or large based on content length
        
        Args:
            text: Document text content
            
        Returns:
            Dict with classification info
        """
        words = len(text.split())
        pages_estimate = words // 500  # Rough estimate: 500 words per page
        is_large = pages_estimate > 15
        
        return {
            "is_large": is_large,
            "word_count": words,
            "page_estimate": pages_estimate,
            "classification": "Large Document" if is_large else "Small Document"
        }
    
    def create_chunks(self, text: str) -> List[Document]:
        """
        Create text chunks using RecursiveCharacterTextSplitter
        
        Args:
            text: Document text content
            
        Returns:
            List of Document chunks
        """
        chunks = self.text_splitter.split_text(text)
        return [Document(page_content=chunk, metadata={"chunk_id": i}) 
                for i, chunk in enumerate(chunks)]
    
    def _truncate_text_for_model(self, text: str, max_tokens: int = 4000) -> str:
        """
        Truncate text to fit within model context limits (increased limits for better summaries)
        
        Args:
            text: Text to truncate
            max_tokens: Maximum tokens allowed (increased from 2000)
            
        Returns:
            Truncated text
        """
        return text[:max_tokens * 4]

    def call_mistral_api(self, prompt: str) -> str:
        headers = {
            "Authorization": f"Bearer {MISTRAL_API_KEY}",
            "Content-Type": "application/json"
        }
        data = {
            "model": "mistral-medium",
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "max_tokens": 500,
            "temperature": 0.3,
            "top_p": 0.8
        }
        try:
            response = requests.post(MISTRAL_API_URL, headers=headers, json=data, timeout=60)
            response.raise_for_status()
            result = response.json()
            return result["choices"][0]["message"]["content"].strip()
        except Exception as e:
            print(f"[WARNING] Error calling Mistral API: {e}")
            return "[Error: Unable to generate summary with Mistral AI API.]"

    async def generate_chunk_summary(self, chunk: Document) -> str:
        """
        Generate summary for a single chunk using Qwen2-0.5B (CPU-optimized)
        
        Args:
            chunk: Document chunk to summarize
            
        Returns:
            Summary text for the chunk
        """
        truncated_content = self._truncate_text_for_model(chunk.page_content, max_tokens=3000)
        prompt = f"""You are an expert document summarizer. Create comprehensive summaries that capture key information from text chunks. Provide summaries in plain text format without markdown formatting.\n\nText to summarize:\n{truncated_content}\n\nSummary:"""
        response = self.call_mistral_api(prompt)
        return clean_markdown_formatting(response)
    
    def _simulate_chunk_summary(self, text: str) -> str:
        """
        Simulate chunk summary generation (fallback when LLM not available)
        
        Args:
            text: Text to summarize
            
        Returns:
            Simulated summary
        """
        # Create a balanced summary simulation
        words = text.split()
        if len(words) < 30:
            return text
        
        # Split into sentences and take key information
        sentences = text.split('. ')
        if len(sentences) <= 2:
            return text
        
        # For longer text, create a meaningful summary
        if len(sentences) > 4:
            # Take first sentence, middle sentence, and last sentence for context
            summary_sentences = [sentences[0]]  # Introduction
            middle_idx = len(sentences) // 2
            summary_sentences.append(sentences[middle_idx])  # Key point
            summary_sentences.append(sentences[-1])  # Conclusion
        else:
            # For shorter text, take first 2 sentences
            summary_sentences = sentences[:2]
        
        summary = '. '.join(summary_sentences)
        return summary + ('.' if not summary.endswith('.') else '')
    
    async def summarize_small_document(self, chunks: List[Document]) -> str:
        """
        Summarize small documents (≤15 pages) by summarizing all chunks and combining
        
        Args:
            chunks: List of document chunks
            
        Returns:
            Combined document summary
        """
        print(f"Processing small document with {len(chunks)} chunks...")
        
        # Generate summaries for all chunks
        chunk_summaries = []
        for i, chunk in enumerate(chunks):
            print(f"Summarizing chunk {i+1}/{len(chunks)}...")
            summary = await self.generate_chunk_summary(chunk)
            chunk_summaries.append(summary)
        
        # Combine all chunk summaries
        combined_summary = " ".join(chunk_summaries)
        
        # Generate final summary from combined summaries
        final_summary = await self.generate_final_summary(combined_summary, "small")
        
        return final_summary
    
    async def summarize_large_document(self, chunks: List[Document]) -> str:
        """
        Summarize large documents (>15 pages) using hierarchical summarization
        
        Args:
            chunks: List of document chunks
            
        Returns:
            Hierarchical document summary
        """
        print(f"Processing large document with {len(chunks)} chunks using hierarchical summarization...")
        
        # Step 1: Generate chunk-level summaries
        chunk_summaries = []
        for i, chunk in enumerate(chunks):
            print(f"Generating chunk summary {i+1}/{len(chunks)}...")
            summary = await self.generate_chunk_summary(chunk)
            chunk_summaries.append(summary)
        
        # Step 2: Group summaries into sections (for very large documents)
        if len(chunk_summaries) > 50:
            section_summaries = await self._create_section_summaries(chunk_summaries)
        else:
            section_summaries = chunk_summaries
        
        # Step 3: Generate section-level summaries
        section_level_summaries = []
        for i, section in enumerate(section_summaries):
            print(f"Generating section summary {i+1}/{len(section_summaries)}...")
            if isinstance(section, list):
                combined_section = " ".join(section)
            else:
                combined_section = section
            section_summary = await self.generate_chunk_summary(
                Document(page_content=combined_section, metadata={"section_id": i})
            )
            section_level_summaries.append(section_summary)
        
        # Step 4: Generate final hierarchical summary
        final_combined = " ".join(section_level_summaries)
        final_summary = await self.generate_final_summary(final_combined, "large")
        
        return final_summary
    
    async def _create_section_summaries(self, chunk_summaries: List[str]) -> List[List[str]]:
        """
        Group chunk summaries into sections for very large documents
        
        Args:
            chunk_summaries: List of chunk summaries
            
        Returns:
            List of section summaries (each section is a list of chunk summaries)
        """
        section_size = max(10, len(chunk_summaries) // 10)  # Create ~10 sections
        sections = []
        
        for i in range(0, len(chunk_summaries), section_size):
            section = chunk_summaries[i:i + section_size]
            sections.append(section)
        
        return sections
    
    async def generate_final_summary(self, combined_text: str, doc_type: str) -> str:
        """
        Generate final summary from combined text using Qwen2-0.5B (CPU-optimized)
        
        Args:
            combined_text: Combined text to summarize
            doc_type: Type of document (small/large)
            
        Returns:
            Final document summary
        """
        prompt = f"""You are an expert document summarizer. Create a final summary for the following combined text. Provide a comprehensive, plain text summary.\n\nText:\n{combined_text}\n\nFinal Summary:"""
        response = self.call_mistral_api(prompt)
        return clean_markdown_formatting(response)
    
    def _simulate_final_summary(self, combined_text: str, doc_type: str) -> str:
        """
        Simulate final summary generation (fallback when LLM not available)
        
        Args:
            combined_text: Combined text to summarize
            doc_type: Type of document (small/large)
            
        Returns:
            Simulated final summary
        """
        # Create a balanced final summary
        sentences = combined_text.split('. ')
        
        if len(sentences) <= 3:
            return combined_text
        
        # For small documents, take key sentences for better context
        if doc_type == "small":
            if len(sentences) <= 5:
                summary_sentences = sentences
            else:
                # Take introduction, key point, and conclusion for small docs
                summary_sentences = [sentences[0]]  # Introduction
                middle_idx = len(sentences) // 2
                summary_sentences.append(sentences[middle_idx])  # Key point
                summary_sentences.append(sentences[-1])  # Conclusion
        else:
            # For large documents, create a comprehensive summary
            if len(sentences) <= 6:
                summary_sentences = sentences
            else:
                # Take introduction, 2 key points, and conclusion
                summary_sentences = [sentences[0]]  # Introduction
                # Take 2 key points from different parts
                quarter_idx = len(sentences) // 4
                three_quarter_idx = (3 * len(sentences)) // 4
                summary_sentences.append(sentences[quarter_idx])  # First key point
                summary_sentences.append(sentences[three_quarter_idx])  # Second key point
                summary_sentences.append(sentences[-1])  # Conclusion
        
        summary = '. '.join(summary_sentences)
        return summary + ('.' if not summary.endswith('.') else '')
    
    async def summarize_document(self, text: str) -> Dict[str, Any]:
        """
        Main method to summarize a document
        
        Args:
            text: Document text content
            
        Returns:
            Dict with summary results
        """
        # Classify document size
        classification = self.classify_document_size(text)
        
        # Create chunks
        chunks = self.create_chunks(text)
        
        # Generate summary based on document size
        if classification["is_large"]:
            summary = await self.summarize_large_document(chunks)
            processing_method = "Hierarchical Summarization"
        else:
            summary = await self.summarize_small_document(chunks)
            processing_method = "Chunk-wise Summarization"
        
        return {
            "summary": summary,
            "classification": classification["classification"],
            "word_count": classification["word_count"],
            "page_estimate": classification["page_estimate"],
            "chunk_count": len(chunks),
            "processing_method": processing_method
        }

async def summarize_text(text: str, llm_model=None) -> Dict[str, Any]:
    """
    Convenience function to summarize text
    
    Args:
        text: Text to summarize
        llm_model: Optional LLM model instance
        
    Returns:
        Dict with summary results
    """
    summarizer = DocumentSummarizer(llm_model=llm_model)
    return await summarizer.summarize_document(text)