Spaces:

mithra99
/

Doc_Summarizer

Runtime error

App Files Files Community

Doc_Summarizer / app /summarizer.py

mithra99

Add RAG Document Summarizer application

d82600f 2 days ago

raw

history blame contribute delete

15.1 kB

	from typing import List, Dict, Any
	import asyncio
	import re
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.schema import Document
	import requests
	import os

	# Remove top-level import of transformers and torch
	# from transformers import AutoTokenizer, AutoModelForCausalLM
	# import torch

	def clean_markdown_formatting(text: str) -> str:
	"""
	Clean markdown formatting from text and convert to plain text

	Args:
	text: Text that may contain markdown formatting

	Returns:
	Cleaned plain text without markdown
	"""
	if not text:
	return text

	# Remove markdown headers (# ## ### etc.)
	text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)

	# Remove bold formatting (text or __text__)
	text = re.sub(r'\\(.?)\\*', r'\1', text)
	text = re.sub(r'__(.*?)__', r'\1', text)

	# Remove italic formatting (text or _text_)
	text = re.sub(r'\(.?)\*', r'\1', text)
	text = re.sub(r'_(.*?)_', r'\1', text)

	# Remove code formatting (`text`)
	text = re.sub(r'`(.*?)`', r'\1', text)

	# Remove links [text](url) -> text
	text = re.sub(r'\[([^\]]+)\]$[^)]+$', r'\1', text)

	# Remove inline links [text] -> text
	text = re.sub(r'\[([^\]]+)\]', r'\1', text)

	# Remove strikethrough ~~text~~
	text = re.sub(r'~~(.*?)~~', r'\1', text)

	# Remove blockquotes (> text)
	text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)

	# Remove horizontal rules (---, ***, ___)
	text = re.sub(r'^[-*_]{3,}$', '', text, flags=re.MULTILINE)

	# Clean up extra whitespace
	text = re.sub(r'\n\s\n\s\n', '\n\n', text) # Remove excessive line breaks
	text = re.sub(r' +', ' ', text) # Remove multiple spaces
	text = re.sub(r'\n +', '\n', text) # Remove leading spaces after line breaks

	# Clean up the text
	text = text.strip()

	return text

	MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY", "")
	if not MISTRAL_API_KEY:
	print("[WARNING] MISTRAL_API_KEY environment variable is not set. API calls will fail.")
	MISTRAL_API_URL = "https://api.mistral.ai/v1/chat/completions"

	class DocumentSummarizer:
	def __init__(self, chunk_size=1200, chunk_overlap=200):
	"""
	Initialize the document summarizer (CPU-optimized version)

	Args:
	llm_model: Qwen2-0.5B model instance (CPU-friendly)
	chunk_size: Size of text chunks for processing (optimized for CPU)
	chunk_overlap: Overlap between chunks (reduced for memory efficiency)
	"""
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len,
	separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
	)

	def classify_document_size(self, text: str) -> Dict[str, Any]:
	"""
	Classify document as small or large based on content length

	Args:
	text: Document text content

	Returns:
	Dict with classification info
	"""
	words = len(text.split())
	pages_estimate = words // 500 # Rough estimate: 500 words per page
	is_large = pages_estimate > 15

	return {
	"is_large": is_large,
	"word_count": words,
	"page_estimate": pages_estimate,
	"classification": "Large Document" if is_large else "Small Document"
	}

	def create_chunks(self, text: str) -> List[Document]:
	"""
	Create text chunks using RecursiveCharacterTextSplitter

	Args:
	text: Document text content

	Returns:
	List of Document chunks
	"""
	chunks = self.text_splitter.split_text(text)
	return [Document(page_content=chunk, metadata={"chunk_id": i})
	for i, chunk in enumerate(chunks)]

	def _truncate_text_for_model(self, text: str, max_tokens: int = 4000) -> str:
	"""
	Truncate text to fit within model context limits (increased limits for better summaries)

	Args:
	text: Text to truncate
	max_tokens: Maximum tokens allowed (increased from 2000)

	Returns:
	Truncated text
	"""
	return text[:max_tokens * 4]

	def call_mistral_api(self, prompt: str) -> str:
	headers = {
	"Authorization": f"Bearer {MISTRAL_API_KEY}",
	"Content-Type": "application/json"
	}
	data = {
	"model": "mistral-medium",
	"messages": [
	{"role": "user", "content": prompt}
	],
	"max_tokens": 500,
	"temperature": 0.3,
	"top_p": 0.8
	}
	try:
	response = requests.post(MISTRAL_API_URL, headers=headers, json=data, timeout=60)
	response.raise_for_status()
	result = response.json()
	return result["choices"][0]["message"]["content"].strip()
	except Exception as e:
	print(f"[WARNING] Error calling Mistral API: {e}")
	return "[Error: Unable to generate summary with Mistral AI API.]"

	async def generate_chunk_summary(self, chunk: Document) -> str:
	"""
	Generate summary for a single chunk using Qwen2-0.5B (CPU-optimized)

	Args:
	chunk: Document chunk to summarize

	Returns:
	Summary text for the chunk
	"""
	truncated_content = self._truncate_text_for_model(chunk.page_content, max_tokens=3000)
	prompt = f"""You are an expert document summarizer. Create comprehensive summaries that capture key information from text chunks. Provide summaries in plain text format without markdown formatting.\n\nText to summarize:\n{truncated_content}\n\nSummary:"""
	response = self.call_mistral_api(prompt)
	return clean_markdown_formatting(response)

	def _simulate_chunk_summary(self, text: str) -> str:
	"""
	Simulate chunk summary generation (fallback when LLM not available)

	Args:
	text: Text to summarize

	Returns:
	Simulated summary
	"""
	# Create a balanced summary simulation
	words = text.split()
	if len(words) < 30:
	return text

	# Split into sentences and take key information
	sentences = text.split('. ')
	if len(sentences) <= 2:
	return text

	# For longer text, create a meaningful summary
	if len(sentences) > 4:
	# Take first sentence, middle sentence, and last sentence for context
	summary_sentences = [sentences[0]] # Introduction
	middle_idx = len(sentences) // 2
	summary_sentences.append(sentences[middle_idx]) # Key point
	summary_sentences.append(sentences[-1]) # Conclusion
	else:
	# For shorter text, take first 2 sentences
	summary_sentences = sentences[:2]

	summary = '. '.join(summary_sentences)
	return summary + ('.' if not summary.endswith('.') else '')

	async def summarize_small_document(self, chunks: List[Document]) -> str:
	"""
	Summarize small documents (≤15 pages) by summarizing all chunks and combining

	Args:
	chunks: List of document chunks

	Returns:
	Combined document summary
	"""
	print(f"Processing small document with {len(chunks)} chunks...")

	# Generate summaries for all chunks
	chunk_summaries = []
	for i, chunk in enumerate(chunks):
	print(f"Summarizing chunk {i+1}/{len(chunks)}...")
	summary = await self.generate_chunk_summary(chunk)
	chunk_summaries.append(summary)

	# Combine all chunk summaries
	combined_summary = " ".join(chunk_summaries)

	# Generate final summary from combined summaries
	final_summary = await self.generate_final_summary(combined_summary, "small")

	return final_summary

	async def summarize_large_document(self, chunks: List[Document]) -> str:
	"""
	Summarize large documents (>15 pages) using hierarchical summarization

	Args:
	chunks: List of document chunks

	Returns:
	Hierarchical document summary
	"""
	print(f"Processing large document with {len(chunks)} chunks using hierarchical summarization...")

	# Step 1: Generate chunk-level summaries
	chunk_summaries = []
	for i, chunk in enumerate(chunks):
	print(f"Generating chunk summary {i+1}/{len(chunks)}...")
	summary = await self.generate_chunk_summary(chunk)
	chunk_summaries.append(summary)

	# Step 2: Group summaries into sections (for very large documents)
	if len(chunk_summaries) > 50:
	section_summaries = await self._create_section_summaries(chunk_summaries)
	else:
	section_summaries = chunk_summaries

	# Step 3: Generate section-level summaries
	section_level_summaries = []
	for i, section in enumerate(section_summaries):
	print(f"Generating section summary {i+1}/{len(section_summaries)}...")
	if isinstance(section, list):
	combined_section = " ".join(section)
	else:
	combined_section = section
	section_summary = await self.generate_chunk_summary(
	Document(page_content=combined_section, metadata={"section_id": i})
	)
	section_level_summaries.append(section_summary)

	# Step 4: Generate final hierarchical summary
	final_combined = " ".join(section_level_summaries)
	final_summary = await self.generate_final_summary(final_combined, "large")

	return final_summary

	async def _create_section_summaries(self, chunk_summaries: List[str]) -> List[List[str]]:
	"""
	Group chunk summaries into sections for very large documents

	Args:
	chunk_summaries: List of chunk summaries

	Returns:
	List of section summaries (each section is a list of chunk summaries)
	"""
	section_size = max(10, len(chunk_summaries) // 10) # Create ~10 sections
	sections = []

	for i in range(0, len(chunk_summaries), section_size):
	section = chunk_summaries[i:i + section_size]
	sections.append(section)

	return sections

	async def generate_final_summary(self, combined_text: str, doc_type: str) -> str:
	"""
	Generate final summary from combined text using Qwen2-0.5B (CPU-optimized)

	Args:
	combined_text: Combined text to summarize
	doc_type: Type of document (small/large)

	Returns:
	Final document summary
	"""
	prompt = f"""You are an expert document summarizer. Create a final summary for the following combined text. Provide a comprehensive, plain text summary.\n\nText:\n{combined_text}\n\nFinal Summary:"""
	response = self.call_mistral_api(prompt)
	return clean_markdown_formatting(response)

	def _simulate_final_summary(self, combined_text: str, doc_type: str) -> str:
	"""
	Simulate final summary generation (fallback when LLM not available)

	Args:
	combined_text: Combined text to summarize
	doc_type: Type of document (small/large)

	Returns:
	Simulated final summary
	"""
	# Create a balanced final summary
	sentences = combined_text.split('. ')

	if len(sentences) <= 3:
	return combined_text

	# For small documents, take key sentences for better context
	if doc_type == "small":
	if len(sentences) <= 5:
	summary_sentences = sentences
	else:
	# Take introduction, key point, and conclusion for small docs
	summary_sentences = [sentences[0]] # Introduction
	middle_idx = len(sentences) // 2
	summary_sentences.append(sentences[middle_idx]) # Key point
	summary_sentences.append(sentences[-1]) # Conclusion
	else:
	# For large documents, create a comprehensive summary
	if len(sentences) <= 6:
	summary_sentences = sentences
	else:
	# Take introduction, 2 key points, and conclusion
	summary_sentences = [sentences[0]] # Introduction
	# Take 2 key points from different parts
	quarter_idx = len(sentences) // 4
	three_quarter_idx = (3 * len(sentences)) // 4
	summary_sentences.append(sentences[quarter_idx]) # First key point
	summary_sentences.append(sentences[three_quarter_idx]) # Second key point
	summary_sentences.append(sentences[-1]) # Conclusion

	summary = '. '.join(summary_sentences)
	return summary + ('.' if not summary.endswith('.') else '')

	async def summarize_document(self, text: str) -> Dict[str, Any]:
	"""
	Main method to summarize a document

	Args:
	text: Document text content

	Returns:
	Dict with summary results
	"""
	# Classify document size
	classification = self.classify_document_size(text)

	# Create chunks
	chunks = self.create_chunks(text)

	# Generate summary based on document size
	if classification["is_large"]:
	summary = await self.summarize_large_document(chunks)
	processing_method = "Hierarchical Summarization"
	else:
	summary = await self.summarize_small_document(chunks)
	processing_method = "Chunk-wise Summarization"

	return {
	"summary": summary,
	"classification": classification["classification"],
	"word_count": classification["word_count"],
	"page_estimate": classification["page_estimate"],
	"chunk_count": len(chunks),
	"processing_method": processing_method
	}

	async def summarize_text(text: str, llm_model=None) -> Dict[str, Any]:
	"""
	Convenience function to summarize text

	Args:
	text: Text to summarize
	llm_model: Optional LLM model instance

	Returns:
	Dict with summary results
	"""
	summarizer = DocumentSummarizer(llm_model=llm_model)
	return await summarizer.summarize_document(text)