#!/usr/bin/env python """ Simple benchmark for efficient-context's semantic deduplication. """ import logging import time import sys # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) logger.info("Simple deduplication benchmark starting") # Import the library try: from efficient_context import ContextManager from efficient_context.compression import SemanticDeduplicator from efficient_context.chunking import SemanticChunker from efficient_context.retrieval import CPUOptimizedRetriever logger.info("Successfully imported efficient_context") except ImportError as e: logger.error(f"Failed to import efficient_context: {e}") sys.exit(1) def create_repetitive_document(): """Create a document with deliberate repetition""" # Base paragraphs with repetitive content climate_paragraph = """ Climate change is a significant alteration in weather patterns over extended periods. Global warming is the long-term heating of Earth's climate system due to human activities. Rising global temperatures are causing substantial changes in our environment and ecosystems. The warming of the planet is leading to significant transformations in weather patterns. Human activities are causing Earth's temperature to increase, resulting in climate changes. """ energy_paragraph = """ Renewable energy comes from sources that are naturally replenishing but flow-limited. Clean energy is derived from natural processes that are constantly replenished. Sustainable power is generated from resources that won't deplete over time. Green energy utilizes sources that don't produce pollution when generating power. Alternative energy refers to sources that are an alternative to fossil fuel. """ # Repeat the paragraphs to create a more repetitive document document = (climate_paragraph + energy_paragraph) * 3 return document def main(): """Run the benchmark""" # Create the test document document = create_repetitive_document() logger.info(f"Document size: {len(document.split())} words") # Test with different thresholds thresholds = [0.7, 0.8, 0.85, 0.9, 0.95] for threshold in thresholds: logger.info(f"\nTesting with threshold: {threshold}") # Create a context manager with the current threshold context_manager = ContextManager( compressor=SemanticDeduplicator(threshold=threshold), chunker=SemanticChunker(chunk_size=100), retriever=CPUOptimizedRetriever(embedding_model="lightweight") ) # Process the document start_time = time.time() doc_id = context_manager.add_document(document) processing_time = time.time() - start_time # Generate context with a query query = "Tell me about climate change and renewable energy" start_time = time.time() context = context_manager.generate_context(query) query_time = time.time() - start_time # Report results original_size = len(document.split()) context_size = len(context.split()) compression_ratio = context_size / original_size if original_size > 0 else 1.0 logger.info(f"Results for threshold {threshold}:") logger.info(f" - Original size: {original_size} words") logger.info(f" - Context size: {context_size} words") logger.info(f" - Compression ratio: {compression_ratio:.2f}") logger.info(f" - Processing time: {processing_time:.4f} seconds") logger.info(f" - Query time: {query_time:.4f} seconds") if __name__ == "__main__": main()