efficient-context / examples /simple_dedup_benchmark.py
biswanath2.roul
Initial commit
e4d5155
#!/usr/bin/env python
"""
Simple benchmark for efficient-context's semantic deduplication.
"""
import logging
import time
import sys
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.info("Simple deduplication benchmark starting")
# Import the library
try:
from efficient_context import ContextManager
from efficient_context.compression import SemanticDeduplicator
from efficient_context.chunking import SemanticChunker
from efficient_context.retrieval import CPUOptimizedRetriever
logger.info("Successfully imported efficient_context")
except ImportError as e:
logger.error(f"Failed to import efficient_context: {e}")
sys.exit(1)
def create_repetitive_document():
"""Create a document with deliberate repetition"""
# Base paragraphs with repetitive content
climate_paragraph = """
Climate change is a significant alteration in weather patterns over extended periods.
Global warming is the long-term heating of Earth's climate system due to human activities.
Rising global temperatures are causing substantial changes in our environment and ecosystems.
The warming of the planet is leading to significant transformations in weather patterns.
Human activities are causing Earth's temperature to increase, resulting in climate changes.
"""
energy_paragraph = """
Renewable energy comes from sources that are naturally replenishing but flow-limited.
Clean energy is derived from natural processes that are constantly replenished.
Sustainable power is generated from resources that won't deplete over time.
Green energy utilizes sources that don't produce pollution when generating power.
Alternative energy refers to sources that are an alternative to fossil fuel.
"""
# Repeat the paragraphs to create a more repetitive document
document = (climate_paragraph + energy_paragraph) * 3
return document
def main():
"""Run the benchmark"""
# Create the test document
document = create_repetitive_document()
logger.info(f"Document size: {len(document.split())} words")
# Test with different thresholds
thresholds = [0.7, 0.8, 0.85, 0.9, 0.95]
for threshold in thresholds:
logger.info(f"\nTesting with threshold: {threshold}")
# Create a context manager with the current threshold
context_manager = ContextManager(
compressor=SemanticDeduplicator(threshold=threshold),
chunker=SemanticChunker(chunk_size=100),
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
)
# Process the document
start_time = time.time()
doc_id = context_manager.add_document(document)
processing_time = time.time() - start_time
# Generate context with a query
query = "Tell me about climate change and renewable energy"
start_time = time.time()
context = context_manager.generate_context(query)
query_time = time.time() - start_time
# Report results
original_size = len(document.split())
context_size = len(context.split())
compression_ratio = context_size / original_size if original_size > 0 else 1.0
logger.info(f"Results for threshold {threshold}:")
logger.info(f" - Original size: {original_size} words")
logger.info(f" - Context size: {context_size} words")
logger.info(f" - Compression ratio: {compression_ratio:.2f}")
logger.info(f" - Processing time: {processing_time:.4f} seconds")
logger.info(f" - Query time: {query_time:.4f} seconds")
if __name__ == "__main__":
main()