efficient-context / examples /dedup_eval.py
biswanath2.roul
Initial commit
e4d5155
#!/usr/bin/env python
"""
Advanced test for efficient-context's deduplication capabilities
"""
import time
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Import the library
from efficient_context import ContextManager
from efficient_context.compression import SemanticDeduplicator
from efficient_context.chunking import SemanticChunker
from efficient_context.retrieval import CPUOptimizedRetriever
def create_repetitive_document():
"""Create a document with highly repetitive semantic content"""
# Create repetitive paragraphs with the same semantic meaning expressed differently
paragraphs = []
# Climate change variations
climate_variations = [
"Climate change is a significant alteration in global weather patterns over extended periods.",
"Global warming refers to the long-term increase in Earth's average temperature.",
"The climate crisis is causing significant shifts in temperature and precipitation patterns worldwide.",
"Rising global temperatures lead to fundamental changes in our planet's climate systems.",
"Human-induced warming of the Earth's atmosphere is resulting in climate destabilization."
]
paragraphs.extend(climate_variations)
# Renewable energy variations
energy_variations = [
"Renewable energy comes from natural sources that are constantly replenished.",
"Clean energy technologies harness power from sustainable, non-depleting resources.",
"Green power is generated from environmentally friendly, renewable sources.",
"Sustainable energy is derived from resources that don't run out over time.",
"Alternative energy refers to power sources that are alternatives to fossil fuels."
]
paragraphs.extend(energy_variations)
# Add some unique content as well
unique_content = [
"Machine learning algorithms require significant computational resources to train effectively.",
"Biodiversity loss is accelerating at an unprecedented rate due to human activities.",
"Quantum computing may revolutionize cryptography and computational chemistry."
]
paragraphs.extend(unique_content)
# Repeat the document to make it longer and more repetitive
document = "\n\n".join(paragraphs * 3) # Repeat 3 times
return document
def run_deduplication_test():
"""Test the semantic deduplication capabilities"""
logger.info("Running semantic deduplication test")
# Create a highly repetitive document
document = create_repetitive_document()
logger.info(f"Document size: {len(document.split())} words")
# Test with different threshold values
thresholds = [0.7, 0.8, 0.85, 0.9, 0.95]
for threshold in thresholds:
logger.info(f"\nTesting threshold: {threshold}")
# Create context manager with current threshold
cm = ContextManager(
compressor=SemanticDeduplicator(threshold=threshold),
chunker=SemanticChunker(chunk_size=200),
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
)
# Add document and measure processing time
start = time.time()
doc_id = cm.add_document(document)
processing_time = time.time() - start
# Generate context for a relevant query
query = "Explain the relationship between climate change and renewable energy"
start = time.time()
context = cm.generate_context(query)
query_time = time.time() - start
# Calculate metrics
original_size = len(document.split())
context_size = len(context.split())
compression_ratio = context_size / original_size
# Report results
logger.info(f"Results for threshold {threshold}:")
logger.info(f" - Original document: {original_size} words")
logger.info(f" - Context generated: {context_size} words")
logger.info(f" - Compression ratio: {compression_ratio:.2f}")
logger.info(f" - Chunks created: {len(cm.chunks)}")
logger.info(f" - Processing time: {processing_time:.4f} seconds")
logger.info(f" - Query time: {query_time:.4f} seconds")
# Print a preview of the context
logger.info(f" - Context preview: {context[:150]}...")
if __name__ == "__main__":
try:
print("Starting deduplication evaluation...")
run_deduplication_test()
print("Evaluation completed successfully")
except Exception as e:
print(f"Error during evaluation: {e}")
import traceback
traceback.print_exc()