efficient-context / examples /dedup_eval.py

biswanath2.roul

Initial commit

e4d5155 10 days ago

4.79 kB

	#!/usr/bin/env python
	"""
	Advanced test for efficient-context's deduplication capabilities
	"""

	import time
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Import the library
	from efficient_context import ContextManager
	from efficient_context.compression import SemanticDeduplicator
	from efficient_context.chunking import SemanticChunker
	from efficient_context.retrieval import CPUOptimizedRetriever

	def create_repetitive_document():
	"""Create a document with highly repetitive semantic content"""

	# Create repetitive paragraphs with the same semantic meaning expressed differently
	paragraphs = []

	# Climate change variations
	climate_variations = [
	"Climate change is a significant alteration in global weather patterns over extended periods.",
	"Global warming refers to the long-term increase in Earth's average temperature.",
	"The climate crisis is causing significant shifts in temperature and precipitation patterns worldwide.",
	"Rising global temperatures lead to fundamental changes in our planet's climate systems.",
	"Human-induced warming of the Earth's atmosphere is resulting in climate destabilization."
	]
	paragraphs.extend(climate_variations)

	# Renewable energy variations
	energy_variations = [
	"Renewable energy comes from natural sources that are constantly replenished.",
	"Clean energy technologies harness power from sustainable, non-depleting resources.",
	"Green power is generated from environmentally friendly, renewable sources.",
	"Sustainable energy is derived from resources that don't run out over time.",
	"Alternative energy refers to power sources that are alternatives to fossil fuels."
	]
	paragraphs.extend(energy_variations)

	# Add some unique content as well
	unique_content = [
	"Machine learning algorithms require significant computational resources to train effectively.",
	"Biodiversity loss is accelerating at an unprecedented rate due to human activities.",
	"Quantum computing may revolutionize cryptography and computational chemistry."
	]
	paragraphs.extend(unique_content)

	# Repeat the document to make it longer and more repetitive
	document = "\n\n".join(paragraphs * 3) # Repeat 3 times
	return document

	def run_deduplication_test():
	"""Test the semantic deduplication capabilities"""
	logger.info("Running semantic deduplication test")

	# Create a highly repetitive document
	document = create_repetitive_document()
	logger.info(f"Document size: {len(document.split())} words")

	# Test with different threshold values
	thresholds = [0.7, 0.8, 0.85, 0.9, 0.95]

	for threshold in thresholds:
	logger.info(f"\nTesting threshold: {threshold}")

	# Create context manager with current threshold
	cm = ContextManager(
	compressor=SemanticDeduplicator(threshold=threshold),
	chunker=SemanticChunker(chunk_size=200),
	retriever=CPUOptimizedRetriever(embedding_model="lightweight")
	)

	# Add document and measure processing time
	start = time.time()
	doc_id = cm.add_document(document)
	processing_time = time.time() - start

	# Generate context for a relevant query
	query = "Explain the relationship between climate change and renewable energy"
	start = time.time()
	context = cm.generate_context(query)
	query_time = time.time() - start

	# Calculate metrics
	original_size = len(document.split())
	context_size = len(context.split())
	compression_ratio = context_size / original_size

	# Report results
	logger.info(f"Results for threshold {threshold}:")
	logger.info(f" - Original document: {original_size} words")
	logger.info(f" - Context generated: {context_size} words")
	logger.info(f" - Compression ratio: {compression_ratio:.2f}")
	logger.info(f" - Chunks created: {len(cm.chunks)}")
	logger.info(f" - Processing time: {processing_time:.4f} seconds")
	logger.info(f" - Query time: {query_time:.4f} seconds")

	# Print a preview of the context
	logger.info(f" - Context preview: {context[:150]}...")

	if __name__ == "__main__":
	try:
	print("Starting deduplication evaluation...")
	run_deduplication_test()
	print("Evaluation completed successfully")
	except Exception as e:
	print(f"Error during evaluation: {e}")
	import traceback
	traceback.print_exc()