efficient-context / examples /simple_dedup_benchmark.py

biswanath2.roul

Initial commit

e4d5155 10 days ago

3.81 kB

	#!/usr/bin/env python
	"""
	Simple benchmark for efficient-context's semantic deduplication.
	"""

	import logging
	import time
	import sys

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)
	logger.info("Simple deduplication benchmark starting")

	# Import the library
	try:
	from efficient_context import ContextManager
	from efficient_context.compression import SemanticDeduplicator
	from efficient_context.chunking import SemanticChunker
	from efficient_context.retrieval import CPUOptimizedRetriever
	logger.info("Successfully imported efficient_context")
	except ImportError as e:
	logger.error(f"Failed to import efficient_context: {e}")
	sys.exit(1)

	def create_repetitive_document():
	"""Create a document with deliberate repetition"""
	# Base paragraphs with repetitive content
	climate_paragraph = """
	Climate change is a significant alteration in weather patterns over extended periods.
	Global warming is the long-term heating of Earth's climate system due to human activities.
	Rising global temperatures are causing substantial changes in our environment and ecosystems.
	The warming of the planet is leading to significant transformations in weather patterns.
	Human activities are causing Earth's temperature to increase, resulting in climate changes.
	"""

	energy_paragraph = """
	Renewable energy comes from sources that are naturally replenishing but flow-limited.
	Clean energy is derived from natural processes that are constantly replenished.
	Sustainable power is generated from resources that won't deplete over time.
	Green energy utilizes sources that don't produce pollution when generating power.
	Alternative energy refers to sources that are an alternative to fossil fuel.
	"""

	# Repeat the paragraphs to create a more repetitive document
	document = (climate_paragraph + energy_paragraph) * 3
	return document

	def main():
	"""Run the benchmark"""
	# Create the test document
	document = create_repetitive_document()
	logger.info(f"Document size: {len(document.split())} words")

	# Test with different thresholds
	thresholds = [0.7, 0.8, 0.85, 0.9, 0.95]

	for threshold in thresholds:
	logger.info(f"\nTesting with threshold: {threshold}")

	# Create a context manager with the current threshold
	context_manager = ContextManager(
	compressor=SemanticDeduplicator(threshold=threshold),
	chunker=SemanticChunker(chunk_size=100),
	retriever=CPUOptimizedRetriever(embedding_model="lightweight")
	)

	# Process the document
	start_time = time.time()
	doc_id = context_manager.add_document(document)
	processing_time = time.time() - start_time

	# Generate context with a query
	query = "Tell me about climate change and renewable energy"
	start_time = time.time()
	context = context_manager.generate_context(query)
	query_time = time.time() - start_time

	# Report results
	original_size = len(document.split())
	context_size = len(context.split())
	compression_ratio = context_size / original_size if original_size > 0 else 1.0

	logger.info(f"Results for threshold {threshold}:")
	logger.info(f" - Original size: {original_size} words")
	logger.info(f" - Context size: {context_size} words")
	logger.info(f" - Compression ratio: {compression_ratio:.2f}")
	logger.info(f" - Processing time: {processing_time:.4f} seconds")
	logger.info(f" - Query time: {query_time:.4f} seconds")

	if __name__ == "__main__":
	main()