|
|
|
""" |
|
Simple benchmark for efficient-context's semantic deduplication. |
|
""" |
|
|
|
import logging |
|
import time |
|
import sys |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger(__name__) |
|
logger.info("Simple deduplication benchmark starting") |
|
|
|
|
|
try: |
|
from efficient_context import ContextManager |
|
from efficient_context.compression import SemanticDeduplicator |
|
from efficient_context.chunking import SemanticChunker |
|
from efficient_context.retrieval import CPUOptimizedRetriever |
|
logger.info("Successfully imported efficient_context") |
|
except ImportError as e: |
|
logger.error(f"Failed to import efficient_context: {e}") |
|
sys.exit(1) |
|
|
|
def create_repetitive_document(): |
|
"""Create a document with deliberate repetition""" |
|
|
|
climate_paragraph = """ |
|
Climate change is a significant alteration in weather patterns over extended periods. |
|
Global warming is the long-term heating of Earth's climate system due to human activities. |
|
Rising global temperatures are causing substantial changes in our environment and ecosystems. |
|
The warming of the planet is leading to significant transformations in weather patterns. |
|
Human activities are causing Earth's temperature to increase, resulting in climate changes. |
|
""" |
|
|
|
energy_paragraph = """ |
|
Renewable energy comes from sources that are naturally replenishing but flow-limited. |
|
Clean energy is derived from natural processes that are constantly replenished. |
|
Sustainable power is generated from resources that won't deplete over time. |
|
Green energy utilizes sources that don't produce pollution when generating power. |
|
Alternative energy refers to sources that are an alternative to fossil fuel. |
|
""" |
|
|
|
|
|
document = (climate_paragraph + energy_paragraph) * 3 |
|
return document |
|
|
|
def main(): |
|
"""Run the benchmark""" |
|
|
|
document = create_repetitive_document() |
|
logger.info(f"Document size: {len(document.split())} words") |
|
|
|
|
|
thresholds = [0.7, 0.8, 0.85, 0.9, 0.95] |
|
|
|
for threshold in thresholds: |
|
logger.info(f"\nTesting with threshold: {threshold}") |
|
|
|
|
|
context_manager = ContextManager( |
|
compressor=SemanticDeduplicator(threshold=threshold), |
|
chunker=SemanticChunker(chunk_size=100), |
|
retriever=CPUOptimizedRetriever(embedding_model="lightweight") |
|
) |
|
|
|
|
|
start_time = time.time() |
|
doc_id = context_manager.add_document(document) |
|
processing_time = time.time() - start_time |
|
|
|
|
|
query = "Tell me about climate change and renewable energy" |
|
start_time = time.time() |
|
context = context_manager.generate_context(query) |
|
query_time = time.time() - start_time |
|
|
|
|
|
original_size = len(document.split()) |
|
context_size = len(context.split()) |
|
compression_ratio = context_size / original_size if original_size > 0 else 1.0 |
|
|
|
logger.info(f"Results for threshold {threshold}:") |
|
logger.info(f" - Original size: {original_size} words") |
|
logger.info(f" - Context size: {context_size} words") |
|
logger.info(f" - Compression ratio: {compression_ratio:.2f}") |
|
logger.info(f" - Processing time: {processing_time:.4f} seconds") |
|
logger.info(f" - Query time: {query_time:.4f} seconds") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|