|
|
|
""" |
|
Advanced test for efficient-context's deduplication capabilities |
|
""" |
|
|
|
import time |
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
from efficient_context import ContextManager |
|
from efficient_context.compression import SemanticDeduplicator |
|
from efficient_context.chunking import SemanticChunker |
|
from efficient_context.retrieval import CPUOptimizedRetriever |
|
|
|
def create_repetitive_document(): |
|
"""Create a document with highly repetitive semantic content""" |
|
|
|
|
|
paragraphs = [] |
|
|
|
|
|
climate_variations = [ |
|
"Climate change is a significant alteration in global weather patterns over extended periods.", |
|
"Global warming refers to the long-term increase in Earth's average temperature.", |
|
"The climate crisis is causing significant shifts in temperature and precipitation patterns worldwide.", |
|
"Rising global temperatures lead to fundamental changes in our planet's climate systems.", |
|
"Human-induced warming of the Earth's atmosphere is resulting in climate destabilization." |
|
] |
|
paragraphs.extend(climate_variations) |
|
|
|
|
|
energy_variations = [ |
|
"Renewable energy comes from natural sources that are constantly replenished.", |
|
"Clean energy technologies harness power from sustainable, non-depleting resources.", |
|
"Green power is generated from environmentally friendly, renewable sources.", |
|
"Sustainable energy is derived from resources that don't run out over time.", |
|
"Alternative energy refers to power sources that are alternatives to fossil fuels." |
|
] |
|
paragraphs.extend(energy_variations) |
|
|
|
|
|
unique_content = [ |
|
"Machine learning algorithms require significant computational resources to train effectively.", |
|
"Biodiversity loss is accelerating at an unprecedented rate due to human activities.", |
|
"Quantum computing may revolutionize cryptography and computational chemistry." |
|
] |
|
paragraphs.extend(unique_content) |
|
|
|
|
|
document = "\n\n".join(paragraphs * 3) |
|
return document |
|
|
|
def run_deduplication_test(): |
|
"""Test the semantic deduplication capabilities""" |
|
logger.info("Running semantic deduplication test") |
|
|
|
|
|
document = create_repetitive_document() |
|
logger.info(f"Document size: {len(document.split())} words") |
|
|
|
|
|
thresholds = [0.7, 0.8, 0.85, 0.9, 0.95] |
|
|
|
for threshold in thresholds: |
|
logger.info(f"\nTesting threshold: {threshold}") |
|
|
|
|
|
cm = ContextManager( |
|
compressor=SemanticDeduplicator(threshold=threshold), |
|
chunker=SemanticChunker(chunk_size=200), |
|
retriever=CPUOptimizedRetriever(embedding_model="lightweight") |
|
) |
|
|
|
|
|
start = time.time() |
|
doc_id = cm.add_document(document) |
|
processing_time = time.time() - start |
|
|
|
|
|
query = "Explain the relationship between climate change and renewable energy" |
|
start = time.time() |
|
context = cm.generate_context(query) |
|
query_time = time.time() - start |
|
|
|
|
|
original_size = len(document.split()) |
|
context_size = len(context.split()) |
|
compression_ratio = context_size / original_size |
|
|
|
|
|
logger.info(f"Results for threshold {threshold}:") |
|
logger.info(f" - Original document: {original_size} words") |
|
logger.info(f" - Context generated: {context_size} words") |
|
logger.info(f" - Compression ratio: {compression_ratio:.2f}") |
|
logger.info(f" - Chunks created: {len(cm.chunks)}") |
|
logger.info(f" - Processing time: {processing_time:.4f} seconds") |
|
logger.info(f" - Query time: {query_time:.4f} seconds") |
|
|
|
|
|
logger.info(f" - Context preview: {context[:150]}...") |
|
|
|
if __name__ == "__main__": |
|
try: |
|
print("Starting deduplication evaluation...") |
|
run_deduplication_test() |
|
print("Evaluation completed successfully") |
|
except Exception as e: |
|
print(f"Error during evaluation: {e}") |
|
import traceback |
|
traceback.print_exc() |
|
|