File size: 4,790 Bytes
e4d5155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
"""
Advanced test for efficient-context's deduplication capabilities
"""

import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Import the library
from efficient_context import ContextManager
from efficient_context.compression import SemanticDeduplicator
from efficient_context.chunking import SemanticChunker
from efficient_context.retrieval import CPUOptimizedRetriever

def create_repetitive_document():
    """Create a document with highly repetitive semantic content"""
    
    # Create repetitive paragraphs with the same semantic meaning expressed differently
    paragraphs = []
    
    # Climate change variations
    climate_variations = [
        "Climate change is a significant alteration in global weather patterns over extended periods.",
        "Global warming refers to the long-term increase in Earth's average temperature.",
        "The climate crisis is causing significant shifts in temperature and precipitation patterns worldwide.",
        "Rising global temperatures lead to fundamental changes in our planet's climate systems.",
        "Human-induced warming of the Earth's atmosphere is resulting in climate destabilization."
    ]
    paragraphs.extend(climate_variations)
    
    # Renewable energy variations
    energy_variations = [
        "Renewable energy comes from natural sources that are constantly replenished.",
        "Clean energy technologies harness power from sustainable, non-depleting resources.",
        "Green power is generated from environmentally friendly, renewable sources.",
        "Sustainable energy is derived from resources that don't run out over time.",
        "Alternative energy refers to power sources that are alternatives to fossil fuels."
    ]
    paragraphs.extend(energy_variations)
    
    # Add some unique content as well
    unique_content = [
        "Machine learning algorithms require significant computational resources to train effectively.",
        "Biodiversity loss is accelerating at an unprecedented rate due to human activities.",
        "Quantum computing may revolutionize cryptography and computational chemistry."
    ]
    paragraphs.extend(unique_content)
    
    # Repeat the document to make it longer and more repetitive
    document = "\n\n".join(paragraphs * 3)  # Repeat 3 times
    return document

def run_deduplication_test():
    """Test the semantic deduplication capabilities"""
    logger.info("Running semantic deduplication test")
    
    # Create a highly repetitive document
    document = create_repetitive_document()
    logger.info(f"Document size: {len(document.split())} words")
    
    # Test with different threshold values
    thresholds = [0.7, 0.8, 0.85, 0.9, 0.95]
    
    for threshold in thresholds:
        logger.info(f"\nTesting threshold: {threshold}")
        
        # Create context manager with current threshold
        cm = ContextManager(
            compressor=SemanticDeduplicator(threshold=threshold),
            chunker=SemanticChunker(chunk_size=200),
            retriever=CPUOptimizedRetriever(embedding_model="lightweight")
        )
        
        # Add document and measure processing time
        start = time.time()
        doc_id = cm.add_document(document)
        processing_time = time.time() - start
        
        # Generate context for a relevant query
        query = "Explain the relationship between climate change and renewable energy"
        start = time.time()
        context = cm.generate_context(query)
        query_time = time.time() - start
        
        # Calculate metrics
        original_size = len(document.split())
        context_size = len(context.split())
        compression_ratio = context_size / original_size
        
        # Report results
        logger.info(f"Results for threshold {threshold}:")
        logger.info(f"  - Original document: {original_size} words")
        logger.info(f"  - Context generated: {context_size} words")
        logger.info(f"  - Compression ratio: {compression_ratio:.2f}")
        logger.info(f"  - Chunks created: {len(cm.chunks)}")
        logger.info(f"  - Processing time: {processing_time:.4f} seconds")
        logger.info(f"  - Query time: {query_time:.4f} seconds")
        
        # Print a preview of the context
        logger.info(f"  - Context preview: {context[:150]}...")

if __name__ == "__main__":
    try:
        print("Starting deduplication evaluation...")
        run_deduplication_test()
        print("Evaluation completed successfully")
    except Exception as e:
        print(f"Error during evaluation: {e}")
        import traceback
        traceback.print_exc()