File size: 7,874 Bytes
e4d5155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
"""
Benchmarking script for efficient-context performance.
"""

import logging
import time
import argparse
import random
import string
import psutil
import os
import gc
from typing import List, Dict, Any

from efficient_context import ContextManager
from efficient_context.compression import SemanticDeduplicator
from efficient_context.chunking import SemanticChunker
from efficient_context.retrieval import CPUOptimizedRetriever

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def generate_random_text(words: int = 1000, paragraphs: int = 5) -> str:
    """
    Generate random text for benchmarking.
    
    Args:
        words: Number of words to generate
        paragraphs: Number of paragraphs to split the text into
        
    Returns:
        text: Generated random text
    """
    # List of common words for more realistic text
    common_words = [
        "the", "be", "to", "of", "and", "a", "in", "that", "have", "I", 
        "it", "for", "not", "on", "with", "he", "as", "you", "do", "at",
        "this", "but", "his", "by", "from", "they", "we", "say", "her", "she",
        "or", "an", "will", "my", "one", "all", "would", "there", "their", "what",
        "so", "up", "out", "if", "about", "who", "get", "which", "go", "me",
        "renewable", "energy", "climate", "wind", "solar", "power", "change", "global",
        "sustainable", "resources", "efficiency", "emissions", "carbon", "technology"
    ]
    
    # Generate paragraphs
    result = []
    words_per_paragraph = words // paragraphs
    
    for i in range(paragraphs):
        paragraph_words = []
        for j in range(words_per_paragraph):
            # Occasionally add a random word for variety
            if random.random() < 0.1:
                word = ''.join(random.choice(string.ascii_lowercase) for _ in range(random.randint(3, 10)))
            else:
                word = random.choice(common_words)
                
            # Capitalize first word of sentence
            if j == 0 or paragraph_words[-1].endswith('.'):
                word = word.capitalize()
                
            # Add punctuation occasionally
            if j > 0 and j % random.randint(8, 15) == 0:
                word += '.'
            elif random.random() < 0.05:
                word += ','
                
            paragraph_words.append(word)
        
        # Ensure paragraph ends with period
        if not paragraph_words[-1].endswith('.'):
            paragraph_words[-1] += '.'
            
        result.append(' '.join(paragraph_words))
    
    return '\n\n'.join(result)

def get_memory_usage() -> Dict[str, Any]:
    """
    Get current memory usage.
    
    Returns:
        stats: Memory usage statistics
    """
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    
    return {
        "rss": memory_info.rss / (1024 * 1024),  # MB
        "vms": memory_info.vms / (1024 * 1024)   # MB
    }

def run_benchmark(
    num_documents: int = 10,
    words_per_document: int = 1000,
    num_queries: int = 5
) -> None:
    """
    Run a benchmark of efficient-context performance.
    
    Args:
        num_documents: Number of documents to process
        words_per_document: Number of words per document
        num_queries: Number of queries to run
    """
    logger.info(f"Starting benchmark with {num_documents} documents, {words_per_document} words each")
    
    # Initialize context manager
    context_manager = ContextManager(
        compressor=SemanticDeduplicator(threshold=0.85),
        chunker=SemanticChunker(chunk_size=256),
        retriever=CPUOptimizedRetriever(embedding_model="lightweight")
    )
    
    # Generate documents
    logger.info("Generating random documents...")
    documents = []
    for i in range(num_documents):
        content = generate_random_text(words=words_per_document, paragraphs=5)
        documents.append({
            "content": content,
            "metadata": {"id": f"doc-{i}", "source": "benchmark"}
        })
    
    # Measure document processing
    logger.info("Adding documents to context manager...")
    start_mem = get_memory_usage()
    start_time = time.time()
    
    document_ids = context_manager.add_documents(documents)
    
    end_time = time.time()
    end_mem = get_memory_usage()
    
    processing_time = end_time - start_time
    memory_increase = end_mem["rss"] - start_mem["rss"]
    
    logger.info(f"Document processing:")
    logger.info(f"  - Time: {processing_time:.2f} seconds")
    logger.info(f"  - Average per document: {processing_time / num_documents:.4f} seconds")
    logger.info(f"  - Memory usage increase: {memory_increase:.2f} MB")
    logger.info(f"  - Total chunks created: {len(context_manager.chunks)}")
    
    # Generate random queries
    logger.info("Generating context for queries...")
    queries = [
        f"Explain {random.choice(['renewable', 'sustainable', 'clean', 'alternative'])} energy",
        f"What are the {random.choice(['benefits', 'advantages', 'impacts', 'effects'])} of {random.choice(['solar', 'wind', 'hydro', 'geothermal'])} power?",
        f"How does {random.choice(['climate change', 'global warming', 'carbon emissions', 'greenhouse gases'])} affect the environment?",
        f"Discuss the {random.choice(['future', 'potential', 'limitations', 'challenges'])} of renewable energy",
        f"What is the {random.choice(['relationship', 'connection', 'link', 'correlation'])} between energy consumption and climate change?"
    ]
    
    # Ensure we have enough queries
    while len(queries) < num_queries:
        queries.append(f"Tell me about {random.choice(['energy', 'climate', 'sustainability', 'emissions'])}")
    
    # Select the requested number of queries
    selected_queries = random.sample(queries, min(num_queries, len(queries)))
    
    # Measure query processing
    total_query_time = 0
    total_query_tokens = 0
    
    for i, query in enumerate(selected_queries):
        # Clear some memory and cache before each query
        gc.collect()
        
        start_time = time.time()
        context = context_manager.generate_context(query)
        query_time = time.time() - start_time
        context_tokens = len(context.split())
        
        total_query_time += query_time
        total_query_tokens += context_tokens
        
        logger.info(f"Query {i+1}: '{query}'")
        logger.info(f"  - Time: {query_time:.4f} seconds")
        logger.info(f"  - Context size: {context_tokens} tokens")
    
    avg_query_time = total_query_time / num_queries
    avg_tokens = total_query_tokens / num_queries
    
    logger.info("\nBenchmark Summary:")
    logger.info(f"  - Documents processed: {num_documents} ({words_per_document} words each)")
    logger.info(f"  - Queries executed: {num_queries}")
    logger.info(f"  - Document processing time: {processing_time:.2f} seconds ({processing_time / num_documents:.4f}s per document)")
    logger.info(f"  - Average query time: {avg_query_time:.4f} seconds")
    logger.info(f"  - Average context size: {avg_tokens:.1f} tokens")
    logger.info(f"  - Final memory usage: {get_memory_usage()['rss']:.2f} MB")

def main():
    """Main function for the benchmark script."""
    parser = argparse.ArgumentParser(description="Benchmark efficient-context performance")
    parser.add_argument("--documents", type=int, default=10, help="Number of documents to process")
    parser.add_argument("--words", type=int, default=1000, help="Words per document")
    parser.add_argument("--queries", type=int, default=5, help="Number of queries to run")
    
    args = parser.parse_args()
    
    run_benchmark(
        num_documents=args.documents,
        words_per_document=args.words,
        num_queries=args.queries
    )

if __name__ == "__main__":
    main()