biswanath2.roul
Initial commit
e4d5155
"""
Benchmarking script for efficient-context performance.
"""
import logging
import time
import argparse
import random
import string
import psutil
import os
import gc
from typing import List, Dict, Any
from efficient_context import ContextManager
from efficient_context.compression import SemanticDeduplicator
from efficient_context.chunking import SemanticChunker
from efficient_context.retrieval import CPUOptimizedRetriever
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def generate_random_text(words: int = 1000, paragraphs: int = 5) -> str:
"""
Generate random text for benchmarking.
Args:
words: Number of words to generate
paragraphs: Number of paragraphs to split the text into
Returns:
text: Generated random text
"""
# List of common words for more realistic text
common_words = [
"the", "be", "to", "of", "and", "a", "in", "that", "have", "I",
"it", "for", "not", "on", "with", "he", "as", "you", "do", "at",
"this", "but", "his", "by", "from", "they", "we", "say", "her", "she",
"or", "an", "will", "my", "one", "all", "would", "there", "their", "what",
"so", "up", "out", "if", "about", "who", "get", "which", "go", "me",
"renewable", "energy", "climate", "wind", "solar", "power", "change", "global",
"sustainable", "resources", "efficiency", "emissions", "carbon", "technology"
]
# Generate paragraphs
result = []
words_per_paragraph = words // paragraphs
for i in range(paragraphs):
paragraph_words = []
for j in range(words_per_paragraph):
# Occasionally add a random word for variety
if random.random() < 0.1:
word = ''.join(random.choice(string.ascii_lowercase) for _ in range(random.randint(3, 10)))
else:
word = random.choice(common_words)
# Capitalize first word of sentence
if j == 0 or paragraph_words[-1].endswith('.'):
word = word.capitalize()
# Add punctuation occasionally
if j > 0 and j % random.randint(8, 15) == 0:
word += '.'
elif random.random() < 0.05:
word += ','
paragraph_words.append(word)
# Ensure paragraph ends with period
if not paragraph_words[-1].endswith('.'):
paragraph_words[-1] += '.'
result.append(' '.join(paragraph_words))
return '\n\n'.join(result)
def get_memory_usage() -> Dict[str, Any]:
"""
Get current memory usage.
Returns:
stats: Memory usage statistics
"""
process = psutil.Process(os.getpid())
memory_info = process.memory_info()
return {
"rss": memory_info.rss / (1024 * 1024), # MB
"vms": memory_info.vms / (1024 * 1024) # MB
}
def run_benchmark(
num_documents: int = 10,
words_per_document: int = 1000,
num_queries: int = 5
) -> None:
"""
Run a benchmark of efficient-context performance.
Args:
num_documents: Number of documents to process
words_per_document: Number of words per document
num_queries: Number of queries to run
"""
logger.info(f"Starting benchmark with {num_documents} documents, {words_per_document} words each")
# Initialize context manager
context_manager = ContextManager(
compressor=SemanticDeduplicator(threshold=0.85),
chunker=SemanticChunker(chunk_size=256),
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
)
# Generate documents
logger.info("Generating random documents...")
documents = []
for i in range(num_documents):
content = generate_random_text(words=words_per_document, paragraphs=5)
documents.append({
"content": content,
"metadata": {"id": f"doc-{i}", "source": "benchmark"}
})
# Measure document processing
logger.info("Adding documents to context manager...")
start_mem = get_memory_usage()
start_time = time.time()
document_ids = context_manager.add_documents(documents)
end_time = time.time()
end_mem = get_memory_usage()
processing_time = end_time - start_time
memory_increase = end_mem["rss"] - start_mem["rss"]
logger.info(f"Document processing:")
logger.info(f" - Time: {processing_time:.2f} seconds")
logger.info(f" - Average per document: {processing_time / num_documents:.4f} seconds")
logger.info(f" - Memory usage increase: {memory_increase:.2f} MB")
logger.info(f" - Total chunks created: {len(context_manager.chunks)}")
# Generate random queries
logger.info("Generating context for queries...")
queries = [
f"Explain {random.choice(['renewable', 'sustainable', 'clean', 'alternative'])} energy",
f"What are the {random.choice(['benefits', 'advantages', 'impacts', 'effects'])} of {random.choice(['solar', 'wind', 'hydro', 'geothermal'])} power?",
f"How does {random.choice(['climate change', 'global warming', 'carbon emissions', 'greenhouse gases'])} affect the environment?",
f"Discuss the {random.choice(['future', 'potential', 'limitations', 'challenges'])} of renewable energy",
f"What is the {random.choice(['relationship', 'connection', 'link', 'correlation'])} between energy consumption and climate change?"
]
# Ensure we have enough queries
while len(queries) < num_queries:
queries.append(f"Tell me about {random.choice(['energy', 'climate', 'sustainability', 'emissions'])}")
# Select the requested number of queries
selected_queries = random.sample(queries, min(num_queries, len(queries)))
# Measure query processing
total_query_time = 0
total_query_tokens = 0
for i, query in enumerate(selected_queries):
# Clear some memory and cache before each query
gc.collect()
start_time = time.time()
context = context_manager.generate_context(query)
query_time = time.time() - start_time
context_tokens = len(context.split())
total_query_time += query_time
total_query_tokens += context_tokens
logger.info(f"Query {i+1}: '{query}'")
logger.info(f" - Time: {query_time:.4f} seconds")
logger.info(f" - Context size: {context_tokens} tokens")
avg_query_time = total_query_time / num_queries
avg_tokens = total_query_tokens / num_queries
logger.info("\nBenchmark Summary:")
logger.info(f" - Documents processed: {num_documents} ({words_per_document} words each)")
logger.info(f" - Queries executed: {num_queries}")
logger.info(f" - Document processing time: {processing_time:.2f} seconds ({processing_time / num_documents:.4f}s per document)")
logger.info(f" - Average query time: {avg_query_time:.4f} seconds")
logger.info(f" - Average context size: {avg_tokens:.1f} tokens")
logger.info(f" - Final memory usage: {get_memory_usage()['rss']:.2f} MB")
def main():
"""Main function for the benchmark script."""
parser = argparse.ArgumentParser(description="Benchmark efficient-context performance")
parser.add_argument("--documents", type=int, default=10, help="Number of documents to process")
parser.add_argument("--words", type=int, default=1000, help="Words per document")
parser.add_argument("--queries", type=int, default=5, help="Number of queries to run")
args = parser.parse_args()
run_benchmark(
num_documents=args.documents,
words_per_document=args.words,
num_queries=args.queries
)
if __name__ == "__main__":
main()