|
""" |
|
Benchmarking script for efficient-context performance. |
|
""" |
|
|
|
import logging |
|
import time |
|
import argparse |
|
import random |
|
import string |
|
import psutil |
|
import os |
|
import gc |
|
from typing import List, Dict, Any |
|
|
|
from efficient_context import ContextManager |
|
from efficient_context.compression import SemanticDeduplicator |
|
from efficient_context.chunking import SemanticChunker |
|
from efficient_context.retrieval import CPUOptimizedRetriever |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
def generate_random_text(words: int = 1000, paragraphs: int = 5) -> str: |
|
""" |
|
Generate random text for benchmarking. |
|
|
|
Args: |
|
words: Number of words to generate |
|
paragraphs: Number of paragraphs to split the text into |
|
|
|
Returns: |
|
text: Generated random text |
|
""" |
|
|
|
common_words = [ |
|
"the", "be", "to", "of", "and", "a", "in", "that", "have", "I", |
|
"it", "for", "not", "on", "with", "he", "as", "you", "do", "at", |
|
"this", "but", "his", "by", "from", "they", "we", "say", "her", "she", |
|
"or", "an", "will", "my", "one", "all", "would", "there", "their", "what", |
|
"so", "up", "out", "if", "about", "who", "get", "which", "go", "me", |
|
"renewable", "energy", "climate", "wind", "solar", "power", "change", "global", |
|
"sustainable", "resources", "efficiency", "emissions", "carbon", "technology" |
|
] |
|
|
|
|
|
result = [] |
|
words_per_paragraph = words // paragraphs |
|
|
|
for i in range(paragraphs): |
|
paragraph_words = [] |
|
for j in range(words_per_paragraph): |
|
|
|
if random.random() < 0.1: |
|
word = ''.join(random.choice(string.ascii_lowercase) for _ in range(random.randint(3, 10))) |
|
else: |
|
word = random.choice(common_words) |
|
|
|
|
|
if j == 0 or paragraph_words[-1].endswith('.'): |
|
word = word.capitalize() |
|
|
|
|
|
if j > 0 and j % random.randint(8, 15) == 0: |
|
word += '.' |
|
elif random.random() < 0.05: |
|
word += ',' |
|
|
|
paragraph_words.append(word) |
|
|
|
|
|
if not paragraph_words[-1].endswith('.'): |
|
paragraph_words[-1] += '.' |
|
|
|
result.append(' '.join(paragraph_words)) |
|
|
|
return '\n\n'.join(result) |
|
|
|
def get_memory_usage() -> Dict[str, Any]: |
|
""" |
|
Get current memory usage. |
|
|
|
Returns: |
|
stats: Memory usage statistics |
|
""" |
|
process = psutil.Process(os.getpid()) |
|
memory_info = process.memory_info() |
|
|
|
return { |
|
"rss": memory_info.rss / (1024 * 1024), |
|
"vms": memory_info.vms / (1024 * 1024) |
|
} |
|
|
|
def run_benchmark( |
|
num_documents: int = 10, |
|
words_per_document: int = 1000, |
|
num_queries: int = 5 |
|
) -> None: |
|
""" |
|
Run a benchmark of efficient-context performance. |
|
|
|
Args: |
|
num_documents: Number of documents to process |
|
words_per_document: Number of words per document |
|
num_queries: Number of queries to run |
|
""" |
|
logger.info(f"Starting benchmark with {num_documents} documents, {words_per_document} words each") |
|
|
|
|
|
context_manager = ContextManager( |
|
compressor=SemanticDeduplicator(threshold=0.85), |
|
chunker=SemanticChunker(chunk_size=256), |
|
retriever=CPUOptimizedRetriever(embedding_model="lightweight") |
|
) |
|
|
|
|
|
logger.info("Generating random documents...") |
|
documents = [] |
|
for i in range(num_documents): |
|
content = generate_random_text(words=words_per_document, paragraphs=5) |
|
documents.append({ |
|
"content": content, |
|
"metadata": {"id": f"doc-{i}", "source": "benchmark"} |
|
}) |
|
|
|
|
|
logger.info("Adding documents to context manager...") |
|
start_mem = get_memory_usage() |
|
start_time = time.time() |
|
|
|
document_ids = context_manager.add_documents(documents) |
|
|
|
end_time = time.time() |
|
end_mem = get_memory_usage() |
|
|
|
processing_time = end_time - start_time |
|
memory_increase = end_mem["rss"] - start_mem["rss"] |
|
|
|
logger.info(f"Document processing:") |
|
logger.info(f" - Time: {processing_time:.2f} seconds") |
|
logger.info(f" - Average per document: {processing_time / num_documents:.4f} seconds") |
|
logger.info(f" - Memory usage increase: {memory_increase:.2f} MB") |
|
logger.info(f" - Total chunks created: {len(context_manager.chunks)}") |
|
|
|
|
|
logger.info("Generating context for queries...") |
|
queries = [ |
|
f"Explain {random.choice(['renewable', 'sustainable', 'clean', 'alternative'])} energy", |
|
f"What are the {random.choice(['benefits', 'advantages', 'impacts', 'effects'])} of {random.choice(['solar', 'wind', 'hydro', 'geothermal'])} power?", |
|
f"How does {random.choice(['climate change', 'global warming', 'carbon emissions', 'greenhouse gases'])} affect the environment?", |
|
f"Discuss the {random.choice(['future', 'potential', 'limitations', 'challenges'])} of renewable energy", |
|
f"What is the {random.choice(['relationship', 'connection', 'link', 'correlation'])} between energy consumption and climate change?" |
|
] |
|
|
|
|
|
while len(queries) < num_queries: |
|
queries.append(f"Tell me about {random.choice(['energy', 'climate', 'sustainability', 'emissions'])}") |
|
|
|
|
|
selected_queries = random.sample(queries, min(num_queries, len(queries))) |
|
|
|
|
|
total_query_time = 0 |
|
total_query_tokens = 0 |
|
|
|
for i, query in enumerate(selected_queries): |
|
|
|
gc.collect() |
|
|
|
start_time = time.time() |
|
context = context_manager.generate_context(query) |
|
query_time = time.time() - start_time |
|
context_tokens = len(context.split()) |
|
|
|
total_query_time += query_time |
|
total_query_tokens += context_tokens |
|
|
|
logger.info(f"Query {i+1}: '{query}'") |
|
logger.info(f" - Time: {query_time:.4f} seconds") |
|
logger.info(f" - Context size: {context_tokens} tokens") |
|
|
|
avg_query_time = total_query_time / num_queries |
|
avg_tokens = total_query_tokens / num_queries |
|
|
|
logger.info("\nBenchmark Summary:") |
|
logger.info(f" - Documents processed: {num_documents} ({words_per_document} words each)") |
|
logger.info(f" - Queries executed: {num_queries}") |
|
logger.info(f" - Document processing time: {processing_time:.2f} seconds ({processing_time / num_documents:.4f}s per document)") |
|
logger.info(f" - Average query time: {avg_query_time:.4f} seconds") |
|
logger.info(f" - Average context size: {avg_tokens:.1f} tokens") |
|
logger.info(f" - Final memory usage: {get_memory_usage()['rss']:.2f} MB") |
|
|
|
def main(): |
|
"""Main function for the benchmark script.""" |
|
parser = argparse.ArgumentParser(description="Benchmark efficient-context performance") |
|
parser.add_argument("--documents", type=int, default=10, help="Number of documents to process") |
|
parser.add_argument("--words", type=int, default=1000, help="Words per document") |
|
parser.add_argument("--queries", type=int, default=5, help="Number of queries to run") |
|
|
|
args = parser.parse_args() |
|
|
|
run_benchmark( |
|
num_documents=args.documents, |
|
words_per_document=args.words, |
|
num_queries=args.queries |
|
) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|