biswanath2.roul
commited on
Commit
·
e4d5155
0
Parent(s):
Initial commit
Browse files- .gitignore +44 -0
- CONTRIBUTING.md +72 -0
- LICENSE +21 -0
- MANIFEST.in +6 -0
- PUBLISHING.md +81 -0
- README.md +73 -0
- debug.py +19 -0
- docs/usage.md +222 -0
- efficient_context/__init__.py +9 -0
- efficient_context/chunking/__init__.py +8 -0
- efficient_context/chunking/base.py +54 -0
- efficient_context/chunking/semantic_chunker.py +295 -0
- efficient_context/compression/__init__.py +8 -0
- efficient_context/compression/base.py +23 -0
- efficient_context/compression/semantic_deduplicator.py +261 -0
- efficient_context/context_manager.py +169 -0
- efficient_context/memory/__init__.py +7 -0
- efficient_context/memory/memory_manager.py +134 -0
- efficient_context/retrieval/__init__.py +8 -0
- efficient_context/retrieval/base.py +40 -0
- efficient_context/retrieval/cpu_optimized_retriever.py +247 -0
- efficient_context/utils/__init__.py +12 -0
- efficient_context/utils/text.py +120 -0
- examples/basic_usage.py +92 -0
- examples/benchmark.py +209 -0
- examples/dedup_benchmark.py +214 -0
- examples/dedup_eval.py +114 -0
- examples/dedup_test.py +49 -0
- examples/deduplication_benchmark.py +277 -0
- examples/demo_notebook.ipynb +0 -0
- examples/llm_integration.py +164 -0
- examples/simple_dedup_benchmark.py +92 -0
- examples/simple_test.py +69 -0
- model_card.md +91 -0
- pyproject.toml +14 -0
- requirements.txt +7 -0
- setup.py +31 -0
- test_simple.py +75 -0
- tests/test_core.py +114 -0
.gitignore
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python bytecode
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# Distribution / packaging
|
7 |
+
dist/
|
8 |
+
build/
|
9 |
+
*.egg-info/
|
10 |
+
|
11 |
+
# Virtual environments
|
12 |
+
venv/
|
13 |
+
env/
|
14 |
+
ENV/
|
15 |
+
|
16 |
+
# Testing
|
17 |
+
.coverage
|
18 |
+
htmlcov/
|
19 |
+
.pytest_cache/
|
20 |
+
|
21 |
+
# Environment variables
|
22 |
+
.env
|
23 |
+
|
24 |
+
# IDE specific files
|
25 |
+
.idea/
|
26 |
+
.vscode/
|
27 |
+
*.swp
|
28 |
+
*.swo
|
29 |
+
|
30 |
+
# OS specific files
|
31 |
+
.DS_Store
|
32 |
+
Thumbs.db
|
33 |
+
|
34 |
+
# Jupyter Notebook
|
35 |
+
.ipynb_checkpoints
|
36 |
+
|
37 |
+
# Build logs
|
38 |
+
*.log
|
39 |
+
|
40 |
+
# Hugging Face specific
|
41 |
+
.huggingface/
|
42 |
+
*.safetensors
|
43 |
+
wandb/
|
44 |
+
outputs/
|
CONTRIBUTING.md
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Contributing to efficient-context
|
2 |
+
|
3 |
+
Thank you for considering contributing to efficient-context! This document provides guidelines and instructions for contributing.
|
4 |
+
|
5 |
+
## Code of Conduct
|
6 |
+
|
7 |
+
By participating in this project, you agree to abide by our [Code of Conduct](CODE_OF_CONDUCT.md).
|
8 |
+
|
9 |
+
## How Can I Contribute?
|
10 |
+
|
11 |
+
### Reporting Bugs
|
12 |
+
|
13 |
+
Bug reports help make efficient-context better for everyone. When reporting a bug, please include:
|
14 |
+
|
15 |
+
1. A clear title and description
|
16 |
+
2. Steps to reproduce the issue
|
17 |
+
3. Expected behavior
|
18 |
+
4. Actual behavior
|
19 |
+
5. Environment details (OS, Python version, etc.)
|
20 |
+
|
21 |
+
### Suggesting Enhancements
|
22 |
+
|
23 |
+
We welcome suggestions for improvements! Please include:
|
24 |
+
|
25 |
+
1. A clear description of the enhancement
|
26 |
+
2. The rationale/use case
|
27 |
+
3. Possible implementation approaches (if any)
|
28 |
+
|
29 |
+
### Pull Requests
|
30 |
+
|
31 |
+
1. Fork the repository
|
32 |
+
2. Create a new branch for your feature or bug fix
|
33 |
+
3. Make your changes with appropriate tests
|
34 |
+
4. Ensure all tests pass
|
35 |
+
5. Submit a pull request
|
36 |
+
|
37 |
+
## Development Setup
|
38 |
+
|
39 |
+
1. Clone the repository
|
40 |
+
2. Create a virtual environment: `python -m venv venv`
|
41 |
+
3. Activate the environment: `source venv/bin/activate` (Unix) or `venv\Scripts\activate` (Windows)
|
42 |
+
4. Install development dependencies: `pip install -e ".[dev]"`
|
43 |
+
|
44 |
+
## Testing
|
45 |
+
|
46 |
+
Run tests with pytest:
|
47 |
+
|
48 |
+
```bash
|
49 |
+
pytest
|
50 |
+
```
|
51 |
+
|
52 |
+
## Style Guide
|
53 |
+
|
54 |
+
This project follows PEP 8 with a line length of 88 characters (compatible with black).
|
55 |
+
|
56 |
+
To format code:
|
57 |
+
|
58 |
+
```bash
|
59 |
+
black .
|
60 |
+
isort .
|
61 |
+
```
|
62 |
+
|
63 |
+
## Documentation
|
64 |
+
|
65 |
+
- Update documentation for any new features or changes
|
66 |
+
- Add docstrings for classes and functions
|
67 |
+
|
68 |
+
## Contact
|
69 |
+
|
70 |
+
For questions, feel free to open an issue or contact [Biswanath Roul](https://github.com/biswanathroul).
|
71 |
+
|
72 |
+
Thank you for contributing to efficient-context!
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 Biswanath Roul
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
MANIFEST.in
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
include LICENSE
|
2 |
+
include README.md
|
3 |
+
include pyproject.toml
|
4 |
+
recursive-include tests *
|
5 |
+
recursive-exclude tests *.pyc
|
6 |
+
recursive-exclude tests __pycache__
|
PUBLISHING.md
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Publishing to PyPI
|
2 |
+
|
3 |
+
This guide explains how to build and publish the `efficient-context` package to PyPI.
|
4 |
+
|
5 |
+
## Prerequisites
|
6 |
+
|
7 |
+
1. Create an account on PyPI: https://pypi.org/account/register/
|
8 |
+
2. Install build and twine packages:
|
9 |
+
|
10 |
+
```bash
|
11 |
+
pip install build twine
|
12 |
+
```
|
13 |
+
|
14 |
+
## Build the Package
|
15 |
+
|
16 |
+
1. Navigate to the project directory:
|
17 |
+
|
18 |
+
```bash
|
19 |
+
cd /path/to/efficient-context
|
20 |
+
```
|
21 |
+
|
22 |
+
2. Build the distribution packages:
|
23 |
+
|
24 |
+
```bash
|
25 |
+
python -m build
|
26 |
+
```
|
27 |
+
|
28 |
+
This will create a directory called `dist` containing both `.tar.gz` (source distribution) and `.whl` (built distribution) files.
|
29 |
+
|
30 |
+
## Upload to TestPyPI (Recommended)
|
31 |
+
|
32 |
+
Before publishing to the main PyPI repository, it's a good practice to test on TestPyPI:
|
33 |
+
|
34 |
+
```bash
|
35 |
+
python -m twine upload --repository-url https://test.pypi.org/legacy/ dist/*
|
36 |
+
```
|
37 |
+
|
38 |
+
You'll be prompted for your TestPyPI username and password.
|
39 |
+
|
40 |
+
Then install from TestPyPI to verify it works:
|
41 |
+
|
42 |
+
```bash
|
43 |
+
pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple efficient-context
|
44 |
+
```
|
45 |
+
|
46 |
+
## Upload to PyPI
|
47 |
+
|
48 |
+
Once you've verified everything works correctly, upload to the actual PyPI:
|
49 |
+
|
50 |
+
```bash
|
51 |
+
python -m twine upload dist/*
|
52 |
+
```
|
53 |
+
|
54 |
+
You'll be prompted for your PyPI username and password.
|
55 |
+
|
56 |
+
## Verify Installation
|
57 |
+
|
58 |
+
After uploading, verify that your package can be installed from PyPI:
|
59 |
+
|
60 |
+
```bash
|
61 |
+
pip install efficient-context
|
62 |
+
```
|
63 |
+
|
64 |
+
## Updating the Package
|
65 |
+
|
66 |
+
To update the package:
|
67 |
+
|
68 |
+
1. Update the version number in `setup.py`
|
69 |
+
2. Rebuild the package: `python -m build`
|
70 |
+
3. Upload to PyPI again: `python -m twine upload dist/*`
|
71 |
+
|
72 |
+
## GitHub Integration
|
73 |
+
|
74 |
+
If your code is hosted on GitHub, you may want to set up GitHub Actions to automatically build and publish your package when you create a new release. The code for this project is available at: https://github.com/biswanathroul/efficient-context
|
75 |
+
|
76 |
+
## Tips
|
77 |
+
|
78 |
+
- Always increment the version number in `setup.py` before publishing a new version
|
79 |
+
- Keep your PyPI credentials secure
|
80 |
+
- Include comprehensive documentation and examples in your package
|
81 |
+
- Add proper classifiers in `setup.py` for better searchability
|
README.md
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# efficient-context
|
2 |
+
|
3 |
+
A Python library for optimizing LLM context handling in CPU-constrained environments.
|
4 |
+
|
5 |
+
## Overview
|
6 |
+
|
7 |
+
`efficient-context` addresses the challenge of working with large language models (LLMs) on CPU-only and memory-limited systems by providing efficient context management strategies. The library focuses on:
|
8 |
+
|
9 |
+
- **Context Compression**: Reduce memory requirements while preserving information quality
|
10 |
+
- **Semantic Chunking**: Go beyond token-based approaches for more effective context management
|
11 |
+
- **Retrieval Optimization**: Minimize context size through intelligent retrieval strategies
|
12 |
+
- **Memory Management**: Handle large contexts on limited hardware resources
|
13 |
+
|
14 |
+
## Installation
|
15 |
+
|
16 |
+
```bash
|
17 |
+
pip install efficient-context
|
18 |
+
```
|
19 |
+
|
20 |
+
## Quick Start
|
21 |
+
|
22 |
+
```python
|
23 |
+
from efficient_context import ContextManager
|
24 |
+
from efficient_context.compression import SemanticDeduplicator
|
25 |
+
from efficient_context.chunking import SemanticChunker
|
26 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
27 |
+
|
28 |
+
# Initialize a context manager with custom strategies
|
29 |
+
context_manager = ContextManager(
|
30 |
+
compressor=SemanticDeduplicator(threshold=0.85),
|
31 |
+
chunker=SemanticChunker(chunk_size=256),
|
32 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
|
33 |
+
)
|
34 |
+
|
35 |
+
# Add documents to your context
|
36 |
+
context_manager.add_documents(documents)
|
37 |
+
|
38 |
+
# Generate optimized context for a query
|
39 |
+
optimized_context = context_manager.generate_context(query="Tell me about the climate impact of renewable energy")
|
40 |
+
|
41 |
+
# Use the optimized context with your LLM
|
42 |
+
response = your_llm_model.generate(prompt=prompt, context=optimized_context)
|
43 |
+
```
|
44 |
+
|
45 |
+
## Features
|
46 |
+
|
47 |
+
### Context Compression
|
48 |
+
- Semantic deduplication to remove redundant information
|
49 |
+
- Importance-based pruning that keeps critical information
|
50 |
+
- Automatic summarization of less relevant sections
|
51 |
+
|
52 |
+
### Advanced Chunking
|
53 |
+
- Semantic chunking that preserves logical units
|
54 |
+
- Adaptive chunk sizing based on content complexity
|
55 |
+
- Chunk relationships mapping for coherent retrieval
|
56 |
+
|
57 |
+
### Retrieval Optimization
|
58 |
+
- Lightweight embedding models optimized for CPU
|
59 |
+
- Tiered retrieval strategies (local vs. remote)
|
60 |
+
- Query-aware context assembly
|
61 |
+
|
62 |
+
### Memory Management
|
63 |
+
- Progressive loading/unloading of context
|
64 |
+
- Streaming context processing
|
65 |
+
- Memory-aware caching strategies
|
66 |
+
|
67 |
+
## Maintainer
|
68 |
+
|
69 |
+
This project is maintained by [Biswanath Roul](https://github.com/biswanathroul)
|
70 |
+
|
71 |
+
## License
|
72 |
+
|
73 |
+
MIT
|
debug.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
Debug script for efficient-context.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import sys
|
7 |
+
import os
|
8 |
+
|
9 |
+
print(f"Python version: {sys.version}")
|
10 |
+
print(f"Current working directory: {os.getcwd()}")
|
11 |
+
print(f"Python path: {sys.path}")
|
12 |
+
|
13 |
+
try:
|
14 |
+
import efficient_context
|
15 |
+
print(f"Successfully imported efficient_context: {efficient_context.__file__}")
|
16 |
+
except ImportError as e:
|
17 |
+
print(f"Failed to import efficient_context: {e}")
|
18 |
+
|
19 |
+
print("Script completed")
|
docs/usage.md
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# efficient-context Documentation
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
|
5 |
+
`efficient-context` is a Python library designed to optimize the handling of context for Large Language Models (LLMs) in CPU-constrained environments. It addresses the challenges of using LLMs with limited computational resources by providing efficient context management strategies.
|
6 |
+
|
7 |
+
## Key Features
|
8 |
+
|
9 |
+
1. **Context Compression**: Reduce memory requirements while preserving information quality
|
10 |
+
2. **Semantic Chunking**: Go beyond token-based approaches for more effective context management
|
11 |
+
3. **Retrieval Optimization**: Minimize context size through intelligent retrieval strategies
|
12 |
+
4. **Memory Management**: Handle large contexts on limited hardware resources
|
13 |
+
|
14 |
+
## Installation
|
15 |
+
|
16 |
+
```bash
|
17 |
+
pip install efficient-context
|
18 |
+
```
|
19 |
+
|
20 |
+
## Core Components
|
21 |
+
|
22 |
+
### ContextManager
|
23 |
+
|
24 |
+
The central class that orchestrates all components of the library.
|
25 |
+
|
26 |
+
```python
|
27 |
+
from efficient_context import ContextManager
|
28 |
+
|
29 |
+
# Initialize with default settings
|
30 |
+
context_manager = ContextManager()
|
31 |
+
|
32 |
+
# Add documents
|
33 |
+
context_manager.add_document("This is a sample document about renewable energy...")
|
34 |
+
context_manager.add_documents([doc1, doc2, doc3]) # Add multiple documents
|
35 |
+
|
36 |
+
# Generate context for a query
|
37 |
+
optimized_context = context_manager.generate_context(query="Tell me about renewable energy")
|
38 |
+
```
|
39 |
+
|
40 |
+
### Context Compression
|
41 |
+
|
42 |
+
The compression module reduces the size of content while preserving key information.
|
43 |
+
|
44 |
+
```python
|
45 |
+
from efficient_context.compression import SemanticDeduplicator
|
46 |
+
|
47 |
+
# Initialize with custom settings
|
48 |
+
compressor = SemanticDeduplicator(
|
49 |
+
threshold=0.85, # Similarity threshold for deduplication
|
50 |
+
embedding_model="lightweight", # Use a lightweight embedding model
|
51 |
+
min_sentence_length=10, # Minimum length of sentences to consider
|
52 |
+
importance_weight=0.3 # Weight given to sentence importance vs. deduplication
|
53 |
+
)
|
54 |
+
|
55 |
+
# Compress content
|
56 |
+
compressed_content = compressor.compress(
|
57 |
+
content="Your large text content here...",
|
58 |
+
target_size=1000 # Optional target size in tokens
|
59 |
+
)
|
60 |
+
```
|
61 |
+
|
62 |
+
### Semantic Chunking
|
63 |
+
|
64 |
+
The chunking module divides content into semantically coherent chunks.
|
65 |
+
|
66 |
+
```python
|
67 |
+
from efficient_context.chunking import SemanticChunker
|
68 |
+
|
69 |
+
# Initialize with custom settings
|
70 |
+
chunker = SemanticChunker(
|
71 |
+
chunk_size=512, # Target size for chunks in tokens
|
72 |
+
chunk_overlap=50, # Number of tokens to overlap between chunks
|
73 |
+
respect_paragraphs=True, # Avoid breaking paragraphs across chunks
|
74 |
+
min_chunk_size=100, # Minimum chunk size in tokens
|
75 |
+
max_chunk_size=1024 # Maximum chunk size in tokens
|
76 |
+
)
|
77 |
+
|
78 |
+
# Chunk content
|
79 |
+
chunks = chunker.chunk(
|
80 |
+
content="Your large text content here...",
|
81 |
+
document_id="doc-1", # Optional document ID
|
82 |
+
metadata={"source": "example", "author": "John Doe"} # Optional metadata
|
83 |
+
)
|
84 |
+
```
|
85 |
+
|
86 |
+
### Retrieval Optimization
|
87 |
+
|
88 |
+
The retrieval module finds the most relevant chunks for a query.
|
89 |
+
|
90 |
+
```python
|
91 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
92 |
+
|
93 |
+
# Initialize with custom settings
|
94 |
+
retriever = CPUOptimizedRetriever(
|
95 |
+
embedding_model="lightweight", # Use a lightweight embedding model
|
96 |
+
similarity_metric="cosine", # Metric for comparing embeddings
|
97 |
+
use_batching=True, # Batch embedding operations
|
98 |
+
batch_size=32, # Size of batches for embedding
|
99 |
+
max_index_size=5000 # Maximum number of chunks to keep in the index
|
100 |
+
)
|
101 |
+
|
102 |
+
# Index chunks
|
103 |
+
retriever.index_chunks(chunks)
|
104 |
+
|
105 |
+
# Retrieve relevant chunks
|
106 |
+
relevant_chunks = retriever.retrieve(
|
107 |
+
query="Your query here...",
|
108 |
+
top_k=5 # Number of chunks to retrieve
|
109 |
+
)
|
110 |
+
```
|
111 |
+
|
112 |
+
### Memory Management
|
113 |
+
|
114 |
+
The memory module helps optimize memory usage during operations.
|
115 |
+
|
116 |
+
```python
|
117 |
+
from efficient_context.memory import MemoryManager
|
118 |
+
|
119 |
+
# Initialize with custom settings
|
120 |
+
memory_manager = MemoryManager(
|
121 |
+
target_usage_percent=80.0, # Target memory usage percentage
|
122 |
+
aggressive_cleanup=False, # Whether to perform aggressive garbage collection
|
123 |
+
memory_monitor_interval=None # Interval for memory monitoring in seconds
|
124 |
+
)
|
125 |
+
|
126 |
+
# Use context manager for memory-intensive operations
|
127 |
+
with memory_manager.optimize_memory():
|
128 |
+
# Run memory-intensive operations here
|
129 |
+
results = process_large_documents(documents)
|
130 |
+
|
131 |
+
# Get memory usage statistics
|
132 |
+
memory_stats = memory_manager.get_memory_usage()
|
133 |
+
print(f"Process memory: {memory_stats['process_rss_bytes'] / (1024*1024):.2f} MB")
|
134 |
+
```
|
135 |
+
|
136 |
+
## Advanced Usage
|
137 |
+
|
138 |
+
### Customizing the Context Manager
|
139 |
+
|
140 |
+
```python
|
141 |
+
from efficient_context import ContextManager
|
142 |
+
from efficient_context.compression import SemanticDeduplicator
|
143 |
+
from efficient_context.chunking import SemanticChunker
|
144 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
145 |
+
from efficient_context.memory import MemoryManager
|
146 |
+
|
147 |
+
# Initialize a fully customized context manager
|
148 |
+
context_manager = ContextManager(
|
149 |
+
compressor=SemanticDeduplicator(threshold=0.85),
|
150 |
+
chunker=SemanticChunker(chunk_size=256, chunk_overlap=50),
|
151 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight"),
|
152 |
+
memory_manager=MemoryManager(target_usage_percent=80.0),
|
153 |
+
max_context_size=4096
|
154 |
+
)
|
155 |
+
```
|
156 |
+
|
157 |
+
### Integration with LLMs
|
158 |
+
|
159 |
+
```python
|
160 |
+
from efficient_context import ContextManager
|
161 |
+
from your_llm_library import LLM # Replace with your actual LLM library
|
162 |
+
|
163 |
+
# Initialize components
|
164 |
+
context_manager = ContextManager()
|
165 |
+
llm = LLM(model="lightweight-model")
|
166 |
+
|
167 |
+
# Process documents
|
168 |
+
context_manager.add_documents(documents)
|
169 |
+
|
170 |
+
# For each query
|
171 |
+
query = "Tell me about renewable energy"
|
172 |
+
optimized_context = context_manager.generate_context(query=query)
|
173 |
+
|
174 |
+
# Use context with the LLM
|
175 |
+
response = llm.generate(
|
176 |
+
prompt=query,
|
177 |
+
context=optimized_context,
|
178 |
+
max_tokens=512
|
179 |
+
)
|
180 |
+
```
|
181 |
+
|
182 |
+
## Performance Considerations
|
183 |
+
|
184 |
+
- **Memory Usage**: The library is designed to be memory-efficient, but be aware that embedding models may still require significant memory.
|
185 |
+
- **CPU Performance**: Choose the appropriate embedding model based on your CPU capabilities. The `lightweight` option is recommended for constrained environments.
|
186 |
+
- **Batch Size**: Adjust the `batch_size` parameter in retrieval to balance between memory usage and processing speed.
|
187 |
+
- **Context Size**: Setting appropriate `max_context_size` can significantly impact performance, especially when working with limited resources.
|
188 |
+
|
189 |
+
## Extending the Library
|
190 |
+
|
191 |
+
You can create custom implementations of the base classes to adapt the library to your specific needs:
|
192 |
+
|
193 |
+
```python
|
194 |
+
from efficient_context.compression.base import BaseCompressor
|
195 |
+
|
196 |
+
class MyCustomCompressor(BaseCompressor):
|
197 |
+
def __init__(self, custom_param=None):
|
198 |
+
self.custom_param = custom_param
|
199 |
+
|
200 |
+
def compress(self, content, target_size=None):
|
201 |
+
# Your custom compression logic here
|
202 |
+
return compressed_content
|
203 |
+
```
|
204 |
+
|
205 |
+
## Troubleshooting
|
206 |
+
|
207 |
+
**High Memory Usage**
|
208 |
+
- Reduce `batch_size` in the retriever
|
209 |
+
- Use a more lightweight embedding model
|
210 |
+
- Decrease `max_index_size` to limit the number of chunks stored in memory
|
211 |
+
|
212 |
+
**Slow Processing**
|
213 |
+
- Increase `batch_size` (balancing with memory constraints)
|
214 |
+
- Increase `threshold` in the SemanticDeduplicator to be more aggressive with deduplication
|
215 |
+
- Reduce `chunk_overlap` to minimize redundant processing
|
216 |
+
|
217 |
+
## Example Applications
|
218 |
+
|
219 |
+
- **Chatbots on Edge Devices**: Enable context-aware conversations on devices with limited resources
|
220 |
+
- **Document QA Systems**: Create efficient question-answering systems for large document collections
|
221 |
+
- **Embedded AI Applications**: Incorporate context-aware LLM capabilities in embedded systems
|
222 |
+
- **Mobile Applications**: Provide sophisticated LLM features in mobile apps with limited resources
|
efficient_context/__init__.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
efficient-context: A Python library for optimizing LLM context handling in CPU-constrained environments.
|
3 |
+
"""
|
4 |
+
|
5 |
+
__version__ = "0.1.0"
|
6 |
+
|
7 |
+
from efficient_context.context_manager import ContextManager
|
8 |
+
|
9 |
+
__all__ = ["ContextManager"]
|
efficient_context/chunking/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Chunking components for efficient-context.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from efficient_context.chunking.base import BaseChunker, Chunk
|
6 |
+
from efficient_context.chunking.semantic_chunker import SemanticChunker
|
7 |
+
|
8 |
+
__all__ = ["BaseChunker", "Chunk", "SemanticChunker"]
|
efficient_context/chunking/base.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Base classes for context chunking components.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from abc import ABC, abstractmethod
|
6 |
+
from typing import List, Dict, Any, Optional
|
7 |
+
|
8 |
+
class Chunk:
|
9 |
+
"""Representation of a text chunk with metadata."""
|
10 |
+
|
11 |
+
def __init__(
|
12 |
+
self,
|
13 |
+
content: str,
|
14 |
+
chunk_id: str,
|
15 |
+
document_id: Optional[str] = None,
|
16 |
+
metadata: Optional[Dict[str, Any]] = None,
|
17 |
+
):
|
18 |
+
"""
|
19 |
+
Initialize a chunk.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
content: The text content of the chunk
|
23 |
+
chunk_id: Unique identifier for the chunk
|
24 |
+
document_id: Optional ID of the source document
|
25 |
+
metadata: Optional metadata for the chunk
|
26 |
+
"""
|
27 |
+
self.content = content
|
28 |
+
self.chunk_id = chunk_id
|
29 |
+
self.document_id = document_id
|
30 |
+
self.metadata = metadata or {}
|
31 |
+
self.embedding = None
|
32 |
+
|
33 |
+
class BaseChunker(ABC):
|
34 |
+
"""Base class for content chunking components."""
|
35 |
+
|
36 |
+
@abstractmethod
|
37 |
+
def chunk(
|
38 |
+
self,
|
39 |
+
content: str,
|
40 |
+
metadata: Optional[Dict[str, Any]] = None,
|
41 |
+
document_id: Optional[str] = None
|
42 |
+
) -> List[Chunk]:
|
43 |
+
"""
|
44 |
+
Split content into chunks.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
content: Content to be chunked
|
48 |
+
metadata: Optional metadata to associate with chunks
|
49 |
+
document_id: Optional document ID to associate with chunks
|
50 |
+
|
51 |
+
Returns:
|
52 |
+
chunks: List of Chunk objects
|
53 |
+
"""
|
54 |
+
pass
|
efficient_context/chunking/semantic_chunker.py
ADDED
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Semantic chunking for intelligent context segmentation.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
import uuid
|
7 |
+
from typing import List, Dict, Any, Optional, Tuple
|
8 |
+
|
9 |
+
from efficient_context.chunking.base import BaseChunker, Chunk
|
10 |
+
from efficient_context.utils.text import split_into_sentences, calculate_text_overlap
|
11 |
+
|
12 |
+
# Set up logging
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
class SemanticChunker(BaseChunker):
|
17 |
+
"""
|
18 |
+
Chunker that creates chunks based on semantic boundaries.
|
19 |
+
|
20 |
+
This chunker aims to keep semantically related content together, unlike
|
21 |
+
simple token-based chunking that might split content mid-thought.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(
|
25 |
+
self,
|
26 |
+
chunk_size: int = 512,
|
27 |
+
chunk_overlap: int = 50,
|
28 |
+
respect_paragraphs: bool = True,
|
29 |
+
min_chunk_size: int = 100,
|
30 |
+
max_chunk_size: int = 1024
|
31 |
+
):
|
32 |
+
"""
|
33 |
+
Initialize the SemanticChunker.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
chunk_size: Target size for chunks in tokens (words)
|
37 |
+
chunk_overlap: Number of tokens to overlap between chunks
|
38 |
+
respect_paragraphs: Whether to avoid breaking paragraphs across chunks
|
39 |
+
min_chunk_size: Minimum chunk size in tokens
|
40 |
+
max_chunk_size: Maximum chunk size in tokens
|
41 |
+
"""
|
42 |
+
self.chunk_size = chunk_size
|
43 |
+
self.chunk_overlap = chunk_overlap
|
44 |
+
self.respect_paragraphs = respect_paragraphs
|
45 |
+
self.min_chunk_size = min_chunk_size
|
46 |
+
self.max_chunk_size = max_chunk_size
|
47 |
+
|
48 |
+
logger.info(
|
49 |
+
"SemanticChunker initialized with target size: %d tokens, overlap: %d tokens",
|
50 |
+
chunk_size, chunk_overlap
|
51 |
+
)
|
52 |
+
|
53 |
+
def _estimate_tokens(self, text: str) -> int:
|
54 |
+
"""
|
55 |
+
Estimate the number of tokens in text.
|
56 |
+
|
57 |
+
Args:
|
58 |
+
text: Text to estimate tokens for
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
token_count: Estimated number of tokens
|
62 |
+
"""
|
63 |
+
# Simple whitespace-based token estimation
|
64 |
+
# This is much faster than using a tokenizer and good enough for chunking
|
65 |
+
return len(text.split())
|
66 |
+
|
67 |
+
def _identify_paragraphs(self, content: str) -> List[str]:
|
68 |
+
"""
|
69 |
+
Split content into paragraphs.
|
70 |
+
|
71 |
+
Args:
|
72 |
+
content: Content to split
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
paragraphs: List of paragraphs
|
76 |
+
"""
|
77 |
+
# Split on empty lines (common paragraph separator)
|
78 |
+
paragraphs = [p.strip() for p in content.split("\n\n")]
|
79 |
+
|
80 |
+
# Handle other kinds of paragraph breaks and clean up
|
81 |
+
result = []
|
82 |
+
current = ""
|
83 |
+
|
84 |
+
for p in paragraphs:
|
85 |
+
# Skip empty paragraphs
|
86 |
+
if not p:
|
87 |
+
continue
|
88 |
+
|
89 |
+
# Handle single newlines that might indicate paragraphs
|
90 |
+
lines = p.split("\n")
|
91 |
+
for line in lines:
|
92 |
+
if not line.strip():
|
93 |
+
if current:
|
94 |
+
result.append(current)
|
95 |
+
current = ""
|
96 |
+
else:
|
97 |
+
if current:
|
98 |
+
current += " " + line.strip()
|
99 |
+
else:
|
100 |
+
current = line.strip()
|
101 |
+
|
102 |
+
if current:
|
103 |
+
result.append(current)
|
104 |
+
current = ""
|
105 |
+
|
106 |
+
# Add any remaining content
|
107 |
+
if current:
|
108 |
+
result.append(current)
|
109 |
+
|
110 |
+
return result if result else [content]
|
111 |
+
|
112 |
+
def _create_semantic_chunks(
|
113 |
+
self,
|
114 |
+
paragraphs: List[str],
|
115 |
+
document_id: Optional[str] = None,
|
116 |
+
metadata: Optional[Dict[str, Any]] = None
|
117 |
+
) -> List[Chunk]:
|
118 |
+
"""
|
119 |
+
Create chunks from paragraphs respecting semantic boundaries.
|
120 |
+
|
121 |
+
Args:
|
122 |
+
paragraphs: List of paragraphs to chunk
|
123 |
+
document_id: Optional ID of the source document
|
124 |
+
metadata: Optional metadata for the chunks
|
125 |
+
|
126 |
+
Returns:
|
127 |
+
chunks: List of Chunk objects
|
128 |
+
"""
|
129 |
+
chunks = []
|
130 |
+
current_chunk_text = ""
|
131 |
+
current_token_count = 0
|
132 |
+
|
133 |
+
for paragraph in paragraphs:
|
134 |
+
paragraph_tokens = self._estimate_tokens(paragraph)
|
135 |
+
|
136 |
+
# Check if adding this paragraph would exceed the max chunk size
|
137 |
+
if (current_token_count + paragraph_tokens > self.max_chunk_size and
|
138 |
+
current_token_count >= self.min_chunk_size):
|
139 |
+
# Create a new chunk with the current content
|
140 |
+
chunk_id = str(uuid.uuid4())
|
141 |
+
chunk = Chunk(
|
142 |
+
content=current_chunk_text.strip(),
|
143 |
+
chunk_id=chunk_id,
|
144 |
+
document_id=document_id,
|
145 |
+
metadata=metadata
|
146 |
+
)
|
147 |
+
chunks.append(chunk)
|
148 |
+
|
149 |
+
# Start a new chunk with overlap
|
150 |
+
if self.chunk_overlap > 0 and current_chunk_text:
|
151 |
+
# Get the last N tokens for overlap
|
152 |
+
words = current_chunk_text.split()
|
153 |
+
overlap_text = " ".join(words[-min(self.chunk_overlap, len(words)):])
|
154 |
+
current_chunk_text = overlap_text + " " + paragraph
|
155 |
+
current_token_count = self._estimate_tokens(current_chunk_text)
|
156 |
+
else:
|
157 |
+
# No overlap
|
158 |
+
current_chunk_text = paragraph
|
159 |
+
current_token_count = paragraph_tokens
|
160 |
+
# Handle very large paragraphs that exceed max_chunk_size on their own
|
161 |
+
elif paragraph_tokens > self.max_chunk_size:
|
162 |
+
# If we have existing content, create a chunk first
|
163 |
+
if current_chunk_text:
|
164 |
+
chunk_id = str(uuid.uuid4())
|
165 |
+
chunk = Chunk(
|
166 |
+
content=current_chunk_text.strip(),
|
167 |
+
chunk_id=chunk_id,
|
168 |
+
document_id=document_id,
|
169 |
+
metadata=metadata
|
170 |
+
)
|
171 |
+
chunks.append(chunk)
|
172 |
+
current_chunk_text = ""
|
173 |
+
current_token_count = 0
|
174 |
+
|
175 |
+
# Split the large paragraph into sentences
|
176 |
+
sentences = split_into_sentences(paragraph)
|
177 |
+
sentence_chunk = ""
|
178 |
+
sentence_token_count = 0
|
179 |
+
|
180 |
+
for sentence in sentences:
|
181 |
+
sentence_tokens = self._estimate_tokens(sentence)
|
182 |
+
|
183 |
+
# Check if adding this sentence would exceed the max chunk size
|
184 |
+
if (sentence_token_count + sentence_tokens > self.max_chunk_size and
|
185 |
+
sentence_token_count >= self.min_chunk_size):
|
186 |
+
# Create a new chunk with the current sentences
|
187 |
+
chunk_id = str(uuid.uuid4())
|
188 |
+
chunk = Chunk(
|
189 |
+
content=sentence_chunk.strip(),
|
190 |
+
chunk_id=chunk_id,
|
191 |
+
document_id=document_id,
|
192 |
+
metadata=metadata
|
193 |
+
)
|
194 |
+
chunks.append(chunk)
|
195 |
+
|
196 |
+
# Start a new chunk with overlap
|
197 |
+
if self.chunk_overlap > 0 and sentence_chunk:
|
198 |
+
words = sentence_chunk.split()
|
199 |
+
overlap_text = " ".join(words[-min(self.chunk_overlap, len(words)):])
|
200 |
+
sentence_chunk = overlap_text + " " + sentence
|
201 |
+
sentence_token_count = self._estimate_tokens(sentence_chunk)
|
202 |
+
else:
|
203 |
+
sentence_chunk = sentence
|
204 |
+
sentence_token_count = sentence_tokens
|
205 |
+
else:
|
206 |
+
# Add the sentence to the current chunk
|
207 |
+
if sentence_chunk:
|
208 |
+
sentence_chunk += " " + sentence
|
209 |
+
else:
|
210 |
+
sentence_chunk = sentence
|
211 |
+
sentence_token_count += sentence_tokens
|
212 |
+
|
213 |
+
# Add any remaining sentence content as a chunk
|
214 |
+
if sentence_chunk:
|
215 |
+
chunk_id = str(uuid.uuid4())
|
216 |
+
chunk = Chunk(
|
217 |
+
content=sentence_chunk.strip(),
|
218 |
+
chunk_id=chunk_id,
|
219 |
+
document_id=document_id,
|
220 |
+
metadata=metadata
|
221 |
+
)
|
222 |
+
chunks.append(chunk)
|
223 |
+
else:
|
224 |
+
# Add the paragraph to the current chunk
|
225 |
+
if current_chunk_text:
|
226 |
+
current_chunk_text += " " + paragraph
|
227 |
+
else:
|
228 |
+
current_chunk_text = paragraph
|
229 |
+
current_token_count += paragraph_tokens
|
230 |
+
|
231 |
+
# Check if we've reached the target chunk size
|
232 |
+
if current_token_count >= self.chunk_size:
|
233 |
+
chunk_id = str(uuid.uuid4())
|
234 |
+
chunk = Chunk(
|
235 |
+
content=current_chunk_text.strip(),
|
236 |
+
chunk_id=chunk_id,
|
237 |
+
document_id=document_id,
|
238 |
+
metadata=metadata
|
239 |
+
)
|
240 |
+
chunks.append(chunk)
|
241 |
+
|
242 |
+
# Start a new chunk with overlap
|
243 |
+
if self.chunk_overlap > 0:
|
244 |
+
words = current_chunk_text.split()
|
245 |
+
current_chunk_text = " ".join(words[-min(self.chunk_overlap, len(words)):])
|
246 |
+
current_token_count = self._estimate_tokens(current_chunk_text)
|
247 |
+
else:
|
248 |
+
current_chunk_text = ""
|
249 |
+
current_token_count = 0
|
250 |
+
|
251 |
+
# Add any remaining content as a final chunk
|
252 |
+
if current_chunk_text and current_token_count >= self.min_chunk_size:
|
253 |
+
chunk_id = str(uuid.uuid4())
|
254 |
+
chunk = Chunk(
|
255 |
+
content=current_chunk_text.strip(),
|
256 |
+
chunk_id=chunk_id,
|
257 |
+
document_id=document_id,
|
258 |
+
metadata=metadata
|
259 |
+
)
|
260 |
+
chunks.append(chunk)
|
261 |
+
|
262 |
+
return chunks
|
263 |
+
|
264 |
+
def chunk(
|
265 |
+
self,
|
266 |
+
content: str,
|
267 |
+
metadata: Optional[Dict[str, Any]] = None,
|
268 |
+
document_id: Optional[str] = None
|
269 |
+
) -> List[Chunk]:
|
270 |
+
"""
|
271 |
+
Split content into semantic chunks.
|
272 |
+
|
273 |
+
Args:
|
274 |
+
content: Content to be chunked
|
275 |
+
metadata: Optional metadata to associate with chunks
|
276 |
+
document_id: Optional document ID to associate with chunks
|
277 |
+
|
278 |
+
Returns:
|
279 |
+
chunks: List of Chunk objects
|
280 |
+
"""
|
281 |
+
if not content.strip():
|
282 |
+
return []
|
283 |
+
|
284 |
+
# Identify paragraphs
|
285 |
+
if self.respect_paragraphs:
|
286 |
+
paragraphs = self._identify_paragraphs(content)
|
287 |
+
else:
|
288 |
+
# Treat the whole content as one paragraph
|
289 |
+
paragraphs = [content]
|
290 |
+
|
291 |
+
# Create chunks from paragraphs
|
292 |
+
chunks = self._create_semantic_chunks(paragraphs, document_id, metadata)
|
293 |
+
|
294 |
+
logger.info("Created %d chunks from content", len(chunks))
|
295 |
+
return chunks
|
efficient_context/compression/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Compression components for efficient-context.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from efficient_context.compression.base import BaseCompressor
|
6 |
+
from efficient_context.compression.semantic_deduplicator import SemanticDeduplicator
|
7 |
+
|
8 |
+
__all__ = ["BaseCompressor", "SemanticDeduplicator"]
|
efficient_context/compression/base.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Base classes for context compression components.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from abc import ABC, abstractmethod
|
6 |
+
from typing import Optional
|
7 |
+
|
8 |
+
class BaseCompressor(ABC):
|
9 |
+
"""Base class for content compression components."""
|
10 |
+
|
11 |
+
@abstractmethod
|
12 |
+
def compress(self, content: str, target_size: Optional[int] = None) -> str:
|
13 |
+
"""
|
14 |
+
Compress content to reduce size while preserving key information.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
content: The content to compress
|
18 |
+
target_size: Optional target size for the compressed content
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
compressed_content: The compressed content
|
22 |
+
"""
|
23 |
+
pass
|
efficient_context/compression/semantic_deduplicator.py
ADDED
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Semantic deduplication for compressing context content.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
from typing import List, Optional, Tuple, Dict, Any
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
+
|
11 |
+
from efficient_context.compression.base import BaseCompressor
|
12 |
+
from efficient_context.utils.text import split_into_sentences, get_sentence_importance
|
13 |
+
|
14 |
+
# Set up logging
|
15 |
+
logging.basicConfig(level=logging.INFO)
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
class SemanticDeduplicator(BaseCompressor):
|
19 |
+
"""
|
20 |
+
Compressor that removes semantically duplicate or redundant content.
|
21 |
+
|
22 |
+
This compressor identifies and removes sentences that are semantically
|
23 |
+
similar to others in the content, keeping only the most representative ones.
|
24 |
+
It's designed to be CPU-friendly and memory-efficient.
|
25 |
+
"""
|
26 |
+
|
27 |
+
def __init__(
|
28 |
+
self,
|
29 |
+
threshold: float = 0.85,
|
30 |
+
embedding_model: str = "lightweight",
|
31 |
+
min_sentence_length: int = 10,
|
32 |
+
importance_weight: float = 0.3,
|
33 |
+
):
|
34 |
+
"""
|
35 |
+
Initialize the SemanticDeduplicator.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
threshold: Similarity threshold for considering content duplicated (0.0 to 1.0)
|
39 |
+
embedding_model: The model to use for generating embeddings
|
40 |
+
min_sentence_length: Minimum length of sentences to consider
|
41 |
+
importance_weight: Weight given to sentence importance vs. deduplication
|
42 |
+
"""
|
43 |
+
self.threshold = threshold
|
44 |
+
self.embedding_model = embedding_model
|
45 |
+
self.min_sentence_length = min_sentence_length
|
46 |
+
self.importance_weight = importance_weight
|
47 |
+
|
48 |
+
# Initialize the embedding model
|
49 |
+
self._init_embedding_model()
|
50 |
+
|
51 |
+
logger.info("SemanticDeduplicator initialized with threshold: %.2f", threshold)
|
52 |
+
|
53 |
+
def _init_embedding_model(self):
|
54 |
+
"""Initialize the embedding model based on the selected type."""
|
55 |
+
try:
|
56 |
+
from sentence_transformers import SentenceTransformer
|
57 |
+
|
58 |
+
# Choose a lightweight model for CPU efficiency
|
59 |
+
if self.embedding_model == "lightweight":
|
60 |
+
# MiniLM models are lightweight and efficient
|
61 |
+
self.model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
|
62 |
+
else:
|
63 |
+
# Default to a balanced model
|
64 |
+
self.model = SentenceTransformer(self.embedding_model)
|
65 |
+
|
66 |
+
logger.info("Using embedding model: %s", self.model.get_sentence_embedding_dimension())
|
67 |
+
except ImportError:
|
68 |
+
logger.warning("SentenceTransformer not available, using numpy fallback (less accurate)")
|
69 |
+
self.model = None
|
70 |
+
|
71 |
+
def _get_embeddings(self, sentences: List[str]) -> np.ndarray:
|
72 |
+
"""
|
73 |
+
Get embeddings for a list of sentences.
|
74 |
+
|
75 |
+
Args:
|
76 |
+
sentences: List of sentences to embed
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
embeddings: Array of sentence embeddings
|
80 |
+
"""
|
81 |
+
if not sentences:
|
82 |
+
return np.array([])
|
83 |
+
|
84 |
+
if self.model is not None:
|
85 |
+
# Use the sentence transformer if available
|
86 |
+
return self.model.encode(sentences, show_progress_bar=False)
|
87 |
+
else:
|
88 |
+
# Fallback to a simple Bag-of-Words approach
|
89 |
+
# This is much less accurate but works without dependencies
|
90 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
91 |
+
vectorizer = TfidfVectorizer(max_features=5000)
|
92 |
+
return vectorizer.fit_transform(sentences).toarray()
|
93 |
+
|
94 |
+
def _compute_similarity_matrix(self, embeddings: np.ndarray) -> np.ndarray:
|
95 |
+
"""
|
96 |
+
Compute pairwise similarity between embeddings.
|
97 |
+
|
98 |
+
Args:
|
99 |
+
embeddings: Array of sentence embeddings
|
100 |
+
|
101 |
+
Returns:
|
102 |
+
similarity_matrix: Matrix of pairwise similarities
|
103 |
+
"""
|
104 |
+
# Return empty array for empty input
|
105 |
+
if embeddings.shape[0] == 0:
|
106 |
+
return np.array([])
|
107 |
+
|
108 |
+
# Compute cosine similarity
|
109 |
+
return cosine_similarity(embeddings)
|
110 |
+
|
111 |
+
def _deduplicate_sentences(
|
112 |
+
self,
|
113 |
+
sentences: List[str],
|
114 |
+
importances: Optional[List[float]] = None
|
115 |
+
) -> List[int]:
|
116 |
+
"""
|
117 |
+
Identify non-redundant sentence indices.
|
118 |
+
|
119 |
+
Args:
|
120 |
+
sentences: List of sentences to deduplicate
|
121 |
+
importances: Optional list of importance scores
|
122 |
+
|
123 |
+
Returns:
|
124 |
+
kept_indices: Indices of sentences to keep
|
125 |
+
"""
|
126 |
+
if not sentences:
|
127 |
+
return []
|
128 |
+
|
129 |
+
# Filter out sentences that are too short
|
130 |
+
valid_indices = [i for i, s in enumerate(sentences) if len(s.split()) >= self.min_sentence_length]
|
131 |
+
|
132 |
+
if not valid_indices:
|
133 |
+
# If no sentences meet the min length, return all indices
|
134 |
+
return list(range(len(sentences)))
|
135 |
+
|
136 |
+
# Get embeddings for valid sentences
|
137 |
+
valid_sentences = [sentences[i] for i in valid_indices]
|
138 |
+
embeddings = self._get_embeddings(valid_sentences)
|
139 |
+
|
140 |
+
# Compute pairwise similarity
|
141 |
+
similarity_matrix = self._compute_similarity_matrix(embeddings)
|
142 |
+
|
143 |
+
# Set diagonal to 0 to avoid self-similarity
|
144 |
+
np.fill_diagonal(similarity_matrix, 0)
|
145 |
+
|
146 |
+
# Determine which sentences to keep
|
147 |
+
kept_indices = []
|
148 |
+
remaining_indices = set(range(len(valid_indices)))
|
149 |
+
|
150 |
+
# If importances are provided, start with most important sentences
|
151 |
+
if importances is not None:
|
152 |
+
valid_importances = [importances[i] for i in valid_indices]
|
153 |
+
ordered_indices = [i for i, _ in sorted(
|
154 |
+
enumerate(valid_importances),
|
155 |
+
key=lambda x: x[1],
|
156 |
+
reverse=True
|
157 |
+
)]
|
158 |
+
else:
|
159 |
+
# Otherwise, use sentence length as a simple importance proxy
|
160 |
+
ordered_indices = [i for i, _ in sorted(
|
161 |
+
enumerate(valid_sentences),
|
162 |
+
key=lambda x: len(x[1].split()),
|
163 |
+
reverse=True
|
164 |
+
)]
|
165 |
+
|
166 |
+
# Process sentences in order of importance
|
167 |
+
for idx in ordered_indices:
|
168 |
+
if idx not in remaining_indices:
|
169 |
+
continue
|
170 |
+
|
171 |
+
# Keep this sentence
|
172 |
+
kept_indices.append(valid_indices[idx])
|
173 |
+
remaining_indices.remove(idx)
|
174 |
+
|
175 |
+
# Remove similar sentences
|
176 |
+
similar_indices = [
|
177 |
+
i for i in remaining_indices
|
178 |
+
if similarity_matrix[idx, i] > self.threshold
|
179 |
+
]
|
180 |
+
|
181 |
+
remaining_indices -= set(similar_indices)
|
182 |
+
|
183 |
+
# Break if we've processed all indices
|
184 |
+
if not remaining_indices:
|
185 |
+
break
|
186 |
+
|
187 |
+
# Add any remaining short sentences we skipped earlier
|
188 |
+
short_indices = [i for i, s in enumerate(sentences) if len(s.split()) < self.min_sentence_length]
|
189 |
+
kept_indices.extend(short_indices)
|
190 |
+
|
191 |
+
# Sort to maintain original order
|
192 |
+
return sorted(kept_indices)
|
193 |
+
|
194 |
+
def compress(self, content: str, target_size: Optional[int] = None) -> str:
|
195 |
+
"""
|
196 |
+
Compress content by removing semantic duplicates.
|
197 |
+
|
198 |
+
Args:
|
199 |
+
content: The content to compress
|
200 |
+
target_size: Optional target size in tokens
|
201 |
+
|
202 |
+
Returns:
|
203 |
+
compressed_content: The compressed content
|
204 |
+
"""
|
205 |
+
# Split content into sentences
|
206 |
+
sentences = split_into_sentences(content)
|
207 |
+
|
208 |
+
if not sentences:
|
209 |
+
return content
|
210 |
+
|
211 |
+
# Get sentence importance scores
|
212 |
+
importances = get_sentence_importance(sentences)
|
213 |
+
|
214 |
+
# Deduplicate sentences
|
215 |
+
kept_indices = self._deduplicate_sentences(sentences, importances)
|
216 |
+
|
217 |
+
# Combine kept sentences
|
218 |
+
kept_sentences = [sentences[i] for i in kept_indices]
|
219 |
+
compressed = " ".join(kept_sentences)
|
220 |
+
|
221 |
+
# If we need to compress further to meet target size
|
222 |
+
if target_size and len(compressed.split()) > target_size:
|
223 |
+
# Calculate how many more sentences to remove
|
224 |
+
current_size = len(compressed.split())
|
225 |
+
reduction_needed = current_size - target_size
|
226 |
+
|
227 |
+
# Sort sentences by importance (lowest first)
|
228 |
+
sentence_priorities = [(i, importances[i]) for i in kept_indices]
|
229 |
+
sorted_priorities = sorted(sentence_priorities, key=lambda x: x[1])
|
230 |
+
|
231 |
+
# Remove least important sentences until we meet target size
|
232 |
+
remove_count = 0
|
233 |
+
tokens_removed = 0
|
234 |
+
indices_to_remove = []
|
235 |
+
|
236 |
+
for idx, _ in sorted_priorities:
|
237 |
+
sentence_tokens = len(sentences[idx].split())
|
238 |
+
tokens_removed += sentence_tokens
|
239 |
+
remove_count += 1
|
240 |
+
indices_to_remove.append(idx)
|
241 |
+
|
242 |
+
if tokens_removed >= reduction_needed:
|
243 |
+
break
|
244 |
+
|
245 |
+
# Remove the low-importance sentences
|
246 |
+
final_indices = [i for i in kept_indices if i not in indices_to_remove]
|
247 |
+
|
248 |
+
# Recombine
|
249 |
+
compressed = " ".join(sentences[i] for i in sorted(final_indices))
|
250 |
+
|
251 |
+
# Log compression stats
|
252 |
+
original_tokens = len(content.split())
|
253 |
+
compressed_tokens = len(compressed.split())
|
254 |
+
reduction = (1 - compressed_tokens / original_tokens) * 100 if original_tokens > 0 else 0
|
255 |
+
|
256 |
+
logger.info(
|
257 |
+
"Compressed from %d to %d tokens (%.1f%% reduction)",
|
258 |
+
original_tokens, compressed_tokens, reduction
|
259 |
+
)
|
260 |
+
|
261 |
+
return compressed
|
efficient_context/context_manager.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Core context management module for efficient-context library.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from typing import List, Dict, Any, Optional, Union
|
6 |
+
import logging
|
7 |
+
from pydantic import BaseModel, Field
|
8 |
+
|
9 |
+
from efficient_context.compression.base import BaseCompressor
|
10 |
+
from efficient_context.chunking.base import BaseChunker
|
11 |
+
from efficient_context.retrieval.base import BaseRetriever
|
12 |
+
from efficient_context.memory.memory_manager import MemoryManager
|
13 |
+
|
14 |
+
# Set up logging
|
15 |
+
logging.basicConfig(level=logging.INFO)
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
class Document(BaseModel):
|
19 |
+
"""A document to be processed by the context manager."""
|
20 |
+
id: str = Field(..., description="Unique identifier for the document")
|
21 |
+
content: str = Field(..., description="Text content of the document")
|
22 |
+
metadata: Dict[str, Any] = Field(default_factory=dict, description="Optional metadata for the document")
|
23 |
+
|
24 |
+
class ContextManager:
|
25 |
+
"""
|
26 |
+
Main class for managing context efficiently for LLMs in CPU-constrained environments.
|
27 |
+
|
28 |
+
This class orchestrates the compression, chunking, retrieval, and memory management
|
29 |
+
components to optimize context handling for LLMs running on limited hardware.
|
30 |
+
"""
|
31 |
+
|
32 |
+
def __init__(
|
33 |
+
self,
|
34 |
+
compressor: Optional[BaseCompressor] = None,
|
35 |
+
chunker: Optional[BaseChunker] = None,
|
36 |
+
retriever: Optional[BaseRetriever] = None,
|
37 |
+
memory_manager: Optional[MemoryManager] = None,
|
38 |
+
max_context_size: int = 4096,
|
39 |
+
):
|
40 |
+
"""
|
41 |
+
Initialize the context manager with configurable components.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
compressor: Component for compressing context content
|
45 |
+
chunker: Component for chunking content
|
46 |
+
retriever: Component for retrieving relevant chunks
|
47 |
+
memory_manager: Component for managing memory usage
|
48 |
+
max_context_size: Maximum size of context in tokens
|
49 |
+
"""
|
50 |
+
from efficient_context.compression import SemanticDeduplicator
|
51 |
+
from efficient_context.chunking import SemanticChunker
|
52 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
53 |
+
from efficient_context.memory import MemoryManager
|
54 |
+
|
55 |
+
self.compressor = compressor or SemanticDeduplicator()
|
56 |
+
self.chunker = chunker or SemanticChunker()
|
57 |
+
self.retriever = retriever or CPUOptimizedRetriever()
|
58 |
+
self.memory_manager = memory_manager or MemoryManager()
|
59 |
+
self.max_context_size = max_context_size
|
60 |
+
|
61 |
+
self.documents = {}
|
62 |
+
self.chunks = []
|
63 |
+
|
64 |
+
logger.info("Context Manager initialized with max context size: %d", max_context_size)
|
65 |
+
|
66 |
+
def add_document(self, document: Union[Document, Dict, str], document_id: Optional[str] = None) -> str:
|
67 |
+
"""
|
68 |
+
Add a document to the context manager.
|
69 |
+
|
70 |
+
Args:
|
71 |
+
document: Document to add (can be a Document object, dict, or string content)
|
72 |
+
document_id: Optional ID for the document (generated if not provided)
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
document_id: ID of the added document
|
76 |
+
"""
|
77 |
+
# Convert input to Document object
|
78 |
+
if isinstance(document, str):
|
79 |
+
if document_id is None:
|
80 |
+
import uuid
|
81 |
+
document_id = str(uuid.uuid4())
|
82 |
+
doc = Document(id=document_id, content=document)
|
83 |
+
elif isinstance(document, dict):
|
84 |
+
if 'id' in document:
|
85 |
+
document_id = document['id']
|
86 |
+
elif document_id is None:
|
87 |
+
import uuid
|
88 |
+
document_id = str(uuid.uuid4())
|
89 |
+
|
90 |
+
doc = Document(
|
91 |
+
id=document_id,
|
92 |
+
content=document.get('content', ''),
|
93 |
+
metadata=document.get('metadata', {})
|
94 |
+
)
|
95 |
+
else:
|
96 |
+
doc = document
|
97 |
+
document_id = doc.id
|
98 |
+
|
99 |
+
# Store the document
|
100 |
+
self.documents[document_id] = doc
|
101 |
+
|
102 |
+
# Process the document
|
103 |
+
with self.memory_manager.optimize_memory():
|
104 |
+
# Compress the document
|
105 |
+
compressed_content = self.compressor.compress(doc.content)
|
106 |
+
|
107 |
+
# Chunk the compressed content
|
108 |
+
doc_chunks = self.chunker.chunk(compressed_content, metadata=doc.metadata, document_id=doc.id)
|
109 |
+
|
110 |
+
# Index the chunks for retrieval
|
111 |
+
self.retriever.index_chunks(doc_chunks)
|
112 |
+
|
113 |
+
# Store the chunks
|
114 |
+
self.chunks.extend(doc_chunks)
|
115 |
+
|
116 |
+
logger.info("Added document with ID %s (%d chunks)", document_id, len(doc_chunks))
|
117 |
+
return document_id
|
118 |
+
|
119 |
+
def add_documents(self, documents: List[Union[Document, Dict, str]]) -> List[str]:
|
120 |
+
"""
|
121 |
+
Add multiple documents to the context manager.
|
122 |
+
|
123 |
+
Args:
|
124 |
+
documents: List of documents to add
|
125 |
+
|
126 |
+
Returns:
|
127 |
+
document_ids: List of IDs of added documents
|
128 |
+
"""
|
129 |
+
document_ids = []
|
130 |
+
for doc in documents:
|
131 |
+
doc_id = self.add_document(doc)
|
132 |
+
document_ids.append(doc_id)
|
133 |
+
|
134 |
+
return document_ids
|
135 |
+
|
136 |
+
def generate_context(self, query: str, max_size: Optional[int] = None) -> str:
|
137 |
+
"""
|
138 |
+
Generate optimized context for a given query.
|
139 |
+
|
140 |
+
Args:
|
141 |
+
query: The query for which to generate context
|
142 |
+
max_size: Maximum size of the context (defaults to self.max_context_size)
|
143 |
+
|
144 |
+
Returns:
|
145 |
+
context: Optimized context for the query
|
146 |
+
"""
|
147 |
+
max_size = max_size or self.max_context_size
|
148 |
+
|
149 |
+
with self.memory_manager.optimize_memory():
|
150 |
+
# Retrieve relevant chunks
|
151 |
+
relevant_chunks = self.retriever.retrieve(query, top_k=max_size)
|
152 |
+
|
153 |
+
# Combine chunks into a context
|
154 |
+
context_parts = [chunk.content for chunk in relevant_chunks]
|
155 |
+
|
156 |
+
# Final compression to ensure we're within size limits
|
157 |
+
combined_context = "\n\n".join(context_parts)
|
158 |
+
if len(combined_context.split()) > max_size:
|
159 |
+
combined_context = self.compressor.compress(combined_context, target_size=max_size)
|
160 |
+
|
161 |
+
logger.info("Generated context of size ~%d tokens for query", len(combined_context.split()))
|
162 |
+
return combined_context
|
163 |
+
|
164 |
+
def clear(self):
|
165 |
+
"""Clear all documents and chunks from the context manager."""
|
166 |
+
self.documents = {}
|
167 |
+
self.chunks = []
|
168 |
+
self.retriever.clear()
|
169 |
+
logger.info("Context manager cleared")
|
efficient_context/memory/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Memory management components for efficient-context.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from efficient_context.memory.memory_manager import MemoryManager
|
6 |
+
|
7 |
+
__all__ = ["MemoryManager"]
|
efficient_context/memory/memory_manager.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Memory management utilities for efficient-context.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
import gc
|
7 |
+
import os
|
8 |
+
import psutil
|
9 |
+
from typing import Optional, Dict, Any
|
10 |
+
from contextlib import contextmanager
|
11 |
+
|
12 |
+
# Set up logging
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
class MemoryManager:
|
17 |
+
"""
|
18 |
+
Manages memory usage for efficient context handling.
|
19 |
+
|
20 |
+
This class provides utilities to monitor and optimize memory usage
|
21 |
+
when working with large language models and context on CPU.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(
|
25 |
+
self,
|
26 |
+
target_usage_percent: float = 80.0,
|
27 |
+
aggressive_cleanup: bool = False,
|
28 |
+
memory_monitor_interval: Optional[float] = None,
|
29 |
+
):
|
30 |
+
"""
|
31 |
+
Initialize the MemoryManager.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
target_usage_percent: Target memory usage as percentage of available memory
|
35 |
+
aggressive_cleanup: Whether to perform aggressive garbage collection
|
36 |
+
memory_monitor_interval: Interval for memory monitoring in seconds (None to disable)
|
37 |
+
"""
|
38 |
+
self.target_usage_percent = target_usage_percent
|
39 |
+
self.aggressive_cleanup = aggressive_cleanup
|
40 |
+
self.memory_monitor_interval = memory_monitor_interval
|
41 |
+
self.monitor_active = False
|
42 |
+
|
43 |
+
logger.info(
|
44 |
+
"MemoryManager initialized with target usage: %.1f%%",
|
45 |
+
target_usage_percent
|
46 |
+
)
|
47 |
+
|
48 |
+
def get_memory_usage(self) -> Dict[str, Any]:
|
49 |
+
"""
|
50 |
+
Get current memory usage statistics.
|
51 |
+
|
52 |
+
Returns:
|
53 |
+
stats: Dictionary of memory usage statistics
|
54 |
+
"""
|
55 |
+
# Get process memory info
|
56 |
+
process = psutil.Process(os.getpid())
|
57 |
+
process_memory = process.memory_info()
|
58 |
+
|
59 |
+
# Get system memory info
|
60 |
+
system_memory = psutil.virtual_memory()
|
61 |
+
|
62 |
+
# Calculate usage percentages
|
63 |
+
process_percent = (process_memory.rss / system_memory.total) * 100
|
64 |
+
system_percent = system_memory.percent
|
65 |
+
|
66 |
+
return {
|
67 |
+
"process_rss_bytes": process_memory.rss,
|
68 |
+
"process_vms_bytes": process_memory.vms,
|
69 |
+
"process_percent": process_percent,
|
70 |
+
"system_available_bytes": system_memory.available,
|
71 |
+
"system_total_bytes": system_memory.total,
|
72 |
+
"system_used_percent": system_percent,
|
73 |
+
}
|
74 |
+
|
75 |
+
def log_memory_usage(self) -> None:
|
76 |
+
"""Log memory usage statistics."""
|
77 |
+
stats = self.get_memory_usage()
|
78 |
+
|
79 |
+
logger.info(
|
80 |
+
"Memory usage: Process: %.1f%% (%.1f MB), System: %.1f%% (%.1f GB available)",
|
81 |
+
stats["process_percent"],
|
82 |
+
stats["process_rss_bytes"] / (1024 * 1024),
|
83 |
+
stats["system_used_percent"],
|
84 |
+
stats["system_available_bytes"] / (1024 * 1024 * 1024)
|
85 |
+
)
|
86 |
+
|
87 |
+
def cleanup_memory(self) -> None:
|
88 |
+
"""Perform memory cleanup."""
|
89 |
+
# Run garbage collection
|
90 |
+
collected = gc.collect()
|
91 |
+
|
92 |
+
if self.aggressive_cleanup:
|
93 |
+
# Run an additional, more aggressive pass
|
94 |
+
collected += gc.collect()
|
95 |
+
|
96 |
+
logger.debug("Memory cleanup: Collected %d objects", collected)
|
97 |
+
|
98 |
+
def _check_memory_threshold(self) -> bool:
|
99 |
+
"""
|
100 |
+
Check if memory usage exceeds the target threshold.
|
101 |
+
|
102 |
+
Returns:
|
103 |
+
exceeded: Whether the threshold is exceeded
|
104 |
+
"""
|
105 |
+
stats = self.get_memory_usage()
|
106 |
+
return stats["system_used_percent"] > self.target_usage_percent
|
107 |
+
|
108 |
+
@contextmanager
|
109 |
+
def optimize_memory(self):
|
110 |
+
"""
|
111 |
+
Context manager for optimizing memory during operations.
|
112 |
+
|
113 |
+
Example:
|
114 |
+
```
|
115 |
+
with memory_manager.optimize_memory():
|
116 |
+
# Run memory-intensive operations
|
117 |
+
```
|
118 |
+
"""
|
119 |
+
# Log initial memory state if in debug mode
|
120 |
+
if logger.isEnabledFor(logging.DEBUG):
|
121 |
+
self.log_memory_usage()
|
122 |
+
|
123 |
+
try:
|
124 |
+
# Yield control back to the caller
|
125 |
+
yield
|
126 |
+
finally:
|
127 |
+
# Check if we need to clean up memory
|
128 |
+
if self._check_memory_threshold():
|
129 |
+
logger.info("Memory threshold exceeded, performing cleanup")
|
130 |
+
self.cleanup_memory()
|
131 |
+
|
132 |
+
# Log final memory state if in debug mode
|
133 |
+
if logger.isEnabledFor(logging.DEBUG):
|
134 |
+
self.log_memory_usage()
|
efficient_context/retrieval/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Retrieval components for efficient-context.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from efficient_context.retrieval.base import BaseRetriever
|
6 |
+
from efficient_context.retrieval.cpu_optimized_retriever import CPUOptimizedRetriever
|
7 |
+
|
8 |
+
__all__ = ["BaseRetriever", "CPUOptimizedRetriever"]
|
efficient_context/retrieval/base.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Base classes for retrieval components.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from abc import ABC, abstractmethod
|
6 |
+
from typing import List, Optional
|
7 |
+
|
8 |
+
from efficient_context.chunking.base import Chunk
|
9 |
+
|
10 |
+
class BaseRetriever(ABC):
|
11 |
+
"""Base class for content retrieval components."""
|
12 |
+
|
13 |
+
@abstractmethod
|
14 |
+
def index_chunks(self, chunks: List[Chunk]) -> None:
|
15 |
+
"""
|
16 |
+
Index chunks for future retrieval.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
chunks: Chunks to index
|
20 |
+
"""
|
21 |
+
pass
|
22 |
+
|
23 |
+
@abstractmethod
|
24 |
+
def retrieve(self, query: str, top_k: Optional[int] = None) -> List[Chunk]:
|
25 |
+
"""
|
26 |
+
Retrieve chunks relevant to a query.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
query: Query to retrieve chunks for
|
30 |
+
top_k: Number of chunks to retrieve
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
chunks: List of retrieved chunks
|
34 |
+
"""
|
35 |
+
pass
|
36 |
+
|
37 |
+
@abstractmethod
|
38 |
+
def clear(self) -> None:
|
39 |
+
"""Clear all indexed chunks."""
|
40 |
+
pass
|
efficient_context/retrieval/cpu_optimized_retriever.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
CPU-optimized retrieval for efficient context handling.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
import heapq
|
7 |
+
from typing import List, Dict, Any, Optional, Tuple, Union
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from efficient_context.retrieval.base import BaseRetriever
|
11 |
+
from efficient_context.chunking.base import Chunk
|
12 |
+
|
13 |
+
# Set up logging
|
14 |
+
logging.basicConfig(level=logging.INFO)
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
class CPUOptimizedRetriever(BaseRetriever):
|
18 |
+
"""
|
19 |
+
Retriever optimized for CPU performance and low memory usage.
|
20 |
+
|
21 |
+
This retriever uses techniques to minimize computational requirements
|
22 |
+
while still providing high-quality retrieval results.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
embedding_model: str = "lightweight",
|
28 |
+
similarity_metric: str = "cosine",
|
29 |
+
use_batching: bool = True,
|
30 |
+
batch_size: int = 32,
|
31 |
+
max_index_size: Optional[int] = None,
|
32 |
+
):
|
33 |
+
"""
|
34 |
+
Initialize the CPUOptimizedRetriever.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
embedding_model: Model to use for embeddings
|
38 |
+
similarity_metric: Metric for comparing embeddings
|
39 |
+
use_batching: Whether to batch embedding operations
|
40 |
+
batch_size: Size of batches for embedding
|
41 |
+
max_index_size: Maximum number of chunks to keep in the index
|
42 |
+
"""
|
43 |
+
self.embedding_model = embedding_model
|
44 |
+
self.similarity_metric = similarity_metric
|
45 |
+
self.use_batching = use_batching
|
46 |
+
self.batch_size = batch_size
|
47 |
+
self.max_index_size = max_index_size
|
48 |
+
|
49 |
+
# Initialize storage
|
50 |
+
self.chunks = []
|
51 |
+
self.chunk_embeddings = None
|
52 |
+
self.chunk_ids_to_index = {}
|
53 |
+
|
54 |
+
# Initialize the embedding model
|
55 |
+
self._init_embedding_model()
|
56 |
+
|
57 |
+
logger.info("CPUOptimizedRetriever initialized with model: %s", embedding_model)
|
58 |
+
|
59 |
+
def _init_embedding_model(self):
|
60 |
+
"""Initialize the embedding model."""
|
61 |
+
try:
|
62 |
+
from sentence_transformers import SentenceTransformer
|
63 |
+
|
64 |
+
# Choose a lightweight model for CPU efficiency
|
65 |
+
if self.embedding_model == "lightweight":
|
66 |
+
# MiniLM models are lightweight and efficient
|
67 |
+
self.model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
|
68 |
+
else:
|
69 |
+
# Default to a balanced model
|
70 |
+
self.model = SentenceTransformer(self.embedding_model)
|
71 |
+
|
72 |
+
logger.info("Using embedding model: %s", self.model.get_sentence_embedding_dimension())
|
73 |
+
except ImportError:
|
74 |
+
logger.warning("SentenceTransformer not available, using numpy fallback (less accurate)")
|
75 |
+
self.model = None
|
76 |
+
|
77 |
+
def _get_embeddings(self, texts: List[str]) -> np.ndarray:
|
78 |
+
"""
|
79 |
+
Get embeddings for a list of texts.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
texts: List of texts to embed
|
83 |
+
|
84 |
+
Returns:
|
85 |
+
embeddings: Array of text embeddings
|
86 |
+
"""
|
87 |
+
if not texts:
|
88 |
+
return np.array([])
|
89 |
+
|
90 |
+
if self.model is not None:
|
91 |
+
# Use the sentence transformer if available
|
92 |
+
# Apply batching for memory efficiency
|
93 |
+
if self.use_batching and len(texts) > self.batch_size:
|
94 |
+
embeddings = []
|
95 |
+
|
96 |
+
for i in range(0, len(texts), self.batch_size):
|
97 |
+
batch = texts[i:i+self.batch_size]
|
98 |
+
batch_embeddings = self.model.encode(
|
99 |
+
batch,
|
100 |
+
show_progress_bar=False,
|
101 |
+
convert_to_numpy=True
|
102 |
+
)
|
103 |
+
embeddings.append(batch_embeddings)
|
104 |
+
|
105 |
+
return np.vstack(embeddings)
|
106 |
+
else:
|
107 |
+
return self.model.encode(texts, show_progress_bar=False)
|
108 |
+
else:
|
109 |
+
# Fallback to a simple Bag-of-Words approach
|
110 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
111 |
+
vectorizer = TfidfVectorizer(max_features=5000)
|
112 |
+
return vectorizer.fit_transform(texts).toarray()
|
113 |
+
|
114 |
+
def _compute_similarities(self, query_embedding: np.ndarray, chunk_embeddings: np.ndarray) -> np.ndarray:
|
115 |
+
"""
|
116 |
+
Compute similarities between query and chunk embeddings.
|
117 |
+
|
118 |
+
Args:
|
119 |
+
query_embedding: Embedding of the query
|
120 |
+
chunk_embeddings: Embeddings of the chunks
|
121 |
+
|
122 |
+
Returns:
|
123 |
+
similarities: Array of similarity scores
|
124 |
+
"""
|
125 |
+
if self.similarity_metric == "cosine":
|
126 |
+
# Normalize the embeddings for cosine similarity
|
127 |
+
query_norm = np.linalg.norm(query_embedding)
|
128 |
+
if query_norm > 0:
|
129 |
+
query_embedding = query_embedding / query_norm
|
130 |
+
|
131 |
+
# Compute cosine similarity efficiently
|
132 |
+
return np.dot(chunk_embeddings, query_embedding)
|
133 |
+
elif self.similarity_metric == "dot":
|
134 |
+
# Simple dot product
|
135 |
+
return np.dot(chunk_embeddings, query_embedding)
|
136 |
+
elif self.similarity_metric == "euclidean":
|
137 |
+
# Negative Euclidean distance (higher is more similar)
|
138 |
+
return -np.sqrt(np.sum((chunk_embeddings - query_embedding) ** 2, axis=1))
|
139 |
+
else:
|
140 |
+
# Default to cosine
|
141 |
+
return np.dot(chunk_embeddings, query_embedding)
|
142 |
+
|
143 |
+
def index_chunks(self, chunks: List[Chunk]) -> None:
|
144 |
+
"""
|
145 |
+
Index chunks for future retrieval.
|
146 |
+
|
147 |
+
Args:
|
148 |
+
chunks: Chunks to index
|
149 |
+
"""
|
150 |
+
if not chunks:
|
151 |
+
return
|
152 |
+
|
153 |
+
# Add new chunks
|
154 |
+
for chunk in chunks:
|
155 |
+
# Skip if chunk is already indexed
|
156 |
+
if chunk.chunk_id in self.chunk_ids_to_index:
|
157 |
+
continue
|
158 |
+
|
159 |
+
self.chunks.append(chunk)
|
160 |
+
self.chunk_ids_to_index[chunk.chunk_id] = len(self.chunks) - 1
|
161 |
+
|
162 |
+
# Get embeddings for all chunks
|
163 |
+
chunk_texts = [chunk.content for chunk in self.chunks]
|
164 |
+
self.chunk_embeddings = self._get_embeddings(chunk_texts)
|
165 |
+
|
166 |
+
# Apply dimensionality reduction if needed for memory efficiency
|
167 |
+
if (self.max_index_size is not None and
|
168 |
+
len(self.chunks) > self.max_index_size and
|
169 |
+
self.model is not None):
|
170 |
+
|
171 |
+
# Keep only the most recent chunks
|
172 |
+
self.chunks = self.chunks[-self.max_index_size:]
|
173 |
+
|
174 |
+
# Update the index mapping
|
175 |
+
self.chunk_ids_to_index = {
|
176 |
+
chunk.chunk_id: i for i, chunk in enumerate(self.chunks)
|
177 |
+
}
|
178 |
+
|
179 |
+
# Recalculate embeddings for the pruned set
|
180 |
+
chunk_texts = [chunk.content for chunk in self.chunks]
|
181 |
+
self.chunk_embeddings = self._get_embeddings(chunk_texts)
|
182 |
+
|
183 |
+
# Normalize embeddings for cosine similarity
|
184 |
+
if self.similarity_metric == "cosine" and self.chunk_embeddings is not None:
|
185 |
+
# Compute norms of each embedding vector
|
186 |
+
norms = np.linalg.norm(self.chunk_embeddings, axis=1, keepdims=True)
|
187 |
+
|
188 |
+
# Avoid division by zero - normalize only where norm > 0
|
189 |
+
non_zero_norms = norms > 0
|
190 |
+
if np.any(non_zero_norms):
|
191 |
+
# Directly normalize by dividing by norms (with keepdims=True, broadcasting works correctly)
|
192 |
+
self.chunk_embeddings = np.where(
|
193 |
+
non_zero_norms,
|
194 |
+
self.chunk_embeddings / norms,
|
195 |
+
self.chunk_embeddings
|
196 |
+
)
|
197 |
+
|
198 |
+
logger.info("Indexed %d chunks (total: %d)", len(chunks), len(self.chunks))
|
199 |
+
|
200 |
+
def retrieve(self, query: str, top_k: Optional[int] = None) -> List[Chunk]:
|
201 |
+
"""
|
202 |
+
Retrieve chunks relevant to a query.
|
203 |
+
|
204 |
+
Args:
|
205 |
+
query: Query to retrieve chunks for
|
206 |
+
top_k: Number of chunks to retrieve (default: 5)
|
207 |
+
|
208 |
+
Returns:
|
209 |
+
chunks: List of retrieved chunks
|
210 |
+
"""
|
211 |
+
if not self.chunks:
|
212 |
+
logger.warning("No chunks indexed for retrieval")
|
213 |
+
return []
|
214 |
+
|
215 |
+
if not query:
|
216 |
+
logger.warning("Empty query provided")
|
217 |
+
return []
|
218 |
+
|
219 |
+
# Default top_k
|
220 |
+
top_k = top_k or 5
|
221 |
+
|
222 |
+
# Get query embedding
|
223 |
+
query_embedding = self._get_embeddings([query])[0]
|
224 |
+
|
225 |
+
# Compute similarities
|
226 |
+
similarities = self._compute_similarities(query_embedding, self.chunk_embeddings)
|
227 |
+
|
228 |
+
# Get indices of top-k most similar chunks
|
229 |
+
if top_k >= len(similarities):
|
230 |
+
top_indices = list(range(len(similarities)))
|
231 |
+
top_indices.sort(key=lambda i: similarities[i], reverse=True)
|
232 |
+
else:
|
233 |
+
# More efficient partial sort for large indices
|
234 |
+
top_indices = heapq.nlargest(top_k, range(len(similarities)), key=lambda i: similarities[i])
|
235 |
+
|
236 |
+
# Get the corresponding chunks
|
237 |
+
retrieved_chunks = [self.chunks[i] for i in top_indices]
|
238 |
+
|
239 |
+
logger.info("Retrieved %d chunks for query", len(retrieved_chunks))
|
240 |
+
return retrieved_chunks
|
241 |
+
|
242 |
+
def clear(self) -> None:
|
243 |
+
"""Clear all indexed chunks."""
|
244 |
+
self.chunks = []
|
245 |
+
self.chunk_embeddings = None
|
246 |
+
self.chunk_ids_to_index = {}
|
247 |
+
logger.info("Cleared chunk index")
|
efficient_context/utils/__init__.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utility functions for efficient-context.
|
3 |
+
"""
|
4 |
+
|
5 |
+
# Import utilities as needed
|
6 |
+
from efficient_context.utils.text import (
|
7 |
+
split_into_sentences,
|
8 |
+
get_sentence_importance,
|
9 |
+
calculate_text_overlap
|
10 |
+
)
|
11 |
+
|
12 |
+
__all__ = ["split_into_sentences", "get_sentence_importance", "calculate_text_overlap"]
|
efficient_context/utils/text.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Text processing utilities for the efficient-context library.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import re
|
6 |
+
from typing import List, Dict, Any
|
7 |
+
import logging
|
8 |
+
|
9 |
+
# Set up logging
|
10 |
+
logging.basicConfig(level=logging.INFO)
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
|
13 |
+
def split_into_sentences(text: str) -> List[str]:
|
14 |
+
"""
|
15 |
+
Split text into sentences.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
text: Text to split
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
sentences: List of sentences
|
22 |
+
"""
|
23 |
+
# Simple but effective sentence splitting
|
24 |
+
# This handles most common sentence endings while preserving common abbreviations
|
25 |
+
text = text.replace('\n', ' ')
|
26 |
+
|
27 |
+
# Try to use NLTK if available for better sentence splitting
|
28 |
+
try:
|
29 |
+
import nltk
|
30 |
+
try:
|
31 |
+
return nltk.sent_tokenize(text)
|
32 |
+
except Exception as e:
|
33 |
+
logger.warning(f"NLTK sentence tokenizer error: {e}. Using fallback.")
|
34 |
+
return _simple_sentence_split(text)
|
35 |
+
except ImportError:
|
36 |
+
logger.warning("NLTK not available, using fallback sentence splitter")
|
37 |
+
return _simple_sentence_split(text)
|
38 |
+
|
39 |
+
def _simple_sentence_split(text: str) -> List[str]:
|
40 |
+
"""Fallback sentence splitter without dependencies."""
|
41 |
+
# This is a simplified version, not as accurate as NLTK but works without dependencies
|
42 |
+
# Handle common abbreviations to avoid splitting them
|
43 |
+
for abbr in ['Mr.', 'Mrs.', 'Dr.', 'vs.', 'e.g.', 'i.e.', 'etc.']:
|
44 |
+
text = text.replace(abbr, abbr.replace('.', '<POINT>'))
|
45 |
+
|
46 |
+
# Split on sentence endings
|
47 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
48 |
+
|
49 |
+
# Restore abbreviations
|
50 |
+
sentences = [s.replace('<POINT>', '.') for s in sentences]
|
51 |
+
|
52 |
+
# Remove empty sentences
|
53 |
+
return [s for s in sentences if s.strip()]
|
54 |
+
|
55 |
+
def get_sentence_importance(sentences: List[str]) -> List[float]:
|
56 |
+
"""
|
57 |
+
Calculate importance scores for sentences based on heuristics.
|
58 |
+
|
59 |
+
Args:
|
60 |
+
sentences: List of sentences to score
|
61 |
+
|
62 |
+
Returns:
|
63 |
+
importances: List of importance scores (0.0 to 1.0)
|
64 |
+
"""
|
65 |
+
# Simple heuristics for scoring sentence importance
|
66 |
+
importances = []
|
67 |
+
|
68 |
+
for sentence in sentences:
|
69 |
+
score = 0.0
|
70 |
+
words = sentence.split()
|
71 |
+
|
72 |
+
# Longer sentences tend to be more informative (up to a point)
|
73 |
+
length_score = min(len(words) / 20, 1.0)
|
74 |
+
|
75 |
+
# Keywords suggest important content
|
76 |
+
keyword_score = 0.0
|
77 |
+
keywords = ['important', 'significant', 'key', 'critical', 'crucial',
|
78 |
+
'essential', 'main', 'major', 'primary', 'central',
|
79 |
+
'result', 'conclusion', 'finding', 'discovered', 'shows']
|
80 |
+
|
81 |
+
for word in words:
|
82 |
+
if word.lower() in keywords:
|
83 |
+
keyword_score += 0.2
|
84 |
+
|
85 |
+
keyword_score = min(keyword_score, 0.6) # Cap keyword importance
|
86 |
+
|
87 |
+
# Presence of numbers often indicates factual content
|
88 |
+
number_score = 0.0
|
89 |
+
if re.search(r'\d', sentence):
|
90 |
+
number_score = 0.2
|
91 |
+
|
92 |
+
# Combine scores
|
93 |
+
score = 0.5 * length_score + 0.3 * keyword_score + 0.2 * number_score
|
94 |
+
|
95 |
+
# Cap at 1.0
|
96 |
+
importances.append(min(score, 1.0))
|
97 |
+
|
98 |
+
return importances
|
99 |
+
|
100 |
+
def calculate_text_overlap(text1: str, text2: str) -> float:
|
101 |
+
"""
|
102 |
+
Calculate simple text overlap between two strings.
|
103 |
+
|
104 |
+
Args:
|
105 |
+
text1: First text
|
106 |
+
text2: Second text
|
107 |
+
|
108 |
+
Returns:
|
109 |
+
overlap_ratio: Ratio of shared tokens (0.0 to 1.0)
|
110 |
+
"""
|
111 |
+
# Convert to sets of tokens
|
112 |
+
tokens1 = set(text1.lower().split())
|
113 |
+
tokens2 = set(text2.lower().split())
|
114 |
+
|
115 |
+
# Calculate overlap
|
116 |
+
if not tokens1 or not tokens2:
|
117 |
+
return 0.0
|
118 |
+
|
119 |
+
overlap = tokens1.intersection(tokens2)
|
120 |
+
return len(overlap) / min(len(tokens1), len(tokens2))
|
examples/basic_usage.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Example usage of efficient-context library.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
from efficient_context import ContextManager
|
7 |
+
from efficient_context.compression import SemanticDeduplicator
|
8 |
+
from efficient_context.chunking import SemanticChunker
|
9 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
10 |
+
from efficient_context.memory import MemoryManager
|
11 |
+
|
12 |
+
# Set up logging
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
def main():
|
17 |
+
# Sample documents
|
18 |
+
documents = [
|
19 |
+
{
|
20 |
+
"content": """
|
21 |
+
Renewable energy is derived from natural sources that are replenished at a higher rate than they are consumed.
|
22 |
+
Sunlight and wind, for example, are such sources that are constantly being replenished.
|
23 |
+
Renewable energy resources exist over wide geographical areas, in contrast to fossil fuels,
|
24 |
+
which are concentrated in a limited number of countries.
|
25 |
+
|
26 |
+
Rapid deployment of renewable energy and energy efficiency technologies is resulting in significant
|
27 |
+
energy security, climate change mitigation, and economic benefits.
|
28 |
+
In international public opinion surveys there is strong support for promoting renewable sources
|
29 |
+
such as solar power and wind power.
|
30 |
+
|
31 |
+
While many renewable energy projects are large-scale, renewable technologies are also suited to rural
|
32 |
+
and remote areas and developing countries, where energy is often crucial in human development.
|
33 |
+
As most of the renewable energy technologies provide electricity, renewable energy is often deployed
|
34 |
+
together with further electrification, which has several benefits: electricity can be converted to heat,
|
35 |
+
can be converted into mechanical energy with high efficiency, and is clean at the point of consumption.
|
36 |
+
""",
|
37 |
+
"metadata": {"topic": "renewable energy", "source": "example"}
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"content": """
|
41 |
+
Climate change mitigation consists of actions to limit global warming and its related effects.
|
42 |
+
This involves reductions in human emissions of greenhouse gases (GHGs) as well as activities
|
43 |
+
that reduce their concentration in the atmosphere.
|
44 |
+
|
45 |
+
Fossil fuels account for more than 70% of GHG emissions. The energy sector contributes to global
|
46 |
+
emissions, mainly through the burning of fossil fuels to generate electricity and heat,
|
47 |
+
and through the use of gasoline and diesel to power vehicles.
|
48 |
+
|
49 |
+
A transition to renewable energy is a key component of climate change mitigation. By replacing
|
50 |
+
fossil fuel power plants with renewable energy sources, such as wind and solar, we can reduce
|
51 |
+
the amount of greenhouse gases emitted into the atmosphere.
|
52 |
+
|
53 |
+
Renewable energy can also play a role in adapting to climate change, for example by providing
|
54 |
+
reliable power for cooling in increasingly hot regions, or by ensuring energy access in the
|
55 |
+
aftermath of climate-related disasters.
|
56 |
+
""",
|
57 |
+
"metadata": {"topic": "climate change", "source": "example"}
|
58 |
+
},
|
59 |
+
]
|
60 |
+
|
61 |
+
# Initialize a context manager with custom strategies
|
62 |
+
context_manager = ContextManager(
|
63 |
+
compressor=SemanticDeduplicator(threshold=0.85),
|
64 |
+
chunker=SemanticChunker(chunk_size=256),
|
65 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight"),
|
66 |
+
memory_manager=MemoryManager(target_usage_percent=80.0),
|
67 |
+
max_context_size=1024
|
68 |
+
)
|
69 |
+
|
70 |
+
# Add documents to the context manager
|
71 |
+
document_ids = context_manager.add_documents(documents)
|
72 |
+
|
73 |
+
# Query 1: Generate optimized context for a query
|
74 |
+
query1 = "Tell me about the climate impact of renewable energy"
|
75 |
+
print(f"\n\n=== QUERY: {query1} ===")
|
76 |
+
optimized_context1 = context_manager.generate_context(query=query1)
|
77 |
+
print(f"--- OPTIMIZED CONTEXT ({len(optimized_context1.split())} tokens) ---")
|
78 |
+
print(optimized_context1)
|
79 |
+
|
80 |
+
# Query 2: Different topic
|
81 |
+
query2 = "How does renewable energy work in rural areas?"
|
82 |
+
print(f"\n\n=== QUERY: {query2} ===")
|
83 |
+
optimized_context2 = context_manager.generate_context(query=query2)
|
84 |
+
print(f"--- OPTIMIZED CONTEXT ({len(optimized_context2.split())} tokens) ---")
|
85 |
+
print(optimized_context2)
|
86 |
+
|
87 |
+
# Example of using with an LLM (commented out since we don't have an actual LLM here)
|
88 |
+
# response = your_llm_model.generate(prompt="Answer this question using the provided context.", context=optimized_context)
|
89 |
+
# print(f"LLM Response: {response}")
|
90 |
+
|
91 |
+
if __name__ == "__main__":
|
92 |
+
main()
|
examples/benchmark.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Benchmarking script for efficient-context performance.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
import time
|
7 |
+
import argparse
|
8 |
+
import random
|
9 |
+
import string
|
10 |
+
import psutil
|
11 |
+
import os
|
12 |
+
import gc
|
13 |
+
from typing import List, Dict, Any
|
14 |
+
|
15 |
+
from efficient_context import ContextManager
|
16 |
+
from efficient_context.compression import SemanticDeduplicator
|
17 |
+
from efficient_context.chunking import SemanticChunker
|
18 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
19 |
+
|
20 |
+
# Set up logging
|
21 |
+
logging.basicConfig(level=logging.INFO)
|
22 |
+
logger = logging.getLogger(__name__)
|
23 |
+
|
24 |
+
def generate_random_text(words: int = 1000, paragraphs: int = 5) -> str:
|
25 |
+
"""
|
26 |
+
Generate random text for benchmarking.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
words: Number of words to generate
|
30 |
+
paragraphs: Number of paragraphs to split the text into
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
text: Generated random text
|
34 |
+
"""
|
35 |
+
# List of common words for more realistic text
|
36 |
+
common_words = [
|
37 |
+
"the", "be", "to", "of", "and", "a", "in", "that", "have", "I",
|
38 |
+
"it", "for", "not", "on", "with", "he", "as", "you", "do", "at",
|
39 |
+
"this", "but", "his", "by", "from", "they", "we", "say", "her", "she",
|
40 |
+
"or", "an", "will", "my", "one", "all", "would", "there", "their", "what",
|
41 |
+
"so", "up", "out", "if", "about", "who", "get", "which", "go", "me",
|
42 |
+
"renewable", "energy", "climate", "wind", "solar", "power", "change", "global",
|
43 |
+
"sustainable", "resources", "efficiency", "emissions", "carbon", "technology"
|
44 |
+
]
|
45 |
+
|
46 |
+
# Generate paragraphs
|
47 |
+
result = []
|
48 |
+
words_per_paragraph = words // paragraphs
|
49 |
+
|
50 |
+
for i in range(paragraphs):
|
51 |
+
paragraph_words = []
|
52 |
+
for j in range(words_per_paragraph):
|
53 |
+
# Occasionally add a random word for variety
|
54 |
+
if random.random() < 0.1:
|
55 |
+
word = ''.join(random.choice(string.ascii_lowercase) for _ in range(random.randint(3, 10)))
|
56 |
+
else:
|
57 |
+
word = random.choice(common_words)
|
58 |
+
|
59 |
+
# Capitalize first word of sentence
|
60 |
+
if j == 0 or paragraph_words[-1].endswith('.'):
|
61 |
+
word = word.capitalize()
|
62 |
+
|
63 |
+
# Add punctuation occasionally
|
64 |
+
if j > 0 and j % random.randint(8, 15) == 0:
|
65 |
+
word += '.'
|
66 |
+
elif random.random() < 0.05:
|
67 |
+
word += ','
|
68 |
+
|
69 |
+
paragraph_words.append(word)
|
70 |
+
|
71 |
+
# Ensure paragraph ends with period
|
72 |
+
if not paragraph_words[-1].endswith('.'):
|
73 |
+
paragraph_words[-1] += '.'
|
74 |
+
|
75 |
+
result.append(' '.join(paragraph_words))
|
76 |
+
|
77 |
+
return '\n\n'.join(result)
|
78 |
+
|
79 |
+
def get_memory_usage() -> Dict[str, Any]:
|
80 |
+
"""
|
81 |
+
Get current memory usage.
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
stats: Memory usage statistics
|
85 |
+
"""
|
86 |
+
process = psutil.Process(os.getpid())
|
87 |
+
memory_info = process.memory_info()
|
88 |
+
|
89 |
+
return {
|
90 |
+
"rss": memory_info.rss / (1024 * 1024), # MB
|
91 |
+
"vms": memory_info.vms / (1024 * 1024) # MB
|
92 |
+
}
|
93 |
+
|
94 |
+
def run_benchmark(
|
95 |
+
num_documents: int = 10,
|
96 |
+
words_per_document: int = 1000,
|
97 |
+
num_queries: int = 5
|
98 |
+
) -> None:
|
99 |
+
"""
|
100 |
+
Run a benchmark of efficient-context performance.
|
101 |
+
|
102 |
+
Args:
|
103 |
+
num_documents: Number of documents to process
|
104 |
+
words_per_document: Number of words per document
|
105 |
+
num_queries: Number of queries to run
|
106 |
+
"""
|
107 |
+
logger.info(f"Starting benchmark with {num_documents} documents, {words_per_document} words each")
|
108 |
+
|
109 |
+
# Initialize context manager
|
110 |
+
context_manager = ContextManager(
|
111 |
+
compressor=SemanticDeduplicator(threshold=0.85),
|
112 |
+
chunker=SemanticChunker(chunk_size=256),
|
113 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
|
114 |
+
)
|
115 |
+
|
116 |
+
# Generate documents
|
117 |
+
logger.info("Generating random documents...")
|
118 |
+
documents = []
|
119 |
+
for i in range(num_documents):
|
120 |
+
content = generate_random_text(words=words_per_document, paragraphs=5)
|
121 |
+
documents.append({
|
122 |
+
"content": content,
|
123 |
+
"metadata": {"id": f"doc-{i}", "source": "benchmark"}
|
124 |
+
})
|
125 |
+
|
126 |
+
# Measure document processing
|
127 |
+
logger.info("Adding documents to context manager...")
|
128 |
+
start_mem = get_memory_usage()
|
129 |
+
start_time = time.time()
|
130 |
+
|
131 |
+
document_ids = context_manager.add_documents(documents)
|
132 |
+
|
133 |
+
end_time = time.time()
|
134 |
+
end_mem = get_memory_usage()
|
135 |
+
|
136 |
+
processing_time = end_time - start_time
|
137 |
+
memory_increase = end_mem["rss"] - start_mem["rss"]
|
138 |
+
|
139 |
+
logger.info(f"Document processing:")
|
140 |
+
logger.info(f" - Time: {processing_time:.2f} seconds")
|
141 |
+
logger.info(f" - Average per document: {processing_time / num_documents:.4f} seconds")
|
142 |
+
logger.info(f" - Memory usage increase: {memory_increase:.2f} MB")
|
143 |
+
logger.info(f" - Total chunks created: {len(context_manager.chunks)}")
|
144 |
+
|
145 |
+
# Generate random queries
|
146 |
+
logger.info("Generating context for queries...")
|
147 |
+
queries = [
|
148 |
+
f"Explain {random.choice(['renewable', 'sustainable', 'clean', 'alternative'])} energy",
|
149 |
+
f"What are the {random.choice(['benefits', 'advantages', 'impacts', 'effects'])} of {random.choice(['solar', 'wind', 'hydro', 'geothermal'])} power?",
|
150 |
+
f"How does {random.choice(['climate change', 'global warming', 'carbon emissions', 'greenhouse gases'])} affect the environment?",
|
151 |
+
f"Discuss the {random.choice(['future', 'potential', 'limitations', 'challenges'])} of renewable energy",
|
152 |
+
f"What is the {random.choice(['relationship', 'connection', 'link', 'correlation'])} between energy consumption and climate change?"
|
153 |
+
]
|
154 |
+
|
155 |
+
# Ensure we have enough queries
|
156 |
+
while len(queries) < num_queries:
|
157 |
+
queries.append(f"Tell me about {random.choice(['energy', 'climate', 'sustainability', 'emissions'])}")
|
158 |
+
|
159 |
+
# Select the requested number of queries
|
160 |
+
selected_queries = random.sample(queries, min(num_queries, len(queries)))
|
161 |
+
|
162 |
+
# Measure query processing
|
163 |
+
total_query_time = 0
|
164 |
+
total_query_tokens = 0
|
165 |
+
|
166 |
+
for i, query in enumerate(selected_queries):
|
167 |
+
# Clear some memory and cache before each query
|
168 |
+
gc.collect()
|
169 |
+
|
170 |
+
start_time = time.time()
|
171 |
+
context = context_manager.generate_context(query)
|
172 |
+
query_time = time.time() - start_time
|
173 |
+
context_tokens = len(context.split())
|
174 |
+
|
175 |
+
total_query_time += query_time
|
176 |
+
total_query_tokens += context_tokens
|
177 |
+
|
178 |
+
logger.info(f"Query {i+1}: '{query}'")
|
179 |
+
logger.info(f" - Time: {query_time:.4f} seconds")
|
180 |
+
logger.info(f" - Context size: {context_tokens} tokens")
|
181 |
+
|
182 |
+
avg_query_time = total_query_time / num_queries
|
183 |
+
avg_tokens = total_query_tokens / num_queries
|
184 |
+
|
185 |
+
logger.info("\nBenchmark Summary:")
|
186 |
+
logger.info(f" - Documents processed: {num_documents} ({words_per_document} words each)")
|
187 |
+
logger.info(f" - Queries executed: {num_queries}")
|
188 |
+
logger.info(f" - Document processing time: {processing_time:.2f} seconds ({processing_time / num_documents:.4f}s per document)")
|
189 |
+
logger.info(f" - Average query time: {avg_query_time:.4f} seconds")
|
190 |
+
logger.info(f" - Average context size: {avg_tokens:.1f} tokens")
|
191 |
+
logger.info(f" - Final memory usage: {get_memory_usage()['rss']:.2f} MB")
|
192 |
+
|
193 |
+
def main():
|
194 |
+
"""Main function for the benchmark script."""
|
195 |
+
parser = argparse.ArgumentParser(description="Benchmark efficient-context performance")
|
196 |
+
parser.add_argument("--documents", type=int, default=10, help="Number of documents to process")
|
197 |
+
parser.add_argument("--words", type=int, default=1000, help="Words per document")
|
198 |
+
parser.add_argument("--queries", type=int, default=5, help="Number of queries to run")
|
199 |
+
|
200 |
+
args = parser.parse_args()
|
201 |
+
|
202 |
+
run_benchmark(
|
203 |
+
num_documents=args.documents,
|
204 |
+
words_per_document=args.words,
|
205 |
+
num_queries=args.queries
|
206 |
+
)
|
207 |
+
|
208 |
+
if __name__ == "__main__":
|
209 |
+
main()
|
examples/dedup_benchmark.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
Specialized benchmark script for measuring the effectiveness of semantic deduplication
|
4 |
+
in the efficient-context library.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import logging
|
8 |
+
import time
|
9 |
+
import argparse
|
10 |
+
import sys
|
11 |
+
from typing import List, Dict, Any # Set up logging
|
12 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
logger.info("Deduplication benchmark starting")
|
15 |
+
|
16 |
+
# Print Python and environment information
|
17 |
+
import platform
|
18 |
+
import sys
|
19 |
+
logger.info(f"Python version: {platform.python_version()}")
|
20 |
+
logger.info(f"Platform: {platform.platform()}")
|
21 |
+
|
22 |
+
# Import the library
|
23 |
+
try:
|
24 |
+
from efficient_context import ContextManager
|
25 |
+
from efficient_context.compression import SemanticDeduplicator
|
26 |
+
from efficient_context.chunking import SemanticChunker
|
27 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
28 |
+
logger.info("Successfully imported efficient_context")
|
29 |
+
except ImportError as e:
|
30 |
+
logger.error(f"Failed to import efficient_context: {e}")
|
31 |
+
sys.exit(1)
|
32 |
+
|
33 |
+
def generate_repetitive_document() -> str:
|
34 |
+
"""
|
35 |
+
Generate a document with deliberate semantic repetition.
|
36 |
+
The document will contain sentences that mean the same thing
|
37 |
+
expressed in different ways.
|
38 |
+
"""
|
39 |
+
# Base paragraphs with distinct topics
|
40 |
+
base_paragraphs = [
|
41 |
+
# Climate change paragraph with repetitive content
|
42 |
+
"""
|
43 |
+
Climate change is a significant and lasting alteration in the statistical distribution of weather
|
44 |
+
patterns over periods ranging from decades to millions of years. Global warming is the long-term
|
45 |
+
heating of Earth's climate system observed since the pre-industrial period due to human activities.
|
46 |
+
The rise in global temperature is causing substantial changes in our environment and ecosystems.
|
47 |
+
The warming of the planet is leading to significant transformations in weather patterns worldwide.
|
48 |
+
Human activities are causing Earth's temperature to increase, resulting in climate modifications.
|
49 |
+
The climate crisis is fundamentally altering the Earth's atmosphere and affecting all living things.
|
50 |
+
""",
|
51 |
+
|
52 |
+
# Renewable energy paragraph with repetitive content
|
53 |
+
"""
|
54 |
+
Renewable energy comes from sources that are naturally replenishing but flow-limited.
|
55 |
+
Clean energy is derived from natural processes that are constantly replenished.
|
56 |
+
Sustainable power is generated from resources that won't deplete over time.
|
57 |
+
Green energy utilizes sources that don't produce pollution when generating power.
|
58 |
+
Alternative energy refers to sources that are an alternative to fossil fuel.
|
59 |
+
Eco-friendly power generation relies on inexhaustible natural resources.
|
60 |
+
""",
|
61 |
+
|
62 |
+
# Technology paragraph with repetitive content
|
63 |
+
"""
|
64 |
+
Artificial intelligence is revolutionizing how we interact with technology.
|
65 |
+
Machine learning is transforming the way computers process information.
|
66 |
+
AI is fundamentally changing our relationship with digital systems.
|
67 |
+
Smart algorithms are reshaping our technological landscape dramatically.
|
68 |
+
Computational intelligence is altering how machines solve complex problems.
|
69 |
+
Neural networks are revolutionizing the capabilities of modern computers.
|
70 |
+
"""
|
71 |
+
]
|
72 |
+
|
73 |
+
# Repeat the paragraphs to create a longer document
|
74 |
+
document = "\n\n".join(base_paragraphs * 3)
|
75 |
+
return document
|
76 |
+
|
77 |
+
def generate_mixed_document() -> str:
|
78 |
+
"""
|
79 |
+
Generate a document with a mix of repetitive and unique content.
|
80 |
+
"""
|
81 |
+
repetitive = generate_repetitive_document()
|
82 |
+
|
83 |
+
unique = """
|
84 |
+
Energy efficiency is the goal to reduce the amount of energy required to provide products and services.
|
85 |
+
For example, insulating a home allows a building to use less heating and cooling energy to achieve and
|
86 |
+
maintain a comfortable temperature. Installing LED bulbs, fluorescent lighting, or natural skylights reduces
|
87 |
+
the amount of energy required to attain the same level of illumination compared with using traditional
|
88 |
+
incandescent light bulbs. Improvements in energy efficiency are generally achieved by adopting a more
|
89 |
+
efficient technology or production process or by application of commonly accepted methods to reduce energy
|
90 |
+
losses.
|
91 |
+
|
92 |
+
Biodiversity is the variety and variability of life on Earth. It is typically a measure of variation at the
|
93 |
+
genetic, species, and ecosystem level. Terrestrial biodiversity is usually greater near the equator, which is
|
94 |
+
the result of the warm climate and high primary productivity. Biodiversity is not distributed evenly on Earth,
|
95 |
+
and is richer in the tropics. These tropical forest ecosystems cover less than 10% of earth's surface, and
|
96 |
+
contain about 90% of the world's species. Marine biodiversity is usually highest along coasts in the Western
|
97 |
+
Pacific, where sea surface temperature is highest, and in the mid-latitudinal band in all oceans.
|
98 |
+
"""
|
99 |
+
|
100 |
+
return repetitive + "\n\n" + unique
|
101 |
+
|
102 |
+
def run_deduplication_benchmark() -> None:
|
103 |
+
"""
|
104 |
+
Run a benchmark specifically testing the semantic deduplication capabilities.
|
105 |
+
"""
|
106 |
+
logger.info("Starting deduplication benchmark")
|
107 |
+
|
108 |
+
# Initialize context manager with various thresholds
|
109 |
+
thresholds = [0.7, 0.8, 0.85, 0.9, 0.95]
|
110 |
+
results = []
|
111 |
+
|
112 |
+
# Create documents
|
113 |
+
repetitive_doc = generate_repetitive_document()
|
114 |
+
mixed_doc = generate_mixed_document()
|
115 |
+
|
116 |
+
logger.info(f"Repetitive document size: {len(repetitive_doc.split())} words")
|
117 |
+
logger.info(f"Mixed document size: {len(mixed_doc.split())} words")
|
118 |
+
|
119 |
+
for threshold in thresholds:
|
120 |
+
logger.info(f"\nTesting with threshold: {threshold}")
|
121 |
+
|
122 |
+
# Create a fresh context manager with the current threshold
|
123 |
+
context_manager = ContextManager(
|
124 |
+
compressor=SemanticDeduplicator(threshold=threshold),
|
125 |
+
chunker=SemanticChunker(chunk_size=256),
|
126 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
|
127 |
+
)
|
128 |
+
|
129 |
+
# Test with repetitive document
|
130 |
+
logger.info("Processing repetitive document...")
|
131 |
+
start_time = time.time()
|
132 |
+
doc_id = context_manager.add_document(repetitive_doc)
|
133 |
+
processing_time = time.time() - start_time
|
134 |
+
|
135 |
+
# Generate context with a relevant query to see compression in action
|
136 |
+
query = "Tell me about climate change and renewable energy"
|
137 |
+
start_time = time.time()
|
138 |
+
context = context_manager.generate_context(query)
|
139 |
+
query_time = time.time() - start_time
|
140 |
+
|
141 |
+
# Record result
|
142 |
+
result = {
|
143 |
+
"threshold": threshold,
|
144 |
+
"document_type": "repetitive",
|
145 |
+
"original_size": len(repetitive_doc.split()),
|
146 |
+
"context_size": len(context.split()),
|
147 |
+
"processing_time": processing_time,
|
148 |
+
"query_time": query_time,
|
149 |
+
"chunks": len(context_manager.chunks)
|
150 |
+
}
|
151 |
+
results.append(result)
|
152 |
+
logger.info(f" - Original size: {result['original_size']} words")
|
153 |
+
logger.info(f" - Context size: {result['context_size']} words")
|
154 |
+
logger.info(f" - Compression ratio: {result['context_size'] / result['original_size']:.2f}")
|
155 |
+
logger.info(f" - Processing time: {result['processing_time']:.4f} seconds")
|
156 |
+
logger.info(f" - Query time: {result['query_time']:.4f} seconds")
|
157 |
+
|
158 |
+
# Reset the context manager
|
159 |
+
context_manager = ContextManager(
|
160 |
+
compressor=SemanticDeduplicator(threshold=threshold),
|
161 |
+
chunker=SemanticChunker(chunk_size=256),
|
162 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
|
163 |
+
)
|
164 |
+
|
165 |
+
# Test with mixed document
|
166 |
+
logger.info("Processing mixed document...")
|
167 |
+
start_time = time.time()
|
168 |
+
doc_id = context_manager.add_document(mixed_doc)
|
169 |
+
processing_time = time.time() - start_time
|
170 |
+
|
171 |
+
# Generate context with a relevant query
|
172 |
+
query = "Tell me about climate change and biodiversity"
|
173 |
+
start_time = time.time()
|
174 |
+
context = context_manager.generate_context(query)
|
175 |
+
query_time = time.time() - start_time
|
176 |
+
|
177 |
+
# Record result
|
178 |
+
result = {
|
179 |
+
"threshold": threshold,
|
180 |
+
"document_type": "mixed",
|
181 |
+
"original_size": len(mixed_doc.split()),
|
182 |
+
"context_size": len(context.split()),
|
183 |
+
"processing_time": processing_time,
|
184 |
+
"query_time": query_time,
|
185 |
+
"chunks": len(context_manager.chunks)
|
186 |
+
}
|
187 |
+
results.append(result)
|
188 |
+
logger.info(f" - Original size: {result['original_size']} words")
|
189 |
+
logger.info(f" - Context size: {result['context_size']} words")
|
190 |
+
logger.info(f" - Compression ratio: {result['context_size'] / result['original_size']:.2f}")
|
191 |
+
logger.info(f" - Processing time: {result['processing_time']:.4f} seconds")
|
192 |
+
logger.info(f" - Query time: {result['query_time']:.4f} seconds")
|
193 |
+
|
194 |
+
# Print summary
|
195 |
+
logger.info("\nDeduplication Benchmark Summary:")
|
196 |
+
logger.info("-----------------------------------")
|
197 |
+
|
198 |
+
logger.info("\nRepetitive Document Results:")
|
199 |
+
for result in [r for r in results if r["document_type"] == "repetitive"]:
|
200 |
+
logger.info(f"Threshold {result['threshold']}: {result['context_size'] / result['original_size']:.2f} compression ratio, {result['processing_time']:.4f}s processing time")
|
201 |
+
|
202 |
+
logger.info("\nMixed Document Results:")
|
203 |
+
for result in [r for r in results if r["document_type"] == "mixed"]:
|
204 |
+
logger.info(f"Threshold {result['threshold']}: {result['context_size'] / result['original_size']:.2f} compression ratio, {result['processing_time']:.4f}s processing time")
|
205 |
+
|
206 |
+
def main():
|
207 |
+
"""Main function for the deduplication benchmark script."""
|
208 |
+
parser = argparse.ArgumentParser(description="Benchmark efficient-context's semantic deduplication")
|
209 |
+
|
210 |
+
args = parser.parse_args()
|
211 |
+
run_deduplication_benchmark()
|
212 |
+
|
213 |
+
if __name__ == "__main__":
|
214 |
+
main()
|
examples/dedup_eval.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
Advanced test for efficient-context's deduplication capabilities
|
4 |
+
"""
|
5 |
+
|
6 |
+
import time
|
7 |
+
import logging
|
8 |
+
|
9 |
+
# Set up logging
|
10 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
|
13 |
+
# Import the library
|
14 |
+
from efficient_context import ContextManager
|
15 |
+
from efficient_context.compression import SemanticDeduplicator
|
16 |
+
from efficient_context.chunking import SemanticChunker
|
17 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
18 |
+
|
19 |
+
def create_repetitive_document():
|
20 |
+
"""Create a document with highly repetitive semantic content"""
|
21 |
+
|
22 |
+
# Create repetitive paragraphs with the same semantic meaning expressed differently
|
23 |
+
paragraphs = []
|
24 |
+
|
25 |
+
# Climate change variations
|
26 |
+
climate_variations = [
|
27 |
+
"Climate change is a significant alteration in global weather patterns over extended periods.",
|
28 |
+
"Global warming refers to the long-term increase in Earth's average temperature.",
|
29 |
+
"The climate crisis is causing significant shifts in temperature and precipitation patterns worldwide.",
|
30 |
+
"Rising global temperatures lead to fundamental changes in our planet's climate systems.",
|
31 |
+
"Human-induced warming of the Earth's atmosphere is resulting in climate destabilization."
|
32 |
+
]
|
33 |
+
paragraphs.extend(climate_variations)
|
34 |
+
|
35 |
+
# Renewable energy variations
|
36 |
+
energy_variations = [
|
37 |
+
"Renewable energy comes from natural sources that are constantly replenished.",
|
38 |
+
"Clean energy technologies harness power from sustainable, non-depleting resources.",
|
39 |
+
"Green power is generated from environmentally friendly, renewable sources.",
|
40 |
+
"Sustainable energy is derived from resources that don't run out over time.",
|
41 |
+
"Alternative energy refers to power sources that are alternatives to fossil fuels."
|
42 |
+
]
|
43 |
+
paragraphs.extend(energy_variations)
|
44 |
+
|
45 |
+
# Add some unique content as well
|
46 |
+
unique_content = [
|
47 |
+
"Machine learning algorithms require significant computational resources to train effectively.",
|
48 |
+
"Biodiversity loss is accelerating at an unprecedented rate due to human activities.",
|
49 |
+
"Quantum computing may revolutionize cryptography and computational chemistry."
|
50 |
+
]
|
51 |
+
paragraphs.extend(unique_content)
|
52 |
+
|
53 |
+
# Repeat the document to make it longer and more repetitive
|
54 |
+
document = "\n\n".join(paragraphs * 3) # Repeat 3 times
|
55 |
+
return document
|
56 |
+
|
57 |
+
def run_deduplication_test():
|
58 |
+
"""Test the semantic deduplication capabilities"""
|
59 |
+
logger.info("Running semantic deduplication test")
|
60 |
+
|
61 |
+
# Create a highly repetitive document
|
62 |
+
document = create_repetitive_document()
|
63 |
+
logger.info(f"Document size: {len(document.split())} words")
|
64 |
+
|
65 |
+
# Test with different threshold values
|
66 |
+
thresholds = [0.7, 0.8, 0.85, 0.9, 0.95]
|
67 |
+
|
68 |
+
for threshold in thresholds:
|
69 |
+
logger.info(f"\nTesting threshold: {threshold}")
|
70 |
+
|
71 |
+
# Create context manager with current threshold
|
72 |
+
cm = ContextManager(
|
73 |
+
compressor=SemanticDeduplicator(threshold=threshold),
|
74 |
+
chunker=SemanticChunker(chunk_size=200),
|
75 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
|
76 |
+
)
|
77 |
+
|
78 |
+
# Add document and measure processing time
|
79 |
+
start = time.time()
|
80 |
+
doc_id = cm.add_document(document)
|
81 |
+
processing_time = time.time() - start
|
82 |
+
|
83 |
+
# Generate context for a relevant query
|
84 |
+
query = "Explain the relationship between climate change and renewable energy"
|
85 |
+
start = time.time()
|
86 |
+
context = cm.generate_context(query)
|
87 |
+
query_time = time.time() - start
|
88 |
+
|
89 |
+
# Calculate metrics
|
90 |
+
original_size = len(document.split())
|
91 |
+
context_size = len(context.split())
|
92 |
+
compression_ratio = context_size / original_size
|
93 |
+
|
94 |
+
# Report results
|
95 |
+
logger.info(f"Results for threshold {threshold}:")
|
96 |
+
logger.info(f" - Original document: {original_size} words")
|
97 |
+
logger.info(f" - Context generated: {context_size} words")
|
98 |
+
logger.info(f" - Compression ratio: {compression_ratio:.2f}")
|
99 |
+
logger.info(f" - Chunks created: {len(cm.chunks)}")
|
100 |
+
logger.info(f" - Processing time: {processing_time:.4f} seconds")
|
101 |
+
logger.info(f" - Query time: {query_time:.4f} seconds")
|
102 |
+
|
103 |
+
# Print a preview of the context
|
104 |
+
logger.info(f" - Context preview: {context[:150]}...")
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
try:
|
108 |
+
print("Starting deduplication evaluation...")
|
109 |
+
run_deduplication_test()
|
110 |
+
print("Evaluation completed successfully")
|
111 |
+
except Exception as e:
|
112 |
+
print(f"Error during evaluation: {e}")
|
113 |
+
import traceback
|
114 |
+
traceback.print_exc()
|
examples/dedup_test.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Manual benchmark for the SemanticDeduplicator component.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import sys
|
6 |
+
import logging
|
7 |
+
from efficient_context.compression import SemanticDeduplicator
|
8 |
+
|
9 |
+
# Set up logging
|
10 |
+
logging.basicConfig(level=logging.INFO)
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
|
13 |
+
def main():
|
14 |
+
print("Testing SemanticDeduplicator")
|
15 |
+
|
16 |
+
# Create a repetitive document with semantically similar sentences
|
17 |
+
repetitive_text = """
|
18 |
+
Climate change is a significant global challenge.
|
19 |
+
Global warming is affecting ecosystems worldwide.
|
20 |
+
The Earth's temperature is rising due to human activities.
|
21 |
+
Climate change poses a serious threat to our planet.
|
22 |
+
Rising global temperatures are causing environmental problems.
|
23 |
+
|
24 |
+
Renewable energy is key to a sustainable future.
|
25 |
+
Clean energy sources help reduce carbon emissions.
|
26 |
+
Sustainable power generation is vital for fighting climate change.
|
27 |
+
Green energy technologies are becoming more affordable.
|
28 |
+
Renewable resources provide alternatives to fossil fuels.
|
29 |
+
"""
|
30 |
+
|
31 |
+
print(f"Original text length: {len(repetitive_text.split())} words")
|
32 |
+
|
33 |
+
# Test with different thresholds
|
34 |
+
for threshold in [0.7, 0.8, 0.85, 0.9, 0.95]:
|
35 |
+
print(f"\nTesting threshold: {threshold}")
|
36 |
+
|
37 |
+
deduplicator = SemanticDeduplicator(threshold=threshold)
|
38 |
+
|
39 |
+
# Apply deduplication
|
40 |
+
compressed_text = deduplicator.compress(repetitive_text)
|
41 |
+
|
42 |
+
print(f"Compressed text length: {len(compressed_text.split())} words")
|
43 |
+
print(f"Compression ratio: {len(compressed_text.split()) / len(repetitive_text.split()):.2f}")
|
44 |
+
|
45 |
+
# Print the first 100 characters of the compressed text
|
46 |
+
print(f"Compressed text (preview): {compressed_text[:100]}...")
|
47 |
+
|
48 |
+
if __name__ == "__main__":
|
49 |
+
main()
|
examples/deduplication_benchmark.py
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
Specialized benchmark script for measuring the effectiveness of semantic deduplication
|
4 |
+
in the efficient-context library.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import logging
|
8 |
+
import time
|
9 |
+
import argparse
|
10 |
+
import sys
|
11 |
+
from typing import List, Dict, Any
|
12 |
+
|
13 |
+
# Set up logging
|
14 |
+
logging.basicConfig(level=logging.INFO)
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
# Import the library
|
18 |
+
try:
|
19 |
+
from efficient_context import ContextManager
|
20 |
+
from efficient_context.compression import SemanticDeduplicator
|
21 |
+
from efficient_context.chunking import SemanticChunker
|
22 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
23 |
+
except ImportError as e:
|
24 |
+
logger.error(f"Failed to import efficient_context: {e}")
|
25 |
+
sys.exit(1)
|
26 |
+
|
27 |
+
def generate_repetitive_document() -> str:
|
28 |
+
"""
|
29 |
+
Generate a document with deliberate semantic repetition.
|
30 |
+
The document will contain sentences that mean the same thing
|
31 |
+
expressed in different ways.
|
32 |
+
"""
|
33 |
+
# Base paragraphs with distinct topics
|
34 |
+
base_paragraphs = [
|
35 |
+
# Climate change paragraph with repetitive content
|
36 |
+
"""
|
37 |
+
Climate change is a significant and lasting alteration in the statistical distribution of weather
|
38 |
+
patterns over periods ranging from decades to millions of years. Global warming is the long-term
|
39 |
+
heating of Earth's climate system observed since the pre-industrial period due to human activities.
|
40 |
+
The rise in global temperature is causing substantial changes in our environment and ecosystems.
|
41 |
+
The warming of the planet is leading to significant transformations in weather patterns worldwide.
|
42 |
+
Human activities are causing Earth's temperature to increase, resulting in climate modifications.
|
43 |
+
The climate crisis is fundamentally altering the Earth's atmosphere and affecting all living things.
|
44 |
+
""",
|
45 |
+
|
46 |
+
# Renewable energy paragraph with repetitive content
|
47 |
+
"""
|
48 |
+
Renewable energy comes from sources that are naturally replenishing but flow-limited.
|
49 |
+
Clean energy is derived from natural processes that are constantly replenished.
|
50 |
+
Sustainable power is generated from resources that won't deplete over time.
|
51 |
+
Green energy utilizes sources that don't produce pollution when generating power.
|
52 |
+
Alternative energy refers to sources that are an alternative to fossil fuel.
|
53 |
+
Eco-friendly power generation relies on inexhaustible natural resources.
|
54 |
+
""",
|
55 |
+
|
56 |
+
# Technology paragraph with repetitive content
|
57 |
+
"""
|
58 |
+
Artificial intelligence is revolutionizing how we interact with technology.
|
59 |
+
Machine learning is transforming the way computers process information.
|
60 |
+
AI is fundamentally changing our relationship with digital systems.
|
61 |
+
Smart algorithms are reshaping our technological landscape dramatically.
|
62 |
+
Computational intelligence is altering how machines solve complex problems.
|
63 |
+
Neural networks are revolutionizing the capabilities of modern computers.
|
64 |
+
"""
|
65 |
+
]
|
66 |
+
|
67 |
+
# Repeat the paragraphs to create a longer document
|
68 |
+
document = "\n\n".join(base_paragraphs * 3)
|
69 |
+
return document
|
70 |
+
|
71 |
+
def generate_mixed_document() -> str:
|
72 |
+
"""
|
73 |
+
Generate a document with a mix of repetitive and unique content.
|
74 |
+
"""
|
75 |
+
repetitive = generate_repetitive_document()
|
76 |
+
|
77 |
+
unique = """
|
78 |
+
Energy efficiency is the goal to reduce the amount of energy required to provide products and services.
|
79 |
+
For example, insulating a home allows a building to use less heating and cooling energy to achieve and
|
80 |
+
maintain a comfortable temperature. Installing LED bulbs, fluorescent lighting, or natural skylights reduces
|
81 |
+
the amount of energy required to attain the same level of illumination compared with using traditional
|
82 |
+
incandescent light bulbs. Improvements in energy efficiency are generally achieved by adopting a more
|
83 |
+
efficient technology or production process or by application of commonly accepted methods to reduce energy
|
84 |
+
losses.
|
85 |
+
|
86 |
+
Biodiversity is the variety and variability of life on Earth. It is typically a measure of variation at the
|
87 |
+
genetic, species, and ecosystem level. Terrestrial biodiversity is usually greater near the equator, which is
|
88 |
+
the result of the warm climate and high primary productivity. Biodiversity is not distributed evenly on Earth,
|
89 |
+
and is richer in the tropics. These tropical forest ecosystems cover less than 10% of earth's surface, and
|
90 |
+
contain about 90% of the world's species. Marine biodiversity is usually highest along coasts in the Western
|
91 |
+
Pacific, where sea surface temperature is highest, and in the mid-latitudinal band in all oceans.
|
92 |
+
"""
|
93 |
+
|
94 |
+
return repetitive + "\n\n" + unique
|
95 |
+
|
96 |
+
def generate_repetitive_document() -> str:
|
97 |
+
"""
|
98 |
+
Generate a document with deliberate semantic repetition.
|
99 |
+
The document will contain sentences that mean the same thing
|
100 |
+
expressed in different ways.
|
101 |
+
"""
|
102 |
+
# Base paragraphs with distinct topics
|
103 |
+
base_paragraphs = [
|
104 |
+
# Climate change paragraph with repetitive content
|
105 |
+
"""
|
106 |
+
Climate change is a significant and lasting alteration in the statistical distribution of weather
|
107 |
+
patterns over periods ranging from decades to millions of years. Global warming is the long-term
|
108 |
+
heating of Earth's climate system observed since the pre-industrial period due to human activities.
|
109 |
+
The rise in global temperature is causing substantial changes in our environment and ecosystems.
|
110 |
+
The warming of the planet is leading to significant transformations in weather patterns worldwide.
|
111 |
+
Human activities are causing Earth's temperature to increase, resulting in climate modifications.
|
112 |
+
The climate crisis is fundamentally altering the Earth's atmosphere and affecting all living things.
|
113 |
+
""",
|
114 |
+
|
115 |
+
# Renewable energy paragraph with repetitive content
|
116 |
+
"""
|
117 |
+
Renewable energy comes from sources that are naturally replenishing but flow-limited.
|
118 |
+
Clean energy is derived from natural processes that are constantly replenished.
|
119 |
+
Sustainable power is generated from resources that won't deplete over time.
|
120 |
+
Green energy utilizes sources that don't produce pollution when generating power.
|
121 |
+
Alternative energy refers to sources that are an alternative to fossil fuel.
|
122 |
+
Eco-friendly power generation relies on inexhaustible natural resources.
|
123 |
+
""",
|
124 |
+
|
125 |
+
# Technology paragraph with repetitive content
|
126 |
+
"""
|
127 |
+
Artificial intelligence is revolutionizing how we interact with technology.
|
128 |
+
Machine learning is transforming the way computers process information.
|
129 |
+
AI is fundamentally changing our relationship with digital systems.
|
130 |
+
Smart algorithms are reshaping our technological landscape dramatically.
|
131 |
+
Computational intelligence is altering how machines solve complex problems.
|
132 |
+
Neural networks are revolutionizing the capabilities of modern computers.
|
133 |
+
"""
|
134 |
+
]
|
135 |
+
|
136 |
+
# Repeat the paragraphs to create a longer document
|
137 |
+
document = "\n\n".join(base_paragraphs * 3)
|
138 |
+
return document
|
139 |
+
|
140 |
+
def generate_mixed_document() -> str:
|
141 |
+
"""
|
142 |
+
Generate a document with a mix of repetitive and unique content.
|
143 |
+
"""
|
144 |
+
repetitive = generate_repetitive_document()
|
145 |
+
|
146 |
+
unique = """
|
147 |
+
Energy efficiency is the goal to reduce the amount of energy required to provide products and services.
|
148 |
+
For example, insulating a home allows a building to use less heating and cooling energy to achieve and
|
149 |
+
maintain a comfortable temperature. Installing LED bulbs, fluorescent lighting, or natural skylights reduces
|
150 |
+
the amount of energy required to attain the same level of illumination compared with using traditional
|
151 |
+
incandescent light bulbs. Improvements in energy efficiency are generally achieved by adopting a more
|
152 |
+
efficient technology or production process or by application of commonly accepted methods to reduce energy
|
153 |
+
losses.
|
154 |
+
|
155 |
+
Biodiversity is the variety and variability of life on Earth. It is typically a measure of variation at the
|
156 |
+
genetic, species, and ecosystem level. Terrestrial biodiversity is usually greater near the equator, which is
|
157 |
+
the result of the warm climate and high primary productivity. Biodiversity is not distributed evenly on Earth,
|
158 |
+
and is richer in the tropics. These tropical forest ecosystems cover less than 10% of earth's surface, and
|
159 |
+
contain about 90% of the world's species. Marine biodiversity is usually highest along coasts in the Western
|
160 |
+
Pacific, where sea surface temperature is highest, and in the mid-latitudinal band in all oceans.
|
161 |
+
"""
|
162 |
+
|
163 |
+
return repetitive + "\n\n" + unique
|
164 |
+
|
165 |
+
def run_deduplication_benchmark() -> None:
|
166 |
+
"""
|
167 |
+
Run a benchmark specifically testing the semantic deduplication capabilities.
|
168 |
+
"""
|
169 |
+
logger.info("Starting deduplication benchmark")
|
170 |
+
|
171 |
+
# Initialize context manager with various thresholds
|
172 |
+
thresholds = [0.7, 0.8, 0.85, 0.9, 0.95]
|
173 |
+
results = []
|
174 |
+
|
175 |
+
# Create documents
|
176 |
+
repetitive_doc = generate_repetitive_document()
|
177 |
+
mixed_doc = generate_mixed_document()
|
178 |
+
|
179 |
+
logger.info(f"Repetitive document size: {len(repetitive_doc.split())} words")
|
180 |
+
logger.info(f"Mixed document size: {len(mixed_doc.split())} words")
|
181 |
+
|
182 |
+
for threshold in thresholds:
|
183 |
+
logger.info(f"\nTesting with threshold: {threshold}")
|
184 |
+
|
185 |
+
# Create a fresh context manager with the current threshold
|
186 |
+
context_manager = ContextManager(
|
187 |
+
compressor=SemanticDeduplicator(threshold=threshold),
|
188 |
+
chunker=SemanticChunker(chunk_size=256),
|
189 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
|
190 |
+
)
|
191 |
+
|
192 |
+
# Test with repetitive document
|
193 |
+
logger.info("Processing repetitive document...")
|
194 |
+
start_time = time.time()
|
195 |
+
doc_id = context_manager.add_document(repetitive_doc)
|
196 |
+
processing_time = time.time() - start_time
|
197 |
+
|
198 |
+
# Generate context with a relevant query to see compression in action
|
199 |
+
query = "Tell me about climate change and renewable energy"
|
200 |
+
start_time = time.time()
|
201 |
+
context = context_manager.generate_context(query)
|
202 |
+
query_time = time.time() - start_time
|
203 |
+
|
204 |
+
# Record result
|
205 |
+
result = {
|
206 |
+
"threshold": threshold,
|
207 |
+
"document_type": "repetitive",
|
208 |
+
"original_size": len(repetitive_doc.split()),
|
209 |
+
"context_size": len(context.split()),
|
210 |
+
"processing_time": processing_time,
|
211 |
+
"query_time": query_time,
|
212 |
+
"chunks": len(context_manager.chunks)
|
213 |
+
}
|
214 |
+
results.append(result)
|
215 |
+
logger.info(f" - Original size: {result['original_size']} words")
|
216 |
+
logger.info(f" - Context size: {result['context_size']} words")
|
217 |
+
logger.info(f" - Compression ratio: {result['context_size'] / result['original_size']:.2f}")
|
218 |
+
logger.info(f" - Processing time: {result['processing_time']:.4f} seconds")
|
219 |
+
logger.info(f" - Query time: {result['query_time']:.4f} seconds")
|
220 |
+
|
221 |
+
# Reset the context manager
|
222 |
+
context_manager = ContextManager(
|
223 |
+
compressor=SemanticDeduplicator(threshold=threshold),
|
224 |
+
chunker=SemanticChunker(chunk_size=256),
|
225 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
|
226 |
+
)
|
227 |
+
|
228 |
+
# Test with mixed document
|
229 |
+
logger.info("Processing mixed document...")
|
230 |
+
start_time = time.time()
|
231 |
+
doc_id = context_manager.add_document(mixed_doc)
|
232 |
+
processing_time = time.time() - start_time
|
233 |
+
|
234 |
+
# Generate context with a relevant query
|
235 |
+
query = "Tell me about climate change and biodiversity"
|
236 |
+
start_time = time.time()
|
237 |
+
context = context_manager.generate_context(query)
|
238 |
+
query_time = time.time() - start_time
|
239 |
+
|
240 |
+
# Record result
|
241 |
+
result = {
|
242 |
+
"threshold": threshold,
|
243 |
+
"document_type": "mixed",
|
244 |
+
"original_size": len(mixed_doc.split()),
|
245 |
+
"context_size": len(context.split()),
|
246 |
+
"processing_time": processing_time,
|
247 |
+
"query_time": query_time,
|
248 |
+
"chunks": len(context_manager.chunks)
|
249 |
+
}
|
250 |
+
results.append(result)
|
251 |
+
logger.info(f" - Original size: {result['original_size']} words")
|
252 |
+
logger.info(f" - Context size: {result['context_size']} words")
|
253 |
+
logger.info(f" - Compression ratio: {result['context_size'] / result['original_size']:.2f}")
|
254 |
+
logger.info(f" - Processing time: {result['processing_time']:.4f} seconds")
|
255 |
+
logger.info(f" - Query time: {result['query_time']:.4f} seconds")
|
256 |
+
|
257 |
+
# Print summary
|
258 |
+
logger.info("\nDeduplication Benchmark Summary:")
|
259 |
+
logger.info("-----------------------------------")
|
260 |
+
|
261 |
+
logger.info("\nRepetitive Document Results:")
|
262 |
+
for result in [r for r in results if r["document_type"] == "repetitive"]:
|
263 |
+
logger.info(f"Threshold {result['threshold']}: {result['context_size'] / result['original_size']:.2f} compression ratio, {result['processing_time']:.4f}s processing time")
|
264 |
+
|
265 |
+
logger.info("\nMixed Document Results:")
|
266 |
+
for result in [r for r in results if r["document_type"] == "mixed"]:
|
267 |
+
logger.info(f"Threshold {result['threshold']}: {result['context_size'] / result['original_size']:.2f} compression ratio, {result['processing_time']:.4f}s processing time")
|
268 |
+
|
269 |
+
def main():
|
270 |
+
"""Main function for the deduplication benchmark script."""
|
271 |
+
parser = argparse.ArgumentParser(description="Benchmark efficient-context's semantic deduplication")
|
272 |
+
|
273 |
+
args = parser.parse_args()
|
274 |
+
run_deduplication_benchmark()
|
275 |
+
|
276 |
+
if __name__ == "__main__":
|
277 |
+
main()
|
examples/demo_notebook.ipynb
ADDED
File without changes
|
examples/llm_integration.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Example of integrating efficient-context with a lightweight LLM.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
import time
|
7 |
+
from typing import List, Dict, Any, Optional
|
8 |
+
|
9 |
+
from efficient_context import ContextManager
|
10 |
+
from efficient_context.compression import SemanticDeduplicator
|
11 |
+
from efficient_context.chunking import SemanticChunker
|
12 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
13 |
+
|
14 |
+
# Set up logging
|
15 |
+
logging.basicConfig(level=logging.INFO)
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
class LightweightLLM:
|
19 |
+
"""
|
20 |
+
A simple wrapper for a lightweight LLM.
|
21 |
+
|
22 |
+
This is a placeholder that would be replaced with an actual
|
23 |
+
lightweight LLM implementation in a real application.
|
24 |
+
"""
|
25 |
+
|
26 |
+
def __init__(self, model_name: str = "tiny-llm"):
|
27 |
+
"""
|
28 |
+
Initialize the lightweight LLM.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
model_name: Name of the model to use
|
32 |
+
"""
|
33 |
+
self.model_name = model_name
|
34 |
+
logger.info(f"Initialized LightweightLLM with model: {model_name}")
|
35 |
+
|
36 |
+
# This would be where you'd load your model in a real implementation
|
37 |
+
logger.info("Note: This is a placeholder class for demonstration purposes")
|
38 |
+
|
39 |
+
def generate(
|
40 |
+
self,
|
41 |
+
prompt: str,
|
42 |
+
context: Optional[str] = None,
|
43 |
+
max_tokens: int = 512
|
44 |
+
) -> str:
|
45 |
+
"""
|
46 |
+
Generate text using the LLM.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
prompt: The prompt for generation
|
50 |
+
context: Optional context to condition the generation
|
51 |
+
max_tokens: Maximum number of tokens to generate
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
response: Generated text response
|
55 |
+
"""
|
56 |
+
# This is a fake implementation for demonstration
|
57 |
+
# In a real application, you'd call your LLM here
|
58 |
+
|
59 |
+
logger.info(f"Generating response with context size: {len(context.split()) if context else 0} tokens")
|
60 |
+
|
61 |
+
# Simulate generation time based on context size
|
62 |
+
if context:
|
63 |
+
time.sleep(0.001 * len(context.split())) # Simulate processing time
|
64 |
+
|
65 |
+
# Simple keyword detection for demo purposes
|
66 |
+
if "renewable energy" in context and "climate" in context:
|
67 |
+
return "Renewable energy has a positive impact on climate change mitigation by reducing greenhouse gas emissions. The transition from fossil fuels to renewable sources like wind and solar is crucial for limiting global warming."
|
68 |
+
elif "rural" in context and "renewable" in context:
|
69 |
+
return "Renewable energy technologies are well-suited for rural and remote areas. They can provide decentralized power generation, improving energy access in areas without reliable grid connections, which is critical for human development."
|
70 |
+
else:
|
71 |
+
return "Renewable energy sources are sustainable alternatives to fossil fuels. They include solar, wind, hydro, geothermal, and biomass energy, and their use is growing globally."
|
72 |
+
else:
|
73 |
+
return "I don't have enough context to provide a detailed answer on this topic."
|
74 |
+
|
75 |
+
def main():
|
76 |
+
# Sample documents - in a real application, you might load these from files
|
77 |
+
documents = [
|
78 |
+
{
|
79 |
+
"content": """
|
80 |
+
Renewable energy is derived from natural sources that are replenished at a higher rate than they are consumed.
|
81 |
+
Sunlight and wind, for example, are such sources that are constantly being replenished.
|
82 |
+
Renewable energy resources exist over wide geographical areas, in contrast to fossil fuels,
|
83 |
+
which are concentrated in a limited number of countries.
|
84 |
+
|
85 |
+
Rapid deployment of renewable energy and energy efficiency technologies is resulting in significant
|
86 |
+
energy security, climate change mitigation, and economic benefits.
|
87 |
+
In international public opinion surveys there is strong support for promoting renewable sources
|
88 |
+
such as solar power and wind power.
|
89 |
+
|
90 |
+
While many renewable energy projects are large-scale, renewable technologies are also suited to rural
|
91 |
+
and remote areas and developing countries, where energy is often crucial in human development.
|
92 |
+
As most of the renewable energy technologies provide electricity, renewable energy is often deployed
|
93 |
+
together with further electrification, which has several benefits: electricity can be converted to heat,
|
94 |
+
can be converted into mechanical energy with high efficiency, and is clean at the point of consumption.
|
95 |
+
""",
|
96 |
+
"metadata": {"topic": "renewable energy", "source": "example"}
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"content": """
|
100 |
+
Climate change mitigation consists of actions to limit global warming and its related effects.
|
101 |
+
This involves reductions in human emissions of greenhouse gases (GHGs) as well as activities
|
102 |
+
that reduce their concentration in the atmosphere.
|
103 |
+
|
104 |
+
Fossil fuels account for more than 70% of GHG emissions. The energy sector contributes to global
|
105 |
+
emissions, mainly through the burning of fossil fuels to generate electricity and heat,
|
106 |
+
and through the use of gasoline and diesel to power vehicles.
|
107 |
+
|
108 |
+
A transition to renewable energy is a key component of climate change mitigation. By replacing
|
109 |
+
fossil fuel power plants with renewable energy sources, such as wind and solar, we can reduce
|
110 |
+
the amount of greenhouse gases emitted into the atmosphere.
|
111 |
+
|
112 |
+
Renewable energy can also play a role in adapting to climate change, for example by providing
|
113 |
+
reliable power for cooling in increasingly hot regions, or by ensuring energy access in the
|
114 |
+
aftermath of climate-related disasters.
|
115 |
+
""",
|
116 |
+
"metadata": {"topic": "climate change", "source": "example"}
|
117 |
+
},
|
118 |
+
]
|
119 |
+
|
120 |
+
# Initialize a context manager with custom strategies
|
121 |
+
context_manager = ContextManager(
|
122 |
+
compressor=SemanticDeduplicator(threshold=0.85),
|
123 |
+
chunker=SemanticChunker(chunk_size=256),
|
124 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight"),
|
125 |
+
max_context_size=512 # Intentionally small for demonstration
|
126 |
+
)
|
127 |
+
|
128 |
+
# Initialize a lightweight LLM
|
129 |
+
llm = LightweightLLM()
|
130 |
+
|
131 |
+
# Add documents to the context manager
|
132 |
+
document_ids = context_manager.add_documents(documents)
|
133 |
+
|
134 |
+
# Example queries
|
135 |
+
queries = [
|
136 |
+
"Tell me about the climate impact of renewable energy",
|
137 |
+
"How does renewable energy work in rural areas?",
|
138 |
+
"What are the advantages of using renewable energy?"
|
139 |
+
]
|
140 |
+
|
141 |
+
# Process each query
|
142 |
+
for query in queries:
|
143 |
+
print(f"\n\n=== QUERY: {query} ===")
|
144 |
+
|
145 |
+
# Generate optimized context for the query
|
146 |
+
start_time = time.time()
|
147 |
+
optimized_context = context_manager.generate_context(query=query)
|
148 |
+
context_time = time.time() - start_time
|
149 |
+
|
150 |
+
print(f"Context generation took {context_time:.3f} seconds")
|
151 |
+
print(f"Context size: {len(optimized_context.split())} tokens")
|
152 |
+
|
153 |
+
# Generate response using the LLM with the optimized context
|
154 |
+
start_time = time.time()
|
155 |
+
response = llm.generate(prompt=query, context=optimized_context)
|
156 |
+
llm_time = time.time() - start_time
|
157 |
+
|
158 |
+
print(f"LLM generation took {llm_time:.3f} seconds")
|
159 |
+
print(f"--- RESPONSE ---")
|
160 |
+
print(response)
|
161 |
+
print("-" * 50)
|
162 |
+
|
163 |
+
if __name__ == "__main__":
|
164 |
+
main()
|
examples/simple_dedup_benchmark.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
Simple benchmark for efficient-context's semantic deduplication.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import logging
|
7 |
+
import time
|
8 |
+
import sys
|
9 |
+
|
10 |
+
# Set up logging
|
11 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
logger.info("Simple deduplication benchmark starting")
|
14 |
+
|
15 |
+
# Import the library
|
16 |
+
try:
|
17 |
+
from efficient_context import ContextManager
|
18 |
+
from efficient_context.compression import SemanticDeduplicator
|
19 |
+
from efficient_context.chunking import SemanticChunker
|
20 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
21 |
+
logger.info("Successfully imported efficient_context")
|
22 |
+
except ImportError as e:
|
23 |
+
logger.error(f"Failed to import efficient_context: {e}")
|
24 |
+
sys.exit(1)
|
25 |
+
|
26 |
+
def create_repetitive_document():
|
27 |
+
"""Create a document with deliberate repetition"""
|
28 |
+
# Base paragraphs with repetitive content
|
29 |
+
climate_paragraph = """
|
30 |
+
Climate change is a significant alteration in weather patterns over extended periods.
|
31 |
+
Global warming is the long-term heating of Earth's climate system due to human activities.
|
32 |
+
Rising global temperatures are causing substantial changes in our environment and ecosystems.
|
33 |
+
The warming of the planet is leading to significant transformations in weather patterns.
|
34 |
+
Human activities are causing Earth's temperature to increase, resulting in climate changes.
|
35 |
+
"""
|
36 |
+
|
37 |
+
energy_paragraph = """
|
38 |
+
Renewable energy comes from sources that are naturally replenishing but flow-limited.
|
39 |
+
Clean energy is derived from natural processes that are constantly replenished.
|
40 |
+
Sustainable power is generated from resources that won't deplete over time.
|
41 |
+
Green energy utilizes sources that don't produce pollution when generating power.
|
42 |
+
Alternative energy refers to sources that are an alternative to fossil fuel.
|
43 |
+
"""
|
44 |
+
|
45 |
+
# Repeat the paragraphs to create a more repetitive document
|
46 |
+
document = (climate_paragraph + energy_paragraph) * 3
|
47 |
+
return document
|
48 |
+
|
49 |
+
def main():
|
50 |
+
"""Run the benchmark"""
|
51 |
+
# Create the test document
|
52 |
+
document = create_repetitive_document()
|
53 |
+
logger.info(f"Document size: {len(document.split())} words")
|
54 |
+
|
55 |
+
# Test with different thresholds
|
56 |
+
thresholds = [0.7, 0.8, 0.85, 0.9, 0.95]
|
57 |
+
|
58 |
+
for threshold in thresholds:
|
59 |
+
logger.info(f"\nTesting with threshold: {threshold}")
|
60 |
+
|
61 |
+
# Create a context manager with the current threshold
|
62 |
+
context_manager = ContextManager(
|
63 |
+
compressor=SemanticDeduplicator(threshold=threshold),
|
64 |
+
chunker=SemanticChunker(chunk_size=100),
|
65 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
|
66 |
+
)
|
67 |
+
|
68 |
+
# Process the document
|
69 |
+
start_time = time.time()
|
70 |
+
doc_id = context_manager.add_document(document)
|
71 |
+
processing_time = time.time() - start_time
|
72 |
+
|
73 |
+
# Generate context with a query
|
74 |
+
query = "Tell me about climate change and renewable energy"
|
75 |
+
start_time = time.time()
|
76 |
+
context = context_manager.generate_context(query)
|
77 |
+
query_time = time.time() - start_time
|
78 |
+
|
79 |
+
# Report results
|
80 |
+
original_size = len(document.split())
|
81 |
+
context_size = len(context.split())
|
82 |
+
compression_ratio = context_size / original_size if original_size > 0 else 1.0
|
83 |
+
|
84 |
+
logger.info(f"Results for threshold {threshold}:")
|
85 |
+
logger.info(f" - Original size: {original_size} words")
|
86 |
+
logger.info(f" - Context size: {context_size} words")
|
87 |
+
logger.info(f" - Compression ratio: {compression_ratio:.2f}")
|
88 |
+
logger.info(f" - Processing time: {processing_time:.4f} seconds")
|
89 |
+
logger.info(f" - Query time: {query_time:.4f} seconds")
|
90 |
+
|
91 |
+
if __name__ == "__main__":
|
92 |
+
main()
|
examples/simple_test.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
Basic test for efficient-context
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import time
|
9 |
+
|
10 |
+
print(f"Python version: {sys.version}")
|
11 |
+
print(f"Current directory: {os.getcwd()}")
|
12 |
+
print(f"Python path: {sys.path}")
|
13 |
+
|
14 |
+
try:
|
15 |
+
print("Testing efficient-context library...")
|
16 |
+
|
17 |
+
# Create a simple context manager
|
18 |
+
from efficient_context import ContextManager
|
19 |
+
from efficient_context.compression import SemanticDeduplicator
|
20 |
+
from efficient_context.chunking import SemanticChunker
|
21 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
22 |
+
|
23 |
+
print("Successfully imported efficient_context")
|
24 |
+
except Exception as e:
|
25 |
+
print(f"Error importing efficient_context: {e}")
|
26 |
+
sys.exit(1)
|
27 |
+
|
28 |
+
cm = ContextManager(
|
29 |
+
compressor=SemanticDeduplicator(threshold=0.85),
|
30 |
+
chunker=SemanticChunker(chunk_size=200),
|
31 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
|
32 |
+
)
|
33 |
+
|
34 |
+
# Add a document
|
35 |
+
doc = """
|
36 |
+
Renewable energy comes from sources that are naturally replenishing but flow-limited.
|
37 |
+
Clean energy is derived from natural processes that are constantly replenished.
|
38 |
+
Sustainable power is generated from resources that won't deplete over time.
|
39 |
+
Green energy utilizes sources that don't produce pollution when generating power.
|
40 |
+
Alternative energy refers to sources that are an alternative to fossil fuel.
|
41 |
+
Eco-friendly power generation relies on inexhaustible natural resources.
|
42 |
+
|
43 |
+
Climate change is a significant and lasting alteration in the statistical distribution
|
44 |
+
of weather patterns over periods ranging from decades to millions of years.
|
45 |
+
Global warming is the long-term heating of Earth's climate system observed since
|
46 |
+
the pre-industrial period due to human activities.
|
47 |
+
"""
|
48 |
+
|
49 |
+
print(f"Document size: {len(doc.split())} words")
|
50 |
+
|
51 |
+
# Add the document
|
52 |
+
start = time.time()
|
53 |
+
doc_id = cm.add_document(doc)
|
54 |
+
processing_time = time.time() - start
|
55 |
+
print(f"Document processed in {processing_time:.4f} seconds")
|
56 |
+
print(f"Created {len(cm.chunks)} chunks")
|
57 |
+
|
58 |
+
# Generate context
|
59 |
+
query = "Tell me about renewable energy"
|
60 |
+
start = time.time()
|
61 |
+
context = cm.generate_context(query)
|
62 |
+
query_time = time.time() - start
|
63 |
+
|
64 |
+
# Print results
|
65 |
+
print(f"Query time: {query_time:.4f} seconds")
|
66 |
+
print(f"Context size: {len(context.split())} words")
|
67 |
+
print(f"Context: {context[:150]}...")
|
68 |
+
|
69 |
+
print("Test completed successfully")
|
model_card.md
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# efficient-context
|
2 |
+
|
3 |
+
A Python library for optimizing LLM context handling in CPU-constrained environments.
|
4 |
+
|
5 |
+
## Model / Library Description
|
6 |
+
|
7 |
+
`efficient-context` addresses the challenge of working with large language models (LLMs) on CPU-only and memory-limited systems by providing efficient context management strategies. The library focuses on making LLMs more usable when computational resources are limited.
|
8 |
+
|
9 |
+
## Intended Use
|
10 |
+
|
11 |
+
This library is designed for:
|
12 |
+
- Deploying LLMs in resource-constrained environments
|
13 |
+
- Optimizing context handling for edge devices
|
14 |
+
- Creating applications that need to run on standard hardware
|
15 |
+
- Reducing memory usage when working with large documents
|
16 |
+
|
17 |
+
## Features
|
18 |
+
|
19 |
+
### Context Compression
|
20 |
+
- Semantic deduplication to remove redundant information
|
21 |
+
- Importance-based pruning that keeps critical information
|
22 |
+
- Automatic summarization of less relevant sections
|
23 |
+
|
24 |
+
### Advanced Chunking
|
25 |
+
- Semantic chunking that preserves logical units
|
26 |
+
- Adaptive chunk sizing based on content complexity
|
27 |
+
- Chunk relationships mapping for coherent retrieval
|
28 |
+
|
29 |
+
### Retrieval Optimization
|
30 |
+
- Lightweight embedding models optimized for CPU
|
31 |
+
- Tiered retrieval strategies (local vs. remote)
|
32 |
+
- Query-aware context assembly
|
33 |
+
|
34 |
+
### Memory Management
|
35 |
+
- Progressive loading/unloading of context
|
36 |
+
- Streaming context processing
|
37 |
+
- Memory-aware caching strategies
|
38 |
+
|
39 |
+
## Installation
|
40 |
+
|
41 |
+
```bash
|
42 |
+
pip install efficient-context
|
43 |
+
```
|
44 |
+
|
45 |
+
## Usage
|
46 |
+
|
47 |
+
```python
|
48 |
+
from efficient_context import ContextManager
|
49 |
+
from efficient_context.compression import SemanticDeduplicator
|
50 |
+
from efficient_context.chunking import SemanticChunker
|
51 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
52 |
+
|
53 |
+
# Initialize a context manager with custom strategies
|
54 |
+
context_manager = ContextManager(
|
55 |
+
compressor=SemanticDeduplicator(threshold=0.85),
|
56 |
+
chunker=SemanticChunker(chunk_size=256),
|
57 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
|
58 |
+
)
|
59 |
+
|
60 |
+
# Add documents to your context
|
61 |
+
context_manager.add_documents(documents)
|
62 |
+
|
63 |
+
# Generate optimized context for a query
|
64 |
+
optimized_context = context_manager.generate_context(
|
65 |
+
query="Tell me about the climate impact of renewable energy"
|
66 |
+
)
|
67 |
+
|
68 |
+
# Use the optimized context with your LLM
|
69 |
+
response = your_llm_model.generate(prompt=prompt, context=optimized_context)
|
70 |
+
```
|
71 |
+
|
72 |
+
## Performance and Benchmarks
|
73 |
+
|
74 |
+
The library has demonstrated excellent performance in handling repetitive content:
|
75 |
+
- With a threshold of 0.7, it achieved a 57.5% reduction in token count
|
76 |
+
- Processing times: 0.13-0.84 seconds for a 426-word document
|
77 |
+
- Query time: 0.08-0.14 seconds
|
78 |
+
|
79 |
+
## Limitations
|
80 |
+
|
81 |
+
- Designed primarily for text data
|
82 |
+
- Performance depends on the quality of embedding models
|
83 |
+
- Semantic deduplication may occasionally remove content that appears similar but has subtle differences
|
84 |
+
|
85 |
+
## Maintainer
|
86 |
+
|
87 |
+
This project is maintained by [Biswanath Roul](https://github.com/biswanathroul)
|
88 |
+
|
89 |
+
## License
|
90 |
+
|
91 |
+
MIT
|
pyproject.toml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["setuptools>=42", "wheel"]
|
3 |
+
build-backend = "setuptools.build_meta"
|
4 |
+
|
5 |
+
[tool.black]
|
6 |
+
line-length = 88
|
7 |
+
include = '\.pyi?$'
|
8 |
+
|
9 |
+
[tool.isort]
|
10 |
+
profile = "black"
|
11 |
+
line_length = 88
|
12 |
+
|
13 |
+
[tool.pytest]
|
14 |
+
testpaths = ["tests"]
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy>=1.19.0
|
2 |
+
scikit-learn>=0.24.0
|
3 |
+
sentence-transformers>=2.2.2
|
4 |
+
nltk>=3.6.0
|
5 |
+
pydantic>=1.8.0
|
6 |
+
tqdm>=4.62.0
|
7 |
+
psutil>=5.9.0
|
setup.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup, find_packages
|
2 |
+
|
3 |
+
with open("README.md", "r", encoding="utf-8") as fh:
|
4 |
+
long_description = fh.read()
|
5 |
+
|
6 |
+
setup(
|
7 |
+
name="efficient-context",
|
8 |
+
version="0.1.0",
|
9 |
+
author="Biswanath Roul",
|
10 |
+
description="Optimize LLM context handling in CPU-constrained environments",
|
11 |
+
long_description=long_description,
|
12 |
+
long_description_content_type="text/markdown",
|
13 |
+
url="https://github.com/biswanathroul/efficient-context",
|
14 |
+
packages=find_packages(),
|
15 |
+
classifiers=[
|
16 |
+
"Programming Language :: Python :: 3",
|
17 |
+
"License :: OSI Approved :: MIT License",
|
18 |
+
"Operating System :: OS Independent",
|
19 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
20 |
+
],
|
21 |
+
python_requires=">=3.7",
|
22 |
+
install_requires=[
|
23 |
+
"numpy>=1.19.0",
|
24 |
+
"scikit-learn>=0.24.0",
|
25 |
+
"sentence-transformers>=2.2.2",
|
26 |
+
"nltk>=3.6.0",
|
27 |
+
"pydantic>=1.8.0",
|
28 |
+
"tqdm>=4.62.0",
|
29 |
+
],
|
30 |
+
keywords="llm, context, optimization, cpu, memory, efficiency, nlp",
|
31 |
+
)
|
test_simple.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Simple test script for efficient-context library.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
from efficient_context import ContextManager
|
7 |
+
from efficient_context.compression import SemanticDeduplicator
|
8 |
+
from efficient_context.chunking import SemanticChunker
|
9 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
10 |
+
|
11 |
+
# Set up logging
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
def test_basic_functionality():
|
16 |
+
"""Test the basic functionality of the library."""
|
17 |
+
print("\n=== Testing Basic Functionality ===")
|
18 |
+
|
19 |
+
# Sample document - notice we've removed indentation and added more content
|
20 |
+
document = """Renewable energy is derived from natural sources that are replenished at a higher rate than they are consumed.
|
21 |
+
Sunlight and wind, for example, are such sources that are constantly being replenished.
|
22 |
+
Renewable energy resources exist over wide geographical areas, in contrast to fossil fuels,
|
23 |
+
which are concentrated in a limited number of countries.
|
24 |
+
|
25 |
+
Rapid deployment of renewable energy and energy efficiency technologies is resulting in significant
|
26 |
+
energy security, climate change mitigation, and economic benefits.
|
27 |
+
In international public opinion surveys there is strong support for promoting renewable sources
|
28 |
+
such as solar power and wind power.
|
29 |
+
|
30 |
+
While many renewable energy projects are large-scale, renewable technologies are also suited to rural
|
31 |
+
and remote areas and developing countries, where energy is often crucial in human development.
|
32 |
+
As most of the renewable energy technologies provide electricity, renewable energy is often deployed
|
33 |
+
together with further electrification, which has several benefits."""
|
34 |
+
|
35 |
+
# Initialize context manager
|
36 |
+
context_manager = ContextManager(
|
37 |
+
compressor=SemanticDeduplicator(threshold=0.85),
|
38 |
+
chunker=SemanticChunker(chunk_size=100),
|
39 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight")
|
40 |
+
)
|
41 |
+
|
42 |
+
# Add document
|
43 |
+
print(f"Document length: {len(document.split())} words")
|
44 |
+
doc_id = context_manager.add_document(document)
|
45 |
+
print(f"Added document with ID: {doc_id}")
|
46 |
+
print(f"Created {len(context_manager.chunks)} chunks")
|
47 |
+
|
48 |
+
# Debug information about chunks
|
49 |
+
if len(context_manager.chunks) > 0:
|
50 |
+
print("\nChunk information:")
|
51 |
+
for i, chunk in enumerate(context_manager.chunks):
|
52 |
+
print(f"Chunk {i+1}: {len(chunk.content.split())} words")
|
53 |
+
print(f"Content sample: {chunk.content[:50]}...")
|
54 |
+
else:
|
55 |
+
print("\nWARNING: No chunks were created. This is likely an issue with the chunker.")
|
56 |
+
# Let's try direct chunking to debug
|
57 |
+
print("\nTrying direct chunking:")
|
58 |
+
chunks = context_manager.chunker.chunk(document, document_id=doc_id)
|
59 |
+
print(f"Direct chunking created {len(chunks)} chunks")
|
60 |
+
if len(chunks) > 0:
|
61 |
+
print(f"Sample chunk content: {chunks[0].content[:50]}...")
|
62 |
+
|
63 |
+
# Test query
|
64 |
+
query = "Tell me about renewable energy sources"
|
65 |
+
print(f"\nQuery: {query}")
|
66 |
+
|
67 |
+
# Get context
|
68 |
+
context = context_manager.generate_context(query)
|
69 |
+
print(f"\nGenerated context ({len(context.split())} tokens):")
|
70 |
+
print(context)
|
71 |
+
|
72 |
+
print("\n=== Test completed successfully ===")
|
73 |
+
|
74 |
+
if __name__ == "__main__":
|
75 |
+
test_basic_functionality()
|
tests/test_core.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Tests for the core functionality of efficient-context.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import unittest
|
6 |
+
from efficient_context import ContextManager
|
7 |
+
from efficient_context.compression import SemanticDeduplicator
|
8 |
+
from efficient_context.chunking import SemanticChunker, Chunk
|
9 |
+
from efficient_context.retrieval import CPUOptimizedRetriever
|
10 |
+
from efficient_context.memory import MemoryManager
|
11 |
+
|
12 |
+
class TestEfficientContext(unittest.TestCase):
|
13 |
+
"""Test cases for efficient-context functionality."""
|
14 |
+
|
15 |
+
def setUp(self):
|
16 |
+
"""Set up test environment."""
|
17 |
+
self.sample_text = """
|
18 |
+
Renewable energy is derived from natural sources that are replenished at a higher rate than they are consumed.
|
19 |
+
Sunlight and wind, for example, are such sources that are constantly being replenished.
|
20 |
+
Renewable energy resources exist over wide geographical areas, in contrast to fossil fuels,
|
21 |
+
which are concentrated in a limited number of countries.
|
22 |
+
|
23 |
+
Rapid deployment of renewable energy and energy efficiency technologies is resulting in significant
|
24 |
+
energy security, climate change mitigation, and economic benefits.
|
25 |
+
In international public opinion surveys there is strong support for promoting renewable sources
|
26 |
+
such as solar power and wind power.
|
27 |
+
|
28 |
+
While many renewable energy projects are large-scale, renewable technologies are also suited to rural
|
29 |
+
and remote areas and developing countries, where energy is often crucial in human development.
|
30 |
+
As most of the renewable energy technologies provide electricity, renewable energy is often deployed
|
31 |
+
together with further electrification, which has several benefits: electricity can be converted to heat,
|
32 |
+
can be converted into mechanical energy with high efficiency, and is clean at the point of consumption.
|
33 |
+
"""
|
34 |
+
|
35 |
+
def test_semantic_deduplicator(self):
|
36 |
+
"""Test the semantic deduplicator functionality."""
|
37 |
+
compressor = SemanticDeduplicator(threshold=0.9)
|
38 |
+
compressed = compressor.compress(self.sample_text)
|
39 |
+
|
40 |
+
# Test that compression reduces size
|
41 |
+
self.assertLess(len(compressed), len(self.sample_text))
|
42 |
+
|
43 |
+
# Test that key content is preserved
|
44 |
+
self.assertIn("Renewable energy", compressed)
|
45 |
+
|
46 |
+
def test_semantic_chunker(self):
|
47 |
+
"""Test the semantic chunker functionality."""
|
48 |
+
chunker = SemanticChunker(chunk_size=100, chunk_overlap=10)
|
49 |
+
chunks = chunker.chunk(self.sample_text, document_id="test-doc")
|
50 |
+
|
51 |
+
# Test that chunks were created
|
52 |
+
self.assertGreater(len(chunks), 0)
|
53 |
+
|
54 |
+
# Test that each chunk has content and metadata
|
55 |
+
for chunk in chunks:
|
56 |
+
self.assertIsInstance(chunk, Chunk)
|
57 |
+
self.assertTrue(chunk.content)
|
58 |
+
self.assertEqual(chunk.document_id, "test-doc")
|
59 |
+
|
60 |
+
def test_cpu_optimized_retriever(self):
|
61 |
+
"""Test the CPU-optimized retriever functionality."""
|
62 |
+
retriever = CPUOptimizedRetriever(embedding_model="lightweight")
|
63 |
+
|
64 |
+
# Create test chunks
|
65 |
+
chunks = [
|
66 |
+
Chunk(content="Renewable energy is a sustainable energy source.", chunk_id="1"),
|
67 |
+
Chunk(content="Climate change is a global challenge.", chunk_id="2"),
|
68 |
+
Chunk(content="Fossil fuels contribute to greenhouse gas emissions.", chunk_id="3")
|
69 |
+
]
|
70 |
+
|
71 |
+
# Index chunks
|
72 |
+
retriever.index_chunks(chunks)
|
73 |
+
|
74 |
+
# Test retrieval
|
75 |
+
query = "What are the environmental impacts of energy sources?"
|
76 |
+
results = retriever.retrieve(query, top_k=2)
|
77 |
+
|
78 |
+
# Should return some results
|
79 |
+
self.assertEqual(len(results), 2)
|
80 |
+
|
81 |
+
# Clear index
|
82 |
+
retriever.clear()
|
83 |
+
self.assertEqual(len(retriever.chunks), 0)
|
84 |
+
|
85 |
+
def test_context_manager_integration(self):
|
86 |
+
"""Test full integration of all components."""
|
87 |
+
# Initialize context manager
|
88 |
+
context_manager = ContextManager(
|
89 |
+
compressor=SemanticDeduplicator(threshold=0.85),
|
90 |
+
chunker=SemanticChunker(chunk_size=100),
|
91 |
+
retriever=CPUOptimizedRetriever(embedding_model="lightweight"),
|
92 |
+
memory_manager=MemoryManager()
|
93 |
+
)
|
94 |
+
|
95 |
+
# Add document
|
96 |
+
doc_id = context_manager.add_document(self.sample_text)
|
97 |
+
|
98 |
+
# Test document was added
|
99 |
+
self.assertIn(doc_id, context_manager.documents)
|
100 |
+
|
101 |
+
# Test context generation
|
102 |
+
query = "Tell me about renewable energy in rural areas"
|
103 |
+
context = context_manager.generate_context(query)
|
104 |
+
|
105 |
+
# Should return some context
|
106 |
+
self.assertTrue(context)
|
107 |
+
|
108 |
+
# Clear context manager
|
109 |
+
context_manager.clear()
|
110 |
+
self.assertEqual(len(context_manager.documents), 0)
|
111 |
+
self.assertEqual(len(context_manager.chunks), 0)
|
112 |
+
|
113 |
+
if __name__ == "__main__":
|
114 |
+
unittest.main()
|