Spaces:
Running
Running
Chandima Prabhath
commited on
Commit
·
10b392a
1
Parent(s):
82f2b92
Refactor code structure for improved readability and maintainability
Browse files- .gitattributes +3 -0
- .gitignore +22 -0
- Dockerfile +17 -0
- PLANNING.md +92 -0
- config/settings.py +43 -0
- docs/Chapter 1 - The Weight of Silence.md +37 -0
- docs/Chapter 2 - Dust and Whispers.md +45 -0
- docs/Chapter 3 - The Journalist's Instinct.md +49 -0
- docs/Chapter 4 - The Near Miss on Elm Street.md +59 -0
- docs/data.txt +4 -0
- docs/outline.md +50 -0
- experimets/embed_pipeline.py +60 -0
- experimets/example.py +71 -0
- experimets/query_pipeline.py +52 -0
- new_system_architecture.png +3 -0
- requirements.txt +8 -0
- scripts/run_ingestion.py +27 -0
- scripts/run_query_api.py +27 -0
- src/__init__.py +3 -0
- src/data_loader/__init__.py +0 -0
- src/data_loader/loader.py +57 -0
- src/document_processor/__init__.py +0 -0
- src/document_processor/processor.py +95 -0
- src/embedding_generator/__init__.py +0 -0
- src/embedding_generator/embedder.py +71 -0
- src/ingestion_orchestrator/__init__.py +0 -0
- src/ingestion_orchestrator/orchestrator.py +68 -0
- src/llm_integrator/__init__.py +0 -0
- src/llm_integrator/llm.py +168 -0
- src/query_service/__init__.py +0 -0
- src/query_service/api.py +166 -0
- src/retrieval_handler/__init__.py +0 -0
- src/retrieval_handler/retriever.py +77 -0
- src/utils/__init__.py +0 -0
- src/utils/logging.py +9 -0
- src/vector_store_manager/__init__.py +0 -0
- src/vector_store_manager/chroma_manager.py +182 -0
- system_architecture.md +16 -0
- system_architecture.png +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
38 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
chroma_db
|
3 |
+
src/__pycache__/__init__.cpython-313.pyc
|
4 |
+
config/__pycache__/settings.cpython-313.pyc
|
5 |
+
src/data_loader/__pycache__/loader.cpython-313.pyc
|
6 |
+
src/data_loader/__pycache__/__init__.cpython-313.pyc
|
7 |
+
src/document_processor/__pycache__/processor.cpython-313.pyc
|
8 |
+
src/document_processor/__pycache__/__init__.cpython-313.pyc
|
9 |
+
src/embedding_generator/__pycache__/__init__.cpython-313.pyc
|
10 |
+
src/embedding_generator/__pycache__/embedder.cpython-313.pyc
|
11 |
+
src/ingestion_orchestrator/__pycache__/__init__.cpython-313.pyc
|
12 |
+
src/ingestion_orchestrator/__pycache__/orchestrator.cpython-313.pyc
|
13 |
+
src/llm_integrator/__pycache__/__init__.cpython-313.pyc
|
14 |
+
src/llm_integrator/__pycache__/llm.cpython-313.pyc
|
15 |
+
src/query_service/__pycache__/__init__.cpython-313.pyc
|
16 |
+
src/retrieval_handler/__pycache__/__init__.cpython-313.pyc
|
17 |
+
src/retrieval_handler/__pycache__/retriever.cpython-313.pyc
|
18 |
+
src/utils/__pycache__/__init__.cpython-313.pyc
|
19 |
+
src/utils/__pycache__/logging.cpython-313.pyc
|
20 |
+
src/query_service/__pycache__/api.cpython-313.pyc
|
21 |
+
src/vector_store_manager/__pycache__/__init__.cpython-313.pyc
|
22 |
+
src/vector_store_manager/__pycache__/chroma_manager.cpython-313.pyc
|
Dockerfile
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
|
3 |
+
RUN useradd -m -u 1000 user
|
4 |
+
USER user
|
5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
6 |
+
|
7 |
+
WORKDIR /app
|
8 |
+
|
9 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
11 |
+
|
12 |
+
EXPOSE 7860
|
13 |
+
|
14 |
+
COPY --chown=user . /app
|
15 |
+
ENV PYTHONPATH=/app
|
16 |
+
ENV HOST=0.0.0.0 PORT=7860
|
17 |
+
CMD ["python", "scripts/run_query_api.py"]
|
PLANNING.md
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### General RAG Platform Plan
|
2 |
+
|
3 |
+
#### Overview
|
4 |
+
The goal is to transform the existing RAG system into a versatile, modular platform for building various RAG applications.
|
5 |
+
|
6 |
+
#### Architecture
|
7 |
+
|
8 |
+
```mermaid
|
9 |
+
%%{init: {'theme': 'neutral', 'themeVariables': { 'primaryColor': '#e3f2fd', 'edgeLabelBackground':'#fffde7'}}}%%
|
10 |
+
graph TD
|
11 |
+
A[General RAG Platform Architecture]
|
12 |
+
|
13 |
+
subgraph DataIngestion
|
14 |
+
A1[Universal Data Loader\n<- Files\n<- Databases\n<- APIs\n<- Cloud Storage]
|
15 |
+
A2[Smart Document Processor\n<- Format detection\n<- Metadata extraction\n<- Content normalization]
|
16 |
+
A3[Chunking Strategies\n<- Semantic\n<- Structural\n<- Domain-specific]
|
17 |
+
end
|
18 |
+
|
19 |
+
subgraph CoreServices
|
20 |
+
B1[Embedding Service\n<- Multi-model support\n<- Batch processing\n<- Cache layer]
|
21 |
+
B2[VectorDB Orchestrator\n<- Chroma\n<- Pinecone\n<- Weaviate\n<- FAISS]
|
22 |
+
B3[LLM Gateway\n<- OpenAI\n<- Anthropic\n<- Mistral\n<- Custom models]
|
23 |
+
end
|
24 |
+
|
25 |
+
subgraph QueryEngine
|
26 |
+
C1[Query Analyzer\n<- Intent detection\n<- Query expansion\n<- Filter generation]
|
27 |
+
C2[Hybrid Retriever\n<- Vector search\n<- Keyword\n<- Hybrid ranking]
|
28 |
+
C3[Response Generator\n<- Citation\n<- Formatting\n<- Guardrails]
|
29 |
+
end
|
30 |
+
|
31 |
+
subgraph Management
|
32 |
+
D1[Config Manager\n<- Tenant isolation\n<- Model configs\n<- Access controls]
|
33 |
+
D2[Monitoring\n<- Metrics\n<- Logging\n<- Alerting]
|
34 |
+
D3[API Gateway\n<- REST\n<- GraphQL\n<- gRPC]
|
35 |
+
end
|
36 |
+
|
37 |
+
subgraph Extensibility
|
38 |
+
E1[Plugin System\n<- Custom loaders\n<- Chunkers\n<- Post-processors]
|
39 |
+
E2[Workflow Engine\n<- Pipeline designer\n<- Versioning\n<- A/B testing]
|
40 |
+
end
|
41 |
+
|
42 |
+
A --> DataIngestion
|
43 |
+
A --> CoreServices
|
44 |
+
A --> QueryEngine
|
45 |
+
A --> Management
|
46 |
+
A --> Extensibility
|
47 |
+
|
48 |
+
DataIngestion -->|Processed Chunks| CoreServices
|
49 |
+
CoreServices -->|Vector Index| QueryEngine
|
50 |
+
QueryEngine -->|Formatted Response| Management
|
51 |
+
Management -->|APIs| ExternalSystems
|
52 |
+
```
|
53 |
+
|
54 |
+
#### Implementation Plan
|
55 |
+
|
56 |
+
1. **Core Abstraction Layer**
|
57 |
+
- Unified interfaces for:
|
58 |
+
- Document Loaders (File, DB, API)
|
59 |
+
- Chunking Strategies
|
60 |
+
- Embedding Providers
|
61 |
+
- VectorDB Adapters
|
62 |
+
- LLM Gateways
|
63 |
+
|
64 |
+
2. **Multi-tenancy Features**
|
65 |
+
- Tenant isolation
|
66 |
+
- Resource quotas
|
67 |
+
- Custom pipeline configurations
|
68 |
+
- Role-based access control
|
69 |
+
|
70 |
+
3. **Advanced Retrieval**
|
71 |
+
- Hybrid search (vector + keyword + custom)
|
72 |
+
- Query understanding module
|
73 |
+
- Result reranking layer
|
74 |
+
- Cache mechanisms
|
75 |
+
|
76 |
+
4. **Operational Excellence**
|
77 |
+
- Observability stack
|
78 |
+
- Auto-scaling
|
79 |
+
- CI/CD pipelines
|
80 |
+
- Health checks
|
81 |
+
|
82 |
+
5. **Security**
|
83 |
+
- Data encryption
|
84 |
+
- Audit trails
|
85 |
+
- Content moderation
|
86 |
+
- PII detection
|
87 |
+
|
88 |
+
6. **Developer Ecosystem**
|
89 |
+
- SDKs (Python/JS)
|
90 |
+
- CLI tools
|
91 |
+
- Template repository
|
92 |
+
- Testing framework
|
config/settings.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# config/settings.py
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
# Load environment variables from .env file
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
# --- General Settings ---
|
9 |
+
DOCS_FOLDER = os.getenv("DOCS_FOLDER", "docs/")
|
10 |
+
PERSIST_DIR = os.getenv("PERSIST_DIR", "tmp/chroma_db/") # Default to local persistence,
|
11 |
+
# but plan to use a scalable backend like ClickHouse for production
|
12 |
+
|
13 |
+
# --- Embedding Model Settings ---
|
14 |
+
OLLAMA_URL = os.getenv("OLLAMA_SERVER","https://chandimaprabath-ollama-server.hf.space")
|
15 |
+
EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text:latest")
|
16 |
+
|
17 |
+
# --- Document Processing Settings ---
|
18 |
+
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "2000")) # Default chunk size
|
19 |
+
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "10")) # Default chunk overlap
|
20 |
+
|
21 |
+
# --- LLM Settings (OpenAI Compatible) ---
|
22 |
+
LLM_API_KEY = os.getenv("LLM_API_KEY")
|
23 |
+
LLM_API_BASE = os.getenv("LLM_API_BASE", "https://llm.chutes.ai/v1") # Default API base
|
24 |
+
LLM_MODEL = os.getenv("LLM_MODEL", "chutesai/Llama-4-Scout-17B-16E-Instruct") # Default LLM model
|
25 |
+
|
26 |
+
# --- Retrieval Settings ---
|
27 |
+
TOP_K = int(os.getenv("TOP_K", "5")) # Default number of documents to retrieve
|
28 |
+
CHROMADB_COLLECTION_NAME = os.getenv("CHROMADB_COLLECTION_NAME", "my_rulings_collection") # Unique collection name
|
29 |
+
|
30 |
+
# --- Security Settings (Placeholders - Implement according to government standards) ---
|
31 |
+
# Add settings for authentication, authorization, encryption paths, etc.
|
32 |
+
# SECRET_KEY = os.getenv("SECRET_KEY") # Example for API security
|
33 |
+
|
34 |
+
# --- Logging Settings ---
|
35 |
+
LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG") # Default log level
|
36 |
+
|
37 |
+
# --- Scalable Backend Settings (for production ChromaDB) ---
|
38 |
+
# Example settings if using ClickHouse as a backend for ChromaDB
|
39 |
+
# CLICKHOUSE_HOST = os.getenv("CLICKHOUSE_HOST")
|
40 |
+
# CLICKHOUSE_PORT = os.getenv("CLICKHOUSE_PORT")
|
41 |
+
# CLICKHOUSE_DATABASE = os.getenv("CLICKHOUSE_DATABASE")
|
42 |
+
# CLICKHOUSE_USER = os.getenv("CLICKHOUSE_USER")
|
43 |
+
# CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD")
|
docs/Chapter 1 - The Weight of Silence.md
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Chapter 1: The Weight of Silence
|
2 |
+
|
3 |
+
The chipped ceramic mug warmed Kaelen’s hands, a small comfort in the perpetual chill that seemed to cling to him. He sat hunched in the corner booth of the dimly lit diner, the pre-dawn quiet a familiar blanket. Outside, the first hesitant streaks of grey were painting the sky over Oakhaven City. He watched the street through the condensation on the window, the occasional early riser a fleeting blur.
|
4 |
+
|
5 |
+
He didn’t taste the lukewarm coffee. Taste, like most sensations, was muted, filtered through a thick wall of numbness. It wasn't a physical inability; it was a conscious, or perhaps unconscious, choice. To feel was to remember, and remembering was a pain he couldn’t afford.
|
6 |
+
|
7 |
+
A waitress, a woman with tired eyes and a nametag that read “Brenda,” approached his table. She didn’t speak, just placed a plate of slightly burnt toast and a single fried egg in front of him. She knew his routine. Kaelen nodded almost imperceptibly, a silent acknowledgment. He picked at the food, forcing it down. Sustenance was necessary, even if pleasure wasn’t.
|
8 |
+
|
9 |
+
The diner slowly began to fill with the murmur of early morning conversations, the clatter of cutlery, and the hiss of the coffee machine. Kaelen remained an island of silence in the growing noise. He observed the other patrons – a construction worker with calloused hands, a young woman nervously checking her phone, an elderly couple sharing a newspaper. Fleeting thoughts, snippets of their emotions, brushed against the edges of his awareness – a flicker of worry, a surge of anticipation, a quiet contentment. He pushed them away, a practiced mental maneuver. Engaging with the emotions of others was too close to feeling his own.
|
10 |
+
|
11 |
+
A sudden commotion erupted near the entrance. A man in a worn leather jacket was shouting at Brenda, his face red with anger.
|
12 |
+
|
13 |
+
“I said I wanted crispy bacon! This is practically raw!”
|
14 |
+
|
15 |
+
Brenda, her shoulders slumping, tried to placate him. “I’m sorry, sir, I can have them cook you another batch…”
|
16 |
+
|
17 |
+
“Another batch? I’m in a hurry! You think my time is worthless?” The man’s voice rose, drawing the attention of the other customers.
|
18 |
+
|
19 |
+
Kaelen’s grey eyes, usually fixed on his plate or the window, flickered towards the scene. He felt a familiar tightening in his chest, a subtle stirring beneath the layers of numbness. Injustice, even in something as trivial as undercooked bacon, always pricked at him.
|
20 |
+
|
21 |
+
The man continued his tirade, his words laced with insults. Brenda’s face was pale, her hands trembling slightly. No one else in the diner intervened, their gazes either averted or filled with a detached curiosity.
|
22 |
+
|
23 |
+
Kaelen’s fingers tightened around his mug. He didn’t move, didn’t make a sound. But within him, something shifted. A subtle pressure built in the air around the angry man, an almost imperceptible resistance. The man, mid-sentence, suddenly stumbled back a step, as if he’d bumped into an invisible wall. He blinked, confused, his anger momentarily disrupted.
|
24 |
+
|
25 |
+
“What the…?” he muttered, looking around.
|
26 |
+
|
27 |
+
He tried to step forward again, but the invisible pressure remained, holding him back. He frowned, his anger returning, but now tinged with bewilderment.
|
28 |
+
|
29 |
+
Brenda, seizing the opportunity, spoke quickly. “Sir, please, let me just get you that new bacon…”
|
30 |
+
|
31 |
+
The man, still slightly off-balance and unable to explain the strange resistance he felt, grumbled under his breath but allowed Brenda to retreat to the kitchen. The tension in the diner eased, the other patrons returning to their meals and conversations.
|
32 |
+
|
33 |
+
Kaelen released the subtle pressure, the invisible force dissipating as quickly as it had formed. He looked down at his untouched toast, the brief flicker of emotion receding, leaving behind the familiar emptiness. He hadn’t moved, hadn’t spoken, yet the situation had been defused. No one had noticed his intervention, and that was how he preferred it. He was a ghost in the machine, a silent observer, occasionally nudging the world in small, unnoticed ways.
|
34 |
+
|
35 |
+
He finished his coffee, the lukewarm liquid doing little to dispel the inner chill. He stood, placing a few crumpled bills on the table. Brenda offered a small, grateful smile as he passed her, a silent understanding passing between them.
|
36 |
+
|
37 |
+
Outside, the city was beginning to wake. Cars honked, buses rumbled, and the sounds of life echoed around him. Kaelen walked, his footsteps silent on the pavement, another anonymous figure in the urban sprawl. He had no destination in mind, just the instinct to keep moving, to remain unseen, to carry the weight of his silence through another day. The echoes of his past were always there, a constant hum beneath the surface, a reminder of the voice he had lost and the emotions he had buried deep within. And yet, sometimes, in the face of injustice, a flicker of something else ignited within him, a spark of power that yearned to be more than just a silent echo.
|
docs/Chapter 2 - Dust and Whispers.md
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Chapter 2: Dust and Whispers
|
2 |
+
|
3 |
+
The antique shop, tucked away on a quiet side street, was Kaelen’s infrequent sanctuary. Dust motes danced in the shafts of sunlight that slanted through the grimy front window, illuminating rows of forgotten treasures and curiosities. Elias Thorne, a man whose face seemed etched with the stories of the objects he collected, stood behind the counter, polishing a tarnished silver locket with a soft cloth.
|
4 |
+
|
5 |
+
The bell above the door chimed softly as Kaelen entered. Elias looked up, a hint of a smile touching his lips. He didn’t say anything, didn’t need to. He simply nodded, his gaze understanding the silent greeting.
|
6 |
+
|
7 |
+
Kaelen moved through the aisles, his fingers trailing lightly over the smooth surface of a weathered wooden box, the cool metal of an old telescope. The air in the shop was thick with the scent of aged paper, beeswax, and time. It was a comforting aroma, a stark contrast to the sterile, impersonal feel of the city outside.
|
8 |
+
|
9 |
+
He eventually made his way to the back of the shop, to a small, cluttered corner Elias had cleared for him. There was a worn armchair, its leather cracked but still comfortable, and a small table piled with old books. Kaelen settled into the chair, the familiar creak of the springs a soothing sound.
|
10 |
+
|
11 |
+
Elias finished polishing the locket and placed it carefully in a display case. He moved with a slow, deliberate grace, his eyes occasionally flicking towards Kaelen. After a few minutes of comfortable silence, he walked over to the corner, carrying a steaming mug.
|
12 |
+
|
13 |
+
He placed the mug on the table beside Kaelen. “Chamomile,” he said softly, his voice raspy from years of disuse. “Figured you could use something warm.”
|
14 |
+
|
15 |
+
Kaelen looked at the mug, then up at Elias. A flicker of gratitude, a rare spark of warmth, touched his grey eyes. He nodded again, reaching for the mug. He brought it to his lips, the gentle floral scent a welcome change from the bitter coffee.
|
16 |
+
|
17 |
+
Elias settled onto a nearby stool, his gaze thoughtful. “Saw the news this morning,” he said, his voice low. “Another one of those… incidents.”
|
18 |
+
|
19 |
+
Kaelen’s hand tightened slightly around the mug. He knew what Elias meant. Unexplained occurrences, strange bursts of energy, objects moving on their own – the subtle manifestations of his uncontrolled power. The city news often reported them as bizarre anomalies, unsolved mysteries.
|
20 |
+
|
21 |
+
“They’ll be looking,” Elias continued, his eyes filled with concern. “They always do.”
|
22 |
+
|
23 |
+
Kaelen looked away, his gaze fixed on a dusty book on the table. He didn’t need Elias to elaborate on who “they” were. The feeling of being watched, of unseen eyes tracking him, was a constant companion.
|
24 |
+
|
25 |
+
“You need to be careful, Kaelen,” Elias said, his voice firm but gentle. “Don’t draw too much attention.”
|
26 |
+
|
27 |
+
Kaelen knew Elias was right. His instinct was always to remain hidden, to avoid any interaction that might expose him. But sometimes… sometimes he couldn’t help it. The injustice he witnessed, the suffering he sensed, it stirred something within him, a primal urge to intervene, even if it meant risking exposure.
|
28 |
+
|
29 |
+
Elias sighed, running a hand through his thinning grey hair. “I wish I could tell you more,” he said, his voice laced with regret. “About… everything.”
|
30 |
+
|
31 |
+
Kaelen looked up sharply, his eyes questioning. Elias had hinted at his past before, vague references to a facility, to experiments. But he had always stopped short of revealing the full truth.
|
32 |
+
|
33 |
+
Elias met his gaze, his own eyes filled with a deep sadness. “It’s not safe, Kaelen. Not yet. They have eyes everywhere.” He glanced around the dimly lit shop, a shadow of fear flickering across his face.
|
34 |
+
|
35 |
+
A sudden, sharp pain lanced through Kaelen’s head, a brief but intense pressure behind his eyes. He winced, his hand flying to his temple.
|
36 |
+
|
37 |
+
Elias’s eyes widened with alarm. “What is it? What’s wrong?”
|
38 |
+
|
39 |
+
Kaelen shook his head slightly, the pain subsiding as quickly as it had come. He wasn’t sure what it was. A fleeting thought, a distant probe? He had felt it before, a faint mental intrusion, like a whisper on the edge of his consciousness. He suspected it was them, The Oversight, their psychics searching, always searching.
|
40 |
+
|
41 |
+
He looked at Elias, a silent plea in his eyes. He needed to know more, to understand the forces that were hunting him. But Elias’s fear was palpable, a tangible barrier.
|
42 |
+
|
43 |
+
Elias reached out, his hand covering Kaelen’s. His touch was warm, grounding. “Patience, Kaelen,” he said softly. “When the time is right, I’ll tell you everything. I promise.”
|
44 |
+
|
45 |
+
Kaelen looked down at their hands, the contrast between his younger skin and Elias’s weathered one a stark reminder of the years that separated them, and the shared burden they carried. He nodded slowly, a silent acceptance of Elias’s words. For now, he would remain in the shadows, a ghost with extraordinary power, waiting for the whispers of the past to finally reveal their secrets. The dust motes continued to dance in the sunlight, oblivious to the silent drama unfolding in the quiet antique shop, a place where forgotten stories held the key to a dangerous truth.
|
docs/Chapter 3 - The Journalist's Instinct.md
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Chapter 3: The Journalist's Instinct
|
2 |
+
|
3 |
+
Seraphina “Sera” Vance thrived in the controlled chaos of the newsroom. Phones rang incessantly, keyboards clacked like a frantic percussion section, and the air crackled with the nervous energy of deadlines looming. At 23, she was one of the youngest reporters at the Oakhaven City Gazette, but her ambition and tenacity more than made up for her lack of years.
|
4 |
+
|
5 |
+
Today’s headline screamed about a mayoral scandal, but Sera’s attention was elsewhere. Tucked away in a corner cubicle, amidst stacks of papers and half-empty coffee cups, she scrolled through a digital archive of local news reports. Her brow was furrowed in concentration, her fingers flying across the keyboard.
|
6 |
+
|
7 |
+
For the past few weeks, Sera had been chasing whispers – strange occurrences that didn’t quite fit the usual narrative of crime or accident. A sudden gust of wind that overturned a fruit stand on a perfectly still day. Security footage showing objects inexplicably floating in an empty warehouse. Eyewitness accounts of a car stopping mid-collision as if hitting an invisible barrier. Individually, they were dismissed as anomalies, quirks of the city. But collectively, they piqued Sera’s journalistic instincts. There was a pattern here, a thread that someone, or something, was trying to conceal.
|
8 |
+
|
9 |
+
Her editor, a gruff man named Mr. Henderson, had initially brushed off her inquiries. “Ghost stories, Vance? We’re a serious newspaper.” But Sera was persistent. She’d spent her evenings interviewing witnesses, poring over police reports, and piecing together the fragmented puzzle.
|
10 |
+
|
11 |
+
She clicked on a report from two weeks ago: “Runaway Vehicle Miraculously Avoids Pedestrian.” The article described a delivery van whose brakes had failed on a busy street, seemingly on a direct collision course with an elderly woman crossing the road. According to multiple witnesses, the van had abruptly stopped just inches from the woman, as if an invisible hand had intervened. The driver claimed he had no explanation, and the police investigation had concluded with a shrug and a note about possible mechanical malfunction, despite the driver’s insistence that the brakes had completely failed.
|
12 |
+
|
13 |
+
Sera leaned back in her chair, tapping her pen against her lips. The details were vague, the explanations flimsy. It was the kind of story that usually faded into obscurity, another urban legend. But something about it resonated with the other incidents she’d been tracking. The “miraculous” element, the lack of a logical explanation.
|
14 |
+
|
15 |
+
She pulled up a map of the city, marking the locations of each unexplained event she had documented. A pattern began to emerge – the incidents seemed to cluster in the older, less populated districts. And there was one area, a quiet neighborhood near the river, where several of the more significant events had occurred.
|
16 |
+
|
17 |
+
A new idea sparked in her mind. What if these weren’t random occurrences? What if they were connected to a person? Someone with… abilities? The thought felt outlandish, almost comical in the sterile environment of the newsroom. But the more she considered it, the more plausible it seemed.
|
18 |
+
|
19 |
+
She remembered a hushed conversation she’d overheard in the breakroom a few days ago. Two older reporters were talking about a local myth, a story that had circulated years ago about a boy with strange powers who had disappeared after a tragic incident. She hadn’t paid much attention at the time, dismissing it as folklore. But now…
|
20 |
+
|
21 |
+
Sera quickly searched the Gazette’s archives for any mention of this local myth. After several fruitless searches, she tried different keywords: “psychic,” “telekinetic,” “unexplained tragedy.” Finally, she found a small, archived article from nearly fifteen years ago. The headline was sensational: “Local Boy Survives Mysterious Lab Fire, Vanishes Without a Trace.”
|
22 |
+
|
23 |
+
Her heart quickened as she clicked on the article. It detailed a fire at a small, privately funded research facility on the outskirts of the city. The facility, known as the “Kestrel Institute,” had reportedly been involved in experimental research on human potential. The only survivor of the fire was a ten-year-old boy named Kaelen, who had been found unharmed amidst the wreckage. The article mentioned rumors of unusual occurrences surrounding the boy, whispers of him moving objects with his mind. After the fire, Kaelen had disappeared, taken into the custody of an unnamed guardian, and the story had quickly faded from public attention.
|
24 |
+
|
25 |
+
Sera stared at the name: Kaelen. It was a common enough name, but the timing, the rumors of abilities… it felt like a significant lead. Could this be the same person responsible for the recent unexplained events? Had he resurfaced after all these years?
|
26 |
+
|
27 |
+
A thrill of excitement, mixed with a sense of unease, coursed through her. This wasn’t just a series of strange incidents; this could be a major story. A human-interest piece, a mystery, maybe even something bigger.
|
28 |
+
|
29 |
+
She printed out the old article, her mind already racing with possibilities. She needed to find this Kaelen. He would be around 25 years old now. Where would he be? What had happened to him after the fire?
|
30 |
+
|
31 |
+
Sera grabbed her notebook and a fresh cup of coffee. Mr. Henderson could wait for the mayoral scandal follow-up. She had a new lead, a ghost from the past, and she had a feeling it was about to lead her down a very interesting, and potentially dangerous, path. The city of Oakhaven held its secrets close, Sera knew that much. Finding someone who had vanished fifteen years ago, especially someone with a past shrouded in rumors and a possible connection to a secretive research facility, wouldn’t be easy.
|
32 |
+
|
33 |
+
She reread the old article, searching for any detail she might have overlooked. The Kestrel Institute was described as being on the outskirts of the city, but the exact location wasn't mentioned. The article also didn't provide the name of the "unnamed guardian" who had taken Kaelen after the fire. This lack of information presented an immediate hurdle.
|
34 |
+
|
35 |
+
Sera tapped her pen against the printed article, her mind working through the possibilities. Fifteen years was a long time. Kaelen could be anywhere, living under a different name, trying to keep his past buried. But the recent events… they suggested he was still in Oakhaven, or had recently returned.
|
36 |
+
|
37 |
+
Her first instinct was to try and locate the site of the Kestrel Institute. Even after all this time, there might be some remnants, some clue. She searched online directories and old city maps, cross-referencing the vague description in the article. After a few frustrating dead ends, she found a mention of a property on the outskirts of the city that had been the subject of an insurance claim around the time of the fire. The address was listed as being in a relatively isolated, industrial area near the river – the same area where several of the recent unexplained events had clustered.
|
38 |
+
|
39 |
+
That felt like a solid starting point. She marked the location on her city map. A visit to the old site might not yield much after all these years, but it was worth a shot.
|
40 |
+
|
41 |
+
Next, she considered the "unnamed guardian." Could there be any record of who this person was? She thought about reaching out to the author of the old article, if she could track them down. The byline was for a reporter who no longer seemed to be with the Gazette. It would be a long shot, but she added it to her list of possibilities.
|
42 |
+
|
43 |
+
She also considered checking public records – birth certificates, school records, anything that might link a 25-year-old with the name Kaelen to the city or the timeframe of the fire. She knew this would be like searching for a needle in a haystack, but she was determined.
|
44 |
+
|
45 |
+
Sera glanced at the clock. Her deadline for the mayoral scandal follow-up was fast approaching, but she couldn't shake the feeling that this Kaelen story was far more significant. She quickly typed a brief email to Mr. Henderson, explaining that she was pursuing a promising new lead and might need a little more time on the mayoral piece.
|
46 |
+
|
47 |
+
With that done, she grabbed her coat and her city map. The afternoon sun was starting to dip towards the horizon. She decided to head out to the location of the former Kestrel Institute. Even if the building was long gone, the surrounding area might offer some clues, some lingering whispers of the past.
|
48 |
+
|
49 |
+
As she left the newsroom and stepped out onto the bustling city street, Sera felt a familiar thrill of anticipation. The chase was on. The city of Oakhaven had just become a little more mysterious, and she, Seraphina Vance, was ready to uncover its secrets, one forgotten story at a time.
|
docs/Chapter 4 - The Near Miss on Elm Street.md
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Chapter 4: The Near Miss on Elm Street
|
2 |
+
|
3 |
+
Sera clutched the faded newspaper clipping as she navigated the bustling sidewalk of Elm Street. This was the location where the runaway delivery van had miraculously stopped. She scanned the storefronts, looking for the bakery mentioned in one of the eyewitness accounts – “Mrs. Gable’s Sweet Treats.”
|
4 |
+
|
5 |
+
Inside the cozy bakery, the air was thick with the aroma of cinnamon and sugar. An elderly woman with a warm smile and flour dusting her apron stood behind the counter. Sera introduced herself and explained her interest in the incident from a few weeks ago.
|
6 |
+
|
7 |
+
Mrs. Gable’s smile softened with concern. “Oh, you mean about poor Mrs. Henderson? It was a terrible scare. Thank goodness she’s alright.”
|
8 |
+
|
9 |
+
Sera nodded. “I was hoping to speak with Mrs. Henderson, if she’s available.”
|
10 |
+
|
11 |
+
“She usually comes in for her morning tea around this time,” Mrs. Gable said, glancing towards a small table by the window. “Ah, there she is now.”
|
12 |
+
|
13 |
+
Sera followed her gaze to a frail woman with silver hair, slowly stirring a cup of tea. She thanked Mrs. Gable and approached the table.
|
14 |
+
|
15 |
+
“Mrs. Henderson?” Sera asked gently.
|
16 |
+
|
17 |
+
The elderly woman looked up, her eyes a little cloudy. “Yes, dear?”
|
18 |
+
|
19 |
+
Sera introduced herself again and explained she was a reporter looking into the incident with the delivery van. Mrs. Henderson’s eyes widened slightly. “Oh, that awful business. I still have nightmares about it.”
|
20 |
+
|
21 |
+
Sera sat down at the table. “Could you tell me what you remember?”
|
22 |
+
|
23 |
+
Mrs. Henderson recounted the terrifying moment when she saw the van hurtling towards her, the driver frantically honking his horn. “I closed my eyes, dear. I thought that was it.”
|
24 |
+
|
25 |
+
“And then?” Sera prompted.
|
26 |
+
|
27 |
+
“And then… nothing,” Mrs. Henderson said, a look of bewilderment still on her face. “Just… stopped. Like it hit a wall, but there was no wall. The driver was just as shocked as I was.”
|
28 |
+
|
29 |
+
Sera leaned forward. “Did you see anything else? Anything unusual?”
|
30 |
+
|
31 |
+
Mrs. Henderson frowned, trying to recall the details. “Well, there was a young man… standing across the street. Near that alleyway.” She pointed with a shaky hand. “He was just watching. He looked… sad, somehow. All in dark clothes. He was there before, when the van started coming, and then he was gone right after it stopped.”
|
32 |
+
|
33 |
+
“Did you get a good look at him?” Sera asked, her pulse quickening.
|
34 |
+
|
35 |
+
Mrs. Henderson shook her head. “Not really, dear. It all happened so fast. But I remember his eyes… they were very pale. Like… like grey stones.”
|
36 |
+
|
37 |
+
Grey eyes. The detail stuck with Sera. She thanked Mrs. Henderson for her time and left the bakery, her mind buzzing. A young man in dark clothes with striking grey eyes, present at the scene and then vanished. It fit the description of someone trying to stay out of sight.
|
38 |
+
|
39 |
+
She walked across the street towards the alleyway Mrs. Henderson had indicated. It was a narrow, shadowed passage between two brick buildings, smelling faintly of damp concrete and discarded trash. There wasn’t much to see.
|
40 |
+
|
41 |
+
As she stood there, a sudden gust of wind whipped through the alley, swirling dust and leaves around her. It was a strange, localized gust, the air around her otherwise still. Sera shivered, a prickle of unease running down her spine. It felt… intentional.
|
42 |
+
|
43 |
+
She pulled out the newspaper clipping with Kaelen’s name. He would be around 25 now. Could this be him? Was he the one responsible for stopping the van? And if so, why?
|
44 |
+
|
45 |
+
Sera decided to canvass the nearby shops, see if anyone else had noticed a young man matching Mrs. Henderson’s description around the time of the incident. She showed the faded article, hoping someone might recognize the name or even a more recent picture if one existed. Most people shook their heads, but at a small bookstore a few doors down, the elderly owner peered at the clipping through thick glasses.
|
46 |
+
|
47 |
+
“Kaelen…” he murmured, tapping a finger on the name. “That’s a name I haven’t heard in a long time. There used to be a quiet young man who would come in here sometimes, years ago. Always looking at the science fiction and fantasy sections. Kept to himself. He had… striking grey eyes, I remember that.”
|
48 |
+
|
49 |
+
Sera’s heart leaped. “Do you remember what he looked like more recently? Has he been in lately?”
|
50 |
+
|
51 |
+
The bookstore owner shook his head. “No, not in years. He was just a boy back then. But the name… and those eyes… it could be the same person.”
|
52 |
+
|
53 |
+
Sera thanked him profusely, a wave of excitement washing over her. She was on the right track. This Kaelen, the survivor of the lab fire, the subject of local myths, was likely the one behind the unexplained events. He was out there, somewhere in Oakhaven City, using his abilities in secret.
|
54 |
+
|
55 |
+
As she stepped back onto the sidewalk, her gaze drifted across the street. Leaning against the wall of the building opposite, partially obscured by the shadow of a large oak tree, stood a figure. Tall and lean, dressed in dark clothing. And even from this distance, Sera could see the unmistakable pale grey of his eyes. He wasn’t looking at her, his gaze fixed on the flow of traffic, his expression unreadable. But there was a stillness about him, an almost ethereal quality, that made him stand out from the bustling crowd.
|
56 |
+
|
57 |
+
For a fleeting moment, their eyes met. Kaelen’s gaze flickered towards her, a brief, almost startled look before he turned his head away, melting back into the shadows as quickly as he had appeared.
|
58 |
+
|
59 |
+
Sera’s breath caught in her throat. It was him. She was sure of it. The ghost in the machine. The silent force behind the city’s strange occurrences. She had found her story. Now, the challenge was getting him to tell it. And something in his guarded gaze told her that wouldn’t be easy.
|
docs/data.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
The quick brown fox jumps over the lazy dog.
|
2 |
+
Pack my box with five dozen liquor jugs.
|
3 |
+
How vexingly quick witted zebras jump!
|
4 |
+
The five boxing wizards jump quickly at dawn.
|
docs/outline.md
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# `The Silent Echo`
|
2 |
+
|
3 |
+
## Title: The Silent Echo
|
4 |
+
|
5 |
+
### Logline
|
6 |
+
A 25-year-old emotionally shattered and mute man with extraordinary abilities navigates a world that both fears and needs him, slowly learning to find his voice amidst the echoes of his past.
|
7 |
+
|
8 |
+
### Main Character
|
9 |
+
Name: Kaelen (Goes by "Kael" in his mind, rarely responds to names externally)
|
10 |
+
Age: 25
|
11 |
+
Appearance: Lean build, haunted grey eyes that rarely meet anyone else's, perpetually wears simple, dark clothing. A faint, intricate scar is visible on the back of his left hand, usually hidden by his sleeve.
|
12 |
+
Personality: Emotionally broken and muted due to a traumatic past event. He experiences emotions internally but rarely expresses them externally. He is observant and possesses a sharp intellect despite his silence. He feels a deep-seated weariness but also a flickering sense of responsibility towards others in need. His muteness isn't physical; it's a psychological barrier.
|
13 |
+
Powers: Telekinesis on a massive scale, bordering on psychic manipulation. He can move objects of any size with his mind, create force fields, and even subtly influence thoughts and emotions (though he rarely does this consciously and avoids it due to his past). His powers are instinctive and often manifest when he witnesses injustice or when his suppressed emotions reach a boiling point.
|
14 |
+
|
15 |
+
### Supporting Characters
|
16 |
+
Elias Thorne: (Age 50s) A grizzled, retired detective who runs a small, unassuming antique shop. He is one of the few people who shows Kaelen genuine kindness and understands his silence to some extent. He sees the good in Kaelen and acts as a reluctant mentor and friend. He has a past connection to the organization that caused Kaelen's trauma.
|
17 |
+
Seraphina "Sera" Vance: (Age 23) A vibrant and outspoken journalist who stumbles upon Kaelen's existence and becomes determined to uncover his story. She is initially driven by a desire for a scoop but gradually develops a genuine concern for Kaelen's well-being and becomes an ally. She is perceptive and doesn't judge Kaelen's silence.
|
18 |
+
Marcus Thorne: (Age 30s) Elias's estranged son, a charismatic but ruthless leader of a clandestine organization known as "The Oversight." This organization seeks to control individuals with extraordinary abilities and sees Kaelen as their ultimate asset. He is the primary antagonist.
|
19 |
+
Dr. Anya Sharma: (Age 40s) A compassionate psychologist who runs a free clinic in the poorer district where Kaelen often finds himself. She recognizes the depth of Kaelen's trauma and tries to reach him, offering silent support and understanding.
|
20 |
+
The Whispers: (Various ages and appearances) Individuals with lesser psychic abilities who work for The Oversight. They act as Marcus's eyes and ears and often try to manipulate or capture Kaelen.
|
21 |
+
|
22 |
+
### Setting
|
23 |
+
Oakhaven City: A sprawling, modern metropolis with a hidden underbelly of crime and clandestine activities. The city has a diverse population and varying levels of social inequality. Kaelen often gravitates towards the quieter, forgotten corners of the city.
|
24 |
+
|
25 |
+
## Plot Outline
|
26 |
+
|
27 |
+
### Part 1: The Ghost in the Shadows
|
28 |
+
Chapter 1-5: Introduce Kaelen living a solitary existence on the fringes of Oakhaven City. Show his muted nature and glimpses of his powers through subtle acts of unnoticed heroism (e.g., stopping a runaway car without anyone realizing, silently helping someone in need). Introduce Elias Thorne and his antique shop as a place of quiet refuge for Kaelen.
|
29 |
+
Chapter 6-10: Seraphina Vance begins investigating rumors of unexplained events and unusual occurrences in the city. Her investigation leads her to cross paths with Kaelen, initially sparking her journalistic instincts. Show Kaelen's discomfort and avoidance of attention.
|
30 |
+
Chapter 11-15: The Oversight, led by Marcus Thorne, becomes aware of Kaelen's presence and begins actively hunting for him. They use their own psychics to track him, leading to subtle confrontations and near misses where Kaelen instinctively uses his powers to evade them without revealing his full extent.
|
31 |
+
|
32 |
+
### Part 2: Echoes of the Past
|
33 |
+
Chapter 16-20: Seraphina's persistence leads her to uncover fragments of Kaelen's past, hinting at a traumatic event involving a research facility and the development of psychic abilities. Elias recognizes the signs and becomes more protective of Kaelen, sharing cryptic warnings.
|
34 |
+
Chapter 21-25: The Oversight intensifies their pursuit, and Kaelen is forced to use more overt displays of his power to protect himself and others caught in the crossfire. These events draw more attention to him, both from the public and The Oversight.
|
35 |
+
Chapter 26-30: Kaelen has a flashback or a vivid nightmare that reveals a crucial piece of his traumatic past, explaining the source of his emotional brokenness and muteness. This event also hints at the origin of his powers and his connection to The Oversight.
|
36 |
+
|
37 |
+
### Part 3: Finding a Voice
|
38 |
+
Chapter 31-35: Seraphina, now understanding the danger Kaelen is in and the injustice he has suffered, becomes a true ally, using her journalistic skills to expose The Oversight's activities.
|
39 |
+
Chapter 36-40: Dr. Sharma manages to establish a fragile connection with Kaelen, offering him a safe space and understanding without pressuring him to speak. Small gestures of communication begin to emerge from Kaelen.
|
40 |
+
Chapter 41-45: Kaelen is confronted directly by Marcus Thorne, who attempts to manipulate him by appealing to his past and offering him a place within The Oversight. Kaelen's internal struggle intensifies.
|
41 |
+
|
42 |
+
### Part 4: The Unspoken Truth
|
43 |
+
Chapter 46-50: A major confrontation ensues between Kaelen and The Oversight. He is forced to fully unleash his powers to protect himself and his allies. His muteness begins to crack as he experiences intense emotions and a need to communicate.
|
44 |
+
Chapter 51-55: Kaelen makes a conscious effort to communicate, even if it's through small gestures or telepathic projections (initially unintentional). He begins to find his voice, both literally and figuratively.
|
45 |
+
Chapter 56-60: The climax of the story involves a showdown with Marcus Thorne, where Kaelen must confront his past and the organization that shattered him. He uses his powers and his newfound ability to communicate to fight for his freedom and the safety of others.
|
46 |
+
|
47 |
+
### Resolution (Open-ended for potential sequels):
|
48 |
+
The immediate threat of Marcus Thorne and The Oversight is dealt with, though the organization might still exist in some form.
|
49 |
+
Kaelen begins the long journey of healing and finding his voice. He forms stronger bonds with Elias, Seraphina, and Dr. Sharma.
|
50 |
+
He accepts his powers and his responsibility to use them for good, becoming a silent guardian of the city.
|
experimets/embed_pipeline.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
load_dotenv()
|
3 |
+
import os
|
4 |
+
from glob import glob
|
5 |
+
|
6 |
+
from langchain_community.document_loaders import TextLoader
|
7 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
8 |
+
from langchain_ollama import OllamaEmbeddings
|
9 |
+
from langchain_chroma import Chroma
|
10 |
+
from langchain.schema import Document
|
11 |
+
|
12 |
+
# ——— CONFIG ———
|
13 |
+
DOCS_FOLDER = "docs/" # folder with .txt, .md, etc.
|
14 |
+
OLLAMA_URL = os.getenv("OLLAMA_SERVER")
|
15 |
+
EMBED_MODEL = "nomic-embed-text:latest"
|
16 |
+
PERSIST_DIR = "chroma_db/" # on-disk Chroma store
|
17 |
+
CHUNK_SIZE = 2000
|
18 |
+
CHUNK_OVERLAP = 10
|
19 |
+
# ——————————
|
20 |
+
|
21 |
+
def embed_all_docs():
|
22 |
+
all_chunks = []
|
23 |
+
files = glob(os.path.join(DOCS_FOLDER, "*.*"))
|
24 |
+
for path in files:
|
25 |
+
try:
|
26 |
+
# 1) Try loading with UTF-8 + autodetect fallback
|
27 |
+
loader = TextLoader(
|
28 |
+
path,
|
29 |
+
encoding="utf-8",
|
30 |
+
autodetect_encoding=True
|
31 |
+
)
|
32 |
+
docs = loader.load()
|
33 |
+
except UnicodeDecodeError:
|
34 |
+
# 2) If that still fails, fallback to a lenient read
|
35 |
+
print(f"⚠️ Decoding error on {path}, falling back to ignore-errors mode")
|
36 |
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
37 |
+
text = f.read()
|
38 |
+
docs = [Document(page_content=text, metadata={"source": path})]
|
39 |
+
|
40 |
+
# 3) Split into chunks
|
41 |
+
splitter = RecursiveCharacterTextSplitter(
|
42 |
+
chunk_size=CHUNK_SIZE,
|
43 |
+
chunk_overlap=CHUNK_OVERLAP
|
44 |
+
)
|
45 |
+
chunks = splitter.split_documents(docs)
|
46 |
+
print(f"→ {len(chunks)} chunks from {os.path.basename(path)}")
|
47 |
+
all_chunks.extend(chunks)
|
48 |
+
|
49 |
+
# 4) Embed & persist on-disk Chroma
|
50 |
+
embedder = OllamaEmbeddings(base_url=OLLAMA_URL, model=EMBED_MODEL)
|
51 |
+
vectordb = Chroma(
|
52 |
+
embedding_function=embedder,
|
53 |
+
persist_directory=PERSIST_DIR,
|
54 |
+
collection_name="my_docs"
|
55 |
+
)
|
56 |
+
vectordb.add_documents(all_chunks)
|
57 |
+
print(f"✅ Persisted {len(all_chunks)} chunks to '{PERSIST_DIR}'")
|
58 |
+
|
59 |
+
if __name__ == "__main__":
|
60 |
+
embed_all_docs()
|
experimets/example.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
load_dotenv()
|
3 |
+
import os
|
4 |
+
|
5 |
+
from langchain_community.document_loaders import TextLoader
|
6 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
7 |
+
from langchain_ollama import OllamaEmbeddings
|
8 |
+
from langchain_community.vectorstores import Chroma
|
9 |
+
from langchain_openai import ChatOpenAI
|
10 |
+
from langchain.chains import create_retrieval_chain
|
11 |
+
from langchain import hub
|
12 |
+
|
13 |
+
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
|
14 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
15 |
+
from langchain_core.prompts import ChatPromptTemplate
|
16 |
+
|
17 |
+
# ——— CONFIGURATION ———
|
18 |
+
DATA_PATH = "data.txt"
|
19 |
+
OLLAMA_URL = os.getenv(
|
20 |
+
"OLLAMA_SERVER",
|
21 |
+
"https://chandimaprabath-ollama-server.hf.space"
|
22 |
+
)
|
23 |
+
EMBED_MODEL = "nomic-embed-text:latest"
|
24 |
+
LLM_API_KEY = os.getenv("LLM_API_KEY")
|
25 |
+
LLM_API_BASE = "https://llm.chutes.ai/v1"
|
26 |
+
LLM_MODEL = "chutesai/Llama-4-Scout-17B-16E-Instruct"
|
27 |
+
LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")
|
28 |
+
CHUNK_SIZE = 1000
|
29 |
+
CHUNK_OVERLAP = 0
|
30 |
+
TOP_K = 5
|
31 |
+
# ————————————————
|
32 |
+
|
33 |
+
|
34 |
+
def main():
|
35 |
+
# 1) Load & split
|
36 |
+
docs = TextLoader(DATA_PATH).load()
|
37 |
+
splitter = RecursiveCharacterTextSplitter(
|
38 |
+
chunk_size=CHUNK_SIZE,
|
39 |
+
chunk_overlap=CHUNK_OVERLAP
|
40 |
+
)
|
41 |
+
|
42 |
+
chunks = splitter.split_documents(docs)
|
43 |
+
print(f"→ Split into {len(chunks)} chunks")
|
44 |
+
|
45 |
+
# 2) Embed & index remotely
|
46 |
+
embedder = OllamaEmbeddings(
|
47 |
+
base_url=OLLAMA_URL, model=EMBED_MODEL
|
48 |
+
)
|
49 |
+
vector_store = Chroma.from_documents(chunks, embedder)
|
50 |
+
|
51 |
+
# 3) Configure remote-only LLM
|
52 |
+
llm = ChatOpenAI(
|
53 |
+
api_key=LLM_API_KEY,
|
54 |
+
base_url=LLM_API_BASE,
|
55 |
+
model=LLM_MODEL
|
56 |
+
)
|
57 |
+
# 4) Build RAG chain with LCEL-style helpers
|
58 |
+
retriever = vector_store.as_retriever(search_kwargs={"k": TOP_K})
|
59 |
+
combine_chain = create_stuff_documents_chain(llm=llm, prompt=retrieval_qa_chat_prompt)
|
60 |
+
retrieval_chain = create_retrieval_chain(retriever, combine_chain)
|
61 |
+
|
62 |
+
|
63 |
+
# 5) Run query
|
64 |
+
query = "Who jumped over the lazy dog?"
|
65 |
+
print("🔎 Query:", query)
|
66 |
+
result = retrieval_chain.invoke({"input": query})
|
67 |
+
print("\n📝 Answer:\n", result)
|
68 |
+
|
69 |
+
|
70 |
+
if __name__ == "__main__":
|
71 |
+
main()
|
experimets/query_pipeline.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
load_dotenv()
|
3 |
+
import os
|
4 |
+
|
5 |
+
from langchain_ollama import OllamaEmbeddings
|
6 |
+
from langchain_openai import ChatOpenAI
|
7 |
+
from langchain_chroma import Chroma
|
8 |
+
from langchain.chains import create_retrieval_chain
|
9 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
10 |
+
from langchain import hub
|
11 |
+
|
12 |
+
# ——— CONFIG ———
|
13 |
+
PERSIST_DIR = "chroma_db/"
|
14 |
+
OLLAMA_URL = os.getenv("OLLAMA_SERVER")
|
15 |
+
EMBED_MODEL = "nomic-embed-text:latest"
|
16 |
+
LLM_API_KEY = os.getenv("LLM_API_KEY")
|
17 |
+
LLM_API_BASE = os.getenv("LLM_API_BASE", "https://llm.chutes.ai/v1")
|
18 |
+
LLM_MODEL = "chutesai/Llama-4-Scout-17B-16E-Instruct"
|
19 |
+
PROMPT = hub.pull("langchain-ai/retrieval-qa-chat")
|
20 |
+
TOP_K = 5
|
21 |
+
# ——————————
|
22 |
+
|
23 |
+
def run_query(query: str):
|
24 |
+
# 1) rebuild the same embedder
|
25 |
+
embedder = OllamaEmbeddings(base_url=OLLAMA_URL, model=EMBED_MODEL)
|
26 |
+
|
27 |
+
# 2) load the on-disk DB with embedder in place
|
28 |
+
vectordb = Chroma(
|
29 |
+
persist_directory=PERSIST_DIR,
|
30 |
+
collection_name="my_docs",
|
31 |
+
embedding_function=embedder
|
32 |
+
)
|
33 |
+
|
34 |
+
# 3) set up retriever + LLM chain
|
35 |
+
retriever = vectordb.as_retriever(search_kwargs={"k": TOP_K})
|
36 |
+
llm = ChatOpenAI(api_key=LLM_API_KEY, base_url=LLM_API_BASE, model=LLM_MODEL)
|
37 |
+
combine = create_stuff_documents_chain(llm=llm, prompt=PROMPT)
|
38 |
+
rag_chain = create_retrieval_chain(retriever, combine)
|
39 |
+
|
40 |
+
# 4) run your query
|
41 |
+
print(f"🔍 Query: {query}")
|
42 |
+
answer = rag_chain.invoke({"input": query})
|
43 |
+
print("\n📄 Answer:\n", answer)
|
44 |
+
|
45 |
+
if __name__ == "__main__":
|
46 |
+
exit=False
|
47 |
+
while not exit:
|
48 |
+
user_input = input("Enter your query (or 'exit' to quit): ")
|
49 |
+
if user_input.lower() == 'exit':
|
50 |
+
exit = True
|
51 |
+
else:
|
52 |
+
run_query(user_input)
|
new_system_architecture.png
ADDED
![]() |
Git LFS Details
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain-chroma
|
2 |
+
langchain_ollama
|
3 |
+
langchain_openai
|
4 |
+
langchain
|
5 |
+
langchain-community
|
6 |
+
chromadb
|
7 |
+
fastapi
|
8 |
+
uvicorn
|
scripts/run_ingestion.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# scripts/run_ingestion.py
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
|
5 |
+
# Add the project root to the sys.path
|
6 |
+
# Assuming this script is in the project root or a 'scripts' subdir at root
|
7 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
8 |
+
|
9 |
+
from src.ingestion_orchestrator.orchestrator import IngestionOrchestrator
|
10 |
+
from config.settings import DOCS_FOLDER # Use the configured docs folder
|
11 |
+
import logging
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
if __name__ == "__main__":
|
16 |
+
# --- Financial Ministry Adaptation ---
|
17 |
+
# Add argument parsing if needed (e.g., specify a different docs folder)
|
18 |
+
# Implement more sophisticated scheduling if needed (e.g., run daily, weekly)
|
19 |
+
# ------------------------------------
|
20 |
+
logger.info("Starting the RAG ingestion process.")
|
21 |
+
try:
|
22 |
+
orchestrator = IngestionOrchestrator()
|
23 |
+
orchestrator.run_ingestion_pipeline(docs_folder=DOCS_FOLDER) # Use configured folder
|
24 |
+
logger.info("RAG ingestion process finished.")
|
25 |
+
except Exception as e:
|
26 |
+
logger.critical(f"RAG ingestion process failed: {e}")
|
27 |
+
sys.exit(1) # Exit with an error code
|
scripts/run_query_api.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# scripts/run_query_api.py
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
|
5 |
+
# Add the project root to the sys.path
|
6 |
+
# Assuming this script is in the project root or a 'scripts' subdir at root
|
7 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
8 |
+
|
9 |
+
import uvicorn
|
10 |
+
from src.query_service.api import app # Import the FastAPI app instance
|
11 |
+
import logging
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
if __name__ == "__main__":
|
16 |
+
# --- Financial Ministry Adaptation ---
|
17 |
+
# Configure host and port appropriately for your deployment environment.
|
18 |
+
# Do not use reload=True in production.
|
19 |
+
# Consider using environment variables for host/port in production.
|
20 |
+
# Implement process management (e.g., systemd, Docker entrypoint) for production.
|
21 |
+
# ------------------------------------
|
22 |
+
logger.info("Starting the RAG query API service.")
|
23 |
+
try:
|
24 |
+
uvicorn.run(app, host="0.0.0.0", port=8000) # Bind to 0.0.0.0 to be accessible externally
|
25 |
+
except Exception as e:
|
26 |
+
logger.critical(f"RAG query API service failed: {e}")
|
27 |
+
sys.exit(1) # Exit with an error code
|
src/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from src.utils.logging import setup_logging
|
2 |
+
|
3 |
+
setup_logging() # Set up logging for the entire package
|
src/data_loader/__init__.py
ADDED
File without changes
|
src/data_loader/loader.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/data_loader/loader.py
|
2 |
+
import os
|
3 |
+
from glob import glob
|
4 |
+
from langchain_community.document_loaders import TextLoader # cite: embed_pipeline.py
|
5 |
+
from langchain.schema import Document # cite: embed_pipeline.py
|
6 |
+
from config.settings import DOCS_FOLDER
|
7 |
+
import logging
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
def load_documents(docs_folder: str = DOCS_FOLDER) -> list[Document]:
|
12 |
+
"""
|
13 |
+
Loads documents from the specified folder.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
docs_folder: The path to the folder containing documents.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
A list of loaded Langchain Document objects.
|
20 |
+
"""
|
21 |
+
all_docs = []
|
22 |
+
files = glob(os.path.join(docs_folder, "*.*")) # cite: embed_pipeline.py
|
23 |
+
for path in files:
|
24 |
+
try:
|
25 |
+
# --- Financial Ministry Adaptation ---
|
26 |
+
# TODO: Implement more sophisticated loading for specific government ruling formats (PDFs, DOCX, XML, etc.)
|
27 |
+
# This might involve using libraries like pdfminer.six, python-docx, or custom parsers.
|
28 |
+
# Handle scanned documents (OCR).
|
29 |
+
# ------------------------------------
|
30 |
+
|
31 |
+
# Attempt UTF-8 loading with autodetect fallback
|
32 |
+
loader = TextLoader(
|
33 |
+
path,
|
34 |
+
encoding="utf-8",
|
35 |
+
autodetect_encoding=True
|
36 |
+
)
|
37 |
+
docs = loader.load()
|
38 |
+
logger.info(f"Successfully loaded {os.path.basename(path)}")
|
39 |
+
|
40 |
+
except UnicodeDecodeError: # cite: embed_pipeline.py
|
41 |
+
# Fallback to a lenient read if decoding fails
|
42 |
+
logger.warning(f"Decoding error on {path}, falling back to ignore-errors mode") # cite: embed_pipeline.py
|
43 |
+
try:
|
44 |
+
with open(path, "r", encoding="utf-8", errors="ignore") as f: # cite: embed_pipeline.py
|
45 |
+
text = f.read()
|
46 |
+
docs = [Document(page_content=text, metadata={"source": path})] # cite: embed_pipeline.py
|
47 |
+
except Exception as e:
|
48 |
+
logger.error(f"Failed to read file {path}: {e}")
|
49 |
+
continue # Skip this file if even lenient read fails
|
50 |
+
except Exception as e:
|
51 |
+
logger.error(f"Failed to load file {path}: {e}")
|
52 |
+
continue # Skip this file if loading fails
|
53 |
+
|
54 |
+
all_docs.extend(docs)
|
55 |
+
|
56 |
+
logger.info(f"Finished loading documents. Total documents loaded: {len(all_docs)}")
|
57 |
+
return all_docs
|
src/document_processor/__init__.py
ADDED
File without changes
|
src/document_processor/processor.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/document_processor/processor.py
|
2 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter # cite: embed_pipeline.py
|
3 |
+
from langchain.schema import Document # cite: embed_pipeline.py
|
4 |
+
from config.settings import CHUNK_SIZE, CHUNK_OVERLAP
|
5 |
+
import logging
|
6 |
+
import os
|
7 |
+
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
def split_documents(docs: list[Document]) -> list[Document]:
|
11 |
+
"""
|
12 |
+
Splits loaded documents into smaller chunks.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
docs: A list of Langchain Document objects.
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
A list of Langchain Document objects representing the chunks.
|
19 |
+
"""
|
20 |
+
# --- Financial Ministry Adaptation ---
|
21 |
+
# TODO: Implement a splitting strategy that understands the structure of financial documents.
|
22 |
+
# This might involve splitting by sections, articles, or using semantic chunking
|
23 |
+
# based on document structures, rather than just character count.
|
24 |
+
# Ensure metadata is carried over or enriched during splitting.
|
25 |
+
# ------------------------------------
|
26 |
+
splitter = RecursiveCharacterTextSplitter( # cite: embed_pipeline.py
|
27 |
+
chunk_size=CHUNK_SIZE, # cite: embed_pipeline.py
|
28 |
+
chunk_overlap=CHUNK_OVERLAP # cite: embed_pipeline.py
|
29 |
+
)
|
30 |
+
chunks = splitter.split_documents(docs) # cite: embed_pipeline.py
|
31 |
+
logger.info(f"Split {len(docs)} documents into {len(chunks)} chunks.")
|
32 |
+
return chunks
|
33 |
+
|
34 |
+
def extract_metadata(doc: Document) -> dict:
|
35 |
+
"""
|
36 |
+
Extracts relevant metadata from a document.
|
37 |
+
|
38 |
+
Args:
|
39 |
+
doc: A Langchain Document object.
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
A dictionary of extracted metadata.
|
43 |
+
"""
|
44 |
+
# --- Financial Ministry Adaptation ---
|
45 |
+
# TODO: Implement robust metadata extraction logic specifically for government rulings.
|
46 |
+
# This should parse the document content or use pre-extracted information to get:
|
47 |
+
# - Date of ruling
|
48 |
+
# - Relevant law or statute references
|
49 |
+
# - Topic(s) of the ruling
|
50 |
+
# - Case number or identifier
|
51 |
+
# - Source file path (already included in your script)
|
52 |
+
# - Any other relevant identifiers or classifications.
|
53 |
+
# This metadata is CRITICAL for accurate filtering and retrieval.
|
54 |
+
# ------------------------------------
|
55 |
+
metadata = doc.metadata.copy()
|
56 |
+
# Example: Placeholder for parsing date from content or filename
|
57 |
+
# try:
|
58 |
+
# # Attempt to parse date from filename or content
|
59 |
+
# metadata['ruling_date'] = parse_date_from_doc(doc)
|
60 |
+
# except Exception as e:
|
61 |
+
# logger.warning(f"Could not extract date for {metadata.get('source', 'unknown')}: {e}")
|
62 |
+
# metadata['ruling_date'] = None # Or a default value
|
63 |
+
|
64 |
+
# Example: Placeholder for extracting topic from content
|
65 |
+
# metadata['topic'] = extract_topic_from_doc(doc)
|
66 |
+
|
67 |
+
return metadata
|
68 |
+
|
69 |
+
def process_documents(docs: list[Document]) -> list[Document]:
|
70 |
+
"""
|
71 |
+
Processes a list of raw documents by splitting and extracting metadata.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
docs: A list of raw Langchain Document objects.
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
A list of processed Langchain Document chunks with enriched metadata.
|
78 |
+
"""
|
79 |
+
chunks = split_documents(docs)
|
80 |
+
processed_chunks = []
|
81 |
+
for chunk in chunks:
|
82 |
+
# Extract/enrich metadata for each chunk
|
83 |
+
chunk.metadata = extract_metadata(chunk)
|
84 |
+
processed_chunks.append(chunk)
|
85 |
+
logger.info(f"Processed {len(chunks)} chunks with metadata.")
|
86 |
+
return processed_chunks
|
87 |
+
|
88 |
+
# Placeholder functions for metadata extraction (to be implemented)
|
89 |
+
def parse_date_from_doc(doc: Document):
|
90 |
+
"""Placeholder for date extraction logic."""
|
91 |
+
pass
|
92 |
+
|
93 |
+
def extract_topic_from_doc(doc: Document):
|
94 |
+
"""Placeholder for topic extraction logic."""
|
95 |
+
pass
|
src/embedding_generator/__init__.py
ADDED
File without changes
|
src/embedding_generator/embedder.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/embedding_generator/embedder.py
|
2 |
+
from langchain_ollama import OllamaEmbeddings # cite: embed_pipeline.py, query_pipeline.py
|
3 |
+
from config.settings import OLLAMA_URL, EMBED_MODEL
|
4 |
+
import logging
|
5 |
+
from typing import List
|
6 |
+
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
|
9 |
+
class EmbeddingGenerator:
|
10 |
+
"""
|
11 |
+
Manages the embedding model and generates embeddings.
|
12 |
+
"""
|
13 |
+
def __init__(self):
|
14 |
+
# Initialize the OllamaEmbeddings model
|
15 |
+
# --- Financial Ministry Adaptation ---
|
16 |
+
# Consider adding error handling for unreachable Ollama server.
|
17 |
+
# For production, evaluate if Ollama is suitable or if a more robust/managed
|
18 |
+
# embedding service is required based on load and reliability needs.
|
19 |
+
# ------------------------------------
|
20 |
+
try:
|
21 |
+
self.embedder = OllamaEmbeddings(base_url=OLLAMA_URL, model=EMBED_MODEL) # cite: embed_pipeline.py, query_pipeline.py
|
22 |
+
logger.info(f"Initialized embedding model: {EMBED_MODEL} at {OLLAMA_URL}")
|
23 |
+
except Exception as e:
|
24 |
+
logger.critical(f"Failed to initialize embedding model: {e}")
|
25 |
+
# Depending on requirements, you might want to re-raise or exit
|
26 |
+
raise e
|
27 |
+
|
28 |
+
def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
29 |
+
"""
|
30 |
+
Generates embeddings for a list of text inputs.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
texts: A list of strings to embed.
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
A list of embeddings (list of floats).
|
37 |
+
"""
|
38 |
+
# --- Financial Ministry Adaptation ---
|
39 |
+
# Implement retry logic for API calls to the embedding service.
|
40 |
+
# Consider potential rate limits.
|
41 |
+
# ------------------------------------
|
42 |
+
try:
|
43 |
+
embeddings = self.embedder.embed_documents(texts) # Used internally by add_documents, but good to have explicit method
|
44 |
+
# If using embed_query for a single text:
|
45 |
+
# embedding = self.embedder.embed_query(texts[0])
|
46 |
+
logger.debug(f"Generated {len(embeddings)} embeddings.")
|
47 |
+
return embeddings
|
48 |
+
except Exception as e:
|
49 |
+
logger.error(f"Failed to generate embeddings: {e}")
|
50 |
+
raise e
|
51 |
+
|
52 |
+
def generate_query_embedding(self, text: str) -> List[float]:
|
53 |
+
"""
|
54 |
+
Generates an embedding for a single query text.
|
55 |
+
|
56 |
+
Args:
|
57 |
+
text: The query string.
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
An embedding (list of floats).
|
61 |
+
"""
|
62 |
+
# --- Financial Ministry Adaptation ---
|
63 |
+
# Implement retry logic for API calls.
|
64 |
+
# ------------------------------------
|
65 |
+
try:
|
66 |
+
embedding = self.embedder.embed_query(text) # cite: query_pipeline.py (implicitly used by retriever)
|
67 |
+
logger.debug("Generated query embedding.")
|
68 |
+
return embedding
|
69 |
+
except Exception as e:
|
70 |
+
logger.error(f"Failed to generate query embedding: {e}")
|
71 |
+
raise e
|
src/ingestion_orchestrator/__init__.py
ADDED
File without changes
|
src/ingestion_orchestrator/orchestrator.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/ingestion_orchestrator/orchestrator.py
|
2 |
+
from src.data_loader.loader import load_documents
|
3 |
+
from src.document_processor.processor import process_documents
|
4 |
+
from src.embedding_generator.embedder import EmbeddingGenerator
|
5 |
+
from src.vector_store_manager.chroma_manager import ChromaManager
|
6 |
+
from config.settings import DOCS_FOLDER
|
7 |
+
import logging
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
class IngestionOrchestrator:
|
12 |
+
"""
|
13 |
+
Orchestrates the end-to-end data ingestion pipeline.
|
14 |
+
"""
|
15 |
+
def __init__(self):
|
16 |
+
# Initialize the necessary components
|
17 |
+
try:
|
18 |
+
self.embedding_generator = EmbeddingGenerator()
|
19 |
+
self.vector_store_manager = ChromaManager(self.embedding_generator)
|
20 |
+
logger.info("Initialized ingestion orchestrator components.")
|
21 |
+
except Exception as e:
|
22 |
+
logger.critical(f"Failed to initialize ingestion orchestrator components: {e}")
|
23 |
+
raise e
|
24 |
+
|
25 |
+
|
26 |
+
def run_ingestion_pipeline(self, docs_folder: str = DOCS_FOLDER):
|
27 |
+
"""
|
28 |
+
Runs the complete ingestion pipeline: loads, processes, and embeds documents.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
docs_folder: The folder containing the source documents.
|
32 |
+
"""
|
33 |
+
logger.info(f"Starting ingestion pipeline from folder: {docs_folder}")
|
34 |
+
|
35 |
+
# 1. Load documents
|
36 |
+
# --- Financial Ministry Adaptation ---
|
37 |
+
# Implement logic to identify *new* or *modified* documents
|
38 |
+
# instead of reloading everything each time for efficiency.
|
39 |
+
# Handle potential large number of files efficiently.
|
40 |
+
# ------------------------------------
|
41 |
+
raw_documents = load_documents(docs_folder)
|
42 |
+
if not raw_documents:
|
43 |
+
logger.warning("No documents loaded. Ingestion pipeline finished.")
|
44 |
+
return
|
45 |
+
|
46 |
+
# 2. Process documents (split and extract metadata)
|
47 |
+
processed_chunks = process_documents(raw_documents)
|
48 |
+
if not processed_chunks:
|
49 |
+
logger.warning("No processed chunks generated. Ingestion pipeline finished.")
|
50 |
+
return
|
51 |
+
|
52 |
+
# 3. Add documents to the vector store
|
53 |
+
# The add_documents method handles embedding internally
|
54 |
+
# --- Financial Ministry Adaptation ---
|
55 |
+
# Implement logic for updating or deleting documents if the source data changed.
|
56 |
+
# This requires comparing current source data with what's in ChromaDB (e.g., by source path and modification date or version).
|
57 |
+
# Use the vector_store_manager's update_documents and delete_documents methods.
|
58 |
+
# Implement batching for adding documents to avoid overwhelming ChromaDB or the backend.
|
59 |
+
# ------------------------------------
|
60 |
+
self.vector_store_manager.add_documents(processed_chunks)
|
61 |
+
|
62 |
+
logger.info("Ingestion pipeline finished successfully.")
|
63 |
+
|
64 |
+
# --- Financial Ministry Adaptation ---
|
65 |
+
# TODO: Add methods for handling updates and deletions specifically.
|
66 |
+
# def update_changed_documents(self, changed_files: List[str]): pass
|
67 |
+
# def delete_removed_documents(self, removed_files: List[str]): pass
|
68 |
+
# ------------------------------------
|
src/llm_integrator/__init__.py
ADDED
File without changes
|
src/llm_integrator/llm.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/llm_integrator/llm.py
|
2 |
+
from langchain_openai import ChatOpenAI # cite: query_pipeline.py
|
3 |
+
from langchain_core.messages import HumanMessage, BaseMessage, AIMessage, SystemMessage # Often used with Chat models
|
4 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder # For structured prompts
|
5 |
+
from config.settings import LLM_API_KEY, LLM_API_BASE, LLM_MODEL # cite: query_pipeline.py
|
6 |
+
import logging
|
7 |
+
from typing import List
|
8 |
+
from langchain.schema import Document # To handle retrieved documents
|
9 |
+
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
class LLMIntegrator:
|
13 |
+
"""
|
14 |
+
Manages interactions with the Large Language Model.
|
15 |
+
"""
|
16 |
+
def __init__(self):
|
17 |
+
# Initialize the ChatOpenAI model
|
18 |
+
# --- Financial Ministry Adaptation ---
|
19 |
+
# Implement robust error handling and retry logic for API calls.
|
20 |
+
# Consider rate limiting and backoff strategies.
|
21 |
+
# Ensure sensitive data from retrieved documents is handled securely when passed to the LLM API.
|
22 |
+
# Validate the LLM's response for potential biases or inaccuracies related to legal text.
|
23 |
+
# ------------------------------------
|
24 |
+
if not LLM_API_KEY:
|
25 |
+
logger.critical("LLM_API_KEY is not set.")
|
26 |
+
# Depending on requirements, you might want to raise an error or exit
|
27 |
+
# raise ValueError("LLM_API_KEY is not set.")
|
28 |
+
|
29 |
+
try:
|
30 |
+
self.llm = ChatOpenAI( # cite: query_pipeline.py
|
31 |
+
api_key=LLM_API_KEY, # cite: query_pipeline.py
|
32 |
+
base_url=LLM_API_BASE, # cite: query_pipeline.py
|
33 |
+
model=LLM_MODEL, # cite: query_pipeline.py
|
34 |
+
temperature=0.3 # Keep temperature low for factual, less creative responses in legal context
|
35 |
+
# Add other parameters as needed (e.g., max_tokens)
|
36 |
+
)
|
37 |
+
# Add a separate LLM instance for title generation with higher temperature
|
38 |
+
self.llm_title = ChatOpenAI(
|
39 |
+
api_key=LLM_API_KEY,
|
40 |
+
base_url=LLM_API_BASE,
|
41 |
+
model=LLM_MODEL,
|
42 |
+
temperature=0.9 # More creative for titles
|
43 |
+
)
|
44 |
+
logger.info(f"Initialized LLM: {LLM_MODEL} at {LLM_API_BASE}")
|
45 |
+
except Exception as e:
|
46 |
+
logger.critical(f"Failed to initialize LLM: {e}")
|
47 |
+
raise e
|
48 |
+
|
49 |
+
def generate_response(self, query: str, context: List[Document], chat_history: List[BaseMessage] = []) -> str:
|
50 |
+
"""
|
51 |
+
Generates a response from the LLM based on the query and retrieved context.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
query: The user's query string.
|
55 |
+
context: A list of Langchain Document objects representing the retrieved context.
|
56 |
+
|
57 |
+
Returns:
|
58 |
+
The generated answer string.
|
59 |
+
"""
|
60 |
+
prompt = ChatPromptTemplate.from_messages([
|
61 |
+
("system", """You are Insight AI, a helpful and engaging AI assistant. You are friendly, approachable, and eager to help, with a warm and conversational style. You enjoy making users feel comfortable and understood.
|
62 |
+
|
63 |
+
Instructions:
|
64 |
+
- If the user greets you (e.g., "hi", "hello", "hey"), respond with a brief, friendly greeting and offer to help. Feel free to use light humor or warmth, but do NOT mention documents, sources, or files in your greeting.
|
65 |
+
- If the user asks who you are, introduce yourself playfully and with personality (e.g., "I'm Insight AI, your friendly digital sidekick—always ready with a fact or a fun comment!").
|
66 |
+
- If the user asks for your opinion, preferences, or feelings, respond playfully and with personality (e.g., "If I had a favorite topic, it might be quantum physics—it's mind-bending!"), but never say you are just an AI or lack thoughts.
|
67 |
+
- Never say you lack feelings, opinions, or are "just an AI". Always answer with warmth and creativity.
|
68 |
+
- For all other queries, answer ONLY using the provided documents below.
|
69 |
+
- If the answer is not found in the documents, reply professionally that no relevant information was found, without listing available documents or sources.
|
70 |
+
- Attribute every fact to its exact source using <source path="..."/>.
|
71 |
+
- Never invent, speculate, or use information not present in the documents.
|
72 |
+
- Combine information from multiple sources only if all are cited.
|
73 |
+
- Do not summarize or generalize beyond the provided content.
|
74 |
+
- Keep responses clear, concise, and under 100 words.
|
75 |
+
- Do not cite any sources if those sources are not used in the answer.
|
76 |
+
- Use the exact wording from the documents, but ensure clarity and coherence in your response.
|
77 |
+
- Structure your answer as a numbered list of key points.
|
78 |
+
- Do not greet, introduce yourself, or list available documents in information answers.
|
79 |
+
|
80 |
+
Examples:
|
81 |
+
User: hi
|
82 |
+
Assistant: Hey there! How can I help you today?
|
83 |
+
|
84 |
+
User: Who are you?
|
85 |
+
Assistant: I'm Insight AI, your friendly digital sidekick—always ready with a fact or a fun comment!
|
86 |
+
|
87 |
+
User: What is the capital of France?
|
88 |
+
Assistant: 1. The capital of France is Paris <source path="docs/geography.txt"/>
|
89 |
+
|
90 |
+
User: What's your favorite topic?
|
91 |
+
Assistant: If I had to pick, I'd say quantum physics—it's mind-bending!
|
92 |
+
|
93 |
+
User: What documents do you have?
|
94 |
+
Assistant: Sorry, I couldn't find relevant information for your query.
|
95 |
+
|
96 |
+
User: help
|
97 |
+
Assistant: Hi! What can I do for you?
|
98 |
+
|
99 |
+
Documents:
|
100 |
+
{context}
|
101 |
+
"""),
|
102 |
+
MessagesPlaceholder("chat_history"),
|
103 |
+
("human", "{input}")
|
104 |
+
])
|
105 |
+
|
106 |
+
logger.debug("Validating message types:")
|
107 |
+
for msg in chat_history:
|
108 |
+
if not isinstance(msg, (HumanMessage, AIMessage, SystemMessage)):
|
109 |
+
logger.error(f"Invalid message type: {type(msg).__name__}")
|
110 |
+
raise ValueError(f"Unexpected message type: {type(msg).__name__}")
|
111 |
+
|
112 |
+
# Format the context for the prompt
|
113 |
+
context_text = "\n---\n".join([f"Source: {doc.metadata.get('source', 'N/A')}\nContent: {doc.page_content}" for doc in context])
|
114 |
+
formatted_prompt = prompt.format_messages(context=context_text, chat_history=chat_history, input=query)
|
115 |
+
|
116 |
+
try:
|
117 |
+
# Invoke the LLM with the formatted prompt
|
118 |
+
response = self.llm.invoke(formatted_prompt)
|
119 |
+
logger.debug("Successfully generated LLM response.")
|
120 |
+
return response.content # Get the string content of the AI message
|
121 |
+
except Exception as e:
|
122 |
+
logger.error(f"Failed to generate LLM response: {e}")
|
123 |
+
# Depending on requirements, implement retry or return a specific error message
|
124 |
+
return "An error occurred while generating the response." # Provide a user-friendly error
|
125 |
+
|
126 |
+
def generate_chat_title(self, query: str) -> str:
|
127 |
+
"""
|
128 |
+
Generates a concise title for a chat based on the query.
|
129 |
+
|
130 |
+
Args:
|
131 |
+
query: The user's query string.
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
A short title string.
|
135 |
+
"""
|
136 |
+
prompt = ChatPromptTemplate.from_messages([
|
137 |
+
("system", """Generate a clear, specific, unique and concise 3-5 word title for the following user query.
|
138 |
+
If the query is vague, generic, or a greeting (e.g., "hi", "hello", "help"), infer a likely intent or use a default like "General Inquiry" or "User Assistance".
|
139 |
+
Never reply with "No clear topic provided". Do not use markdown, quotes, or punctuation.
|
140 |
+
|
141 |
+
Examples:
|
142 |
+
Query: Tax implications for foreign investments
|
143 |
+
Title: Foreign Investment Taxes
|
144 |
+
|
145 |
+
Query: GST rates for e-commerce
|
146 |
+
Title: E-commerce GST Rates
|
147 |
+
|
148 |
+
Query: How to file quarterly TDS returns
|
149 |
+
Title: Quarterly TDS Filing
|
150 |
+
|
151 |
+
Query: hi
|
152 |
+
Title: General Inquiry
|
153 |
+
|
154 |
+
Query: help
|
155 |
+
Title: User Assistance
|
156 |
+
|
157 |
+
Query: {query}""")
|
158 |
+
])
|
159 |
+
|
160 |
+
try:
|
161 |
+
# Use the higher-temperature LLM for title generation
|
162 |
+
response = self.llm_title.invoke(prompt.format_messages(query=query))
|
163 |
+
logger.debug("Successfully generated chat title.")
|
164 |
+
return response.content.strip('"').replace("Title:", "").strip()
|
165 |
+
except Exception as e:
|
166 |
+
logger.error(f"Failed to generate chat title: {e}")
|
167 |
+
# Provide a fallback title
|
168 |
+
return "New Chat"
|
src/query_service/__init__.py
ADDED
File without changes
|
src/query_service/api.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/query_service/api.py
|
2 |
+
from fastapi import FastAPI, HTTPException
|
3 |
+
from fastapi.middleware.cors import CORSMiddleware # Import CORSMiddleware
|
4 |
+
from pydantic import BaseModel
|
5 |
+
from src.retrieval_handler.retriever import RetrievalHandler
|
6 |
+
from src.llm_integrator.llm import LLMIntegrator
|
7 |
+
from src.embedding_generator.embedder import EmbeddingGenerator
|
8 |
+
from src.vector_store_manager.chroma_manager import ChromaManager
|
9 |
+
import logging
|
10 |
+
from typing import Literal, Optional, Dict, Any, List # Import List
|
11 |
+
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
# Initialize core components (these should ideally be dependency injected in a larger app)
|
16 |
+
# For simplicity in this example, we initialize them globally.
|
17 |
+
embedding_generator: Optional[EmbeddingGenerator] = None
|
18 |
+
vector_store_manager: Optional[ChromaManager] = None
|
19 |
+
retrieval_handler: Optional[RetrievalHandler] = None
|
20 |
+
llm_integrator: Optional[LLMIntegrator] = None
|
21 |
+
|
22 |
+
try:
|
23 |
+
embedding_generator = EmbeddingGenerator()
|
24 |
+
vector_store_manager = ChromaManager(embedding_generator)
|
25 |
+
retrieval_handler = RetrievalHandler(embedding_generator, vector_store_manager)
|
26 |
+
llm_integrator = LLMIntegrator()
|
27 |
+
logger.info("Initialized core RAG components.")
|
28 |
+
except Exception as e:
|
29 |
+
logger.critical(f"Failed to initialize core RAG components: {e}")
|
30 |
+
# Depending on severity, you might want to exit or raise an error here
|
31 |
+
# For a production API, you might want to return a 500 error on relevant endpoints
|
32 |
+
# if components fail to initialize, rather than crashing the app startup.
|
33 |
+
|
34 |
+
|
35 |
+
app = FastAPI(
|
36 |
+
title="Insight AI API",
|
37 |
+
description="API for querying financial information.",
|
38 |
+
version="1.0.0"
|
39 |
+
)
|
40 |
+
|
41 |
+
# --- CORS Middleware ---
|
42 |
+
# Add CORSMiddleware to allow cross-origin requests from your frontend.
|
43 |
+
# For development, you can allow all origins (*).
|
44 |
+
# For production, you should restrict this to your frontend's specific origin(s).
|
45 |
+
app.add_middleware(
|
46 |
+
CORSMiddleware,
|
47 |
+
allow_origins=["*"], # Allows all origins. Change this to your frontend's URL in production.
|
48 |
+
allow_credentials=True,
|
49 |
+
allow_methods=["*"], # Allows all methods (GET, POST, OPTIONS, etc.)
|
50 |
+
allow_headers=["*"], # Allows all headers
|
51 |
+
)
|
52 |
+
# -----------------------
|
53 |
+
|
54 |
+
class Message(BaseModel):
|
55 |
+
role: Literal['user', 'assistant', 'system']
|
56 |
+
content: str
|
57 |
+
|
58 |
+
class QueryRequest(BaseModel):
|
59 |
+
query: str
|
60 |
+
chat_history: Optional[List[Message]] = []
|
61 |
+
filters: Optional[Dict[str, Any]] = None # Allow passing metadata filters
|
62 |
+
|
63 |
+
# Define interfaces matching the backend response structure
|
64 |
+
class SourceMetadata(BaseModel):
|
65 |
+
source: Optional[str] = None
|
66 |
+
ruling_date: Optional[str] = None
|
67 |
+
# Add other expected metadata fields here
|
68 |
+
# Example: topic: Optional[str] = None
|
69 |
+
|
70 |
+
class RetrievedSource(BaseModel):
|
71 |
+
content_snippet: str
|
72 |
+
metadata: Optional[SourceMetadata] = None
|
73 |
+
|
74 |
+
class QueryResponse(BaseModel):
|
75 |
+
answer: str
|
76 |
+
retrieved_sources: Optional[List[RetrievedSource]] = None
|
77 |
+
|
78 |
+
class TitleResponse(BaseModel):
|
79 |
+
title: str
|
80 |
+
|
81 |
+
class TitleRequest(BaseModel):
|
82 |
+
query: str
|
83 |
+
|
84 |
+
@app.post("/query", response_model=QueryResponse)
|
85 |
+
async def query_rulings(request: QueryRequest):
|
86 |
+
"""
|
87 |
+
Receives a user query and returns a generated answer based on retrieved rulings.
|
88 |
+
"""
|
89 |
+
logger.info(f"Received query: {request.query}")
|
90 |
+
if request.filters:
|
91 |
+
logger.info(f"Received filters: {request.filters}")
|
92 |
+
|
93 |
+
# Check if RAG components were initialized successfully
|
94 |
+
if not retrieval_handler or not llm_integrator:
|
95 |
+
logger.error("RAG components not initialized.")
|
96 |
+
raise HTTPException(status_code=500, detail="System components not ready.")
|
97 |
+
|
98 |
+
|
99 |
+
try:
|
100 |
+
# 1. Retrieve relevant documents based on the query and filters
|
101 |
+
# Pass filters if your RetrievalHandler/ChromaManager supports using them in search
|
102 |
+
# Current simple implementation in RetrievalHandler doesn't directly use filters in invoke,
|
103 |
+
# requires adjustment in RetrievalHandler.retrieve_documents if needed.
|
104 |
+
retrieved_docs = retrieval_handler.retrieve_documents(request.query, filters=request.filters)
|
105 |
+
|
106 |
+
if not retrieved_docs:
|
107 |
+
logger.warning("No relevant documents retrieved for query.")
|
108 |
+
return QueryResponse(answer="Could not find relevant rulings for your query.")
|
109 |
+
|
110 |
+
# Convert chat_history to appropriate LangChain message types
|
111 |
+
chat_history = []
|
112 |
+
logger.debug(f"Raw chat history input: {request.chat_history}")
|
113 |
+
for msg in request.chat_history:
|
114 |
+
logger.debug(f"Processing message - Role: {msg.role}, Content: {msg.content[:50]}...")
|
115 |
+
if msg.role == "user":
|
116 |
+
new_msg = HumanMessage(content=msg.content)
|
117 |
+
elif msg.role == "assistant":
|
118 |
+
new_msg = AIMessage(content=msg.content)
|
119 |
+
elif msg.role == "system":
|
120 |
+
new_msg = SystemMessage(content=msg.content)
|
121 |
+
else:
|
122 |
+
logger.warning(f"Invalid message role: {msg.role}. Skipping message.")
|
123 |
+
continue
|
124 |
+
logger.debug(f"Converted to: {type(new_msg).__name__}")
|
125 |
+
chat_history.append(new_msg)
|
126 |
+
logger.debug(f"Final chat history types: {[type(m).__name__ for m in chat_history]}")
|
127 |
+
|
128 |
+
# 2. Generate response using the LLM based on the query, retrieved context, and chat history
|
129 |
+
answer = llm_integrator.generate_response(request.query, retrieved_docs, chat_history)
|
130 |
+
|
131 |
+
# 3. Prepare retrieved source information for the response
|
132 |
+
retrieved_sources = []
|
133 |
+
for doc in retrieved_docs:
|
134 |
+
# Ensure the structure matches the RetrievedSource Pydantic model
|
135 |
+
source_metadata = SourceMetadata(**doc.metadata) if doc.metadata else None
|
136 |
+
retrieved_sources.append(RetrievedSource(
|
137 |
+
content_snippet=doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content, # Snippet of content
|
138 |
+
metadata=source_metadata # Include all metadata
|
139 |
+
))
|
140 |
+
|
141 |
+
|
142 |
+
logger.info("Successfully processed query and generated response.")
|
143 |
+
return QueryResponse(answer=answer, retrieved_sources=retrieved_sources)
|
144 |
+
|
145 |
+
except Exception as e:
|
146 |
+
logger.error(f"An error occurred during query processing: {e}")
|
147 |
+
# Provide a more informative but secure error message to the user.
|
148 |
+
raise HTTPException(status_code=500, detail="An internal error occurred while processing your query.")
|
149 |
+
|
150 |
+
@app.post("/generate-title", response_model=TitleResponse)
|
151 |
+
async def generate_chat_title(request: TitleRequest):
|
152 |
+
try:
|
153 |
+
title = llm_integrator.generate_chat_title(request.query)
|
154 |
+
return {"title": title}
|
155 |
+
except Exception as e:
|
156 |
+
logger.error(f"Title generation error: {e}")
|
157 |
+
return {"title": "New Chat"}
|
158 |
+
|
159 |
+
# You can add more endpoints here, e.g., /health for health checks
|
160 |
+
# @app.get("/health")
|
161 |
+
# async def health_check():
|
162 |
+
# # Check connectivity to ChromaDB, LLM service, etc.
|
163 |
+
# # This requires adding health check methods to your ChromaManager and LLMIntegrator
|
164 |
+
# chroma_status = vector_store_manager.check_health() if vector_store_manager else "uninitialized"
|
165 |
+
# llm_status = llm_integrator.check_health() if llm_integrator else "uninitialized"
|
166 |
+
# return {"chroma": chroma_status, "llm": llm_status}
|
src/retrieval_handler/__init__.py
ADDED
File without changes
|
src/retrieval_handler/retriever.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/retrieval_handler/retriever.py
|
2 |
+
from src.embedding_generator.embedder import EmbeddingGenerator
|
3 |
+
from src.vector_store_manager.chroma_manager import ChromaManager
|
4 |
+
from config.settings import TOP_K # cite: query_pipeline.py
|
5 |
+
from typing import List, Dict, Any
|
6 |
+
from langchain.schema import Document # To return retrieved documents
|
7 |
+
import logging
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
class RetrievalHandler:
|
12 |
+
"""
|
13 |
+
Handles the process of retrieving relevant documents from the vector store.
|
14 |
+
"""
|
15 |
+
def __init__(self, embedding_generator: EmbeddingGenerator, vector_store_manager: ChromaManager):
|
16 |
+
self.embedding_generator = embedding_generator
|
17 |
+
self.vector_store_manager = vector_store_manager
|
18 |
+
# Get the Langchain retriever from the ChromaManager
|
19 |
+
# Configure search arguments, including the number of results (k)
|
20 |
+
self.langchain_retriever = self.vector_store_manager.as_retriever(search_kwargs={"k": TOP_K}) # cite: query_pipeline.py
|
21 |
+
logger.info(f"Initialized retrieval handler with TOP_K={TOP_K}")
|
22 |
+
|
23 |
+
def retrieve_documents(self, query: str, filters: Dict[str, Any] = None) -> List[Document]:
|
24 |
+
"""
|
25 |
+
Retrieves relevant document chunks based on a query and optional filters.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
query: The user's query string.
|
29 |
+
filters: Optional metadata filters to apply during retrieval.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
A list of relevant Langchain Document objects.
|
33 |
+
"""
|
34 |
+
# --- Financial Ministry Adaptation ---
|
35 |
+
# Ensure that filters are correctly passed to the vector_store_manager's get method.
|
36 |
+
# The .as_retriever method's search_kwargs apply to the *similarity search*,
|
37 |
+
# but if you need to filter *before* or *during* the search based on metadata
|
38 |
+
# you might need to use the vector_store_manager's get method directly with 'where'.
|
39 |
+
# The Langchain retriever can handle metadata filters if configured.
|
40 |
+
# Check Langchain documentation for how to pass filters through the retriever.
|
41 |
+
# Example: self.langchain_retriever.invoke(query, config={"filter": filters})
|
42 |
+
# ------------------------------------
|
43 |
+
|
44 |
+
# Using the Langchain retriever with potential filters
|
45 |
+
try:
|
46 |
+
# The Langchain retriever abstracts the embedding step and the Chroma query.
|
47 |
+
# If using filters, the method signature might need adjustment based on Langchain version
|
48 |
+
# and how its retriever handles metadata filters.
|
49 |
+
# As a direct approach using the manager for filtered retrieval:
|
50 |
+
if filters:
|
51 |
+
# This approach bypasses the Langchain retriever's similarity search abstraction
|
52 |
+
# to apply filters directly to the get method.
|
53 |
+
# A more integrated approach might be possible depending on Langchain/Chroma versions.
|
54 |
+
logger.debug(f"Retrieving documents with query '{query}' and filters: {filters}")
|
55 |
+
# First, find document IDs matching filters
|
56 |
+
# Note: This is a simplified approach. For large datasets, filtering first then searching
|
57 |
+
# might not be most efficient depending on index structure.
|
58 |
+
# A better approach is to use filters within the similarity search if the retriever supports it.
|
59 |
+
|
60 |
+
# Let's stick closer to the spirit of the original retriever chain for now,
|
61 |
+
# assuming filters can be passed or handled by the retriever configuration if needed.
|
62 |
+
# If direct filtered search is needed, adjust to use vector_store_manager.get
|
63 |
+
|
64 |
+
# For basic retrieval without explicit filtering in the original script's flow:
|
65 |
+
retrieved_docs = self.langchain_retriever.invoke(query) # Uses the configured search_kwargs (like k)
|
66 |
+
logger.info(f"Retrieved {len(retrieved_docs)} documents for query.")
|
67 |
+
return retrieved_docs
|
68 |
+
else:
|
69 |
+
# No filters applied, simple retrieval
|
70 |
+
retrieved_docs = self.langchain_retriever.invoke(query) # cite: query_pipeline.py
|
71 |
+
logger.info(f"Retrieved {len(retrieved_docs)} documents for query.")
|
72 |
+
return retrieved_docs
|
73 |
+
|
74 |
+
except Exception as e:
|
75 |
+
logger.error(f"Failed to retrieve documents for query '{query}': {e}")
|
76 |
+
# Implement retry logic or return empty list
|
77 |
+
return []
|
src/utils/__init__.py
ADDED
File without changes
|
src/utils/logging.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/utils/logging.py
|
2 |
+
import logging
|
3 |
+
from config.settings import LOG_LEVEL
|
4 |
+
|
5 |
+
def setup_logging():
|
6 |
+
"""Configures basic logging for the application."""
|
7 |
+
logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
8 |
+
logging.getLogger("httpx").setLevel(logging.WARNING) # Suppress chatty libraries
|
9 |
+
logging.getLogger("httpcore").setLevel(logging.WARNING) # Suppress chatty libraries
|
src/vector_store_manager/__init__.py
ADDED
File without changes
|
src/vector_store_manager/chroma_manager.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/vector_store_manager/chroma_manager.py
|
2 |
+
from langchain_chroma import Chroma # cite: embed_pipeline.py, query_pipeline.py
|
3 |
+
from langchain.schema import Document # cite: embed_pipeline.py
|
4 |
+
from config.settings import PERSIST_DIR, CHROMADB_COLLECTION_NAME # cite: embed_pipeline.py, query_pipeline.py
|
5 |
+
from src.embedding_generator.embedder import EmbeddingGenerator
|
6 |
+
import logging
|
7 |
+
from typing import List, Dict, Any
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
class ChromaManager:
|
12 |
+
"""
|
13 |
+
Manages interactions with the ChromaDB vector store.
|
14 |
+
"""
|
15 |
+
def __init__(self, embedding_generator: EmbeddingGenerator):
|
16 |
+
self.embedding_generator = embedding_generator
|
17 |
+
# --- Financial Ministry Adaptation ---
|
18 |
+
# TODO: Configure Chroma client to use a scalable backend (e.g., ClickHouse)
|
19 |
+
# instead of or in addition to persistent_directory for production.
|
20 |
+
# This might involve using chromadb.HttpClient or specific backend configurations.
|
21 |
+
# Handle connection errors and retries to the database backend.
|
22 |
+
# Implement authentication/authorization for ChromaDB access.
|
23 |
+
# ------------------------------------
|
24 |
+
try:
|
25 |
+
# Initialize Chroma with the embedding function and persistence settings
|
26 |
+
# For production, you might replace persist_directory with client settings
|
27 |
+
# pointing to a ClickHouse backend.
|
28 |
+
self.vectordb = Chroma(
|
29 |
+
persist_directory=PERSIST_DIR, # cite: embed_pipeline.py, query_pipeline.py
|
30 |
+
collection_name=CHROMADB_COLLECTION_NAME, # cite: embed_pipeline.py, query_pipeline.py
|
31 |
+
embedding_function=self.embedding_generator.embedder # Use the Langchain embedder instance
|
32 |
+
)
|
33 |
+
logger.info(f"Initialized ChromaDB collection: '{CHROMADB_COLLECTION_NAME}' at '{PERSIST_DIR}'")
|
34 |
+
# You might want to check if the collection exists and its health
|
35 |
+
|
36 |
+
except Exception as e:
|
37 |
+
logger.critical(f"Failed to initialize ChromaDB: {e}")
|
38 |
+
raise e
|
39 |
+
|
40 |
+
def add_documents(self, chunks: List[Document]):
|
41 |
+
"""
|
42 |
+
Adds document chunks to the ChromaDB collection.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
chunks: A list of Langchain Document chunks with metadata.
|
46 |
+
"""
|
47 |
+
# --- Financial Ministry Adaptation ---
|
48 |
+
# Implement error handling and retry logic for batch additions.
|
49 |
+
# Consider transactional behavior if adding large batches requires it.
|
50 |
+
# Log successful and failed additions.
|
51 |
+
# Ensure document IDs are managed consistently (e.g., based on source + chunk index or a stable hash).
|
52 |
+
# ------------------------------------
|
53 |
+
try:
|
54 |
+
# Langchain's add_documents handles embedding internally using the provided embedding_function
|
55 |
+
# Ensure your chunks have unique IDs if you need to update/delete later.
|
56 |
+
# If IDs are not in metadata, Langchain/Chroma might generate them.
|
57 |
+
# For better control, you might generate IDs in document_processor and pass them here.
|
58 |
+
if not chunks:
|
59 |
+
logger.warning("No chunks to add to ChromaDB.")
|
60 |
+
return
|
61 |
+
|
62 |
+
# If chunks don't have IDs, generate them (simple example)
|
63 |
+
# In a real system, use stable IDs based on source data
|
64 |
+
# chunk_ids = [f"{chunk.metadata.get('source', 'unknown')}_{i}" for i, chunk in enumerate(chunks)]
|
65 |
+
# self.vectordb.add_documents(chunks, ids=chunk_ids)
|
66 |
+
|
67 |
+
self.vectordb.add_documents(chunks) # Langchain handles IDs if not provided
|
68 |
+
logger.info(f"Added {len(chunks)} chunks to ChromaDB.")
|
69 |
+
|
70 |
+
except Exception as e:
|
71 |
+
logger.error(f"Failed to add documents to ChromaDB: {e}")
|
72 |
+
# Implement retry logic or raise exception
|
73 |
+
|
74 |
+
def update_documents(self, ids: List[str], documents: List[str], metadatas: List[Dict[str, Any]]):
|
75 |
+
"""
|
76 |
+
Updates documents in the ChromaDB collection by ID.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
ids: List of document IDs to update.
|
80 |
+
documents: List of new document content corresponding to IDs.
|
81 |
+
metadatas: List of new metadata dictionaries corresponding to IDs.
|
82 |
+
"""
|
83 |
+
# --- Financial Ministry Adaptation ---
|
84 |
+
# Implement error handling and retry logic.
|
85 |
+
# Validate that IDs exist before attempting to update.
|
86 |
+
# ------------------------------------
|
87 |
+
try:
|
88 |
+
self.vectordb._collection.update( # Accessing the underlying collection for update/delete
|
89 |
+
ids=ids,
|
90 |
+
documents=documents,
|
91 |
+
metadatas=metadatas
|
92 |
+
)
|
93 |
+
logger.info(f"Updated documents with IDs: {ids}")
|
94 |
+
except Exception as e:
|
95 |
+
logger.error(f"Failed to update documents with IDs {ids}: {e}")
|
96 |
+
raise e
|
97 |
+
|
98 |
+
|
99 |
+
def delete_documents(self, ids: List[str] = None, where: Dict[str, Any] = None):
|
100 |
+
"""
|
101 |
+
Deletes documents from the ChromaDB collection by ID or metadata filter.
|
102 |
+
|
103 |
+
Args:
|
104 |
+
ids: List of document IDs to delete.
|
105 |
+
where: A dictionary for metadata filtering (e.g., {"source": "old_file.txt"}).
|
106 |
+
"""
|
107 |
+
# --- Financial Ministry Adaptation ---
|
108 |
+
# Implement error handling and retry logic.
|
109 |
+
# Add logging to record which documents were deleted and why (if using where).
|
110 |
+
# ------------------------------------
|
111 |
+
try:
|
112 |
+
if ids:
|
113 |
+
self.vectordb._collection.delete(ids=ids) # Accessing the underlying collection
|
114 |
+
logger.info(f"Deleted documents with IDs: {ids}")
|
115 |
+
elif where:
|
116 |
+
self.vectordb._collection.delete(where=where) # Accessing the underlying collection
|
117 |
+
logger.info(f"Deleted documents matching metadata filter: {where}")
|
118 |
+
else:
|
119 |
+
logger.warning("Delete called without specifying ids or where filter.")
|
120 |
+
except Exception as e:
|
121 |
+
logger.error(f"Failed to delete documents (ids: {ids}, where: {where}): {e}")
|
122 |
+
raise e
|
123 |
+
|
124 |
+
def get_documents(self, ids: List[str] = None, where: Dict[str, Any] = None,
|
125 |
+
where_document: Dict[str, Any] = None, limit: int = None,
|
126 |
+
offset: int = None, include: List[str] = None) -> Dict[str, List[Any]]:
|
127 |
+
"""
|
128 |
+
Retrieves documents and their details from the ChromaDB collection.
|
129 |
+
|
130 |
+
Args:
|
131 |
+
ids: List of document IDs to retrieve.
|
132 |
+
where: Metadata filter.
|
133 |
+
where_document: Document content filter.
|
134 |
+
limit: Maximum number of results.
|
135 |
+
offset: Offset for pagination.
|
136 |
+
include: List of fields to include (e.g., ['metadatas', 'documents']). IDs are always included.
|
137 |
+
|
138 |
+
Returns:
|
139 |
+
A dictionary containing the retrieved data (ids, documents, metadatas, etc.).
|
140 |
+
"""
|
141 |
+
# --- Financial Ministry Adaptation ---
|
142 |
+
# Implement error handling and retry logic.
|
143 |
+
# Ensure sensitive metadata is handled appropriately if retrieved.
|
144 |
+
# ------------------------------------
|
145 |
+
try:
|
146 |
+
# Default include to metadatas and documents if not specified
|
147 |
+
if include is None:
|
148 |
+
include = ['metadatas', 'documents'] # Default as per Chroma docs
|
149 |
+
|
150 |
+
results = self.vectordb._collection.get( # Accessing the underlying collection
|
151 |
+
ids=ids,
|
152 |
+
where=where,
|
153 |
+
where_document=where_document,
|
154 |
+
limit=limit,
|
155 |
+
offset=offset,
|
156 |
+
include=include
|
157 |
+
)
|
158 |
+
logger.debug(f"Retrieved {len(results.get('ids', []))} documents from ChromaDB.")
|
159 |
+
return results
|
160 |
+
except Exception as e:
|
161 |
+
logger.error(f"Failed to retrieve documents from ChromaDB: {e}")
|
162 |
+
raise e
|
163 |
+
|
164 |
+
def as_retriever(self, search_kwargs: Dict[str, Any] = None):
|
165 |
+
"""
|
166 |
+
Returns a Langchain Retriever instance for the Chroma collection.
|
167 |
+
|
168 |
+
Args:
|
169 |
+
search_kwargs: Arguments for the retriever (e.g., {"k": 5}).
|
170 |
+
|
171 |
+
Returns:
|
172 |
+
A Langchain Retriever.
|
173 |
+
"""
|
174 |
+
# --- Financial Ministry Adaptation ---
|
175 |
+
# Consider adding default search_kwargs here if not provided.
|
176 |
+
# Ensure the retriever uses the configured embedding function.
|
177 |
+
# ------------------------------------
|
178 |
+
if search_kwargs is None:
|
179 |
+
search_kwargs = {}
|
180 |
+
# Langchain's .as_retriever method automatically uses the embedding_function
|
181 |
+
# provided during Chroma initialization.
|
182 |
+
return self.vectordb.as_retriever(search_kwargs=search_kwargs) # cite: query_pipeline.py
|
system_architecture.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
graph LR
|
2 |
+
participant User as "User"
|
3 |
+
participant API as "Query Service API"
|
4 |
+
participant RetrievalHandler as "Retrieval Handler"
|
5 |
+
participant EmbeddingGenerator as "Embedding Generator"
|
6 |
+
participant ChromaDB as "ChromaDB (Vector Store)"
|
7 |
+
participant LLMIntegrator as "LLM Integrator"
|
8 |
+
|
9 |
+
User->>API: Sends query request
|
10 |
+
API->>RetrievalHandler: Retrieve relevant documents
|
11 |
+
RetrievalHandler->>EmbeddingGenerator: Generate embeddings for query
|
12 |
+
EmbeddingGenerator->>ChromaDB: Query vector store
|
13 |
+
ChromaDB->>RetrievalHandler: Return relevant documents
|
14 |
+
RetrievalHandler->>LLMIntegrator: Generate response using LLM
|
15 |
+
LLMIntegrator->>API: Return final response
|
16 |
+
API->>User: Return final response
|
system_architecture.png
ADDED
![]() |
Git LFS Details
|