Spaces:
Build error
Build error
XThomasBU
commited on
Commit
·
9a7da99
1
Parent(s):
7a98bd3
improvements
Browse files- README.md +8 -2
- code/main.py +2 -9
- code/modules/__init__.py +0 -2
- code/modules/chat/__init__.py +0 -2
- code/modules/chat/llm_tutor.py +1 -1
- code/modules/config/__init__.py +0 -1
- code/modules/config/config.yml +1 -1
- code/modules/dataloader/__init__.py +0 -2
- code/modules/dataloader/data_loader.py +4 -1
- code/modules/retriever/__init__.py +0 -5
- code/modules/vectorstore/store_manager.py +16 -17
README.md
CHANGED
|
@@ -15,7 +15,13 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
|
|
| 15 |
- Add URLs in the `urls.txt` file.
|
| 16 |
- Add other PDF files in the `storage/data` directory.
|
| 17 |
|
| 18 |
-
3. **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
```bash
|
| 20 |
cd code
|
| 21 |
python -m modules.vectorstore.store_manager
|
|
@@ -23,7 +29,7 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
|
|
| 23 |
- Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
|
| 24 |
- Alternatively, you can set `["vectorstore"]["embedd_files"]` to `True` in the `code/modules/config/config.yaml` file, which will embed files from the storage directory every time you run the below chainlit command.
|
| 25 |
|
| 26 |
-
|
| 27 |
```bash
|
| 28 |
chainlit run main.py
|
| 29 |
```
|
|
|
|
| 15 |
- Add URLs in the `urls.txt` file.
|
| 16 |
- Add other PDF files in the `storage/data` directory.
|
| 17 |
|
| 18 |
+
3. **To test Data Loading (Optional)**
|
| 19 |
+
```bash
|
| 20 |
+
cd code
|
| 21 |
+
python -m modules.dataloader.data_loader
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
4. **Create the Vector Database**
|
| 25 |
```bash
|
| 26 |
cd code
|
| 27 |
python -m modules.vectorstore.store_manager
|
|
|
|
| 29 |
- Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
|
| 30 |
- Alternatively, you can set `["vectorstore"]["embedd_files"]` to `True` in the `code/modules/config/config.yaml` file, which will embed files from the storage directory every time you run the below chainlit command.
|
| 31 |
|
| 32 |
+
5. **Run the Chainlit App**
|
| 33 |
```bash
|
| 34 |
chainlit run main.py
|
| 35 |
```
|
code/main.py
CHANGED
|
@@ -10,27 +10,20 @@ import yaml
|
|
| 10 |
import logging
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
|
| 13 |
-
import os
|
| 14 |
-
import sys
|
| 15 |
-
|
| 16 |
-
# Add the 'code' directory to the Python path
|
| 17 |
-
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 18 |
-
sys.path.append(current_dir)
|
| 19 |
-
|
| 20 |
from modules.chat.llm_tutor import LLMTutor
|
| 21 |
from modules.config.constants import *
|
| 22 |
from modules.chat.helpers import get_sources
|
| 23 |
from modules.chat_processor.chat_processor import ChatProcessor
|
| 24 |
|
| 25 |
global logger
|
|
|
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
logger.setLevel(logging.INFO)
|
| 28 |
-
|
| 29 |
|
| 30 |
# Console Handler
|
| 31 |
console_handler = logging.StreamHandler()
|
| 32 |
console_handler.setLevel(logging.INFO)
|
| 33 |
-
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
| 34 |
console_handler.setFormatter(formatter)
|
| 35 |
logger.addHandler(console_handler)
|
| 36 |
|
|
|
|
| 10 |
import logging
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
from modules.chat.llm_tutor import LLMTutor
|
| 14 |
from modules.config.constants import *
|
| 15 |
from modules.chat.helpers import get_sources
|
| 16 |
from modules.chat_processor.chat_processor import ChatProcessor
|
| 17 |
|
| 18 |
global logger
|
| 19 |
+
# Initialize logger
|
| 20 |
logger = logging.getLogger(__name__)
|
| 21 |
logger.setLevel(logging.INFO)
|
| 22 |
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
| 23 |
|
| 24 |
# Console Handler
|
| 25 |
console_handler = logging.StreamHandler()
|
| 26 |
console_handler.setLevel(logging.INFO)
|
|
|
|
| 27 |
console_handler.setFormatter(formatter)
|
| 28 |
logger.addHandler(console_handler)
|
| 29 |
|
code/modules/__init__.py
CHANGED
|
@@ -1,2 +0,0 @@
|
|
| 1 |
-
from . import vectorstore
|
| 2 |
-
from . import dataloader
|
|
|
|
|
|
|
|
|
code/modules/chat/__init__.py
CHANGED
|
@@ -1,2 +0,0 @@
|
|
| 1 |
-
from .llm_tutor import LLMTutor
|
| 2 |
-
from .chat_model_loader import ChatModelLoader
|
|
|
|
|
|
|
|
|
code/modules/chat/llm_tutor.py
CHANGED
|
@@ -10,7 +10,7 @@ from modules.chat.helpers import get_prompt
|
|
| 10 |
from modules.chat.chat_model_loader import ChatModelLoader
|
| 11 |
from modules.vectorstore.store_manager import VectorStoreManager
|
| 12 |
|
| 13 |
-
from modules.retriever import Retriever
|
| 14 |
|
| 15 |
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
|
| 16 |
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
|
|
|
| 10 |
from modules.chat.chat_model_loader import ChatModelLoader
|
| 11 |
from modules.vectorstore.store_manager import VectorStoreManager
|
| 12 |
|
| 13 |
+
from modules.retriever.retriever import Retriever
|
| 14 |
|
| 15 |
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
|
| 16 |
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
code/modules/config/__init__.py
CHANGED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
from .constants import *
|
|
|
|
|
|
code/modules/config/config.yml
CHANGED
|
@@ -7,7 +7,7 @@ vectorstore:
|
|
| 7 |
data_path: '../storage/data' # str
|
| 8 |
url_file_path: '../storage/data/urls.txt' # str
|
| 9 |
expand_urls: True # bool
|
| 10 |
-
db_option : '
|
| 11 |
db_path : '../vectorstores' # str
|
| 12 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
| 13 |
search_top_k : 3 # int
|
|
|
|
| 7 |
data_path: '../storage/data' # str
|
| 8 |
url_file_path: '../storage/data/urls.txt' # str
|
| 9 |
expand_urls: True # bool
|
| 10 |
+
db_option : 'FAISS' # str [FAISS, Chroma, RAGatouille, RAPTOR]
|
| 11 |
db_path : '../vectorstores' # str
|
| 12 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
| 13 |
search_top_k : 3 # int
|
code/modules/dataloader/__init__.py
CHANGED
|
@@ -1,2 +0,0 @@
|
|
| 1 |
-
from .webpage_crawler import WebpageCrawler
|
| 2 |
-
from .data_loader import DataLoader
|
|
|
|
|
|
|
|
|
code/modules/dataloader/data_loader.py
CHANGED
|
@@ -345,7 +345,7 @@ if __name__ == "__main__":
|
|
| 345 |
logger = logging.getLogger(__name__)
|
| 346 |
logger.setLevel(logging.INFO)
|
| 347 |
|
| 348 |
-
with open("../code/config.yml", "r") as f:
|
| 349 |
config = yaml.safe_load(f)
|
| 350 |
|
| 351 |
data_loader = DataLoader(config, logger=logger)
|
|
@@ -355,3 +355,6 @@ if __name__ == "__main__":
|
|
| 355 |
["https://dl4ds.github.io/sp2024/"],
|
| 356 |
)
|
| 357 |
)
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
logger = logging.getLogger(__name__)
|
| 346 |
logger.setLevel(logging.INFO)
|
| 347 |
|
| 348 |
+
with open("../code/modules/config/config.yml", "r") as f:
|
| 349 |
config = yaml.safe_load(f)
|
| 350 |
|
| 351 |
data_loader = DataLoader(config, logger=logger)
|
|
|
|
| 355 |
["https://dl4ds.github.io/sp2024/"],
|
| 356 |
)
|
| 357 |
)
|
| 358 |
+
|
| 359 |
+
print(document_names)
|
| 360 |
+
print(len(document_chunks))
|
code/modules/retriever/__init__.py
CHANGED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
from .faiss_retriever import FaissRetriever
|
| 2 |
-
from .chroma_retriever import ChromaRetriever
|
| 3 |
-
from .colbert_retriever import ColbertRetriever
|
| 4 |
-
from .raptor_retriever import RaptorRetriever
|
| 5 |
-
from .retriever import Retriever
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
code/modules/vectorstore/store_manager.py
CHANGED
|
@@ -16,37 +16,36 @@ class VectorStoreManager:
|
|
| 16 |
self.document_names = None
|
| 17 |
|
| 18 |
# Set up logging to both console and a file
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# Console Handler
|
| 25 |
console_handler = logging.StreamHandler()
|
| 26 |
console_handler.setLevel(logging.INFO)
|
| 27 |
-
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
| 28 |
console_handler.setFormatter(formatter)
|
| 29 |
-
|
| 30 |
|
| 31 |
# Ensure log directory exists
|
| 32 |
log_directory = self.config["log_dir"]
|
| 33 |
-
|
| 34 |
-
os.makedirs(log_directory)
|
| 35 |
|
| 36 |
# File Handler
|
| 37 |
-
log_file_path =
|
| 38 |
file_handler = logging.FileHandler(log_file_path, mode="w")
|
| 39 |
file_handler.setLevel(logging.INFO)
|
| 40 |
file_handler.setFormatter(formatter)
|
| 41 |
-
|
| 42 |
-
else:
|
| 43 |
-
self.logger = logger
|
| 44 |
-
|
| 45 |
-
self.webpage_crawler = WebpageCrawler()
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
self.logger.info("VectorDB instance instantiated")
|
| 50 |
|
| 51 |
def load_files(self):
|
| 52 |
|
|
|
|
| 16 |
self.document_names = None
|
| 17 |
|
| 18 |
# Set up logging to both console and a file
|
| 19 |
+
self.logger = logger or self._setup_logging()
|
| 20 |
+
self.webpage_crawler = WebpageCrawler()
|
| 21 |
+
self.vector_db = VectorStore(self.config)
|
| 22 |
+
|
| 23 |
+
self.logger.info("VectorDB instance instantiated")
|
| 24 |
+
|
| 25 |
+
def _setup_logging(self):
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
if not logger.hasHandlers():
|
| 28 |
+
logger.setLevel(logging.INFO)
|
| 29 |
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
| 30 |
|
| 31 |
# Console Handler
|
| 32 |
console_handler = logging.StreamHandler()
|
| 33 |
console_handler.setLevel(logging.INFO)
|
|
|
|
| 34 |
console_handler.setFormatter(formatter)
|
| 35 |
+
logger.addHandler(console_handler)
|
| 36 |
|
| 37 |
# Ensure log directory exists
|
| 38 |
log_directory = self.config["log_dir"]
|
| 39 |
+
os.makedirs(log_directory, exist_ok=True)
|
|
|
|
| 40 |
|
| 41 |
# File Handler
|
| 42 |
+
log_file_path = os.path.join(log_directory, "vector_db.log")
|
| 43 |
file_handler = logging.FileHandler(log_file_path, mode="w")
|
| 44 |
file_handler.setLevel(logging.INFO)
|
| 45 |
file_handler.setFormatter(formatter)
|
| 46 |
+
logger.addHandler(file_handler)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
+
return logger
|
|
|
|
|
|
|
| 49 |
|
| 50 |
def load_files(self):
|
| 51 |
|