Spaces:
Running
Running
from dotenv import load_dotenv | |
load_dotenv() | |
import os | |
from glob import glob | |
from langchain_community.document_loaders import TextLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_ollama import OllamaEmbeddings | |
from langchain_chroma import Chroma | |
from langchain.schema import Document | |
# βββ CONFIG βββ | |
DOCS_FOLDER = "docs/" # folder with .txt, .md, etc. | |
OLLAMA_URL = os.getenv("OLLAMA_SERVER") | |
EMBED_MODEL = "nomic-embed-text:latest" | |
PERSIST_DIR = "chroma_db/" # on-disk Chroma store | |
CHUNK_SIZE = 2000 | |
CHUNK_OVERLAP = 10 | |
# ββββββββββ | |
def embed_all_docs(): | |
all_chunks = [] | |
files = glob(os.path.join(DOCS_FOLDER, "*.*")) | |
for path in files: | |
try: | |
# 1) Try loading with UTF-8 + autodetect fallback | |
loader = TextLoader( | |
path, | |
encoding="utf-8", | |
autodetect_encoding=True | |
) | |
docs = loader.load() | |
except UnicodeDecodeError: | |
# 2) If that still fails, fallback to a lenient read | |
print(f"β οΈ Decoding error on {path}, falling back to ignore-errors mode") | |
with open(path, "r", encoding="utf-8", errors="ignore") as f: | |
text = f.read() | |
docs = [Document(page_content=text, metadata={"source": path})] | |
# 3) Split into chunks | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=CHUNK_SIZE, | |
chunk_overlap=CHUNK_OVERLAP | |
) | |
chunks = splitter.split_documents(docs) | |
print(f"β {len(chunks)} chunks from {os.path.basename(path)}") | |
all_chunks.extend(chunks) | |
# 4) Embed & persist on-disk Chroma | |
embedder = OllamaEmbeddings(base_url=OLLAMA_URL, model=EMBED_MODEL) | |
vectordb = Chroma( | |
embedding_function=embedder, | |
persist_directory=PERSIST_DIR, | |
collection_name="my_docs" | |
) | |
vectordb.add_documents(all_chunks) | |
print(f"β Persisted {len(all_chunks)} chunks to '{PERSIST_DIR}'") | |
if __name__ == "__main__": | |
embed_all_docs() | |