from dotenv import load_dotenv load_dotenv() import os from glob import glob from langchain_community.document_loaders import TextLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_ollama import OllamaEmbeddings from langchain_chroma import Chroma from langchain.schema import Document # ——— CONFIG ——— DOCS_FOLDER = "docs/" # folder with .txt, .md, etc. OLLAMA_URL = os.getenv("OLLAMA_SERVER") EMBED_MODEL = "nomic-embed-text:latest" PERSIST_DIR = "chroma_db/" # on-disk Chroma store CHUNK_SIZE = 2000 CHUNK_OVERLAP = 10 # —————————— def embed_all_docs(): all_chunks = [] files = glob(os.path.join(DOCS_FOLDER, "*.*")) for path in files: try: # 1) Try loading with UTF-8 + autodetect fallback loader = TextLoader( path, encoding="utf-8", autodetect_encoding=True ) docs = loader.load() except UnicodeDecodeError: # 2) If that still fails, fallback to a lenient read print(f"⚠️ Decoding error on {path}, falling back to ignore-errors mode") with open(path, "r", encoding="utf-8", errors="ignore") as f: text = f.read() docs = [Document(page_content=text, metadata={"source": path})] # 3) Split into chunks splitter = RecursiveCharacterTextSplitter( chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP ) chunks = splitter.split_documents(docs) print(f"→ {len(chunks)} chunks from {os.path.basename(path)}") all_chunks.extend(chunks) # 4) Embed & persist on-disk Chroma embedder = OllamaEmbeddings(base_url=OLLAMA_URL, model=EMBED_MODEL) vectordb = Chroma( embedding_function=embedder, persist_directory=PERSIST_DIR, collection_name="my_docs" ) vectordb.add_documents(all_chunks) print(f"✅ Persisted {len(all_chunks)} chunks to '{PERSIST_DIR}'") if __name__ == "__main__": embed_all_docs()