Spaces:
Running
Running
File size: 2,108 Bytes
10b392a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from dotenv import load_dotenv
load_dotenv()
import os
from glob import glob
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document
# βββ CONFIG βββ
DOCS_FOLDER = "docs/" # folder with .txt, .md, etc.
OLLAMA_URL = os.getenv("OLLAMA_SERVER")
EMBED_MODEL = "nomic-embed-text:latest"
PERSIST_DIR = "chroma_db/" # on-disk Chroma store
CHUNK_SIZE = 2000
CHUNK_OVERLAP = 10
# ββββββββββ
def embed_all_docs():
all_chunks = []
files = glob(os.path.join(DOCS_FOLDER, "*.*"))
for path in files:
try:
# 1) Try loading with UTF-8 + autodetect fallback
loader = TextLoader(
path,
encoding="utf-8",
autodetect_encoding=True
)
docs = loader.load()
except UnicodeDecodeError:
# 2) If that still fails, fallback to a lenient read
print(f"β οΈ Decoding error on {path}, falling back to ignore-errors mode")
with open(path, "r", encoding="utf-8", errors="ignore") as f:
text = f.read()
docs = [Document(page_content=text, metadata={"source": path})]
# 3) Split into chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP
)
chunks = splitter.split_documents(docs)
print(f"β {len(chunks)} chunks from {os.path.basename(path)}")
all_chunks.extend(chunks)
# 4) Embed & persist on-disk Chroma
embedder = OllamaEmbeddings(base_url=OLLAMA_URL, model=EMBED_MODEL)
vectordb = Chroma(
embedding_function=embedder,
persist_directory=PERSIST_DIR,
collection_name="my_docs"
)
vectordb.add_documents(all_chunks)
print(f"β
Persisted {len(all_chunks)} chunks to '{PERSIST_DIR}'")
if __name__ == "__main__":
embed_all_docs()
|