Spaces:

Teapack1
/

RAG-Retrieve-Ingest-cz-eng

Runtime error

App Files Files Community

Teapack1 commited on Jul 1

Commit

cd7b78b

verified ·

1 Parent(s): 05055d0

Update ingest.py

Browse files

Files changed (1) hide show

ingest.py +19 -75

ingest.py CHANGED Viewed

@@ -1,44 +1,24 @@
-# ingest.py
-"""
-Create FAISS indices for Czech and English PDFs.
-  Default (matches backend/main.py):
-      • English embeddings : sentence-transformers/all-MiniLM-L6-v2  (384-d)
-      • Czech   embeddings : Seznam/retromae-small-cs               (768-d)
-  If you still need a legacy English store with OpenAI
-  `text-embedding-3-large` (3 072-d), instantiate with
-      use_openai_embeddings=True   and pass OPENAI_API_KEY.
-"""
 from pathlib import Path
 from typing import List
 from langchain_community.vectorstores import FAISS
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.document_loaders import DirectoryLoader, PyPDFLoader
-# ← updated import (fixes deprecation warning) ----------------------[2][3]
 from langchain_huggingface.embeddings import HuggingFaceEmbeddings
-from langchain.embeddings import OpenAIEmbeddings
 class Ingest:
-    # --------------------------------------------------------------------- #
     def __init__(
         self,
         *,
-        # names must stay exactly like in backend/main.py
         english_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
         czech_embedding_model:   str = "Seznam/retromae-small-cs",
-        # optional OpenAI path
         use_openai_embeddings:   bool = False,
-        openai_api_key:          str | None = None,
         openai_embedding_model:  str = "text-embedding-3-large",
-        # chunking
         chunk: int = 512,
         overlap: int = 256,
-        # folders
         english_store: str = "stores/english_512",
         czech_store:   str = "stores/czech_512",
         data_english:  str = "data/english",
@@ -46,37 +26,34 @@ class Ingest:
     ):
         self.english_embedding_model = english_embedding_model
         self.czech_embedding_model   = czech_embedding_model
         self.use_openai_embeddings   = use_openai_embeddings
-        self.openai_api_key          = openai_api_key
         self.openai_embedding_model  = openai_embedding_model
         self.chunk   = chunk
         self.overlap = overlap
         self.english_store = Path(english_store)
         self.czech_store   = Path(czech_store)
         self.data_english  = Path(data_english)
         self.data_czech    = Path(data_czech)
-    # --------------------------- helpers ---------------------------------- #
     @staticmethod
     def _load(folder: Path):
         return DirectoryLoader(
             str(folder),
             recursive=True,
             loader_cls=PyPDFLoader,
-            show_progress=True,
             use_multithreading=True,
         ).load()
     @staticmethod
     def _split(docs: List, chunk: int, overlap: int):
-        return RecursiveCharacterTextSplitter(
-            chunk_size=chunk, chunk_overlap=overlap
-        ).split_documents(docs)
-    # --------------------------- English ---------------------------------- #
     def ingest_english(self):
         if self.use_openai_embeddings:
             if not self.openai_api_key:
@@ -85,60 +62,27 @@ class Ingest:
                 openai_api_key=self.openai_api_key,
                 model=self.openai_embedding_model,
             )
-            mode = f"OpenAI ({self.openai_embedding_model}) 3 072-d"
         else:
             embed = HuggingFaceEmbeddings(
                 model_name=self.english_embedding_model,
                 model_kwargs={"device": "cpu"},
                 encode_kwargs={"normalize_embeddings": False},
             )
-            dim  = embed.client.get_sentence_embedding_dimension()
-            mode = f"HuggingFace ({self.english_embedding_model}) {dim}-d"
-        print(f"\n── Building English index with {mode}")
         texts = self._split(self._load(self.data_english), self.chunk, self.overlap)
         FAISS.from_documents(texts, embed).save_local(str(self.english_store))
-        print("✓ English store saved to", self.english_store, "\n")
-    # --------------------------- Czech ------------------------------------ #
     def ingest_czech(self):
         embed = HuggingFaceEmbeddings(
             model_name=self.czech_embedding_model,
             model_kwargs={"device": "cpu"},
             encode_kwargs={"normalize_embeddings": False},
         )
-        dim = embed.client.get_sentence_embedding_dimension()
-        print(f"\n── Building Czech index with HuggingFace "
-              f"({self.czech_embedding_model}) {dim}-d")
         texts = self._split(self._load(self.data_czech), self.chunk, self.overlap)
         FAISS.from_documents(texts, embed).save_local(str(self.czech_store))
-        print("✓ Czech store saved to", self.czech_store, "\n")
-# ───────────── CLI helper (optional) ───────────── #
-if __name__ == "__main__":
-    """
-    Examples
-    --------
-    python ingest.py                 # builds both stores (OSS embeddings)
-    OPENAI_API_KEY=sk-... \
-    python ingest.py --openai en     # rebuild English with OpenAI encoder
-    """
-    import argparse, os
-    p = argparse.ArgumentParser()
-    p.add_argument("--openai", action="store_true",
-                   help="Use OpenAI embeddings for English store.")
-    p.add_argument("lang", nargs="?", choices=["en", "cz"],
-                   help="Only ingest this language.")
-    args = p.parse_args()
-    ing = Ingest(
-        use_openai_embeddings=args.openai,
-        openai_api_key=os.getenv("OPENAI_API_KEY"),
-    )
-    if args.lang in (None, "en"):
-        ing.ingest_english()
-    if args.lang in (None, "cz"):
-        ing.ingest_czech()

+# ingest.py  – works with LangChain v0.2+
 from pathlib import Path
 from typing import List
 from langchain_community.vectorstores import FAISS
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
 from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+from langchain_openai import OpenAIEmbeddings      # optional
 class Ingest:
     def __init__(
         self,
         *,
         english_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
         czech_embedding_model:   str = "Seznam/retromae-small-cs",
         use_openai_embeddings:   bool = False,
         openai_embedding_model:  str = "text-embedding-3-large",
+        openai_api_key: str | None = None,
         chunk: int = 512,
         overlap: int = 256,
         english_store: str = "stores/english_512",
         czech_store:   str = "stores/czech_512",
         data_english:  str = "data/english",
     ):
         self.english_embedding_model = english_embedding_model
         self.czech_embedding_model   = czech_embedding_model
         self.use_openai_embeddings   = use_openai_embeddings
         self.openai_embedding_model  = openai_embedding_model
+        self.openai_api_key          = openai_api_key
         self.chunk   = chunk
         self.overlap = overlap
         self.english_store = Path(english_store)
         self.czech_store   = Path(czech_store)
         self.data_english  = Path(data_english)
         self.data_czech    = Path(data_czech)
+    # ------------------------------------------------------------------ utils
     @staticmethod
     def _load(folder: Path):
         return DirectoryLoader(
             str(folder),
             recursive=True,
             loader_cls=PyPDFLoader,
             use_multithreading=True,
+            show_progress=True,
         ).load()
     @staticmethod
     def _split(docs: List, chunk: int, overlap: int):
+        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk,
+                                                  chunk_overlap=overlap)
+        return splitter.split_documents(docs)
+    # ------------------------------------------------------------------ ENG
     def ingest_english(self):
         if self.use_openai_embeddings:
             if not self.openai_api_key:
                 openai_api_key=self.openai_api_key,
                 model=self.openai_embedding_model,
             )
+            mode = f"OpenAI {self.openai_embedding_model}"
         else:
             embed = HuggingFaceEmbeddings(
                 model_name=self.english_embedding_model,
                 model_kwargs={"device": "cpu"},
                 encode_kwargs={"normalize_embeddings": False},
             )
+            mode = f"HuggingFace {self.english_embedding_model}"
+        print(f"• English ingest with {mode}")
         texts = self._split(self._load(self.data_english), self.chunk, self.overlap)
         FAISS.from_documents(texts, embed).save_local(str(self.english_store))
+        print("✓ English store saved to", self.english_store)
+    # ------------------------------------------------------------------ CZ
     def ingest_czech(self):
         embed = HuggingFaceEmbeddings(
             model_name=self.czech_embedding_model,
             model_kwargs={"device": "cpu"},
             encode_kwargs={"normalize_embeddings": False},
         )
+        print(f"• Czech ingest with {self.czech_embedding_model}")
         texts = self._split(self._load(self.data_czech), self.chunk, self.overlap)
         FAISS.from_documents(texts, embed).save_local(str(self.czech_store))
+        print("✓ Czech store saved to", self.czech_store)