Spaces:
Sleeping
Sleeping
| import uuid | |
| from pathlib import Path | |
| from src.utils.config import DOC_INPUT_DIR, TRANS_OUTPUT_DIR | |
| from src.utils.data_utils import chunk_text | |
| from src.db.vector_store import VectorStore | |
| from src.modelling.embed import DalaEmbedder | |
| from src.modelling.transliterate import DalaTransliterator | |
| def load_documents(input_dir: Path) -> list[tuple[str, str]]: | |
| """ | |
| Loads all .txt documents from input_dir. Returns a list of | |
| tuples: (filename, content) | |
| """ | |
| docs = [] | |
| for file in input_dir.glob("*.txt"): | |
| with open(file, 'r', encoding = "utf-8") as f: | |
| text = f.read() | |
| docs.append((file.stem, text)) | |
| return docs | |
| def process_documents() -> None: | |
| """ | |
| Main processing procedure. | |
| """ | |
| # Components | |
| transliterator = DalaTransliterator() | |
| embedder = DalaEmbedder() | |
| vector_store = VectorStore() | |
| docs = load_documents(DOC_INPUT_DIR) | |
| all_chunks = [] | |
| all_transliterated = [] | |
| all_metadata = [] | |
| for doc_id, text in docs: | |
| # Chunk the data | |
| chunks = chunk_text(text) | |
| all_chunks.extend(chunks) | |
| # Transliterate chunks | |
| translit_chunks = transliterator.batch_transliterate(chunks) | |
| all_transliterated.extend(translit_chunks) | |
| # Save transliterated version | |
| output_path = TRANS_OUTPUT_DIR / f"{doc_id}_transliterated.txt" | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write("\n\n".join(translit_chunks)) | |
| # Create metadata entries | |
| for i, chunk in enumerate(translit_chunks): | |
| meta = { | |
| "id": f"{doc_id}_{i}_{uuid.uuid4().hex[:6]}", | |
| "text": chunk | |
| } | |
| all_metadata.append(meta) | |
| # Embed all chunks | |
| embeddings = embedder.embed_batch(all_transliterated) | |
| # Add to vector DB | |
| vector_store.add(embeddings, all_metadata) | |
| print(f"[INFO] Successfully ingested {len(all_chunks)} chunks.") | |
| if __name__ == "__main__": | |
| process_documents() | |