Spaces:

broadfield-dev
/

bible-app

Sleeping

App Files Files Community

broadfield-dev commited on 10 days ago

Commit

9376ac0

verified ·

1 Parent(s): 04eac3c

Update build_rag.py

Browse files

Files changed (1) hide show

build_rag.py +24 -13

build_rag.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import json
 import os
 import pandas as pd
 import torch
 from transformers import AutoTokenizer, AutoModel
 import chromadb
 import sys
@@ -18,6 +21,7 @@ STATUS_FILE = "build_status.log"
 JSON_DIRECTORY = 'bible_json'
 CHUNK_SIZE = 3
 EMBEDDING_BATCH_SIZE = 16
 BOOK_ID_TO_NAME = {
     1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
     6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
@@ -36,13 +40,12 @@ BOOK_ID_TO_NAME = {
 }
 def update_status(message):
-    """Writes a new status to the log file."""
-    print(message)  # Also print to Space logs
     with open(STATUS_FILE, "w") as f:
         f.write(message)
 def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
-    # (This function's internal logic is unchanged)
     all_verses = []
     if not os.path.exists(directory_path) or not os.listdir(directory_path):
         raise FileNotFoundError(f"Directory '{directory_path}' is empty or does not exist.")
@@ -72,7 +75,6 @@ def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFra
     return pd.DataFrame(all_chunks)
 def main():
-    """Main build process."""
     update_status("IN_PROGRESS: Step 1/5 - Processing JSON files...")
     bible_chunks_df = process_bible_json_files(JSON_DIRECTORY, chunk_size=CHUNK_SIZE)
@@ -81,29 +83,38 @@ def main():
         import shutil
         shutil.rmtree(CHROMA_PATH)
     client = chromadb.PersistentClient(path=CHROMA_PATH)
-    collection = client.create_collection(name=COLLECTION_NAME)
     update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
-    update_status("IN_PROGRESS: Step 4/5 - Generating embeddings and populating database...")
-    total_chunks = len(bible_chunks_df)
-    for i in tqdm(range(0, total_chunks, EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
         batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
         texts = batch_df['text'].tolist()
         inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
         with torch.no_grad():
             outputs = model(**inputs)
-        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().tolist()
         collection.add(
             ids=[str(j) for j in range(i, i + len(batch_df))],
-            embeddings=embeddings,
             documents=texts,
             metadatas=batch_df[['reference', 'version']].to_dict('records')
         )
     update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
     create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
     api = HfApi()
     api.upload_folder(
@@ -118,10 +129,10 @@ if __name__ == "__main__":
     try:
         main()
     except Exception as e:
         error_message = traceback.format_exc()
-        # Be specific about token errors
         if "401" in str(e) or "Unauthorized" in str(e):
-            update_status("FAILED: Hugging Face authentication error. Please ensure your HF_TOKEN secret is set correctly and has WRITE permissions.")
         else:
-            update_status(f"FAILED: An unexpected error occurred. Check Space logs for details. Error: {e}")
         print(error_message, file=sys.stderr)

+# build_rag.py (Updated with Normalization and Cosine Distance)
 import json
 import os
 import pandas as pd
 import torch
+import torch.nn.functional as F # Import the functional module
 from transformers import AutoTokenizer, AutoModel
 import chromadb
 import sys
 JSON_DIRECTORY = 'bible_json'
 CHUNK_SIZE = 3
 EMBEDDING_BATCH_SIZE = 16
+# (BOOK_ID_TO_NAME dictionary remains the same)
 BOOK_ID_TO_NAME = {
     1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
     6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
 }
 def update_status(message):
+    print(message)
     with open(STATUS_FILE, "w") as f:
         f.write(message)
 def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
+    # (This function is unchanged)
     all_verses = []
     if not os.path.exists(directory_path) or not os.listdir(directory_path):
         raise FileNotFoundError(f"Directory '{directory_path}' is empty or does not exist.")
     return pd.DataFrame(all_chunks)
 def main():
     update_status("IN_PROGRESS: Step 1/5 - Processing JSON files...")
     bible_chunks_df = process_bible_json_files(JSON_DIRECTORY, chunk_size=CHUNK_SIZE)
         import shutil
         shutil.rmtree(CHROMA_PATH)
     client = chromadb.PersistentClient(path=CHROMA_PATH)
+    # *** FIX 1: SET THE DISTANCE FUNCTION FOR THE COLLECTION ***
+    collection = client.create_collection(
+        name=COLLECTION_NAME,
+        metadata={"hnsw:space": "cosine"} # Use cosine distance
+    )
     update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
+    update_status("IN_PROGRESS: Step 4/5 - Generating and NORMALIZING embeddings...")
+    for i in tqdm(range(0, len(bible_chunks_df), EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
         batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
         texts = batch_df['text'].tolist()
         inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
         with torch.no_grad():
             outputs = model(**inputs)
+        # *** FIX 2: NORMALIZE THE EMBEDDINGS ***
+        embeddings = F.normalize(outputs.last_hidden_state.mean(dim=1), p=2, dim=1)
         collection.add(
             ids=[str(j) for j in range(i, i + len(batch_df))],
+            embeddings=embeddings.cpu().tolist(), # Convert to list after normalization
             documents=texts,
             metadatas=batch_df[['reference', 'version']].to_dict('records')
         )
     update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
+    # (This part is unchanged)
     create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
     api = HfApi()
     api.upload_folder(
     try:
         main()
     except Exception as e:
+        # (Error handling is unchanged)
         error_message = traceback.format_exc()
         if "401" in str(e) or "Unauthorized" in str(e):
+            update_status("FAILED: Hugging Face authentication error. Ensure your HF_TOKEN secret has WRITE permissions.")
         else:
+            update_status(f"FAILED: An unexpected error occurred. Check Space logs. Error: {e}")
         print(error_message, file=sys.stderr)