broadfield-dev commited on
Commit
04eac3c
·
verified ·
1 Parent(s): 1dde6a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -43
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import sys
2
  import subprocess
3
- from flask import Flask, render_template, request, flash, redirect, url_for
4
  import torch
5
  from transformers import AutoTokenizer, AutoModel
6
  import os
@@ -19,6 +19,7 @@ CHROMA_PATH = "chroma_db"
19
  COLLECTION_NAME = "bible_verses"
20
  MODEL_NAME = "google/embeddinggemma-300m"
21
  DATASET_REPO = "broadfield-dev/bible-chromadb-gemma"
 
22
 
23
  # --- Global variables for resources ---
24
  chroma_collection = None
@@ -26,83 +27,88 @@ tokenizer = None
26
  embedding_model = None
27
 
28
  def load_resources():
29
- """
30
- Downloads the DB from the Hub if not present, then loads it and the model.
31
- """
32
  global chroma_collection, tokenizer, embedding_model
33
  if chroma_collection and embedding_model:
34
  return True
35
 
36
  print("Attempting to load resources...")
37
  try:
38
- # 1. Download the ChromaDB files from the Hugging Face Hub
39
- # This will only download if the folder doesn't already exist.
40
- print(f"Ensuring database is available locally from '{DATASET_REPO}'...")
41
- snapshot_download(
42
- repo_id=DATASET_REPO,
43
- repo_type="dataset",
44
- local_dir=CHROMA_PATH,
45
- local_dir_use_symlinks=False # Recommended for Spaces
46
- )
47
- print("Database files are present locally.")
48
-
49
- # 2. Initialize ChromaDB client from the downloaded files
50
  client = chromadb.PersistentClient(path=CHROMA_PATH)
51
  collection = client.get_collection(name=COLLECTION_NAME)
52
-
53
  if collection.count() == 0:
54
- print(f"Warning: Database collection is empty.")
55
  return False
56
 
57
  chroma_collection = collection
58
  print(f"Successfully connected to DB with {collection.count()} items.")
59
-
60
- # 3. Load the embedding model
61
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
62
  embedding_model = AutoModel.from_pretrained(MODEL_NAME)
63
  print(f"Embedding model '{MODEL_NAME}' loaded successfully.")
64
 
65
  return True
66
  except Exception as e:
67
- print(f"Could not load resources. The database may not be built yet.")
68
  print(f"Error: {e}")
69
  return False
70
 
71
- # Try to load resources on startup.
72
  resources_loaded = load_resources()
73
 
74
- # --- 3. Define App Routes (Unchanged from previous ChromaDB version) ---
 
75
  @app.route('/')
76
  def home():
77
- if not resources_loaded:
78
- flash(f"Welcome! Database not ready. Use the admin panel to build it.", "warning")
79
  return render_template('index.html')
80
 
81
  @app.route('/build-rag', methods=['POST'])
82
  def build_rag_route():
83
- print("Vector database build process requested.")
84
  try:
85
- process = subprocess.Popen(
86
- [sys.executable, "build_rag.py"],
87
- stdout=subprocess.PIPE,
88
- stderr=subprocess.STDOUT,
89
- text=True
90
- )
91
- print(f"Started build process with PID: {process.pid}")
92
- flash("Database build & push initiated! This can take several minutes. Check logs for progress. The app will be ready when it completes.", "info")
93
  except Exception as e:
94
- print(f"Failed to start build process: {e}")
95
- flash(f"An error occurred: {e}", "error")
96
- return redirect(url_for('home'))
97
-
 
 
 
 
 
 
 
 
 
 
 
 
98
  @app.route('/search', methods=['POST'])
99
  def search():
100
  global resources_loaded
101
  if not resources_loaded:
102
- print("Reloading resources for search...")
103
  resources_loaded = load_resources()
104
  if not resources_loaded:
105
- flash("Database not ready. Please wait for the build process to finish.", "error")
106
  return redirect(url_for('home'))
107
 
108
  user_query = request.form['query']
@@ -120,9 +126,7 @@ def search():
120
  )
121
 
122
  results_list = []
123
- documents = search_results['documents'][0]
124
- metadatas = search_results['metadatas'][0]
125
- distances = search_results['distances'][0]
126
 
127
  for i in range(len(documents)):
128
  results_list.append({
 
1
  import sys
2
  import subprocess
3
+ from flask import Flask, render_template, request, flash, redirect, url_for, jsonify
4
  import torch
5
  from transformers import AutoTokenizer, AutoModel
6
  import os
 
19
  COLLECTION_NAME = "bible_verses"
20
  MODEL_NAME = "google/embeddinggemma-300m"
21
  DATASET_REPO = "broadfield-dev/bible-chromadb-gemma"
22
+ STATUS_FILE = "build_status.log" # File to track build status
23
 
24
  # --- Global variables for resources ---
25
  chroma_collection = None
 
27
  embedding_model = None
28
 
29
  def load_resources():
30
+ """Downloads the DB from the Hub if not present, then loads it and the model."""
 
 
31
  global chroma_collection, tokenizer, embedding_model
32
  if chroma_collection and embedding_model:
33
  return True
34
 
35
  print("Attempting to load resources...")
36
  try:
37
+ if not os.path.exists(CHROMA_PATH) or not os.listdir(CHROMA_PATH):
38
+ print(f"Local DB not found. Downloading from '{DATASET_REPO}'...")
39
+ snapshot_download(
40
+ repo_id=DATASET_REPO,
41
+ repo_type="dataset",
42
+ local_dir=CHROMA_PATH,
43
+ local_dir_use_symlinks=False
44
+ )
45
+ print("Database files downloaded.")
46
+ else:
47
+ print("Local database files found.")
48
+
49
  client = chromadb.PersistentClient(path=CHROMA_PATH)
50
  collection = client.get_collection(name=COLLECTION_NAME)
 
51
  if collection.count() == 0:
52
+ print("Warning: Database collection is empty.")
53
  return False
54
 
55
  chroma_collection = collection
56
  print(f"Successfully connected to DB with {collection.count()} items.")
57
+
 
58
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
59
  embedding_model = AutoModel.from_pretrained(MODEL_NAME)
60
  print(f"Embedding model '{MODEL_NAME}' loaded successfully.")
61
 
62
  return True
63
  except Exception as e:
64
+ print(f"Could not load resources. The database may not be built yet or the repo is empty.")
65
  print(f"Error: {e}")
66
  return False
67
 
 
68
  resources_loaded = load_resources()
69
 
70
+ # --- 3. Define App Routes ---
71
+
72
  @app.route('/')
73
  def home():
 
 
74
  return render_template('index.html')
75
 
76
  @app.route('/build-rag', methods=['POST'])
77
  def build_rag_route():
78
+ """Triggers the build script and immediately responds."""
79
  try:
80
+ # Clear old status and set to "In Progress"
81
+ with open(STATUS_FILE, "w") as f:
82
+ f.write("IN_PROGRESS: Starting build process...")
83
+
84
+ # Start the build process in the background
85
+ subprocess.Popen([sys.executable, "build_rag.py"])
86
+
87
+ return jsonify({"status": "started"})
88
  except Exception as e:
89
+ with open(STATUS_FILE, "w") as f:
90
+ f.write(f"FAILED: Could not start process - {e}")
91
+ return jsonify({"status": "error", "message": str(e)}), 500
92
+
93
+ @app.route('/status')
94
+ def status():
95
+ """Endpoint for the frontend to poll for build status."""
96
+ if not os.path.exists(STATUS_FILE):
97
+ return jsonify({"status": "NOT_STARTED"})
98
+
99
+ with open(STATUS_FILE, "r") as f:
100
+ status_line = f.read().strip()
101
+
102
+ status_code, _, message = status_line.partition(': ')
103
+ return jsonify({"status": status_code, "message": message})
104
+
105
  @app.route('/search', methods=['POST'])
106
  def search():
107
  global resources_loaded
108
  if not resources_loaded:
 
109
  resources_loaded = load_resources()
110
  if not resources_loaded:
111
+ flash("Database not ready. Please wait for the build process to finish and then refresh the page.", "error")
112
  return redirect(url_for('home'))
113
 
114
  user_query = request.form['query']
 
126
  )
127
 
128
  results_list = []
129
+ documents, metadatas, distances = search_results['documents'][0], search_results['metadatas'][0], search_results['distances'][0]
 
 
130
 
131
  for i in range(len(documents)):
132
  results_list.append({