Upload 5 files
Browse files- add_model_explanations.py +5 -1
- app.py +15 -9
- build_index.py +8 -3
- huggingface_model_descriptions.py +9 -5
add_model_explanations.py
CHANGED
|
@@ -9,7 +9,11 @@ from openai import OpenAI, APIError # Add back OpenAI imports
|
|
| 9 |
# Configure logging
|
| 10 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
EXPLANATION_KEY = "model_explanation_gemini"
|
| 14 |
DESCRIPTION_KEY = "description"
|
| 15 |
MAX_RETRIES = 3 # Retries for API calls
|
|
|
|
| 9 |
# Configure logging
|
| 10 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 11 |
|
| 12 |
+
# Define the base persistent storage path (must match other scripts)
|
| 13 |
+
PERSISTENT_STORAGE_PATH = "/data" # <-- ADJUST IF YOUR PATH IS DIFFERENT
|
| 14 |
+
|
| 15 |
+
# Point to the JSON data within persistent storage
|
| 16 |
+
MODEL_DATA_DIR = os.path.join(PERSISTENT_STORAGE_PATH, "model_data_json")
|
| 17 |
EXPLANATION_KEY = "model_explanation_gemini"
|
| 18 |
DESCRIPTION_KEY = "description"
|
| 19 |
MAX_RETRIES = 3 # Retries for API calls
|
app.py
CHANGED
|
@@ -22,6 +22,9 @@ except ImportError:
|
|
| 22 |
|
| 23 |
app = Flask(__name__) # Create app object FIRST
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
# Configure Flask app logging (optional but recommended)
|
| 26 |
# app.logger.setLevel(logging.INFO)
|
| 27 |
|
|
@@ -29,11 +32,12 @@ app = Flask(__name__) # Create app object FIRST
|
|
| 29 |
CORS(app, origins=["http://127.0.0.1:3000", "http://localhost:3000", "https://rag-huggingface.vercel.app"], supports_credentials=True)
|
| 30 |
|
| 31 |
# --- Configuration ---
|
| 32 |
-
|
| 33 |
-
|
|
|
|
| 34 |
EMBEDDING_MODEL = 'all-mpnet-base-v2'
|
| 35 |
-
#
|
| 36 |
-
MODEL_DATA_DIR = os.path.join(
|
| 37 |
# ---
|
| 38 |
|
| 39 |
# --- Global variables for resources ---
|
|
@@ -72,7 +76,8 @@ def load_resources():
|
|
| 72 |
print("Sentence transformer model loaded successfully.")
|
| 73 |
|
| 74 |
# Load FAISS Index
|
| 75 |
-
index_path = os.path.join(os.path.dirname(__file__), INDEX_FILE)
|
|
|
|
| 76 |
print(f"Loading FAISS index from: {index_path}")
|
| 77 |
if not os.path.exists(index_path):
|
| 78 |
raise FileNotFoundError(f"FAISS index file not found at {index_path}")
|
|
@@ -81,7 +86,8 @@ def load_resources():
|
|
| 81 |
print("FAISS index loaded successfully.")
|
| 82 |
|
| 83 |
# Load Index-to-Metadata Map
|
| 84 |
-
map_path = os.path.join(os.path.dirname(__file__), MAP_FILE)
|
|
|
|
| 85 |
print(f"Loading index-to-Metadata map from: {map_path}")
|
| 86 |
if not os.path.exists(map_path):
|
| 87 |
raise FileNotFoundError(f"Metadata map file not found at {map_path}")
|
|
@@ -95,8 +101,8 @@ def load_resources():
|
|
| 95 |
|
| 96 |
except FileNotFoundError as fnf_error:
|
| 97 |
print(f"Error: {fnf_error}")
|
| 98 |
-
print(f"Please ensure {INDEX_FILE} and {MAP_FILE} exist in the
|
| 99 |
-
print("You might need to run
|
| 100 |
RESOURCES_LOADED = False # Keep as False
|
| 101 |
except ImportError as import_error:
|
| 102 |
print(f"Import Error loading resources: {import_error}")
|
|
@@ -235,7 +241,7 @@ def search():
|
|
| 235 |
# --- Add description from model_data_json ---
|
| 236 |
model_id = metadata.get('model_id')
|
| 237 |
description = None
|
| 238 |
-
# Use the globally defined
|
| 239 |
if model_id and MODEL_DATA_DIR:
|
| 240 |
filename = model_id.replace('/', '_') + '.json'
|
| 241 |
filepath = os.path.join(MODEL_DATA_DIR, filename)
|
|
|
|
| 22 |
|
| 23 |
app = Flask(__name__) # Create app object FIRST
|
| 24 |
|
| 25 |
+
# Define the base persistent storage path (must match other scripts)
|
| 26 |
+
PERSISTENT_STORAGE_PATH = "/data" # <-- ADJUST IF YOUR PATH IS DIFFERENT
|
| 27 |
+
|
| 28 |
# Configure Flask app logging (optional but recommended)
|
| 29 |
# app.logger.setLevel(logging.INFO)
|
| 30 |
|
|
|
|
| 32 |
CORS(app, origins=["http://127.0.0.1:3000", "http://localhost:3000", "https://rag-huggingface.vercel.app"], supports_credentials=True)
|
| 33 |
|
| 34 |
# --- Configuration ---
|
| 35 |
+
# Point to index/map files in persistent storage
|
| 36 |
+
INDEX_FILE = os.path.join(PERSISTENT_STORAGE_PATH, "index.faiss")
|
| 37 |
+
MAP_FILE = os.path.join(PERSISTENT_STORAGE_PATH, "index_to_metadata.pkl")
|
| 38 |
EMBEDDING_MODEL = 'all-mpnet-base-v2'
|
| 39 |
+
# Point to model data JSON in persistent storage
|
| 40 |
+
MODEL_DATA_DIR = os.path.join(PERSISTENT_STORAGE_PATH, "model_data_json")
|
| 41 |
# ---
|
| 42 |
|
| 43 |
# --- Global variables for resources ---
|
|
|
|
| 76 |
print("Sentence transformer model loaded successfully.")
|
| 77 |
|
| 78 |
# Load FAISS Index
|
| 79 |
+
# index_path = os.path.join(os.path.dirname(__file__), INDEX_FILE) # Old path
|
| 80 |
+
index_path = INDEX_FILE # Use configured path
|
| 81 |
print(f"Loading FAISS index from: {index_path}")
|
| 82 |
if not os.path.exists(index_path):
|
| 83 |
raise FileNotFoundError(f"FAISS index file not found at {index_path}")
|
|
|
|
| 86 |
print("FAISS index loaded successfully.")
|
| 87 |
|
| 88 |
# Load Index-to-Metadata Map
|
| 89 |
+
# map_path = os.path.join(os.path.dirname(__file__), MAP_FILE) # Old path
|
| 90 |
+
map_path = MAP_FILE # Use configured path
|
| 91 |
print(f"Loading index-to-Metadata map from: {map_path}")
|
| 92 |
if not os.path.exists(map_path):
|
| 93 |
raise FileNotFoundError(f"Metadata map file not found at {map_path}")
|
|
|
|
| 101 |
|
| 102 |
except FileNotFoundError as fnf_error:
|
| 103 |
print(f"Error: {fnf_error}")
|
| 104 |
+
print(f"Please ensure {os.path.basename(INDEX_FILE)} and {os.path.basename(MAP_FILE)} exist in the persistent storage directory ({PERSISTENT_STORAGE_PATH}).")
|
| 105 |
+
print("You might need to run the update process first or manually place initial files there.")
|
| 106 |
RESOURCES_LOADED = False # Keep as False
|
| 107 |
except ImportError as import_error:
|
| 108 |
print(f"Import Error loading resources: {import_error}")
|
|
|
|
| 241 |
# --- Add description from model_data_json ---
|
| 242 |
model_id = metadata.get('model_id')
|
| 243 |
description = None
|
| 244 |
+
# Use the globally defined MODEL_DATA_DIR pointing to persistent storage
|
| 245 |
if model_id and MODEL_DATA_DIR:
|
| 246 |
filename = model_id.replace('/', '_') + '.json'
|
| 247 |
filepath = os.path.join(MODEL_DATA_DIR, filename)
|
build_index.py
CHANGED
|
@@ -7,10 +7,15 @@ import pickle
|
|
| 7 |
import json # Import json module
|
| 8 |
from tqdm import tqdm
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
# --- Configuration ---
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
| 14 |
EMBEDDING_MODEL = 'all-mpnet-base-v2' # Efficient and good quality model
|
| 15 |
ENCODE_BATCH_SIZE = 32 # Process descriptions in smaller batches
|
| 16 |
# Tags to exclude from indexing text
|
|
|
|
| 7 |
import json # Import json module
|
| 8 |
from tqdm import tqdm
|
| 9 |
|
| 10 |
+
# Define the base persistent storage path (must match other scripts)
|
| 11 |
+
PERSISTENT_STORAGE_PATH = "/data" # <-- ADJUST IF YOUR PATH IS DIFFERENT
|
| 12 |
+
|
| 13 |
# --- Configuration ---
|
| 14 |
+
# Point to the JSON data within persistent storage
|
| 15 |
+
MODEL_DATA_DIR = os.path.join(PERSISTENT_STORAGE_PATH, "model_data_json")
|
| 16 |
+
# Save index and map to persistent storage
|
| 17 |
+
INDEX_FILE = os.path.join(PERSISTENT_STORAGE_PATH, "index.faiss")
|
| 18 |
+
MAP_FILE = os.path.join(PERSISTENT_STORAGE_PATH, "index_to_metadata.pkl")
|
| 19 |
EMBEDDING_MODEL = 'all-mpnet-base-v2' # Efficient and good quality model
|
| 20 |
ENCODE_BATCH_SIZE = 32 # Process descriptions in smaller batches
|
| 21 |
# Tags to exclude from indexing text
|
huggingface_model_descriptions.py
CHANGED
|
@@ -10,8 +10,11 @@ from requests.exceptions import RequestException
|
|
| 10 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 11 |
import pickle # Add pickle for caching
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 16 |
|
| 17 |
# Number of worker threads for parallel processing - REDUCED
|
|
@@ -41,7 +44,8 @@ def clean_readme_content(text):
|
|
| 41 |
return text
|
| 42 |
# ---
|
| 43 |
|
| 44 |
-
|
|
|
|
| 45 |
|
| 46 |
def get_all_models_with_downloads(min_downloads=10000):
|
| 47 |
"""Fetch all models from Hugging Face with at least min_downloads, using a local cache for the list."""
|
|
@@ -154,9 +158,9 @@ def get_model_readme(model_id):
|
|
| 154 |
return None
|
| 155 |
|
| 156 |
def get_filename_for_model(model_id):
|
| 157 |
-
"""Generate JSON filename for a model"""
|
| 158 |
safe_id = model_id.replace("/", "_")
|
| 159 |
-
return os.path.join(OUTPUT_DIR, f"{safe_id}.json") #
|
| 160 |
|
| 161 |
def save_model_data(model_id, data):
|
| 162 |
"""Save model data (description, tags, downloads) to a JSON file."""
|
|
|
|
| 10 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 11 |
import pickle # Add pickle for caching
|
| 12 |
|
| 13 |
+
# Define the base persistent storage path
|
| 14 |
+
PERSISTENT_STORAGE_PATH = "/data" # <-- ADJUST IF YOUR PATH IS DIFFERENT
|
| 15 |
+
|
| 16 |
+
# Create a directory to store JSON data within persistent storage
|
| 17 |
+
OUTPUT_DIR = os.path.join(PERSISTENT_STORAGE_PATH, "model_data_json")
|
| 18 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 19 |
|
| 20 |
# Number of worker threads for parallel processing - REDUCED
|
|
|
|
| 44 |
return text
|
| 45 |
# ---
|
| 46 |
|
| 47 |
+
# Use persistent storage for the cache file
|
| 48 |
+
MODELS_CACHE_FILE = os.path.join(PERSISTENT_STORAGE_PATH, "models_list_cache.pkl") # File to cache the raw model list
|
| 49 |
|
| 50 |
def get_all_models_with_downloads(min_downloads=10000):
|
| 51 |
"""Fetch all models from Hugging Face with at least min_downloads, using a local cache for the list."""
|
|
|
|
| 158 |
return None
|
| 159 |
|
| 160 |
def get_filename_for_model(model_id):
|
| 161 |
+
"""Generate JSON filename for a model (uses global OUTPUT_DIR)"""
|
| 162 |
safe_id = model_id.replace("/", "_")
|
| 163 |
+
return os.path.join(OUTPUT_DIR, f"{safe_id}.json") # OUTPUT_DIR is already correct path
|
| 164 |
|
| 165 |
def save_model_data(model_id, data):
|
| 166 |
"""Save model data (description, tags, downloads) to a JSON file."""
|