Spaces:
Sleeping
Sleeping
Commit
·
aa622c0
1
Parent(s):
1eaf3d8
added download link + dataset from hf
Browse files- app.py +30 -13
- requirements.txt +2 -1
- scripts/config.py +65 -0
- scripts/rag_engine.py +74 -11
app.py
CHANGED
@@ -26,11 +26,28 @@ def initialize_system():
|
|
26 |
query_engine = None
|
27 |
|
28 |
# IMPORTANT: Setup LLM settings at the very beginning
|
29 |
-
from scripts.config import setup_llm_settings
|
30 |
setup_llm_settings()
|
31 |
|
32 |
-
#
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
try:
|
35 |
print("Found existing RAG system files, loading...")
|
36 |
query_engine = load_rag_system()
|
@@ -49,23 +66,23 @@ def initialize_system():
|
|
49 |
print(f"Could not count documents: {e}")
|
50 |
chunk_count = "неизвестно"
|
51 |
|
52 |
-
return f"AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из
|
53 |
except Exception as e:
|
54 |
print(f"Не удалось загрузить сохраненную систему: {str(e)}")
|
55 |
|
56 |
-
# If no
|
57 |
-
if
|
58 |
try:
|
59 |
-
print("
|
60 |
processed_chunks_df = load_processed_chunks(PROCESSED_DATA_FILE)
|
61 |
|
62 |
-
#
|
63 |
required_columns = {'document_id', 'file_link', 'chunk_text', 'chunk_id'}
|
64 |
missing_columns = required_columns - set(processed_chunks_df.columns)
|
65 |
if missing_columns:
|
66 |
-
return f"Ошибка при инициализации из CSV: отсутствуют необходимые столбцы: {missing_columns}"
|
67 |
|
68 |
-
#
|
69 |
if 'txt_file_id' not in processed_chunks_df.columns:
|
70 |
processed_chunks_df['txt_file_id'] = processed_chunks_df['document_id']
|
71 |
if 'section' not in processed_chunks_df.columns:
|
@@ -79,11 +96,11 @@ def initialize_system():
|
|
79 |
if processed_chunks:
|
80 |
print(f"Building RAG system with {len(processed_chunks)} chunks...")
|
81 |
query_engine = build_rag_system(processed_chunks)
|
82 |
-
return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
|
83 |
except Exception as e:
|
84 |
-
return f"Ошибка при инициализации из CSV: {str(e)}"
|
85 |
|
86 |
-
return "AIEXP система готова к работе. Загрузите нормативные документы для создания базы знаний."
|
87 |
|
88 |
def get_uploaded_files_info():
|
89 |
if not os.path.exists(UPLOAD_FOLDER):
|
|
|
26 |
query_engine = None
|
27 |
|
28 |
# IMPORTANT: Setup LLM settings at the very beginning
|
29 |
+
from scripts.config import setup_llm_settings, download_pretrained_files
|
30 |
setup_llm_settings()
|
31 |
|
32 |
+
# Check if local RAG system exists
|
33 |
+
local_rag_exists = os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
|
34 |
+
local_csv_exists = os.path.exists(PROCESSED_DATA_FILE)
|
35 |
+
|
36 |
+
# If no local system exists, try to download from HuggingFace
|
37 |
+
if not local_rag_exists and not local_csv_exists:
|
38 |
+
print("No local RAG system found. Attempting to download from HuggingFace...")
|
39 |
+
download_success = download_pretrained_files()
|
40 |
+
|
41 |
+
if download_success:
|
42 |
+
print("✅ Downloaded pre-trained files from HuggingFace Hub")
|
43 |
+
# Update existence flags after download
|
44 |
+
local_rag_exists = os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
|
45 |
+
local_csv_exists = os.path.exists(PROCESSED_DATA_FILE)
|
46 |
+
else:
|
47 |
+
print("⚠️ Failed to download pre-trained files. System will start empty.")
|
48 |
+
|
49 |
+
# Try to load existing RAG system
|
50 |
+
if local_rag_exists:
|
51 |
try:
|
52 |
print("Found existing RAG system files, loading...")
|
53 |
query_engine = load_rag_system()
|
|
|
66 |
print(f"Could not count documents: {e}")
|
67 |
chunk_count = "неизвестно"
|
68 |
|
69 |
+
return f"✅ AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из индекса)"
|
70 |
except Exception as e:
|
71 |
print(f"Не удалось загрузить сохраненную систему: {str(e)}")
|
72 |
|
73 |
+
# If no RAG system but CSV exists, build from CSV
|
74 |
+
if local_csv_exists and query_engine is None:
|
75 |
try:
|
76 |
+
print("Building RAG system from CSV file...")
|
77 |
processed_chunks_df = load_processed_chunks(PROCESSED_DATA_FILE)
|
78 |
|
79 |
+
# Check for required columns
|
80 |
required_columns = {'document_id', 'file_link', 'chunk_text', 'chunk_id'}
|
81 |
missing_columns = required_columns - set(processed_chunks_df.columns)
|
82 |
if missing_columns:
|
83 |
+
return f"❌ Ошибка при инициализации из CSV: отсутствуют необходимые столбцы: {missing_columns}"
|
84 |
|
85 |
+
# Fill missing optional columns
|
86 |
if 'txt_file_id' not in processed_chunks_df.columns:
|
87 |
processed_chunks_df['txt_file_id'] = processed_chunks_df['document_id']
|
88 |
if 'section' not in processed_chunks_df.columns:
|
|
|
96 |
if processed_chunks:
|
97 |
print(f"Building RAG system with {len(processed_chunks)} chunks...")
|
98 |
query_engine = build_rag_system(processed_chunks)
|
99 |
+
return f"✅ AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
|
100 |
except Exception as e:
|
101 |
+
return f"❌ Ошибка при инициализации из CSV: {str(e)}"
|
102 |
|
103 |
+
return "🔄 AIEXP система готова к работе. Загрузите нормативные документы для создания базы знаний."
|
104 |
|
105 |
def get_uploaded_files_info():
|
106 |
if not os.path.exists(UPLOAD_FOLDER):
|
requirements.txt
CHANGED
@@ -8,4 +8,5 @@ PyPDF2
|
|
8 |
python-docx
|
9 |
openpyxl
|
10 |
sentence-transformers
|
11 |
-
google-generativeai
|
|
|
|
8 |
python-docx
|
9 |
openpyxl
|
10 |
sentence-transformers
|
11 |
+
google-generativeai
|
12 |
+
huggingface_hub
|
scripts/config.py
CHANGED
@@ -5,6 +5,8 @@ from llama_index.llms.google_genai import GoogleGenAI
|
|
5 |
from llama_index.core import Settings
|
6 |
from llama_index.core.llms import ChatMessage, MessageRole
|
7 |
import os
|
|
|
|
|
8 |
|
9 |
# Configuration
|
10 |
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
@@ -17,6 +19,8 @@ UPLOAD_FOLDER = "UPLOADED_DOCUMENTS"
|
|
17 |
INDEX_STATE_FILE = "processed_data/index_store.json"
|
18 |
|
19 |
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', "AIzaSyDemsCp7JIdRNDRyP6DkYdMox1DLZwPcPE")
|
|
|
|
|
20 |
LLM_MODEL = "gemini-2.0-flash"
|
21 |
|
22 |
CHUNK_SIZE = 1024
|
@@ -25,6 +29,67 @@ MAX_CHUNK_SIZE = 2048
|
|
25 |
MIN_CHUNK_SIZE = 750
|
26 |
SIMILARITY_THRESHOLD = 0.7
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def setup_llm_settings():
|
29 |
"""Setup embedding and LLM models"""
|
30 |
# Configure Google API
|
|
|
5 |
from llama_index.core import Settings
|
6 |
from llama_index.core.llms import ChatMessage, MessageRole
|
7 |
import os
|
8 |
+
from huggingface_hub import hf_hub_download
|
9 |
+
|
10 |
|
11 |
# Configuration
|
12 |
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
|
19 |
INDEX_STATE_FILE = "processed_data/index_store.json"
|
20 |
|
21 |
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', "AIzaSyDemsCp7JIdRNDRyP6DkYdMox1DLZwPcPE")
|
22 |
+
HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
|
23 |
+
HF_TOKEN = os.getenv('HF_TOKEN')
|
24 |
LLM_MODEL = "gemini-2.0-flash"
|
25 |
|
26 |
CHUNK_SIZE = 1024
|
|
|
29 |
MIN_CHUNK_SIZE = 750
|
30 |
SIMILARITY_THRESHOLD = 0.7
|
31 |
|
32 |
+
|
33 |
+
def download_pretrained_files():
|
34 |
+
"""Download pre-trained RAG files from HuggingFace Hub"""
|
35 |
+
try:
|
36 |
+
print("Downloading pre-trained RAG files from HuggingFace Hub...")
|
37 |
+
|
38 |
+
# Files to download
|
39 |
+
files_to_download = [
|
40 |
+
"faiss_index.index",
|
41 |
+
"processed_chunks.csv",
|
42 |
+
"chunk_metadata.pkl",
|
43 |
+
"config.pkl",
|
44 |
+
"documents.pkl",
|
45 |
+
"default__vector_store.json",
|
46 |
+
"docstore.json",
|
47 |
+
"index_store.json"
|
48 |
+
]
|
49 |
+
|
50 |
+
# Ensure RAG_FILES_DIR exists
|
51 |
+
os.makedirs(RAG_FILES_DIR, exist_ok=True)
|
52 |
+
os.makedirs("processed_data", exist_ok=True)
|
53 |
+
|
54 |
+
downloaded_files = {}
|
55 |
+
|
56 |
+
for filename in files_to_download:
|
57 |
+
try:
|
58 |
+
print(f"Downloading {filename}...")
|
59 |
+
|
60 |
+
# Download to RAG_FILES_DIR for most files, processed_data for CSV
|
61 |
+
target_dir = "processed_data" if filename == "processed_chunks.csv" else RAG_FILES_DIR
|
62 |
+
|
63 |
+
file_path = hf_hub_download(
|
64 |
+
repo_id=HF_REPO_ID,
|
65 |
+
filename=filename,
|
66 |
+
local_dir=target_dir,
|
67 |
+
repo_type="dataset",
|
68 |
+
token=HF_TOKEN
|
69 |
+
)
|
70 |
+
|
71 |
+
downloaded_files[filename] = file_path
|
72 |
+
print(f"✓ Downloaded {filename}")
|
73 |
+
|
74 |
+
except Exception as e:
|
75 |
+
print(f"✗ Failed to download {filename}: {e}")
|
76 |
+
continue
|
77 |
+
|
78 |
+
# Verify critical files
|
79 |
+
critical_files = ["faiss_index.index", "processed_chunks.csv"]
|
80 |
+
missing_critical = [f for f in critical_files if f not in downloaded_files]
|
81 |
+
|
82 |
+
if missing_critical:
|
83 |
+
print(f"❌ Missing critical files: {missing_critical}")
|
84 |
+
return False
|
85 |
+
|
86 |
+
print(f"✅ Successfully downloaded {len(downloaded_files)}/{len(files_to_download)} files")
|
87 |
+
return True
|
88 |
+
|
89 |
+
except Exception as e:
|
90 |
+
print(f"❌ Failed to download pre-trained files: {e}")
|
91 |
+
return False
|
92 |
+
|
93 |
def setup_llm_settings():
|
94 |
"""Setup embedding and LLM models"""
|
95 |
# Configure Google API
|
scripts/rag_engine.py
CHANGED
@@ -171,32 +171,95 @@ def save_rag_system(index, faiss_index, documents):
|
|
171 |
pickle.dump(config, f)
|
172 |
|
173 |
def load_rag_system():
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
return None
|
176 |
|
177 |
try:
|
178 |
setup_llm_settings()
|
179 |
|
|
|
180 |
faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
|
181 |
vector_store = FaissVectorStore(faiss_index=faiss_index)
|
182 |
-
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
183 |
-
|
184 |
-
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
|
185 |
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
for doc in documents:
|
192 |
-
index.insert(doc)
|
193 |
|
194 |
query_engine = create_query_engine(index)
|
195 |
return query_engine
|
196 |
|
197 |
except Exception as e:
|
198 |
-
print(f"Error loading RAG system: {str(e)}")
|
199 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
def build_rag_system(processed_chunks):
|
201 |
setup_llm_settings()
|
202 |
|
|
|
171 |
pickle.dump(config, f)
|
172 |
|
173 |
def load_rag_system():
|
174 |
+
"""Load RAG system with better error handling and file verification"""
|
175 |
+
required_files = [
|
176 |
+
'faiss_index.index',
|
177 |
+
'default__vector_store.json',
|
178 |
+
'docstore.json',
|
179 |
+
'index_store.json'
|
180 |
+
]
|
181 |
+
|
182 |
+
# Check if all required files exist
|
183 |
+
missing_files = []
|
184 |
+
for file in required_files:
|
185 |
+
if not os.path.exists(os.path.join(RAG_FILES_DIR, file)):
|
186 |
+
missing_files.append(file)
|
187 |
+
|
188 |
+
if missing_files:
|
189 |
+
print(f"Missing RAG system files: {missing_files}")
|
190 |
return None
|
191 |
|
192 |
try:
|
193 |
setup_llm_settings()
|
194 |
|
195 |
+
# Load FAISS index
|
196 |
faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
|
197 |
vector_store = FaissVectorStore(faiss_index=faiss_index)
|
|
|
|
|
|
|
198 |
|
199 |
+
# Load storage context from persisted files
|
200 |
+
storage_context = StorageContext.from_defaults(
|
201 |
+
vector_store=vector_store,
|
202 |
+
persist_dir=RAG_FILES_DIR
|
203 |
+
)
|
204 |
+
|
205 |
+
# Create index from storage context
|
206 |
+
index = VectorStoreIndex.from_documents(
|
207 |
+
[],
|
208 |
+
storage_context=storage_context,
|
209 |
+
embed_model=Settings.embed_model
|
210 |
+
)
|
211 |
|
212 |
+
# Verify the index loaded correctly
|
213 |
+
print(f"✅ RAG system loaded with {faiss_index.ntotal} vectors")
|
|
|
|
|
|
|
214 |
|
215 |
query_engine = create_query_engine(index)
|
216 |
return query_engine
|
217 |
|
218 |
except Exception as e:
|
219 |
+
print(f"❌ Error loading RAG system: {str(e)}")
|
220 |
return None
|
221 |
+
|
222 |
+
def save_rag_system(index, faiss_index, documents):
|
223 |
+
"""Enhanced save function with verification"""
|
224 |
+
try:
|
225 |
+
os.makedirs(RAG_FILES_DIR, exist_ok=True)
|
226 |
+
|
227 |
+
# Save FAISS index
|
228 |
+
faiss.write_index(faiss_index, os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
|
229 |
+
|
230 |
+
# Persist storage context (saves docstore.json, index_store.json, default__vector_store.json)
|
231 |
+
index.storage_context.persist(persist_dir=RAG_FILES_DIR)
|
232 |
+
|
233 |
+
# Save documents pickle (for compatibility)
|
234 |
+
with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'wb') as f:
|
235 |
+
pickle.dump(documents, f)
|
236 |
+
|
237 |
+
# Save metadata pickle (for compatibility)
|
238 |
+
metadata_dict = {}
|
239 |
+
for doc in documents:
|
240 |
+
metadata_dict[doc.id_] = doc.metadata
|
241 |
+
|
242 |
+
with open(os.path.join(RAG_FILES_DIR, 'chunk_metadata.pkl'), 'wb') as f:
|
243 |
+
pickle.dump(metadata_dict, f)
|
244 |
+
|
245 |
+
# Save config
|
246 |
+
config = {
|
247 |
+
'embed_model_name': EMBEDDING_MODEL,
|
248 |
+
'vector_dim': 384,
|
249 |
+
'total_documents': len(documents),
|
250 |
+
'index_type': 'faiss_flat_ip'
|
251 |
+
}
|
252 |
+
|
253 |
+
with open(os.path.join(RAG_FILES_DIR, 'config.pkl'), 'wb') as f:
|
254 |
+
pickle.dump(config, f)
|
255 |
+
|
256 |
+
print(f"✅ RAG system saved successfully with {len(documents)} documents")
|
257 |
+
|
258 |
+
except Exception as e:
|
259 |
+
print(f"❌ Error saving RAG system: {str(e)}")
|
260 |
+
raise
|
261 |
+
|
262 |
+
|
263 |
def build_rag_system(processed_chunks):
|
264 |
setup_llm_settings()
|
265 |
|