MrSimple07 commited on
Commit
aa622c0
·
1 Parent(s): 1eaf3d8

added download link + dataset from hf

Browse files
Files changed (4) hide show
  1. app.py +30 -13
  2. requirements.txt +2 -1
  3. scripts/config.py +65 -0
  4. scripts/rag_engine.py +74 -11
app.py CHANGED
@@ -26,11 +26,28 @@ def initialize_system():
26
  query_engine = None
27
 
28
  # IMPORTANT: Setup LLM settings at the very beginning
29
- from scripts.config import setup_llm_settings
30
  setup_llm_settings()
31
 
32
- # Rest of your existing code...
33
- if os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index')):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  try:
35
  print("Found existing RAG system files, loading...")
36
  query_engine = load_rag_system()
@@ -49,23 +66,23 @@ def initialize_system():
49
  print(f"Could not count documents: {e}")
50
  chunk_count = "неизвестно"
51
 
52
- return f"AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из сохраненного индекса)"
53
  except Exception as e:
54
  print(f"Не удалось загрузить сохраненную систему: {str(e)}")
55
 
56
- # If no existing RAG system, try to load from CSV
57
- if os.path.exists(PROCESSED_DATA_FILE):
58
  try:
59
- print("Loading from CSV file...")
60
  processed_chunks_df = load_processed_chunks(PROCESSED_DATA_FILE)
61
 
62
- # Fix: Check for required columns with correct names from your CSV
63
  required_columns = {'document_id', 'file_link', 'chunk_text', 'chunk_id'}
64
  missing_columns = required_columns - set(processed_chunks_df.columns)
65
  if missing_columns:
66
- return f"Ошибка при инициализации из CSV: отсутствуют необходимые столбцы: {missing_columns}"
67
 
68
- # Fix: Fill missing optional columns
69
  if 'txt_file_id' not in processed_chunks_df.columns:
70
  processed_chunks_df['txt_file_id'] = processed_chunks_df['document_id']
71
  if 'section' not in processed_chunks_df.columns:
@@ -79,11 +96,11 @@ def initialize_system():
79
  if processed_chunks:
80
  print(f"Building RAG system with {len(processed_chunks)} chunks...")
81
  query_engine = build_rag_system(processed_chunks)
82
- return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
83
  except Exception as e:
84
- return f"Ошибка при инициализации из CSV: {str(e)}"
85
 
86
- return "AIEXP система готова к работе. Загрузите нормативные документы для создания базы знаний."
87
 
88
  def get_uploaded_files_info():
89
  if not os.path.exists(UPLOAD_FOLDER):
 
26
  query_engine = None
27
 
28
  # IMPORTANT: Setup LLM settings at the very beginning
29
+ from scripts.config import setup_llm_settings, download_pretrained_files
30
  setup_llm_settings()
31
 
32
+ # Check if local RAG system exists
33
+ local_rag_exists = os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
34
+ local_csv_exists = os.path.exists(PROCESSED_DATA_FILE)
35
+
36
+ # If no local system exists, try to download from HuggingFace
37
+ if not local_rag_exists and not local_csv_exists:
38
+ print("No local RAG system found. Attempting to download from HuggingFace...")
39
+ download_success = download_pretrained_files()
40
+
41
+ if download_success:
42
+ print("✅ Downloaded pre-trained files from HuggingFace Hub")
43
+ # Update existence flags after download
44
+ local_rag_exists = os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
45
+ local_csv_exists = os.path.exists(PROCESSED_DATA_FILE)
46
+ else:
47
+ print("⚠️ Failed to download pre-trained files. System will start empty.")
48
+
49
+ # Try to load existing RAG system
50
+ if local_rag_exists:
51
  try:
52
  print("Found existing RAG system files, loading...")
53
  query_engine = load_rag_system()
 
66
  print(f"Could not count documents: {e}")
67
  chunk_count = "неизвестно"
68
 
69
+ return f"AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из индекса)"
70
  except Exception as e:
71
  print(f"Не удалось загрузить сохраненную систему: {str(e)}")
72
 
73
+ # If no RAG system but CSV exists, build from CSV
74
+ if local_csv_exists and query_engine is None:
75
  try:
76
+ print("Building RAG system from CSV file...")
77
  processed_chunks_df = load_processed_chunks(PROCESSED_DATA_FILE)
78
 
79
+ # Check for required columns
80
  required_columns = {'document_id', 'file_link', 'chunk_text', 'chunk_id'}
81
  missing_columns = required_columns - set(processed_chunks_df.columns)
82
  if missing_columns:
83
+ return f"Ошибка при инициализации из CSV: отсутствуют необходимые столбцы: {missing_columns}"
84
 
85
+ # Fill missing optional columns
86
  if 'txt_file_id' not in processed_chunks_df.columns:
87
  processed_chunks_df['txt_file_id'] = processed_chunks_df['document_id']
88
  if 'section' not in processed_chunks_df.columns:
 
96
  if processed_chunks:
97
  print(f"Building RAG system with {len(processed_chunks)} chunks...")
98
  query_engine = build_rag_system(processed_chunks)
99
+ return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
100
  except Exception as e:
101
+ return f"Ошибка при инициализации из CSV: {str(e)}"
102
 
103
+ return "🔄 AIEXP система готова к работе. Загрузите нормативные документы для создания базы знаний."
104
 
105
  def get_uploaded_files_info():
106
  if not os.path.exists(UPLOAD_FOLDER):
requirements.txt CHANGED
@@ -8,4 +8,5 @@ PyPDF2
8
  python-docx
9
  openpyxl
10
  sentence-transformers
11
- google-generativeai
 
 
8
  python-docx
9
  openpyxl
10
  sentence-transformers
11
+ google-generativeai
12
+ huggingface_hub
scripts/config.py CHANGED
@@ -5,6 +5,8 @@ from llama_index.llms.google_genai import GoogleGenAI
5
  from llama_index.core import Settings
6
  from llama_index.core.llms import ChatMessage, MessageRole
7
  import os
 
 
8
 
9
  # Configuration
10
  EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
@@ -17,6 +19,8 @@ UPLOAD_FOLDER = "UPLOADED_DOCUMENTS"
17
  INDEX_STATE_FILE = "processed_data/index_store.json"
18
 
19
  GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', "AIzaSyDemsCp7JIdRNDRyP6DkYdMox1DLZwPcPE")
 
 
20
  LLM_MODEL = "gemini-2.0-flash"
21
 
22
  CHUNK_SIZE = 1024
@@ -25,6 +29,67 @@ MAX_CHUNK_SIZE = 2048
25
  MIN_CHUNK_SIZE = 750
26
  SIMILARITY_THRESHOLD = 0.7
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def setup_llm_settings():
29
  """Setup embedding and LLM models"""
30
  # Configure Google API
 
5
  from llama_index.core import Settings
6
  from llama_index.core.llms import ChatMessage, MessageRole
7
  import os
8
+ from huggingface_hub import hf_hub_download
9
+
10
 
11
  # Configuration
12
  EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 
19
  INDEX_STATE_FILE = "processed_data/index_store.json"
20
 
21
  GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', "AIzaSyDemsCp7JIdRNDRyP6DkYdMox1DLZwPcPE")
22
+ HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
23
+ HF_TOKEN = os.getenv('HF_TOKEN')
24
  LLM_MODEL = "gemini-2.0-flash"
25
 
26
  CHUNK_SIZE = 1024
 
29
  MIN_CHUNK_SIZE = 750
30
  SIMILARITY_THRESHOLD = 0.7
31
 
32
+
33
+ def download_pretrained_files():
34
+ """Download pre-trained RAG files from HuggingFace Hub"""
35
+ try:
36
+ print("Downloading pre-trained RAG files from HuggingFace Hub...")
37
+
38
+ # Files to download
39
+ files_to_download = [
40
+ "faiss_index.index",
41
+ "processed_chunks.csv",
42
+ "chunk_metadata.pkl",
43
+ "config.pkl",
44
+ "documents.pkl",
45
+ "default__vector_store.json",
46
+ "docstore.json",
47
+ "index_store.json"
48
+ ]
49
+
50
+ # Ensure RAG_FILES_DIR exists
51
+ os.makedirs(RAG_FILES_DIR, exist_ok=True)
52
+ os.makedirs("processed_data", exist_ok=True)
53
+
54
+ downloaded_files = {}
55
+
56
+ for filename in files_to_download:
57
+ try:
58
+ print(f"Downloading {filename}...")
59
+
60
+ # Download to RAG_FILES_DIR for most files, processed_data for CSV
61
+ target_dir = "processed_data" if filename == "processed_chunks.csv" else RAG_FILES_DIR
62
+
63
+ file_path = hf_hub_download(
64
+ repo_id=HF_REPO_ID,
65
+ filename=filename,
66
+ local_dir=target_dir,
67
+ repo_type="dataset",
68
+ token=HF_TOKEN
69
+ )
70
+
71
+ downloaded_files[filename] = file_path
72
+ print(f"✓ Downloaded {filename}")
73
+
74
+ except Exception as e:
75
+ print(f"✗ Failed to download {filename}: {e}")
76
+ continue
77
+
78
+ # Verify critical files
79
+ critical_files = ["faiss_index.index", "processed_chunks.csv"]
80
+ missing_critical = [f for f in critical_files if f not in downloaded_files]
81
+
82
+ if missing_critical:
83
+ print(f"❌ Missing critical files: {missing_critical}")
84
+ return False
85
+
86
+ print(f"✅ Successfully downloaded {len(downloaded_files)}/{len(files_to_download)} files")
87
+ return True
88
+
89
+ except Exception as e:
90
+ print(f"❌ Failed to download pre-trained files: {e}")
91
+ return False
92
+
93
  def setup_llm_settings():
94
  """Setup embedding and LLM models"""
95
  # Configure Google API
scripts/rag_engine.py CHANGED
@@ -171,32 +171,95 @@ def save_rag_system(index, faiss_index, documents):
171
  pickle.dump(config, f)
172
 
173
  def load_rag_system():
174
- if not os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index')):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  return None
176
 
177
  try:
178
  setup_llm_settings()
179
 
 
180
  faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
181
  vector_store = FaissVectorStore(faiss_index=faiss_index)
182
- storage_context = StorageContext.from_defaults(vector_store=vector_store)
183
-
184
- embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
185
 
186
- index = VectorStoreIndex.from_documents([], storage_context=storage_context, embed_model=embed_model)
 
 
 
 
 
 
 
 
 
 
 
187
 
188
- with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'rb') as f:
189
- documents = pickle.load(f)
190
-
191
- for doc in documents:
192
- index.insert(doc)
193
 
194
  query_engine = create_query_engine(index)
195
  return query_engine
196
 
197
  except Exception as e:
198
- print(f"Error loading RAG system: {str(e)}")
199
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  def build_rag_system(processed_chunks):
201
  setup_llm_settings()
202
 
 
171
  pickle.dump(config, f)
172
 
173
  def load_rag_system():
174
+ """Load RAG system with better error handling and file verification"""
175
+ required_files = [
176
+ 'faiss_index.index',
177
+ 'default__vector_store.json',
178
+ 'docstore.json',
179
+ 'index_store.json'
180
+ ]
181
+
182
+ # Check if all required files exist
183
+ missing_files = []
184
+ for file in required_files:
185
+ if not os.path.exists(os.path.join(RAG_FILES_DIR, file)):
186
+ missing_files.append(file)
187
+
188
+ if missing_files:
189
+ print(f"Missing RAG system files: {missing_files}")
190
  return None
191
 
192
  try:
193
  setup_llm_settings()
194
 
195
+ # Load FAISS index
196
  faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
197
  vector_store = FaissVectorStore(faiss_index=faiss_index)
 
 
 
198
 
199
+ # Load storage context from persisted files
200
+ storage_context = StorageContext.from_defaults(
201
+ vector_store=vector_store,
202
+ persist_dir=RAG_FILES_DIR
203
+ )
204
+
205
+ # Create index from storage context
206
+ index = VectorStoreIndex.from_documents(
207
+ [],
208
+ storage_context=storage_context,
209
+ embed_model=Settings.embed_model
210
+ )
211
 
212
+ # Verify the index loaded correctly
213
+ print(f"✅ RAG system loaded with {faiss_index.ntotal} vectors")
 
 
 
214
 
215
  query_engine = create_query_engine(index)
216
  return query_engine
217
 
218
  except Exception as e:
219
+ print(f"Error loading RAG system: {str(e)}")
220
  return None
221
+
222
+ def save_rag_system(index, faiss_index, documents):
223
+ """Enhanced save function with verification"""
224
+ try:
225
+ os.makedirs(RAG_FILES_DIR, exist_ok=True)
226
+
227
+ # Save FAISS index
228
+ faiss.write_index(faiss_index, os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
229
+
230
+ # Persist storage context (saves docstore.json, index_store.json, default__vector_store.json)
231
+ index.storage_context.persist(persist_dir=RAG_FILES_DIR)
232
+
233
+ # Save documents pickle (for compatibility)
234
+ with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'wb') as f:
235
+ pickle.dump(documents, f)
236
+
237
+ # Save metadata pickle (for compatibility)
238
+ metadata_dict = {}
239
+ for doc in documents:
240
+ metadata_dict[doc.id_] = doc.metadata
241
+
242
+ with open(os.path.join(RAG_FILES_DIR, 'chunk_metadata.pkl'), 'wb') as f:
243
+ pickle.dump(metadata_dict, f)
244
+
245
+ # Save config
246
+ config = {
247
+ 'embed_model_name': EMBEDDING_MODEL,
248
+ 'vector_dim': 384,
249
+ 'total_documents': len(documents),
250
+ 'index_type': 'faiss_flat_ip'
251
+ }
252
+
253
+ with open(os.path.join(RAG_FILES_DIR, 'config.pkl'), 'wb') as f:
254
+ pickle.dump(config, f)
255
+
256
+ print(f"✅ RAG system saved successfully with {len(documents)} documents")
257
+
258
+ except Exception as e:
259
+ print(f"❌ Error saving RAG system: {str(e)}")
260
+ raise
261
+
262
+
263
  def build_rag_system(processed_chunks):
264
  setup_llm_settings()
265