Spaces:

MrSimple01
/

AIEXP_RAG_1

Sleeping

App Files Files Community

MrSimple07 commited on 4 days ago

Commit

aa622c0

1 Parent(s): 1eaf3d8

added download link + dataset from hf

Browse files

Files changed (4) hide show

app.py +30 -13
requirements.txt +2 -1
scripts/config.py +65 -0
scripts/rag_engine.py +74 -11

app.py CHANGED Viewed

@@ -26,11 +26,28 @@ def initialize_system():
     query_engine = None
     # IMPORTANT: Setup LLM settings at the very beginning
-    from scripts.config import setup_llm_settings
     setup_llm_settings()
-    # Rest of your existing code...
-    if os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index')):
         try:
             print("Found existing RAG system files, loading...")
             query_engine = load_rag_system()
@@ -49,23 +66,23 @@ def initialize_system():
                         print(f"Could not count documents: {e}")
                         chunk_count = "неизвестно"
-                return f"AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из сохраненного индекса)"
         except Exception as e:
             print(f"Не удалось загрузить сохраненную систему: {str(e)}")
-    # If no existing RAG system, try to load from CSV
-    if os.path.exists(PROCESSED_DATA_FILE):
         try:
-            print("Loading from CSV file...")
             processed_chunks_df = load_processed_chunks(PROCESSED_DATA_FILE)
-            # Fix: Check for required columns with correct names from your CSV
             required_columns = {'document_id', 'file_link', 'chunk_text', 'chunk_id'}
             missing_columns = required_columns - set(processed_chunks_df.columns)
             if missing_columns:
-                return f"Ошибка при инициализации из CSV: отсутствуют необходимые столбцы: {missing_columns}"
-            # Fix: Fill missing optional columns
             if 'txt_file_id' not in processed_chunks_df.columns:
                 processed_chunks_df['txt_file_id'] = processed_chunks_df['document_id']
             if 'section' not in processed_chunks_df.columns:
@@ -79,11 +96,11 @@ def initialize_system():
             if processed_chunks:
                 print(f"Building RAG system with {len(processed_chunks)} chunks...")
                 query_engine = build_rag_system(processed_chunks)
-                return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
         except Exception as e:
-            return f"Ошибка при инициализации из CSV: {str(e)}"
-    return "AIEXP система готова к работе. Загрузите нормативные документы для создания базы знаний."
 def get_uploaded_files_info():
     if not os.path.exists(UPLOAD_FOLDER):

     query_engine = None
     # IMPORTANT: Setup LLM settings at the very beginning
+    from scripts.config import setup_llm_settings, download_pretrained_files
     setup_llm_settings()
+    # Check if local RAG system exists
+    local_rag_exists = os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
+    local_csv_exists = os.path.exists(PROCESSED_DATA_FILE)
+    # If no local system exists, try to download from HuggingFace
+    if not local_rag_exists and not local_csv_exists:
+        print("No local RAG system found. Attempting to download from HuggingFace...")
+        download_success = download_pretrained_files()
+        if download_success:
+            print("✅ Downloaded pre-trained files from HuggingFace Hub")
+            # Update existence flags after download
+            local_rag_exists = os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
+            local_csv_exists = os.path.exists(PROCESSED_DATA_FILE)
+        else:
+            print("⚠️ Failed to download pre-trained files. System will start empty.")
+    # Try to load existing RAG system
+    if local_rag_exists:
         try:
             print("Found existing RAG system files, loading...")
             query_engine = load_rag_system()
                         print(f"Could not count documents: {e}")
                         chunk_count = "неизвестно"
+                return f"✅ AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из индекса)"
         except Exception as e:
             print(f"Не удалось загрузить сохраненную систему: {str(e)}")
+    # If no RAG system but CSV exists, build from CSV
+    if local_csv_exists and query_engine is None:
         try:
+            print("Building RAG system from CSV file...")
             processed_chunks_df = load_processed_chunks(PROCESSED_DATA_FILE)
+            # Check for required columns
             required_columns = {'document_id', 'file_link', 'chunk_text', 'chunk_id'}
             missing_columns = required_columns - set(processed_chunks_df.columns)
             if missing_columns:
+                return f"❌ Ошибка при инициализации из CSV: отсутствуют необходимые столбцы: {missing_columns}"
+            # Fill missing optional columns
             if 'txt_file_id' not in processed_chunks_df.columns:
                 processed_chunks_df['txt_file_id'] = processed_chunks_df['document_id']
             if 'section' not in processed_chunks_df.columns:
             if processed_chunks:
                 print(f"Building RAG system with {len(processed_chunks)} chunks...")
                 query_engine = build_rag_system(processed_chunks)
+                return f"✅ AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
         except Exception as e:
+            return f"❌ Ошибка при инициализации из CSV: {str(e)}"
+    return "🔄 AIEXP система готова к работе. Загрузите нормативные документы для создания базы знаний."
 def get_uploaded_files_info():
     if not os.path.exists(UPLOAD_FOLDER):

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ PyPDF2
 python-docx
 openpyxl
 sentence-transformers
-google-generativeai

 python-docx
 openpyxl
 sentence-transformers
+google-generativeai
+huggingface_hub

scripts/config.py CHANGED Viewed

@@ -5,6 +5,8 @@ from llama_index.llms.google_genai import GoogleGenAI
 from llama_index.core import Settings
 from llama_index.core.llms import ChatMessage, MessageRole
 import os
 # Configuration
 EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
@@ -17,6 +19,8 @@ UPLOAD_FOLDER = "UPLOADED_DOCUMENTS"
 INDEX_STATE_FILE = "processed_data/index_store.json"
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', "AIzaSyDemsCp7JIdRNDRyP6DkYdMox1DLZwPcPE")
 LLM_MODEL = "gemini-2.0-flash"
 CHUNK_SIZE = 1024
@@ -25,6 +29,67 @@ MAX_CHUNK_SIZE = 2048
 MIN_CHUNK_SIZE = 750
 SIMILARITY_THRESHOLD = 0.7
 def setup_llm_settings():
     """Setup embedding and LLM models"""
     # Configure Google API

 from llama_index.core import Settings
 from llama_index.core.llms import ChatMessage, MessageRole
 import os
+from huggingface_hub import hf_hub_download
 # Configuration
 EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 INDEX_STATE_FILE = "processed_data/index_store.json"
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', "AIzaSyDemsCp7JIdRNDRyP6DkYdMox1DLZwPcPE")
+HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
+HF_TOKEN = os.getenv('HF_TOKEN')
 LLM_MODEL = "gemini-2.0-flash"
 CHUNK_SIZE = 1024
 MIN_CHUNK_SIZE = 750
 SIMILARITY_THRESHOLD = 0.7
+def download_pretrained_files():
+    """Download pre-trained RAG files from HuggingFace Hub"""
+    try:
+        print("Downloading pre-trained RAG files from HuggingFace Hub...")
+        # Files to download
+        files_to_download = [
+            "faiss_index.index",
+            "processed_chunks.csv",
+            "chunk_metadata.pkl",
+            "config.pkl",
+            "documents.pkl",
+            "default__vector_store.json",
+            "docstore.json",
+            "index_store.json"
+        ]
+        # Ensure RAG_FILES_DIR exists
+        os.makedirs(RAG_FILES_DIR, exist_ok=True)
+        os.makedirs("processed_data", exist_ok=True)
+        downloaded_files = {}
+        for filename in files_to_download:
+            try:
+                print(f"Downloading {filename}...")
+                # Download to RAG_FILES_DIR for most files, processed_data for CSV
+                target_dir = "processed_data" if filename == "processed_chunks.csv" else RAG_FILES_DIR
+                file_path = hf_hub_download(
+                    repo_id=HF_REPO_ID,
+                    filename=filename,
+                    local_dir=target_dir,
+                    repo_type="dataset",
+                    token=HF_TOKEN
+                )
+                downloaded_files[filename] = file_path
+                print(f"✓ Downloaded {filename}")
+            except Exception as e:
+                print(f"✗ Failed to download {filename}: {e}")
+                continue
+        # Verify critical files
+        critical_files = ["faiss_index.index", "processed_chunks.csv"]
+        missing_critical = [f for f in critical_files if f not in downloaded_files]
+        if missing_critical:
+            print(f"❌ Missing critical files: {missing_critical}")
+            return False
+        print(f"✅ Successfully downloaded {len(downloaded_files)}/{len(files_to_download)} files")
+        return True
+    except Exception as e:
+        print(f"❌ Failed to download pre-trained files: {e}")
+        return False
 def setup_llm_settings():
     """Setup embedding and LLM models"""
     # Configure Google API

scripts/rag_engine.py CHANGED Viewed

@@ -171,32 +171,95 @@ def save_rag_system(index, faiss_index, documents):
         pickle.dump(config, f)
 def load_rag_system():
-    if not os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index')):
         return None
     try:
         setup_llm_settings()
         faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
         vector_store = FaissVectorStore(faiss_index=faiss_index)
-        storage_context = StorageContext.from_defaults(vector_store=vector_store)
-        embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
-        index = VectorStoreIndex.from_documents([], storage_context=storage_context, embed_model=embed_model)
-        with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'rb') as f:
-            documents = pickle.load(f)
-        for doc in documents:
-            index.insert(doc)
         query_engine = create_query_engine(index)
         return query_engine
     except Exception as e:
-        print(f"Error loading RAG system: {str(e)}")
         return None
 def build_rag_system(processed_chunks):
     setup_llm_settings()

         pickle.dump(config, f)
 def load_rag_system():
+    """Load RAG system with better error handling and file verification"""
+    required_files = [
+        'faiss_index.index',
+        'default__vector_store.json',
+        'docstore.json',
+        'index_store.json'
+    ]
+    # Check if all required files exist
+    missing_files = []
+    for file in required_files:
+        if not os.path.exists(os.path.join(RAG_FILES_DIR, file)):
+            missing_files.append(file)
+    if missing_files:
+        print(f"Missing RAG system files: {missing_files}")
         return None
     try:
         setup_llm_settings()
+        # Load FAISS index
         faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
         vector_store = FaissVectorStore(faiss_index=faiss_index)
+        # Load storage context from persisted files
+        storage_context = StorageContext.from_defaults(
+            vector_store=vector_store,
+            persist_dir=RAG_FILES_DIR
+        )
+        # Create index from storage context
+        index = VectorStoreIndex.from_documents(
+            [],
+            storage_context=storage_context,
+            embed_model=Settings.embed_model
+        )
+        # Verify the index loaded correctly
+        print(f"✅ RAG system loaded with {faiss_index.ntotal} vectors")
         query_engine = create_query_engine(index)
         return query_engine
     except Exception as e:
+        print(f"❌ Error loading RAG system: {str(e)}")
         return None
+def save_rag_system(index, faiss_index, documents):
+    """Enhanced save function with verification"""
+    try:
+        os.makedirs(RAG_FILES_DIR, exist_ok=True)
+        # Save FAISS index
+        faiss.write_index(faiss_index, os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
+        # Persist storage context (saves docstore.json, index_store.json, default__vector_store.json)
+        index.storage_context.persist(persist_dir=RAG_FILES_DIR)
+        # Save documents pickle (for compatibility)
+        with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'wb') as f:
+            pickle.dump(documents, f)
+        # Save metadata pickle (for compatibility)
+        metadata_dict = {}
+        for doc in documents:
+            metadata_dict[doc.id_] = doc.metadata
+        with open(os.path.join(RAG_FILES_DIR, 'chunk_metadata.pkl'), 'wb') as f:
+            pickle.dump(metadata_dict, f)
+        # Save config
+        config = {
+            'embed_model_name': EMBEDDING_MODEL,
+            'vector_dim': 384,
+            'total_documents': len(documents),
+            'index_type': 'faiss_flat_ip'
+        }
+        with open(os.path.join(RAG_FILES_DIR, 'config.pkl'), 'wb') as f:
+            pickle.dump(config, f)
+        print(f"✅ RAG system saved successfully with {len(documents)} documents")
+    except Exception as e:
+        print(f"❌ Error saving RAG system: {str(e)}")
+        raise
 def build_rag_system(processed_chunks):
     setup_llm_settings()