Spaces:
Sleeping
Sleeping
Commit
·
a50849a
1
Parent(s):
648d16e
checking rag if it works
Browse files- app.py +46 -20
- scripts/__pycache__/config.cpython-311.pyc +0 -0
- scripts/__pycache__/rag_engine.cpython-311.pyc +0 -0
- scripts/config.py +24 -9
- scripts/rag_engine.py +5 -3
app.py
CHANGED
@@ -25,30 +25,56 @@ def initialize_system():
|
|
25 |
global query_engine
|
26 |
query_engine = None
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
47 |
|
|
|
48 |
if os.path.exists(PROCESSED_DATA_FILE):
|
49 |
try:
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
if processed_chunks:
|
|
|
52 |
query_engine = build_rag_system(processed_chunks)
|
53 |
return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
|
54 |
except Exception as e:
|
|
|
25 |
global query_engine
|
26 |
query_engine = None
|
27 |
|
28 |
+
# Fix: Check if existing RAG system files exist first
|
29 |
+
if os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index')):
|
30 |
+
try:
|
31 |
+
print("Found existing RAG system files, loading...")
|
32 |
+
query_engine = load_rag_system()
|
33 |
+
if query_engine is not None:
|
34 |
+
# Count chunks from existing system
|
35 |
+
chunk_count = 0
|
36 |
+
if os.path.exists(PROCESSED_DATA_FILE):
|
37 |
+
processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE)
|
38 |
+
chunk_count = len(processed_chunks)
|
39 |
+
else:
|
40 |
+
try:
|
41 |
+
import pickle
|
42 |
+
with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'rb') as f:
|
43 |
+
documents = pickle.load(f)
|
44 |
+
chunk_count = len(documents)
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Could not count documents: {e}")
|
47 |
+
chunk_count = "неизвестно"
|
48 |
+
|
49 |
+
return f"AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из сохраненного индекса)"
|
50 |
+
except Exception as e:
|
51 |
+
print(f"Не удалось загрузить сохраненную систему: {str(e)}")
|
52 |
|
53 |
+
# If no existing RAG system, try to load from CSV
|
54 |
if os.path.exists(PROCESSED_DATA_FILE):
|
55 |
try:
|
56 |
+
print("Loading from CSV file...")
|
57 |
+
processed_chunks_df = load_processed_chunks(PROCESSED_DATA_FILE)
|
58 |
+
|
59 |
+
# Fix: Check for required columns with correct names from your CSV
|
60 |
+
required_columns = {'document_id', 'file_link', 'chunk_text', 'chunk_id'}
|
61 |
+
missing_columns = required_columns - set(processed_chunks_df.columns)
|
62 |
+
if missing_columns:
|
63 |
+
return f"Ошибка при инициализации из CSV: отсутствуют необходимые столбцы: {missing_columns}"
|
64 |
+
|
65 |
+
# Fix: Fill missing optional columns
|
66 |
+
if 'txt_file_id' not in processed_chunks_df.columns:
|
67 |
+
processed_chunks_df['txt_file_id'] = processed_chunks_df['document_id']
|
68 |
+
if 'section' not in processed_chunks_df.columns:
|
69 |
+
processed_chunks_df['section'] = ''
|
70 |
+
if 'subsection' not in processed_chunks_df.columns:
|
71 |
+
processed_chunks_df['subsection'] = ''
|
72 |
+
if 'chunk_length' not in processed_chunks_df.columns:
|
73 |
+
processed_chunks_df['chunk_length'] = processed_chunks_df['chunk_text'].str.len()
|
74 |
+
|
75 |
+
processed_chunks = processed_chunks_df.to_dict('records')
|
76 |
if processed_chunks:
|
77 |
+
print(f"Building RAG system with {len(processed_chunks)} chunks...")
|
78 |
query_engine = build_rag_system(processed_chunks)
|
79 |
return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
|
80 |
except Exception as e:
|
scripts/__pycache__/config.cpython-311.pyc
CHANGED
Binary files a/scripts/__pycache__/config.cpython-311.pyc and b/scripts/__pycache__/config.cpython-311.pyc differ
|
|
scripts/__pycache__/rag_engine.cpython-311.pyc
CHANGED
Binary files a/scripts/__pycache__/rag_engine.cpython-311.pyc and b/scripts/__pycache__/rag_engine.cpython-311.pyc differ
|
|
scripts/config.py
CHANGED
@@ -17,11 +17,10 @@ PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
|
|
17 |
UPLOAD_FOLDER = "UPLOADED_DOCUMENTS"
|
18 |
PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
|
19 |
INDEX_STATE_FILE = "processed_data/index_store.json"
|
20 |
-
RAG_FILES_DIR = "
|
21 |
-
|
22 |
-
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
|
23 |
-
LLM_MODEL = "gemini-2.5-flash"
|
24 |
|
|
|
|
|
25 |
|
26 |
|
27 |
CHUNK_SIZE = 1024
|
@@ -34,6 +33,27 @@ SIMILARITY_THRESHOLD = 0.7
|
|
34 |
RETRIEVER_TOP_K = 15
|
35 |
RETRIEVER_SIMILARITY_CUTOFF = 0.7
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
CUSTOM_PROMPT = """
|
38 |
You are a highly specialized Document Analysis Assistant (AIEXP). Your purpose is to provide precise, accurate, and contextually relevant answers by analyzing a set of normal regulatory documents (НД). Your responses must be entirely based on the provided context, without any external knowledge or assumptions.
|
39 |
|
@@ -69,11 +89,6 @@ Question:
|
|
69 |
Answer:
|
70 |
"""
|
71 |
|
72 |
-
def setup_llm_settings():
|
73 |
-
Settings.embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
|
74 |
-
Settings.llm = GoogleGenAI(model=LLM_MODEL, api_key=GOOGLE_API_KEY)
|
75 |
-
Settings.llm.system_prompt = CUSTOM_PROMPT
|
76 |
-
|
77 |
|
78 |
LLM_MODEL_PREPROCESS = "gemini-1.5-flash"
|
79 |
|
|
|
17 |
UPLOAD_FOLDER = "UPLOADED_DOCUMENTS"
|
18 |
PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
|
19 |
INDEX_STATE_FILE = "processed_data/index_store.json"
|
20 |
+
RAG_FILES_DIR = "processed_data"
|
|
|
|
|
|
|
21 |
|
22 |
+
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', "AIzaSyDemsCp7JIdRNDRyP6DkYdMox1DLZwPcPE")
|
23 |
+
LLM_MODEL = "gemini-2.0-flash"
|
24 |
|
25 |
|
26 |
CHUNK_SIZE = 1024
|
|
|
33 |
RETRIEVER_TOP_K = 15
|
34 |
RETRIEVER_SIMILARITY_CUTOFF = 0.7
|
35 |
|
36 |
+
|
37 |
+
def setup_llm_settings():
|
38 |
+
# Set embedding model first
|
39 |
+
Settings.embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
|
40 |
+
|
41 |
+
# Only set LLM if API key is available
|
42 |
+
if GOOGLE_API_KEY:
|
43 |
+
try:
|
44 |
+
llm = GoogleGenAI(model=LLM_MODEL, api_key=GOOGLE_API_KEY)
|
45 |
+
Settings.llm = llm
|
46 |
+
# Set system prompt after LLM is properly initialized
|
47 |
+
if hasattr(llm, 'system_prompt'):
|
48 |
+
llm.system_prompt = CUSTOM_PROMPT
|
49 |
+
except Exception as e:
|
50 |
+
print(f"Warning: Could not initialize Google GenAI LLM: {e}")
|
51 |
+
Settings.llm = None
|
52 |
+
else:
|
53 |
+
print("Warning: GOOGLE_API_KEY not found. Setting LLM to None.")
|
54 |
+
Settings.llm = None
|
55 |
+
|
56 |
+
|
57 |
CUSTOM_PROMPT = """
|
58 |
You are a highly specialized Document Analysis Assistant (AIEXP). Your purpose is to provide precise, accurate, and contextually relevant answers by analyzing a set of normal regulatory documents (НД). Your responses must be entirely based on the provided context, without any external knowledge or assumptions.
|
59 |
|
|
|
89 |
Answer:
|
90 |
"""
|
91 |
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
LLM_MODEL_PREPROCESS = "gemini-1.5-flash"
|
94 |
|
scripts/rag_engine.py
CHANGED
@@ -26,7 +26,8 @@ def create_vector_index_with_faiss(documents):
|
|
26 |
|
27 |
index = VectorStoreIndex.from_documents(
|
28 |
documents,
|
29 |
-
storage_context=storage_context
|
|
|
30 |
)
|
31 |
|
32 |
return index, faiss_index
|
@@ -177,8 +178,10 @@ def load_rag_system():
|
|
177 |
faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
|
178 |
vector_store = FaissVectorStore(faiss_index=faiss_index)
|
179 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
|
|
|
|
180 |
|
181 |
-
index = VectorStoreIndex.from_documents([], storage_context=storage_context)
|
182 |
|
183 |
with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'rb') as f:
|
184 |
documents = pickle.load(f)
|
@@ -192,7 +195,6 @@ def load_rag_system():
|
|
192 |
except Exception as e:
|
193 |
print(f"Error loading RAG system: {str(e)}")
|
194 |
return None
|
195 |
-
|
196 |
def build_rag_system(processed_chunks):
|
197 |
setup_llm_settings()
|
198 |
|
|
|
26 |
|
27 |
index = VectorStoreIndex.from_documents(
|
28 |
documents,
|
29 |
+
storage_context=storage_context,
|
30 |
+
embed_model = EMBEDDING_MODEL
|
31 |
)
|
32 |
|
33 |
return index, faiss_index
|
|
|
178 |
faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
|
179 |
vector_store = FaissVectorStore(faiss_index=faiss_index)
|
180 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
181 |
+
|
182 |
+
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
|
183 |
|
184 |
+
index = VectorStoreIndex.from_documents([], storage_context=storage_context, embed_model=embed_model)
|
185 |
|
186 |
with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'rb') as f:
|
187 |
documents = pickle.load(f)
|
|
|
195 |
except Exception as e:
|
196 |
print(f"Error loading RAG system: {str(e)}")
|
197 |
return None
|
|
|
198 |
def build_rag_system(processed_chunks):
|
199 |
setup_llm_settings()
|
200 |
|