MrSimple07 commited on
Commit
a50849a
·
1 Parent(s): 648d16e

checking rag if it works

Browse files
app.py CHANGED
@@ -25,30 +25,56 @@ def initialize_system():
25
  global query_engine
26
  query_engine = None
27
 
28
- try:
29
- query_engine = load_rag_system()
30
- if query_engine is not None:
31
- chunk_count = 0
32
- if os.path.exists(PROCESSED_DATA_FILE):
33
- processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE)
34
- chunk_count = len(processed_chunks)
35
- else:
36
- try:
37
- import pickle
38
- with open(os.path.join("processed_data", 'documents.pkl'), 'rb') as f:
39
- documents = pickle.load(f)
40
- chunk_count = len(documents)
41
- except:
42
- chunk_count = "неизвестно"
43
-
44
- return f"AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из сохраненного индекса)"
45
- except Exception as e:
46
- print(f"Не удалось загрузить сохраненную систему: {str(e)}")
 
 
 
 
 
47
 
 
48
  if os.path.exists(PROCESSED_DATA_FILE):
49
  try:
50
- processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE).to_dict('records')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  if processed_chunks:
 
52
  query_engine = build_rag_system(processed_chunks)
53
  return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
54
  except Exception as e:
 
25
  global query_engine
26
  query_engine = None
27
 
28
+ # Fix: Check if existing RAG system files exist first
29
+ if os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index')):
30
+ try:
31
+ print("Found existing RAG system files, loading...")
32
+ query_engine = load_rag_system()
33
+ if query_engine is not None:
34
+ # Count chunks from existing system
35
+ chunk_count = 0
36
+ if os.path.exists(PROCESSED_DATA_FILE):
37
+ processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE)
38
+ chunk_count = len(processed_chunks)
39
+ else:
40
+ try:
41
+ import pickle
42
+ with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'rb') as f:
43
+ documents = pickle.load(f)
44
+ chunk_count = len(documents)
45
+ except Exception as e:
46
+ print(f"Could not count documents: {e}")
47
+ chunk_count = "неизвестно"
48
+
49
+ return f"AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из сохраненного индекса)"
50
+ except Exception as e:
51
+ print(f"Не удалось загрузить сохраненную систему: {str(e)}")
52
 
53
+ # If no existing RAG system, try to load from CSV
54
  if os.path.exists(PROCESSED_DATA_FILE):
55
  try:
56
+ print("Loading from CSV file...")
57
+ processed_chunks_df = load_processed_chunks(PROCESSED_DATA_FILE)
58
+
59
+ # Fix: Check for required columns with correct names from your CSV
60
+ required_columns = {'document_id', 'file_link', 'chunk_text', 'chunk_id'}
61
+ missing_columns = required_columns - set(processed_chunks_df.columns)
62
+ if missing_columns:
63
+ return f"Ошибка при инициализации из CSV: отсутствуют необходимые столбцы: {missing_columns}"
64
+
65
+ # Fix: Fill missing optional columns
66
+ if 'txt_file_id' not in processed_chunks_df.columns:
67
+ processed_chunks_df['txt_file_id'] = processed_chunks_df['document_id']
68
+ if 'section' not in processed_chunks_df.columns:
69
+ processed_chunks_df['section'] = ''
70
+ if 'subsection' not in processed_chunks_df.columns:
71
+ processed_chunks_df['subsection'] = ''
72
+ if 'chunk_length' not in processed_chunks_df.columns:
73
+ processed_chunks_df['chunk_length'] = processed_chunks_df['chunk_text'].str.len()
74
+
75
+ processed_chunks = processed_chunks_df.to_dict('records')
76
  if processed_chunks:
77
+ print(f"Building RAG system with {len(processed_chunks)} chunks...")
78
  query_engine = build_rag_system(processed_chunks)
79
  return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
80
  except Exception as e:
scripts/__pycache__/config.cpython-311.pyc CHANGED
Binary files a/scripts/__pycache__/config.cpython-311.pyc and b/scripts/__pycache__/config.cpython-311.pyc differ
 
scripts/__pycache__/rag_engine.cpython-311.pyc CHANGED
Binary files a/scripts/__pycache__/rag_engine.cpython-311.pyc and b/scripts/__pycache__/rag_engine.cpython-311.pyc differ
 
scripts/config.py CHANGED
@@ -17,11 +17,10 @@ PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
17
  UPLOAD_FOLDER = "UPLOADED_DOCUMENTS"
18
  PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
19
  INDEX_STATE_FILE = "processed_data/index_store.json"
20
- RAG_FILES_DIR = "rag_files"
21
-
22
- GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
23
- LLM_MODEL = "gemini-2.5-flash"
24
 
 
 
25
 
26
 
27
  CHUNK_SIZE = 1024
@@ -34,6 +33,27 @@ SIMILARITY_THRESHOLD = 0.7
34
  RETRIEVER_TOP_K = 15
35
  RETRIEVER_SIMILARITY_CUTOFF = 0.7
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  CUSTOM_PROMPT = """
38
  You are a highly specialized Document Analysis Assistant (AIEXP). Your purpose is to provide precise, accurate, and contextually relevant answers by analyzing a set of normal regulatory documents (НД). Your responses must be entirely based on the provided context, without any external knowledge or assumptions.
39
 
@@ -69,11 +89,6 @@ Question:
69
  Answer:
70
  """
71
 
72
- def setup_llm_settings():
73
- Settings.embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
74
- Settings.llm = GoogleGenAI(model=LLM_MODEL, api_key=GOOGLE_API_KEY)
75
- Settings.llm.system_prompt = CUSTOM_PROMPT
76
-
77
 
78
  LLM_MODEL_PREPROCESS = "gemini-1.5-flash"
79
 
 
17
  UPLOAD_FOLDER = "UPLOADED_DOCUMENTS"
18
  PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
19
  INDEX_STATE_FILE = "processed_data/index_store.json"
20
+ RAG_FILES_DIR = "processed_data"
 
 
 
21
 
22
+ GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', "AIzaSyDemsCp7JIdRNDRyP6DkYdMox1DLZwPcPE")
23
+ LLM_MODEL = "gemini-2.0-flash"
24
 
25
 
26
  CHUNK_SIZE = 1024
 
33
  RETRIEVER_TOP_K = 15
34
  RETRIEVER_SIMILARITY_CUTOFF = 0.7
35
 
36
+
37
+ def setup_llm_settings():
38
+ # Set embedding model first
39
+ Settings.embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
40
+
41
+ # Only set LLM if API key is available
42
+ if GOOGLE_API_KEY:
43
+ try:
44
+ llm = GoogleGenAI(model=LLM_MODEL, api_key=GOOGLE_API_KEY)
45
+ Settings.llm = llm
46
+ # Set system prompt after LLM is properly initialized
47
+ if hasattr(llm, 'system_prompt'):
48
+ llm.system_prompt = CUSTOM_PROMPT
49
+ except Exception as e:
50
+ print(f"Warning: Could not initialize Google GenAI LLM: {e}")
51
+ Settings.llm = None
52
+ else:
53
+ print("Warning: GOOGLE_API_KEY not found. Setting LLM to None.")
54
+ Settings.llm = None
55
+
56
+
57
  CUSTOM_PROMPT = """
58
  You are a highly specialized Document Analysis Assistant (AIEXP). Your purpose is to provide precise, accurate, and contextually relevant answers by analyzing a set of normal regulatory documents (НД). Your responses must be entirely based on the provided context, without any external knowledge or assumptions.
59
 
 
89
  Answer:
90
  """
91
 
 
 
 
 
 
92
 
93
  LLM_MODEL_PREPROCESS = "gemini-1.5-flash"
94
 
scripts/rag_engine.py CHANGED
@@ -26,7 +26,8 @@ def create_vector_index_with_faiss(documents):
26
 
27
  index = VectorStoreIndex.from_documents(
28
  documents,
29
- storage_context=storage_context
 
30
  )
31
 
32
  return index, faiss_index
@@ -177,8 +178,10 @@ def load_rag_system():
177
  faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
178
  vector_store = FaissVectorStore(faiss_index=faiss_index)
179
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
 
 
180
 
181
- index = VectorStoreIndex.from_documents([], storage_context=storage_context)
182
 
183
  with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'rb') as f:
184
  documents = pickle.load(f)
@@ -192,7 +195,6 @@ def load_rag_system():
192
  except Exception as e:
193
  print(f"Error loading RAG system: {str(e)}")
194
  return None
195
-
196
  def build_rag_system(processed_chunks):
197
  setup_llm_settings()
198
 
 
26
 
27
  index = VectorStoreIndex.from_documents(
28
  documents,
29
+ storage_context=storage_context,
30
+ embed_model = EMBEDDING_MODEL
31
  )
32
 
33
  return index, faiss_index
 
178
  faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
179
  vector_store = FaissVectorStore(faiss_index=faiss_index)
180
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
181
+
182
+ embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
183
 
184
+ index = VectorStoreIndex.from_documents([], storage_context=storage_context, embed_model=embed_model)
185
 
186
  with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'rb') as f:
187
  documents = pickle.load(f)
 
195
  except Exception as e:
196
  print(f"Error loading RAG system: {str(e)}")
197
  return None
 
198
  def build_rag_system(processed_chunks):
199
  setup_llm_settings()
200