MrSimple07 commited on
Commit
1b98e0e
·
1 Parent(s): 34ac8e9

fixed config + added all the necessary files

Browse files
app.py CHANGED
@@ -4,14 +4,11 @@ import shutil
4
  import pandas as pd
5
  from datetime import datetime
6
  from scripts.document_processor import process_multiple_documents, save_processed_chunks, load_processed_chunks
7
- from scripts.rag_engine import build_rag_system, query_documents, format_response_with_sources, add_new_document_to_system
8
  import json
9
  import tempfile
 
10
 
11
- UPLOAD_FOLDER = "UPLOADED_DOCUMENTS"
12
- PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
13
- INDEX_STATE_FILE = "processed_data/index_store.json"
14
- RAG_FILES_DIR = "rag_files"
15
 
16
  if not os.path.exists(UPLOAD_FOLDER):
17
  os.makedirs(UPLOAD_FOLDER)
@@ -22,18 +19,41 @@ if not os.path.exists("processed_data"):
22
  if not os.path.exists(RAG_FILES_DIR):
23
  os.makedirs(RAG_FILES_DIR)
24
 
 
 
25
  def initialize_system():
26
  global query_engine
27
  query_engine = None
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  if os.path.exists(PROCESSED_DATA_FILE):
30
  try:
31
- processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE).to_dict('records')
32
  if processed_chunks:
33
  query_engine = build_rag_system(processed_chunks)
34
- return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов"
35
  except Exception as e:
36
- return f"Ошибка при инициализации: {str(e)}"
37
 
38
  return "AIEXP система готова к работе. Загрузите нормативные документы для создания базы знаний."
39
 
@@ -229,7 +249,7 @@ def create_demo_interface():
229
  placeholder="Введите вопрос по нормативным документам...",
230
  lines=3
231
  )
232
- ask_btn = gr.Button("🔍 Найти ответ в НД", variant="primary", size="lg")
233
 
234
  gr.Examples(
235
  examples=[
 
4
  import pandas as pd
5
  from datetime import datetime
6
  from scripts.document_processor import process_multiple_documents, save_processed_chunks, load_processed_chunks
7
+ from scripts.rag_engine import build_rag_system, query_documents, format_response_with_sources, add_new_document_to_system, load_rag_system
8
  import json
9
  import tempfile
10
+ from scripts.config import *
11
 
 
 
 
 
12
 
13
  if not os.path.exists(UPLOAD_FOLDER):
14
  os.makedirs(UPLOAD_FOLDER)
 
19
  if not os.path.exists(RAG_FILES_DIR):
20
  os.makedirs(RAG_FILES_DIR)
21
 
22
+
23
+
24
  def initialize_system():
25
  global query_engine
26
  query_engine = None
27
 
28
+ try:
29
+ query_engine = load_rag_system()
30
+ if query_engine is not None:
31
+ chunk_count = 0
32
+ if os.path.exists(PROCESSED_DATA_FILE):
33
+ processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE)
34
+ chunk_count = len(processed_chunks)
35
+ else:
36
+ try:
37
+ import pickle
38
+ with open(os.path.join("processed_data", 'documents.pkl'), 'rb') as f:
39
+ documents = pickle.load(f)
40
+ chunk_count = len(documents)
41
+ except:
42
+ chunk_count = "неизвестно"
43
+
44
+ return f"AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из сохраненного индекса)"
45
+ except Exception as e:
46
+ print(f"Не удалось загрузить сохраненную систему: {str(e)}")
47
+
48
+ # Fallback: try to build from processed_chunks.csv if RAG system loading failed
49
  if os.path.exists(PROCESSED_DATA_FILE):
50
  try:
51
+ processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE).to_dict('processed_chunks.csv')
52
  if processed_chunks:
53
  query_engine = build_rag_system(processed_chunks)
54
+ return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
55
  except Exception as e:
56
+ return f"Ошибка при инициализации из CSV: {str(e)}"
57
 
58
  return "AIEXP система готова к работе. Загрузите нормативные документы для создания базы знаний."
59
 
 
249
  placeholder="Введите вопрос по нормативным документам...",
250
  lines=3
251
  )
252
+ ask_btn = gr.Button("🔍 Найти ответ", variant="primary", size="lg")
253
 
254
  gr.Examples(
255
  examples=[
scripts/__pycache__/config.cpython-311.pyc ADDED
Binary file (3.95 kB). View file
 
scripts/__pycache__/document_processor.cpython-311.pyc CHANGED
Binary files a/scripts/__pycache__/document_processor.cpython-311.pyc and b/scripts/__pycache__/document_processor.cpython-311.pyc differ
 
scripts/__pycache__/rag_engine.cpython-311.pyc ADDED
Binary file (11.2 kB). View file
 
scripts/config.py CHANGED
@@ -4,16 +4,31 @@ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
4
  from llama_index.llms.google_genai import GoogleGenAI
5
  from llama_index.core import Settings
6
 
7
- GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 
8
  EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 
 
 
 
 
 
 
 
 
 
 
9
  LLM_MODEL = "gemini-2.5-flash"
10
 
11
- CHUNK_SIZE = 1000
12
- CHUNK_OVERLAP = 150
13
- MAX_CHUNK_SIZE = 2500
14
- MIN_CHUNK_SIZE = 1000
 
 
15
  SIMILARITY_THRESHOLD = 0.7
16
 
 
17
  RETRIEVER_TOP_K = 15
18
  RETRIEVER_SIMILARITY_CUTOFF = 0.7
19
 
 
4
  from llama_index.llms.google_genai import GoogleGenAI
5
  from llama_index.core import Settings
6
 
7
+
8
+
9
  EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
10
+ RETRIEVER_TOP_K = 10
11
+ RETRIEVER_SIMILARITY_CUTOFF = 0.7
12
+ RAG_FILES_DIR = "processed_data"
13
+ PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
14
+
15
+ UPLOAD_FOLDER = "UPLOADED_DOCUMENTS"
16
+ PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
17
+ INDEX_STATE_FILE = "processed_data/index_store.json"
18
+ RAG_FILES_DIR = "rag_files"
19
+
20
+ GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
21
  LLM_MODEL = "gemini-2.5-flash"
22
 
23
+
24
+
25
+ CHUNK_SIZE = 1024
26
+ CHUNK_OVERLAP = 256
27
+ MAX_CHUNK_SIZE = 2048
28
+ MIN_CHUNK_SIZE = 750
29
  SIMILARITY_THRESHOLD = 0.7
30
 
31
+
32
  RETRIEVER_TOP_K = 15
33
  RETRIEVER_SIMILARITY_CUTOFF = 0.7
34
 
scripts/document_processor.py CHANGED
@@ -10,13 +10,8 @@ from llama_index.core.text_splitter import SentenceSplitter
10
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
11
  from sklearn.metrics.pairwise import cosine_similarity
12
  from llama_index.core.schema import Document
 
13
 
14
- EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
15
- CHUNK_SIZE = 1024
16
- CHUNK_OVERLAP = 256
17
- MAX_CHUNK_SIZE = 2048
18
- MIN_CHUNK_SIZE = 200
19
- SIMILARITY_THRESHOLD = 0.85
20
 
21
  def extract_text_from_pdf(file_path):
22
  text = ""
 
10
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
11
  from sklearn.metrics.pairwise import cosine_similarity
12
  from llama_index.core.schema import Document
13
+ from scripts.config import *
14
 
 
 
 
 
 
 
15
 
16
  def extract_text_from_pdf(file_path):
17
  text = ""
scripts/rag_engine.py CHANGED
@@ -9,12 +9,8 @@ import pandas as pd
9
  import faiss
10
  import pickle
11
  import os
 
12
 
13
- EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
14
- RETRIEVER_TOP_K = 10
15
- RETRIEVER_SIMILARITY_CUTOFF = 0.7
16
- RAG_FILES_DIR = "processed_data"
17
- PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
18
 
19
  def setup_llm_settings():
20
  embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
 
9
  import faiss
10
  import pickle
11
  import os
12
+ from scripts.config import *
13
 
 
 
 
 
 
14
 
15
  def setup_llm_settings():
16
  embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)