Spaces:
Sleeping
Sleeping
Commit
·
648d16e
1
Parent(s):
92105e7
fixed file_path problem + added app_1.py + added possible relevancy check first
Browse files- app.py +1 -2
- app_1.py +473 -0
- scripts/__pycache__/config.cpython-311.pyc +0 -0
- scripts/__pycache__/document_processor.cpython-311.pyc +0 -0
- scripts/__pycache__/rag_engine.cpython-311.pyc +0 -0
- scripts/config.py +1 -2
- scripts/document_processor.py +1 -3
- scripts/rag_engine.py +87 -0
app.py
CHANGED
@@ -45,10 +45,9 @@ def initialize_system():
|
|
45 |
except Exception as e:
|
46 |
print(f"Не удалось загрузить сохраненную систему: {str(e)}")
|
47 |
|
48 |
-
# Fallback: try to build from processed_chunks.csv if RAG system loading failed
|
49 |
if os.path.exists(PROCESSED_DATA_FILE):
|
50 |
try:
|
51 |
-
processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE).to_dict('
|
52 |
if processed_chunks:
|
53 |
query_engine = build_rag_system(processed_chunks)
|
54 |
return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
|
|
|
45 |
except Exception as e:
|
46 |
print(f"Не удалось загрузить сохраненную систему: {str(e)}")
|
47 |
|
|
|
48 |
if os.path.exists(PROCESSED_DATA_FILE):
|
49 |
try:
|
50 |
+
processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE).to_dict('records')
|
51 |
if processed_chunks:
|
52 |
query_engine = build_rag_system(processed_chunks)
|
53 |
return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
|
app_1.py
ADDED
@@ -0,0 +1,473 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
import pandas as pd
|
5 |
+
from datetime import datetime
|
6 |
+
from scripts.document_processor import *
|
7 |
+
from scripts.rag_engine import *
|
8 |
+
import json
|
9 |
+
import tempfile
|
10 |
+
from scripts.config import *
|
11 |
+
|
12 |
+
|
13 |
+
if not os.path.exists(UPLOAD_FOLDER):
|
14 |
+
os.makedirs(UPLOAD_FOLDER)
|
15 |
+
|
16 |
+
if not os.path.exists("processed_data"):
|
17 |
+
os.makedirs("processed_data")
|
18 |
+
|
19 |
+
if not os.path.exists(RAG_FILES_DIR):
|
20 |
+
os.makedirs(RAG_FILES_DIR)
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
def initialize_system():
|
25 |
+
global query_engine
|
26 |
+
query_engine = None
|
27 |
+
|
28 |
+
try:
|
29 |
+
query_engine = load_rag_system()
|
30 |
+
if query_engine is not None:
|
31 |
+
chunk_count = 0
|
32 |
+
if os.path.exists(PROCESSED_DATA_FILE):
|
33 |
+
processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE)
|
34 |
+
chunk_count = len(processed_chunks)
|
35 |
+
else:
|
36 |
+
try:
|
37 |
+
import pickle
|
38 |
+
with open(os.path.join("processed_data", 'documents.pkl'), 'rb') as f:
|
39 |
+
documents = pickle.load(f)
|
40 |
+
chunk_count = len(documents)
|
41 |
+
except:
|
42 |
+
chunk_count = "неизвестно"
|
43 |
+
|
44 |
+
return f"AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из сохраненного индекса)"
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Не удалось загрузить сохраненную систему: {str(e)}")
|
47 |
+
|
48 |
+
if os.path.exists(PROCESSED_DATA_FILE):
|
49 |
+
try:
|
50 |
+
processed_chunks_df = load_processed_chunks(PROCESSED_DATA_FILE)
|
51 |
+
# Проверяем наличие нужных столбцов
|
52 |
+
required_columns = {'file_link', 'chunk_text', 'chunk_id', 'document_id'}
|
53 |
+
if not required_columns.issubset(set(processed_chunks_df.columns)):
|
54 |
+
return f"Ошибка при инициализации из CSV: отсутствуют необходимые столбцы: {required_columns - set(processed_chunks_df.columns)}"
|
55 |
+
processed_chunks = processed_chunks_df.to_dict('records')
|
56 |
+
if processed_chunks:
|
57 |
+
query_engine = build_rag_system(processed_chunks)
|
58 |
+
return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
|
59 |
+
except Exception as e:
|
60 |
+
return f"Ошибка при инициализации из CSV: {str(e)}"
|
61 |
+
|
62 |
+
|
63 |
+
return "AIEXP система готова к работе. Загрузите нормативные документы для создания базы знаний."
|
64 |
+
|
65 |
+
def get_uploaded_files_info():
|
66 |
+
if not os.path.exists(UPLOAD_FOLDER):
|
67 |
+
return "Нет загруженных файлов в базе знаний"
|
68 |
+
|
69 |
+
files = os.listdir(UPLOAD_FOLDER)
|
70 |
+
if not files:
|
71 |
+
return "Нет загруженных файлов в базе знаний"
|
72 |
+
|
73 |
+
file_info = []
|
74 |
+
file_count = len(files)
|
75 |
+
for file in files:
|
76 |
+
file_path = os.path.join(UPLOAD_FOLDER, file)
|
77 |
+
size = os.path.getsize(file_path)
|
78 |
+
modified = datetime.fromtimestamp(os.path.getmtime(file_path)).strftime("%Y-%m-%d %H:%M")
|
79 |
+
file_info.append(f"📄 {file} ({size} байт, добавлен: {modified})")
|
80 |
+
|
81 |
+
return f"Всего нормативных документов в базе: {file_count}\n\n" + "\n".join(file_info)
|
82 |
+
|
83 |
+
def upload_files(files):
|
84 |
+
global query_engine
|
85 |
+
|
86 |
+
if not files:
|
87 |
+
return "Файлы не выбраны", get_uploaded_files_info()
|
88 |
+
|
89 |
+
uploaded_count = 0
|
90 |
+
errors = []
|
91 |
+
|
92 |
+
for file in files:
|
93 |
+
try:
|
94 |
+
filename = os.path.basename(file.name)
|
95 |
+
destination = os.path.join(UPLOAD_FOLDER, filename)
|
96 |
+
shutil.copy2(file.name, destination)
|
97 |
+
uploaded_count += 1
|
98 |
+
|
99 |
+
if query_engine is not None:
|
100 |
+
try:
|
101 |
+
query_engine = add_new_document_to_system(destination, query_engine)
|
102 |
+
except Exception as e:
|
103 |
+
errors.append(f"Ошибка добавления {filename} в систему: {str(e)}")
|
104 |
+
|
105 |
+
except Exception as e:
|
106 |
+
errors.append(f"Ошибка загрузки {file.name}: {str(e)}")
|
107 |
+
|
108 |
+
result_message = f"Загружено нормативных документов: {uploaded_count}"
|
109 |
+
if errors:
|
110 |
+
result_message += f"\nОшибки:\n" + "\n".join(errors)
|
111 |
+
else:
|
112 |
+
result_message += f"\nДокументы автоматически добавлены в базу знаний"
|
113 |
+
|
114 |
+
return result_message, get_uploaded_files_info()
|
115 |
+
|
116 |
+
def process_all_documents():
|
117 |
+
global query_engine
|
118 |
+
|
119 |
+
if not os.path.exists(UPLOAD_FOLDER):
|
120 |
+
return "Папка с но��мативными документами не найдена"
|
121 |
+
|
122 |
+
files = os.listdir(UPLOAD_FOLDER)
|
123 |
+
if not files:
|
124 |
+
return "Нет нормативных документов для обработки"
|
125 |
+
|
126 |
+
file_paths = [os.path.join(UPLOAD_FOLDER, f) for f in files]
|
127 |
+
|
128 |
+
try:
|
129 |
+
processed_chunks = process_multiple_documents(file_paths)
|
130 |
+
|
131 |
+
if not processed_chunks:
|
132 |
+
return "Не удалось создать фрагменты нормативных документов"
|
133 |
+
|
134 |
+
save_processed_chunks(processed_chunks, PROCESSED_DATA_FILE)
|
135 |
+
|
136 |
+
query_engine = build_rag_system(processed_chunks)
|
137 |
+
|
138 |
+
with open(INDEX_STATE_FILE, 'w', encoding='utf-8') as f:
|
139 |
+
json.dump({
|
140 |
+
'processed_files': files,
|
141 |
+
'chunks_count': len(processed_chunks),
|
142 |
+
'last_update': datetime.now().isoformat()
|
143 |
+
}, f, ensure_ascii=False, indent=2)
|
144 |
+
|
145 |
+
return f"Обработка базы знаний завершена успешно!\nОбработано нормативных документов: {len(files)}\nСоздано фрагментов: {len(processed_chunks)}\nAIEXP система готова для работы с нормативной документацией."
|
146 |
+
|
147 |
+
except Exception as e:
|
148 |
+
return f"Ошибка при обработке нормативных документов: {str(e)}"
|
149 |
+
|
150 |
+
def get_system_status():
|
151 |
+
status_info = []
|
152 |
+
|
153 |
+
files_count = len(os.listdir(UPLOAD_FOLDER)) if os.path.exists(UPLOAD_FOLDER) else 0
|
154 |
+
|
155 |
+
if os.path.exists(INDEX_STATE_FILE):
|
156 |
+
with open(INDEX_STATE_FILE, 'r', encoding='utf-8') as f:
|
157 |
+
state = json.load(f)
|
158 |
+
status_info.append(f"🟢 AIEXP система активна")
|
159 |
+
status_info.append(f"📊 Нормативных документов в базе: {files_count}")
|
160 |
+
status_info.append(f"📝 Фрагментов в индексе: {state.get('chunks_count', 0)}")
|
161 |
+
status_info.append(f"🕒 Последнее обновление: {state.get('last_update', 'Неизвестно')}")
|
162 |
+
|
163 |
+
if state.get('processed_files'):
|
164 |
+
status_info.append(f"📋 Обработанные документы:")
|
165 |
+
for file in state['processed_files'][:10]:
|
166 |
+
status_info.append(f" • {file}")
|
167 |
+
if len(state['processed_files']) > 10:
|
168 |
+
status_info.append(f" ... и еще {len(state['processed_files']) - 10} документов")
|
169 |
+
else:
|
170 |
+
status_info.append("🔴 AIEXP система не инициализирована")
|
171 |
+
status_info.append(f"📊 Нормативных документов загружено: {files_count}")
|
172 |
+
status_info.append("Обработайте документы для создания базы знаний")
|
173 |
+
|
174 |
+
return "\n".join(status_info)
|
175 |
+
|
176 |
+
def answer_question(question):
|
177 |
+
global query_engine
|
178 |
+
|
179 |
+
if not question.strip():
|
180 |
+
return "Пожалуйста, введите вопрос по нормативной документации", ""
|
181 |
+
|
182 |
+
if query_engine is None:
|
183 |
+
return "База знаний не готова. Сначала загрузите и обработайте нормативные документы.", ""
|
184 |
+
|
185 |
+
try:
|
186 |
+
response = query_documents(query_engine, question)
|
187 |
+
formatted_response = format_response_with_sources(response)
|
188 |
+
|
189 |
+
answer = formatted_response['answer']
|
190 |
+
|
191 |
+
sources_info = []
|
192 |
+
sources_info.append("📚 Источники из нормативной документации:")
|
193 |
+
for i, source in enumerate(formatted_response['sources'][:5], 1):
|
194 |
+
sources_info.append(f"\n{i}. Документ: {source['document_id']}")
|
195 |
+
if source['section']:
|
196 |
+
sources_info.append(f" Раздел: {source['section']}")
|
197 |
+
if source['subsection']:
|
198 |
+
sources_info.append(f" Подраздел: {source['subsection']}")
|
199 |
+
sources_info.append(f" Фрагмент: ...{source['text_preview'][:150]}...")
|
200 |
+
|
201 |
+
return answer, "\n".join(sources_info)
|
202 |
+
|
203 |
+
except Exception as e:
|
204 |
+
return f"Ошибка при обработке вопроса: {str(e)}", ""
|
205 |
+
|
206 |
+
def clear_all_data():
|
207 |
+
global query_engine
|
208 |
+
|
209 |
+
try:
|
210 |
+
if os.path.exists(UPLOAD_FOLDER):
|
211 |
+
shutil.rmtree(UPLOAD_FOLDER)
|
212 |
+
os.makedirs(UPLOAD_FOLDER)
|
213 |
+
|
214 |
+
if os.path.exists("processed_data"):
|
215 |
+
shutil.rmtree("processed_data")
|
216 |
+
os.makedirs("processed_data")
|
217 |
+
|
218 |
+
if os.path.exists(RAG_FILES_DIR):
|
219 |
+
shutil.rmtree(RAG_FILES_DIR)
|
220 |
+
os.makedirs(RAG_FILES_DIR)
|
221 |
+
|
222 |
+
query_engine = None
|
223 |
+
|
224 |
+
return "Вся база знаний успешно очищена", get_uploaded_files_info(), get_system_status()
|
225 |
+
|
226 |
+
except Exception as e:
|
227 |
+
return f"Ошибка при очистке базы знаний: {str(e)}", get_uploaded_files_info(), get_system_status()
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
+
chat_history = []
|
232 |
+
|
233 |
+
def add_to_chat_history(user_query, assistant_response):
|
234 |
+
"""Add exchange to chat history"""
|
235 |
+
global chat_history
|
236 |
+
chat_history.append({
|
237 |
+
'user': user_query,
|
238 |
+
'assistant': assistant_response,
|
239 |
+
'timestamp': datetime.now().isoformat()
|
240 |
+
})
|
241 |
+
|
242 |
+
# Keep only last 10 exchanges to prevent memory issues
|
243 |
+
if len(chat_history) > 10:
|
244 |
+
chat_history = chat_history[-10:]
|
245 |
+
|
246 |
+
def get_chat_context():
|
247 |
+
"""Get formatted chat history"""
|
248 |
+
if not chat_history:
|
249 |
+
return "Новый разговор"
|
250 |
+
|
251 |
+
context = "История разговора:\n"
|
252 |
+
for i, exchange in enumerate(chat_history[-3:], 1): # Show last 3
|
253 |
+
context += f"{i}. Пользователь: {exchange['user'][:100]}...\n"
|
254 |
+
context += f" Ответ: {exchange['assistant'][:100]}...\n"
|
255 |
+
return context
|
256 |
+
|
257 |
+
def enhanced_answer_question(question):
|
258 |
+
"""Enhanced version with preprocessing and chat history"""
|
259 |
+
global query_engine, chat_history
|
260 |
+
|
261 |
+
if not question.strip():
|
262 |
+
return "Пожалуйста, введите вопрос по нормативной документации", "", get_chat_context()
|
263 |
+
|
264 |
+
if query_engine is None:
|
265 |
+
return "База знаний не готова. Сначала загрузите и обработайте нормативные документы.", "", get_chat_context()
|
266 |
+
|
267 |
+
try:
|
268 |
+
# Step 1: Preprocess query with chat history
|
269 |
+
improved_query = preprocess_query_with_context(question, chat_history)
|
270 |
+
|
271 |
+
# Step 2: Query with enhanced scoring
|
272 |
+
enhanced_response = query_documents_with_scores(query_engine, improved_query)
|
273 |
+
|
274 |
+
# Step 3: Format response with sources
|
275 |
+
formatted_response = format_enhanced_response_with_sources(enhanced_response)
|
276 |
+
|
277 |
+
# Step 4: Add conversational context
|
278 |
+
final_answer = create_chat_context_prompt(formatted_response['answer'], chat_history)
|
279 |
+
|
280 |
+
# Step 5: Add to chat history
|
281 |
+
add_to_chat_history(question, final_answer)
|
282 |
+
|
283 |
+
# Add query info to sources if preprocessing was used
|
284 |
+
sources_text = formatted_response['sources']
|
285 |
+
if improved_query != question:
|
286 |
+
sources_text = f"🔄 Улучшенный запрос: '{improved_query}'\n\n" + sources_text
|
287 |
+
|
288 |
+
return final_answer, sources_text, get_chat_context()
|
289 |
+
|
290 |
+
except Exception as e:
|
291 |
+
error_msg = f"Ошибка при обработке вопроса: {str(e)}"
|
292 |
+
add_to_chat_history(question, error_msg)
|
293 |
+
return error_msg, "", get_chat_context()
|
294 |
+
|
295 |
+
def clear_chat_history():
|
296 |
+
"""Clear chat history"""
|
297 |
+
global chat_history
|
298 |
+
chat_history = []
|
299 |
+
return "История чата очищена", get_chat_context()
|
300 |
+
|
301 |
+
def create_demo_interface():
|
302 |
+
with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
|
303 |
+
gr.Markdown("""
|
304 |
+
# 🤖 AIEXP - Artificial Intelligence Expert
|
305 |
+
|
306 |
+
## Инструмент для работы с нормативной документацией
|
307 |
+
|
308 |
+
**Возможности системы:**
|
309 |
+
- 🔍 Поиск информации по запросу с указанием источников среди нормативной документации
|
310 |
+
- 📋 Цитирование пунктов нормативной документации из базы знаний
|
311 |
+
- 📝 Краткий пересказ содержания разделов или целых нормативных документов
|
312 |
+
- 🔎 Семантический анализ соответствия информации требованиям НД
|
313 |
+
- 📋 Формирование пошаговых планов действий на основании требований НД
|
314 |
+
|
315 |
+
**Поддерживаемые форматы:** PDF, DOCX, TXT, CSV, XLSX, JSON
|
316 |
+
""")
|
317 |
+
|
318 |
+
with gr.Tab("🏠 Поиск по нормативным документам"):
|
319 |
+
gr.Markdown("### Задайте вопрос по нормативной документации")
|
320 |
+
|
321 |
+
with gr.Row():
|
322 |
+
with gr.Column(scale=3):
|
323 |
+
question_input = gr.Textbox(
|
324 |
+
label="Ваш вопрос к базе знаний",
|
325 |
+
placeholder="Введите вопрос по нормативным документам...",
|
326 |
+
lines=3
|
327 |
+
)
|
328 |
+
ask_btn = gr.Button("🔍 Найти ответ", variant="primary", size="lg")
|
329 |
+
clear_chat_btn = gr.Button("🗑️ Очистить историю чата", variant="secondary")
|
330 |
+
|
331 |
+
|
332 |
+
chat_context = gr.Textbox(
|
333 |
+
label="Контекст разговора",
|
334 |
+
lines=4,
|
335 |
+
interactive=False,
|
336 |
+
value=get_chat_context()
|
337 |
+
)
|
338 |
+
|
339 |
+
gr.Examples(
|
340 |
+
examples=[
|
341 |
+
"Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
|
342 |
+
"Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
|
343 |
+
"В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями, не включенными в перечисления?",
|
344 |
+
"Какие критерии используются органом по сертификации для анализа документации на втором этапе признания протоколов испытаний?"
|
345 |
+
],
|
346 |
+
inputs=question_input,
|
347 |
+
label="Примеры вопросов по нормативным документам"
|
348 |
+
)
|
349 |
+
|
350 |
+
with gr.Column(scale=4):
|
351 |
+
answer_output = gr.Textbox(
|
352 |
+
label="Ответ на основе нормативных документов",
|
353 |
+
lines=8,
|
354 |
+
interactive=False
|
355 |
+
)
|
356 |
+
|
357 |
+
sources_output = gr.Textbox(
|
358 |
+
label="Источники из нормативной документации",
|
359 |
+
lines=10,
|
360 |
+
interactive=False
|
361 |
+
)
|
362 |
+
|
363 |
+
# Event handlers
|
364 |
+
ask_btn.click(
|
365 |
+
fn=enhanced_answer_question,
|
366 |
+
inputs=[question_input],
|
367 |
+
outputs=[answer_output, sources_output, chat_context]
|
368 |
+
)
|
369 |
+
|
370 |
+
clear_chat_btn.click(
|
371 |
+
fn=clear_chat_history,
|
372 |
+
outputs=[chat_context]
|
373 |
+
)
|
374 |
+
|
375 |
+
with gr.Tab("📤 Управление базой знаний (Администратор)"):
|
376 |
+
gr.Markdown("### Загрузка и обработка нормативных документов")
|
377 |
+
|
378 |
+
with gr.Row():
|
379 |
+
with gr.Column(scale=2):
|
380 |
+
file_upload = gr.File(
|
381 |
+
label="Выберите нормативные документы для загрузки",
|
382 |
+
file_count="multiple",
|
383 |
+
file_types=[".pdf", ".docx", ".txt", ".csv", ".xlsx", ".json"]
|
384 |
+
)
|
385 |
+
|
386 |
+
with gr.Row():
|
387 |
+
upload_btn = gr.Button("📤 Загрузить в базу знаний", variant="primary")
|
388 |
+
process_btn = gr.Button("⚙️ Переобработать всю базу", variant="secondary")
|
389 |
+
clear_btn = gr.Button("🗑️ Очистить базу знаний", variant="stop")
|
390 |
+
|
391 |
+
with gr.Column(scale=2):
|
392 |
+
upload_status = gr.Textbox(
|
393 |
+
label="Статус загрузки",
|
394 |
+
lines=3,
|
395 |
+
interactive=False
|
396 |
+
)
|
397 |
+
|
398 |
+
processing_status = gr.Textbox(
|
399 |
+
label="Статус обработки базы знаний",
|
400 |
+
lines=5,
|
401 |
+
interactive=False
|
402 |
+
)
|
403 |
+
|
404 |
+
gr.Markdown("### База нормативных документов")
|
405 |
+
files_info = gr.Textbox(
|
406 |
+
label="Документы в базе знаний",
|
407 |
+
lines=8,
|
408 |
+
interactive=False,
|
409 |
+
value=get_uploaded_files_info()
|
410 |
+
)
|
411 |
+
|
412 |
+
with gr.Tab("📊 Статус AIEXP системы"):
|
413 |
+
gr.Markdown("### Информация о состоянии базы знаний")
|
414 |
+
|
415 |
+
system_status = gr.Textbox(
|
416 |
+
label="Статус AIEXP системы",
|
417 |
+
lines=10,
|
418 |
+
interactive=False,
|
419 |
+
value=get_system_status()
|
420 |
+
)
|
421 |
+
|
422 |
+
refresh_status_btn = gr.Button("🔄 Обновить статус системы")
|
423 |
+
|
424 |
+
upload_btn.click(
|
425 |
+
fn=upload_files,
|
426 |
+
inputs=[file_upload],
|
427 |
+
outputs=[upload_status, files_info]
|
428 |
+
)
|
429 |
+
|
430 |
+
process_btn.click(
|
431 |
+
fn=process_all_documents,
|
432 |
+
outputs=[processing_status]
|
433 |
+
)
|
434 |
+
|
435 |
+
ask_btn.click(
|
436 |
+
fn=answer_question,
|
437 |
+
inputs=[question_input],
|
438 |
+
outputs=[answer_output, sources_output]
|
439 |
+
)
|
440 |
+
|
441 |
+
question_input.submit(
|
442 |
+
fn=answer_question,
|
443 |
+
inputs=[question_input],
|
444 |
+
outputs=[answer_output, sources_output]
|
445 |
+
)
|
446 |
+
|
447 |
+
clear_btn.click(
|
448 |
+
fn=clear_all_data,
|
449 |
+
outputs=[processing_status, files_info, system_status]
|
450 |
+
)
|
451 |
+
|
452 |
+
refresh_status_btn.click(
|
453 |
+
fn=get_system_status,
|
454 |
+
outputs=[system_status]
|
455 |
+
)
|
456 |
+
|
457 |
+
return demo
|
458 |
+
|
459 |
+
|
460 |
+
|
461 |
+
|
462 |
+
if __name__ == "__main__":
|
463 |
+
print("Инициализация AIEXP системы...")
|
464 |
+
init_message = initialize_system()
|
465 |
+
print(init_message)
|
466 |
+
|
467 |
+
demo = create_demo_interface()
|
468 |
+
demo.launch(
|
469 |
+
share=True,
|
470 |
+
server_name="0.0.0.0",
|
471 |
+
server_port=7860,
|
472 |
+
show_error=True
|
473 |
+
)
|
scripts/__pycache__/config.cpython-311.pyc
CHANGED
Binary files a/scripts/__pycache__/config.cpython-311.pyc and b/scripts/__pycache__/config.cpython-311.pyc differ
|
|
scripts/__pycache__/document_processor.cpython-311.pyc
CHANGED
Binary files a/scripts/__pycache__/document_processor.cpython-311.pyc and b/scripts/__pycache__/document_processor.cpython-311.pyc differ
|
|
scripts/__pycache__/rag_engine.cpython-311.pyc
CHANGED
Binary files a/scripts/__pycache__/rag_engine.cpython-311.pyc and b/scripts/__pycache__/rag_engine.cpython-311.pyc differ
|
|
scripts/config.py
CHANGED
@@ -98,7 +98,7 @@ Chat History:
|
|
98 |
Current Question: {user_query}
|
99 |
|
100 |
Tasks:
|
101 |
-
1. If the question refers to previous context
|
102 |
2. Add relevant keywords that would help find documents.
|
103 |
3. Maintain the legal/regulatory focus.
|
104 |
4. Keep it concise but specific.
|
@@ -107,7 +107,6 @@ Return ONLY the improved question:
|
|
107 |
"""
|
108 |
|
109 |
try:
|
110 |
-
# Create the message and get a response from the LLM
|
111 |
messages = [ChatMessage(role=MessageRole.USER, content=preprocessing_prompt)]
|
112 |
response = llm.chat(messages)
|
113 |
improved_query = response.message.content.strip()
|
|
|
98 |
Current Question: {user_query}
|
99 |
|
100 |
Tasks:
|
101 |
+
1. If the question refers to previous context, make it self-contained.
|
102 |
2. Add relevant keywords that would help find documents.
|
103 |
3. Maintain the legal/regulatory focus.
|
104 |
4. Keep it concise but specific.
|
|
|
107 |
"""
|
108 |
|
109 |
try:
|
|
|
110 |
messages = [ChatMessage(role=MessageRole.USER, content=preprocessing_prompt)]
|
111 |
response = llm.chat(messages)
|
112 |
improved_query = response.message.content.strip()
|
scripts/document_processor.py
CHANGED
@@ -207,7 +207,6 @@ def process_single_document(file_path):
|
|
207 |
|
208 |
results.append({
|
209 |
'document_id': filename,
|
210 |
-
'file_path': file_path,
|
211 |
'section': current_section,
|
212 |
'subsection': current_subsection,
|
213 |
'chunk_text': chunk_text,
|
@@ -239,12 +238,11 @@ def create_llama_documents(processed_chunks):
|
|
239 |
metadata = {
|
240 |
'chunk_id': chunk_data['chunk_id'],
|
241 |
'document_id': chunk_data['document_id'],
|
242 |
-
'file_path': chunk_data['file_path'],
|
243 |
'section': chunk_data['section'] if chunk_data['section'] else '',
|
244 |
'subsection': chunk_data['subsection'] if chunk_data['subsection'] else '',
|
245 |
'chunk_length': chunk_data['chunk_length'],
|
246 |
'txt_file_id': chunk_data.get('txt_file_id', chunk_data['document_id']),
|
247 |
-
'file_link': chunk_data.get('file_link', chunk_data['
|
248 |
}
|
249 |
|
250 |
doc = Document(
|
|
|
207 |
|
208 |
results.append({
|
209 |
'document_id': filename,
|
|
|
210 |
'section': current_section,
|
211 |
'subsection': current_subsection,
|
212 |
'chunk_text': chunk_text,
|
|
|
238 |
metadata = {
|
239 |
'chunk_id': chunk_data['chunk_id'],
|
240 |
'document_id': chunk_data['document_id'],
|
|
|
241 |
'section': chunk_data['section'] if chunk_data['section'] else '',
|
242 |
'subsection': chunk_data['subsection'] if chunk_data['subsection'] else '',
|
243 |
'chunk_length': chunk_data['chunk_length'],
|
244 |
'txt_file_id': chunk_data.get('txt_file_id', chunk_data['document_id']),
|
245 |
+
'file_link': chunk_data.get('file_link', chunk_data['file_link'] if 'file_link' in chunk_data else '')
|
246 |
}
|
247 |
|
248 |
doc = Document(
|
scripts/rag_engine.py
CHANGED
@@ -38,6 +38,93 @@ def create_retriever(index):
|
|
38 |
similarity_cutoff=RETRIEVER_SIMILARITY_CUTOFF
|
39 |
)
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
def create_response_synthesizer():
|
42 |
return get_response_synthesizer(
|
43 |
response_mode=ResponseMode.TREE_SUMMARIZE,
|
|
|
38 |
similarity_cutoff=RETRIEVER_SIMILARITY_CUTOFF
|
39 |
)
|
40 |
|
41 |
+
|
42 |
+
def create_enhanced_retriever(index, query_str=None):
|
43 |
+
"""Create retriever with score transparency"""
|
44 |
+
retriever = VectorIndexRetriever(
|
45 |
+
index=index,
|
46 |
+
similarity_top_k=RETRIEVER_TOP_K,
|
47 |
+
similarity_cutoff=RETRIEVER_SIMILARITY_CUTOFF
|
48 |
+
)
|
49 |
+
return retriever
|
50 |
+
|
51 |
+
|
52 |
+
def query_documents_with_scores(query_engine, question):
|
53 |
+
response = query_engine.query(question)
|
54 |
+
|
55 |
+
# Extract scores from source nodes
|
56 |
+
scored_sources = []
|
57 |
+
max_score = 0.0
|
58 |
+
|
59 |
+
for node in response.source_nodes:
|
60 |
+
score = getattr(node, 'score', 0.0)
|
61 |
+
max_score = max(max_score, score)
|
62 |
+
scored_sources.append({
|
63 |
+
'node': node,
|
64 |
+
'score': score,
|
65 |
+
'text_preview': node.text[:200] + "..." if len(node.text) > 200 else node.text
|
66 |
+
})
|
67 |
+
|
68 |
+
scored_sources.sort(key=lambda x: x['score'], reverse=True)
|
69 |
+
|
70 |
+
QUERY_RELEVANCE_THRESHOLD = 0.6
|
71 |
+
is_query_relevant = max_score >= QUERY_RELEVANCE_THRESHOLD
|
72 |
+
|
73 |
+
# Enhanced response object
|
74 |
+
enhanced_response = {
|
75 |
+
'original_response': response,
|
76 |
+
'answer': response.response,
|
77 |
+
'max_similarity_score': max_score,
|
78 |
+
'is_query_relevant': is_query_relevant,
|
79 |
+
'scored_sources': scored_sources,
|
80 |
+
'total_sources': len(scored_sources)
|
81 |
+
}
|
82 |
+
|
83 |
+
# If query is not relevant, modify the answer
|
84 |
+
if not is_query_relevant:
|
85 |
+
enhanced_response['answer'] = (
|
86 |
+
"На основе доступных нормативных документов я не могу дать точный ответ на ваш вопрос. "
|
87 |
+
f"Максимальная релевантность найденных документов: {max_score:.2f}. "
|
88 |
+
"Попробуйте переформулировать вопрос или быть более конкретным."
|
89 |
+
)
|
90 |
+
enhanced_response['scored_sources'] = [] # Don't show irrelevant sources
|
91 |
+
|
92 |
+
return enhanced_response
|
93 |
+
|
94 |
+
|
95 |
+
def format_enhanced_response_with_sources(enhanced_response):
|
96 |
+
"""Format response with detailed scoring info"""
|
97 |
+
sources_info = []
|
98 |
+
|
99 |
+
if enhanced_response['is_query_relevant']:
|
100 |
+
sources_info.append("📚 Источники из нормативной документации:")
|
101 |
+
sources_info.append(f"🎯 Максимальная релевантность: {enhanced_response['max_similarity_score']:.3f}")
|
102 |
+
|
103 |
+
for i, source_data in enumerate(enhanced_response['scored_sources'][:5], 1):
|
104 |
+
node = source_data['node']
|
105 |
+
score = source_data['score']
|
106 |
+
|
107 |
+
sources_info.append(f"\n{i}. Релевантность: {score:.3f}")
|
108 |
+
sources_info.append(f" Документ: {node.metadata.get('document_id', 'Неизвестен')}")
|
109 |
+
|
110 |
+
if node.metadata.get('section'):
|
111 |
+
sources_info.append(f" Раздел: {node.metadata.get('section')}")
|
112 |
+
if node.metadata.get('subsection'):
|
113 |
+
sources_info.append(f" Подраздел: {node.metadata.get('subsection')}")
|
114 |
+
|
115 |
+
sources_info.append(f" Фрагмент: ...{source_data['text_preview']}")
|
116 |
+
else:
|
117 |
+
sources_info.append("⚠️ Запрос имеет низкую релевантность к базе нормативных документов")
|
118 |
+
sources_info.append(f"🎯 Максимальная найденная релевантность: {enhanced_response['max_similarity_score']:.3f}")
|
119 |
+
sources_info.append("💡 Рекомендация: Переформулируйте вопрос более конкретно")
|
120 |
+
|
121 |
+
return {
|
122 |
+
'answer': enhanced_response['answer'],
|
123 |
+
'sources': "\n".join(sources_info),
|
124 |
+
'is_relevant': enhanced_response['is_query_relevant'],
|
125 |
+
'max_score': enhanced_response['max_similarity_score']
|
126 |
+
}
|
127 |
+
|
128 |
def create_response_synthesizer():
|
129 |
return get_response_synthesizer(
|
130 |
response_mode=ResponseMode.TREE_SUMMARIZE,
|