Spaces:
Sleeping
Sleeping
Commit
·
1b98e0e
1
Parent(s):
34ac8e9
fixed config + added all the necessary files
Browse files
app.py
CHANGED
@@ -4,14 +4,11 @@ import shutil
|
|
4 |
import pandas as pd
|
5 |
from datetime import datetime
|
6 |
from scripts.document_processor import process_multiple_documents, save_processed_chunks, load_processed_chunks
|
7 |
-
from scripts.rag_engine import build_rag_system, query_documents, format_response_with_sources, add_new_document_to_system
|
8 |
import json
|
9 |
import tempfile
|
|
|
10 |
|
11 |
-
UPLOAD_FOLDER = "UPLOADED_DOCUMENTS"
|
12 |
-
PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
|
13 |
-
INDEX_STATE_FILE = "processed_data/index_store.json"
|
14 |
-
RAG_FILES_DIR = "rag_files"
|
15 |
|
16 |
if not os.path.exists(UPLOAD_FOLDER):
|
17 |
os.makedirs(UPLOAD_FOLDER)
|
@@ -22,18 +19,41 @@ if not os.path.exists("processed_data"):
|
|
22 |
if not os.path.exists(RAG_FILES_DIR):
|
23 |
os.makedirs(RAG_FILES_DIR)
|
24 |
|
|
|
|
|
25 |
def initialize_system():
|
26 |
global query_engine
|
27 |
query_engine = None
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
if os.path.exists(PROCESSED_DATA_FILE):
|
30 |
try:
|
31 |
-
processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE).to_dict('
|
32 |
if processed_chunks:
|
33 |
query_engine = build_rag_system(processed_chunks)
|
34 |
-
return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов"
|
35 |
except Exception as e:
|
36 |
-
return f"Ошибка при
|
37 |
|
38 |
return "AIEXP система готова к работе. Загрузите нормативные документы для создания базы знаний."
|
39 |
|
@@ -229,7 +249,7 @@ def create_demo_interface():
|
|
229 |
placeholder="Введите вопрос по нормативным документам...",
|
230 |
lines=3
|
231 |
)
|
232 |
-
ask_btn = gr.Button("🔍 Найти ответ
|
233 |
|
234 |
gr.Examples(
|
235 |
examples=[
|
|
|
4 |
import pandas as pd
|
5 |
from datetime import datetime
|
6 |
from scripts.document_processor import process_multiple_documents, save_processed_chunks, load_processed_chunks
|
7 |
+
from scripts.rag_engine import build_rag_system, query_documents, format_response_with_sources, add_new_document_to_system, load_rag_system
|
8 |
import json
|
9 |
import tempfile
|
10 |
+
from scripts.config import *
|
11 |
|
|
|
|
|
|
|
|
|
12 |
|
13 |
if not os.path.exists(UPLOAD_FOLDER):
|
14 |
os.makedirs(UPLOAD_FOLDER)
|
|
|
19 |
if not os.path.exists(RAG_FILES_DIR):
|
20 |
os.makedirs(RAG_FILES_DIR)
|
21 |
|
22 |
+
|
23 |
+
|
24 |
def initialize_system():
|
25 |
global query_engine
|
26 |
query_engine = None
|
27 |
|
28 |
+
try:
|
29 |
+
query_engine = load_rag_system()
|
30 |
+
if query_engine is not None:
|
31 |
+
chunk_count = 0
|
32 |
+
if os.path.exists(PROCESSED_DATA_FILE):
|
33 |
+
processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE)
|
34 |
+
chunk_count = len(processed_chunks)
|
35 |
+
else:
|
36 |
+
try:
|
37 |
+
import pickle
|
38 |
+
with open(os.path.join("processed_data", 'documents.pkl'), 'rb') as f:
|
39 |
+
documents = pickle.load(f)
|
40 |
+
chunk_count = len(documents)
|
41 |
+
except:
|
42 |
+
chunk_count = "неизвестно"
|
43 |
+
|
44 |
+
return f"AIEXP система инициализирована с {chunk_count} фрагментами нормативных документов (загружена из сохраненного индекса)"
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Не удалось загрузить сохраненную систему: {str(e)}")
|
47 |
+
|
48 |
+
# Fallback: try to build from processed_chunks.csv if RAG system loading failed
|
49 |
if os.path.exists(PROCESSED_DATA_FILE):
|
50 |
try:
|
51 |
+
processed_chunks = load_processed_chunks(PROCESSED_DATA_FILE).to_dict('processed_chunks.csv')
|
52 |
if processed_chunks:
|
53 |
query_engine = build_rag_system(processed_chunks)
|
54 |
+
return f"AIEXP система инициализирована с {len(processed_chunks)} фрагментами нормативных документов (построена из CSV)"
|
55 |
except Exception as e:
|
56 |
+
return f"Ошибка при инициализации из CSV: {str(e)}"
|
57 |
|
58 |
return "AIEXP система готова к работе. Загрузите нормативные документы для создания базы знаний."
|
59 |
|
|
|
249 |
placeholder="Введите вопрос по нормативным документам...",
|
250 |
lines=3
|
251 |
)
|
252 |
+
ask_btn = gr.Button("🔍 Найти ответ", variant="primary", size="lg")
|
253 |
|
254 |
gr.Examples(
|
255 |
examples=[
|
scripts/__pycache__/config.cpython-311.pyc
ADDED
Binary file (3.95 kB). View file
|
|
scripts/__pycache__/document_processor.cpython-311.pyc
CHANGED
Binary files a/scripts/__pycache__/document_processor.cpython-311.pyc and b/scripts/__pycache__/document_processor.cpython-311.pyc differ
|
|
scripts/__pycache__/rag_engine.cpython-311.pyc
ADDED
Binary file (11.2 kB). View file
|
|
scripts/config.py
CHANGED
@@ -4,16 +4,31 @@ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
4 |
from llama_index.llms.google_genai import GoogleGenAI
|
5 |
from llama_index.core import Settings
|
6 |
|
7 |
-
|
|
|
8 |
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
LLM_MODEL = "gemini-2.5-flash"
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
15 |
SIMILARITY_THRESHOLD = 0.7
|
16 |
|
|
|
17 |
RETRIEVER_TOP_K = 15
|
18 |
RETRIEVER_SIMILARITY_CUTOFF = 0.7
|
19 |
|
|
|
4 |
from llama_index.llms.google_genai import GoogleGenAI
|
5 |
from llama_index.core import Settings
|
6 |
|
7 |
+
|
8 |
+
|
9 |
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
10 |
+
RETRIEVER_TOP_K = 10
|
11 |
+
RETRIEVER_SIMILARITY_CUTOFF = 0.7
|
12 |
+
RAG_FILES_DIR = "processed_data"
|
13 |
+
PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
|
14 |
+
|
15 |
+
UPLOAD_FOLDER = "UPLOADED_DOCUMENTS"
|
16 |
+
PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
|
17 |
+
INDEX_STATE_FILE = "processed_data/index_store.json"
|
18 |
+
RAG_FILES_DIR = "rag_files"
|
19 |
+
|
20 |
+
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
|
21 |
LLM_MODEL = "gemini-2.5-flash"
|
22 |
|
23 |
+
|
24 |
+
|
25 |
+
CHUNK_SIZE = 1024
|
26 |
+
CHUNK_OVERLAP = 256
|
27 |
+
MAX_CHUNK_SIZE = 2048
|
28 |
+
MIN_CHUNK_SIZE = 750
|
29 |
SIMILARITY_THRESHOLD = 0.7
|
30 |
|
31 |
+
|
32 |
RETRIEVER_TOP_K = 15
|
33 |
RETRIEVER_SIMILARITY_CUTOFF = 0.7
|
34 |
|
scripts/document_processor.py
CHANGED
@@ -10,13 +10,8 @@ from llama_index.core.text_splitter import SentenceSplitter
|
|
10 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
11 |
from sklearn.metrics.pairwise import cosine_similarity
|
12 |
from llama_index.core.schema import Document
|
|
|
13 |
|
14 |
-
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
15 |
-
CHUNK_SIZE = 1024
|
16 |
-
CHUNK_OVERLAP = 256
|
17 |
-
MAX_CHUNK_SIZE = 2048
|
18 |
-
MIN_CHUNK_SIZE = 200
|
19 |
-
SIMILARITY_THRESHOLD = 0.85
|
20 |
|
21 |
def extract_text_from_pdf(file_path):
|
22 |
text = ""
|
|
|
10 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
11 |
from sklearn.metrics.pairwise import cosine_similarity
|
12 |
from llama_index.core.schema import Document
|
13 |
+
from scripts.config import *
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
def extract_text_from_pdf(file_path):
|
17 |
text = ""
|
scripts/rag_engine.py
CHANGED
@@ -9,12 +9,8 @@ import pandas as pd
|
|
9 |
import faiss
|
10 |
import pickle
|
11 |
import os
|
|
|
12 |
|
13 |
-
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
14 |
-
RETRIEVER_TOP_K = 10
|
15 |
-
RETRIEVER_SIMILARITY_CUTOFF = 0.7
|
16 |
-
RAG_FILES_DIR = "processed_data"
|
17 |
-
PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
|
18 |
|
19 |
def setup_llm_settings():
|
20 |
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
|
|
|
9 |
import faiss
|
10 |
import pickle
|
11 |
import os
|
12 |
+
from scripts.config import *
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def setup_llm_settings():
|
16 |
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
|