Spaces:

MrSimple01
/

AIEXP_RAG_1

Sleeping

App Files Files Community

MrSimple01 commited on 7 days ago

Commit

36eab75

1 Parent(s): c9db278

Update scripts/rag_engine.py

Browse files

Files changed (1) hide show

scripts/rag_engine.py +199 -199

scripts/rag_engine.py CHANGED Viewed

@@ -1,200 +1,200 @@
-from llama_index.core import VectorStoreIndex, Document, StorageContext, Settings
-from llama_index.vector_stores.faiss import FaissVectorStore
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.core.query_engine import RetrieverQueryEngine
-from llama_index.core.retrievers import VectorIndexRetriever
-from llama_index.core.response_synthesizers import ResponseMode, get_response_synthesizer
-from document_processor import create_llama_documents, process_single_document, save_processed_chunks, load_processed_chunks
-import pandas as pd
-import faiss
-import pickle
-import os
-EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
-RETRIEVER_TOP_K = 10
-RETRIEVER_SIMILARITY_CUTOFF = 0.7
-RAG_FILES_DIR = "processed_data"
-PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
-def setup_llm_settings():
-    embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
-    Settings.embed_model = embed_model
-def create_vector_index_with_faiss(documents):
-    setup_llm_settings()
-    d = 384
-    faiss_index = faiss.IndexFlatIP(d)
-    vector_store = FaissVectorStore(faiss_index=faiss_index)
-    storage_context = StorageContext.from_defaults(vector_store=vector_store)
-    index = VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context
-    )
-    return index, faiss_index
-def create_retriever(index):
-    return VectorIndexRetriever(
-        index=index,
-        similarity_top_k=RETRIEVER_TOP_K,
-        similarity_cutoff=RETRIEVER_SIMILARITY_CUTOFF
-    )
-def create_response_synthesizer():
-    return get_response_synthesizer(
-        response_mode=ResponseMode.TREE_SUMMARIZE,
-        streaming=False
-    )
-def create_query_engine(index):
-    retriever = create_retriever(index)
-    response_synthesizer = create_response_synthesizer()
-    return RetrieverQueryEngine(
-        retriever=retriever,
-        response_synthesizer=response_synthesizer
-    )
-def save_rag_system(index, faiss_index, documents):
-    os.makedirs(RAG_FILES_DIR, exist_ok=True)
-    faiss.write_index(faiss_index, os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
-    index.storage_context.persist(persist_dir=RAG_FILES_DIR)
-    with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'wb') as f:
-        pickle.dump(documents, f)
-    metadata_dict = {}
-    for doc in documents:
-        metadata_dict[doc.id_] = doc.metadata
-    with open(os.path.join(RAG_FILES_DIR, 'chunk_metadata.pkl'), 'wb') as f:
-        pickle.dump(metadata_dict, f)
-    config = {
-        'embed_model_name': EMBEDDING_MODEL,
-        'vector_dim': 384,
-        'total_documents': len(documents),
-        'index_type': 'faiss_flat_ip'
-    }
-    with open(os.path.join(RAG_FILES_DIR, 'config.pkl'), 'wb') as f:
-        pickle.dump(config, f)
-def load_rag_system():
-    if not os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index')):
-        return None
-    try:
-        setup_llm_settings()
-        faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
-        vector_store = FaissVectorStore(faiss_index=faiss_index)
-        storage_context = StorageContext.from_defaults(vector_store=vector_store)
-        index = VectorStoreIndex.from_documents([], storage_context=storage_context)
-        with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'rb') as f:
-            documents = pickle.load(f)
-        for doc in documents:
-            index.insert(doc)
-        query_engine = create_query_engine(index)
-        return query_engine
-    except Exception as e:
-        print(f"Error loading RAG system: {str(e)}")
-        return None
-def build_rag_system(processed_chunks):
-    setup_llm_settings()
-    documents = create_llama_documents(processed_chunks)
-    print(f"Created {len(documents)} documents for RAG system")
-    index, faiss_index = create_vector_index_with_faiss(documents)
-    query_engine = create_query_engine(index)
-    save_rag_system(index, faiss_index, documents)
-    return query_engine
-def add_new_document_to_system(file_path, existing_query_engine):
-    try:
-        new_chunks = process_single_document(file_path)
-        if not new_chunks:
-            return existing_query_engine
-        if os.path.exists(PROCESSED_DATA_FILE):
-            existing_df = load_processed_chunks(PROCESSED_DATA_FILE)
-            existing_chunks = existing_df.to_dict('records')
-        else:
-            existing_chunks = []
-        all_chunks = existing_chunks + new_chunks
-        save_processed_chunks(all_chunks, PROCESSED_DATA_FILE)
-        query_engine = build_rag_system(all_chunks)
-        print(f"Added {len(new_chunks)} new chunks from {os.path.basename(file_path)}")
-        return query_engine
-    except Exception as e:
-        print(f"Error adding new document: {str(e)}")
-        return existing_query_engine
-def query_documents(query_engine, question):
-    response = query_engine.query(question)
-    return response
-def get_response_sources(response):
-    sources = []
-    for i, node in enumerate(response.source_nodes):
-        source_info = {
-            'chunk_number': i + 1,
-            'section': node.metadata.get('section', 'Не указан'),
-            'subsection': node.metadata.get('subsection', 'Не указан'),
-            'chunk_id': node.metadata.get('chunk_id', 'Не указан'),
-            'document_id': node.metadata.get('document_id', 'Не указан'),
-            'txt_file_id': node.metadata.get('txt_file_id', 'Не указан'),
-            'file_link': node.metadata.get('file_link', 'Не указан'),
-            'text_preview': node.text[:200] + "..." if len(node.text) > 200 else node.text,
-            'score': getattr(node, 'score', 0.0)
-        }
-        sources.append(source_info)
-    return sources
-def format_response_with_sources(response):
-    formatted_response = {
-        'answer': response.response,
-        'sources': get_response_sources(response)
-    }
-    return formatted_response
-def test_rag_system(query_engine, test_questions):
-    results = []
-    for question in test_questions:
-        print(f"Question: {question}")
-        response = query_documents(query_engine, question)
-        formatted_response = format_response_with_sources(response)
-        print(f"Answer: {formatted_response['answer']}")
-        print("Sources:")
-        for source in formatted_response['sources']:
-            print(f"  - Chunk {source['chunk_number']}: {source['document_id']}")
-            print(f"    Section: {source['section']}, Subsection: {source['subsection']}")
-            print(f"    Preview: {source['text_preview']}")
-        print("=" * 80)
-        results.append({
-            'question': question,
-            'response': formatted_response
-        })
     return results

+from llama_index.core import VectorStoreIndex, Document, StorageContext, Settings
+from llama_index.vector_stores.faiss import FaissVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.query_engine import RetrieverQueryEngine
+from llama_index.core.retrievers import VectorIndexRetriever
+from llama_index.core.response_synthesizers import ResponseMode, get_response_synthesizer
+from script.document_processor import create_llama_documents, process_single_document, save_processed_chunks, load_processed_chunks
+import pandas as pd
+import faiss
+import pickle
+import os
+EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+RETRIEVER_TOP_K = 10
+RETRIEVER_SIMILARITY_CUTOFF = 0.7
+RAG_FILES_DIR = "processed_data"
+PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"
+def setup_llm_settings():
+    embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
+    Settings.embed_model = embed_model
+def create_vector_index_with_faiss(documents):
+    setup_llm_settings()
+    d = 384
+    faiss_index = faiss.IndexFlatIP(d)
+    vector_store = FaissVectorStore(faiss_index=faiss_index)
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    index = VectorStoreIndex.from_documents(
+        documents,
+        storage_context=storage_context
+    )
+    return index, faiss_index
+def create_retriever(index):
+    return VectorIndexRetriever(
+        index=index,
+        similarity_top_k=RETRIEVER_TOP_K,
+        similarity_cutoff=RETRIEVER_SIMILARITY_CUTOFF
+    )
+def create_response_synthesizer():
+    return get_response_synthesizer(
+        response_mode=ResponseMode.TREE_SUMMARIZE,
+        streaming=False
+    )
+def create_query_engine(index):
+    retriever = create_retriever(index)
+    response_synthesizer = create_response_synthesizer()
+    return RetrieverQueryEngine(
+        retriever=retriever,
+        response_synthesizer=response_synthesizer
+    )
+def save_rag_system(index, faiss_index, documents):
+    os.makedirs(RAG_FILES_DIR, exist_ok=True)
+    faiss.write_index(faiss_index, os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
+    index.storage_context.persist(persist_dir=RAG_FILES_DIR)
+    with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'wb') as f:
+        pickle.dump(documents, f)
+    metadata_dict = {}
+    for doc in documents:
+        metadata_dict[doc.id_] = doc.metadata
+    with open(os.path.join(RAG_FILES_DIR, 'chunk_metadata.pkl'), 'wb') as f:
+        pickle.dump(metadata_dict, f)
+    config = {
+        'embed_model_name': EMBEDDING_MODEL,
+        'vector_dim': 384,
+        'total_documents': len(documents),
+        'index_type': 'faiss_flat_ip'
+    }
+    with open(os.path.join(RAG_FILES_DIR, 'config.pkl'), 'wb') as f:
+        pickle.dump(config, f)
+def load_rag_system():
+    if not os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index')):
+        return None
+    try:
+        setup_llm_settings()
+        faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
+        vector_store = FaissVectorStore(faiss_index=faiss_index)
+        storage_context = StorageContext.from_defaults(vector_store=vector_store)
+        index = VectorStoreIndex.from_documents([], storage_context=storage_context)
+        with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'rb') as f:
+            documents = pickle.load(f)
+        for doc in documents:
+            index.insert(doc)
+        query_engine = create_query_engine(index)
+        return query_engine
+    except Exception as e:
+        print(f"Error loading RAG system: {str(e)}")
+        return None
+def build_rag_system(processed_chunks):
+    setup_llm_settings()
+    documents = create_llama_documents(processed_chunks)
+    print(f"Created {len(documents)} documents for RAG system")
+    index, faiss_index = create_vector_index_with_faiss(documents)
+    query_engine = create_query_engine(index)
+    save_rag_system(index, faiss_index, documents)
+    return query_engine
+def add_new_document_to_system(file_path, existing_query_engine):
+    try:
+        new_chunks = process_single_document(file_path)
+        if not new_chunks:
+            return existing_query_engine
+        if os.path.exists(PROCESSED_DATA_FILE):
+            existing_df = load_processed_chunks(PROCESSED_DATA_FILE)
+            existing_chunks = existing_df.to_dict('records')
+        else:
+            existing_chunks = []
+        all_chunks = existing_chunks + new_chunks
+        save_processed_chunks(all_chunks, PROCESSED_DATA_FILE)
+        query_engine = build_rag_system(all_chunks)
+        print(f"Added {len(new_chunks)} new chunks from {os.path.basename(file_path)}")
+        return query_engine
+    except Exception as e:
+        print(f"Error adding new document: {str(e)}")
+        return existing_query_engine
+def query_documents(query_engine, question):
+    response = query_engine.query(question)
+    return response
+def get_response_sources(response):
+    sources = []
+    for i, node in enumerate(response.source_nodes):
+        source_info = {
+            'chunk_number': i + 1,
+            'section': node.metadata.get('section', 'Не указан'),
+            'subsection': node.metadata.get('subsection', 'Не указан'),
+            'chunk_id': node.metadata.get('chunk_id', 'Не указан'),
+            'document_id': node.metadata.get('document_id', 'Не указан'),
+            'txt_file_id': node.metadata.get('txt_file_id', 'Не указан'),
+            'file_link': node.metadata.get('file_link', 'Не указан'),
+            'text_preview': node.text[:200] + "..." if len(node.text) > 200 else node.text,
+            'score': getattr(node, 'score', 0.0)
+        }
+        sources.append(source_info)
+    return sources
+def format_response_with_sources(response):
+    formatted_response = {
+        'answer': response.response,
+        'sources': get_response_sources(response)
+    }
+    return formatted_response
+def test_rag_system(query_engine, test_questions):
+    results = []
+    for question in test_questions:
+        print(f"Question: {question}")
+        response = query_documents(query_engine, question)
+        formatted_response = format_response_with_sources(response)
+        print(f"Answer: {formatted_response['answer']}")
+        print("Sources:")
+        for source in formatted_response['sources']:
+            print(f"  - Chunk {source['chunk_number']}: {source['document_id']}")
+            print(f"    Section: {source['section']}, Subsection: {source['subsection']}")
+            print(f"    Preview: {source['text_preview']}")
+        print("=" * 80)
+        results.append({
+            'question': question,
+            'response': formatted_response
+        })
     return results