File size: 7,403 Bytes
f3d0f1e
 
 
 
 
 
 
 
85df319
 
 
c98215f
85df319
 
f3d0f1e
 
c98215f
 
 
f3d0f1e
a3f5633
b5a209d
 
 
f3d0f1e
 
b5a209d
 
 
 
 
 
 
 
 
 
 
 
 
c98215f
b5a209d
 
 
f3d0f1e
b5a209d
 
f3d0f1e
 
 
 
 
 
b5a209d
 
f3d0f1e
b5a209d
f3d0f1e
 
85df319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5a209d
 
 
 
85df319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3d0f1e
 
85df319
 
b5a209d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3d0f1e
b5a209d
f3d0f1e
b5a209d
f3d0f1e
b5a209d
f3d0f1e
b5a209d
f3d0f1e
 
b5a209d
85df319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFaceHub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from faiss import IndexFlatL2
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.embeddings import SentenceTransformerEmbeddings
#import functools

import pandas as pd
import os

# For local run load environmental variables from .env-file
# from dotenv import load_dotenv
# load_dotenv()

# Define important variables
embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
                                            embeddings=embeddings, allow_dangerous_deserialization=True)

def load_documents(df):
    """

    Load documents from a DataFrame and split them into smaller chunks for vector storage.



    Parameters:

    ----------

    df : pandas.DataFrame

        A DataFrame containing the documents to be processed, with a column named 'speech_content' that holds the text content.



    Returns:

    -------

    list

        A list of split document chunks ready for further processing or vectorization.

    """
    
    # Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load
    loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')
    # Load the data from the DataFrame into a suitable format for processing
    data = loader.load()
    
    # Initialize a RecursiveCharacterTextSplitter to split the text into chunks
    splitter = RecursiveCharacterTextSplitter(
            chunk_size=1024,
            chunk_overlap=32,
            length_function=len,
            is_separator_regex=False,
        )
    
    # Split the loaded data into smaller chunks using the splitter
    documents = splitter.split_documents(documents=data)
    
    return documents


#@functools.lru_cache()
def get_vectorstore(inputs, embeddings):
    """

    Combine multiple FAISS vector stores into a single vector store based on the specified inputs.



    Parameters:

    ----------

    inputs : list of str

        A list of strings specifying which vector stores to combine. Each string represents a specific 

        index or a special keyword "All". If "All" is included in the list, it will load a pre-defined 

        comprehensive vector store and return immediately.

        

    embeddings : Embeddings

        An instance of embeddings that will be used to load the vector stores. The specific type and

        structure of `embeddings` depend on the implementation of the `get_vectorstore` function.



    Returns:

    -------

    FAISS

        A FAISS vector store that combines the specified indices into a single vector store.

    

    Notes:

    -----

    - The `folder_path` variable is set to the default path "./src/FAISS", where the FAISS index files are stored.

    - The function initializes an empty FAISS vector store with a dimensionality of 128.

    - If "All" is specified in the `inputs`, it directly loads and returns the comprehensive vector store named "speeches_1949_09_12".

    - For each specific index in `inputs`, it retrieves the corresponding vector store and merges it with the initialized FAISS vector store.

    - The `FAISS.load_local` method is used to load vector stores from the local file system. 

      The `allow_dangerous_deserialization` parameter is set to True to allow loading of potentially unsafe serialized objects.

    """

    # Default folder path
    folder_path = "./src/FAISS"

    if inputs[0] == "All":
        # index_name = "speeches_1949_09_12"
        # db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
        #                                     embeddings=embeddings, allow_dangerous_deserialization=True)
        return db_all
    

    # Initialize empty db
    embedding_function = embeddings #SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    dimensions: int = len(embedding_function.embed_query("dummy"))

    db = FAISS(
        embedding_function=embedding_function,
        index=IndexFlatL2(dimensions),
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        normalize_L2=False
    )

    # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
    for input in inputs:
        # Retrieve selected index and merge vector stores
        index = input.split(".")[0]
        index_name = f'{index}_legislature'
        local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
                                            embeddings=embeddings, allow_dangerous_deserialization=True)
        db.merge_from(local_db)
    return db



def RAG(llm, prompt, db, question):
    """

    Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the 

    language model using a predefined template.



    Parameters:

    ----------

    llm : LanguageModel

        An instance of the language model to be used for generating responses.

        

    prompt : str

        A predefined template or prompt that structures how the context and question are presented to the language model.

        

    db : VectorStore

        A vector store instance that supports retrieval of relevant documents based on the input question.

        

    question : str

        The question or query to be answered by the language model.



    Returns:

    -------

    str

        The response generated by the language model, based on the retrieved context and provided question.

    """
    # Create a document chain using the provided language model and prompt template
    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
    # Convert the vector store into a retriever
    retriever = db.as_retriever()
    # Create a retrieval chain that integrates the retriever with the document chain
    retrieval_chain = create_retrieval_chain(retriever, document_chain)
    # Invoke the retrieval chain with the input question to get the final response
    response = retrieval_chain.invoke({"input": question})
    
    return response


#########
# Dynamically loading vector_db
##########

def get_similar_vectorstore(start_date, end_date, party, base_path='src\FAISS'):

    # Get all file names
    vector_stores = [store for store in os.listdir(base_path) if store.split(".")[1] == "faiss"]

    df = pd.DataFrame(culumns=["file_name", "start_date", "end_date", "date_diff"])
    # Extract metadata of file from its name
    for file_name in vector_stores:
        file_name = file_name.split(".")[0]
        file_elements = file_name.split("_")
        file_start_date, file_end_date, file_party = file_elements[1], file_elements[2], file_elements[3]

        if file_party == party and file_start_date <= start_date:
            None