Spaces:

TomData
/

PoliticsToYou

Running

App Files Files Community

TomData commited on Apr 23

Commit

38166c5

1 Parent(s): cdd7ddb

upload refactored code to exclude small chunks without data files

Browse files

Files changed (5) hide show

Home.py +11 -0
src/FAISS.ipynb +90 -33
src/Speeches/{querry.ipynb → query.ipynb} +10 -162
src/chatbot.py +118 -10
src/vectordatabase.py +0 -152

Home.py CHANGED Viewed

@@ -3,6 +3,17 @@ from src.chatbot import chatbot, keyword_search
 #from gradio_calendar import Calendar
 #from datetime import datetime
 # Define important variables
 legislature_periods = [
     "All",

 #from gradio_calendar import Calendar
 #from datetime import datetime
+# Log into HF
+# Only required when running locally
+# import os
+# from dotenv import load_dotenv
+# from huggingface_hub import login
+# load_dotenv(dotenv_path=".env")
+# login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN")) # Your token here
 # Define important variables
 legislature_periods = [
     "All",

src/FAISS.ipynb CHANGED Viewed

@@ -2,7 +2,29 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -145,37 +167,66 @@
        "[930960 rows x 4 columns]"
       ]
      },
-     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# Create vectorstore\n",
-    "import pandas as pd\n",
-    "from vectordatabase import load_documents\n",
-    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
-    "from langchain_community.vectorstores import FAISS\n",
-    "from datetime import datetime\n",
     "\n",
     "\n",
-    "df = pd.read_pickle(\"C:\\\\Users\\Tom\\SynologyDrive\\Tom\\Programming\\\\NLP\\Spaces\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
-    "df['date'] = pd.to_datetime(df['date'])\n",
-    "# Split speeches into documents\n",
-    "df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
-      "  warnings.warn(\n",
       "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
       "  warnings.warn(\n"
      ]
@@ -208,44 +259,50 @@
     }
    ],
    "source": [
-    "\n",
     "dates = [\"1953-10-06\", \"1957-10-16\", \"1961-10-17\", \"1965-10-19\", \"1969-10-20\", \"1972-12-13\", \"1976-12-14\", \"1980-11-04\", \"1983-03-29\", \"1987-02-18\",\"1990-12-20\", \"1994-11-10\", \"1998-10-26\", \"2002-10-17\", \"2005-10-18\", \"2009-10-27\", \"2013-10-22\",\"2017-10-24\",\"2021-10-26\", None]\n",
     "embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
     "\n",
-    "# Iterate over all date to split by legislature getting vector stores for each period\n",
     "\n",
     "period = 1\n",
     "previous_date = None\n",
     "for date in dates:\n",
     "    if previous_date is None:\n",
-    "        legislature = df.loc[df['date'] < datetime.strptime(date, \"%Y-%m-%d\")]\n",
     "    elif date is None:\n",
-    "        legislature = df.loc[df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")]\n",
     "    else:\n",
-    "        legislature = df.loc[(df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")) & (df['date'] < datetime.strptime(date, \"%Y-%m-%d\"))]\n",
     "\n",
     "   \n",
-    "    # Split text into documents\n",
-    "    documents = load_documents(legislature)\n",
     "    index_name = f'{period}_legislature'\n",
     "    db = FAISS.from_documents(documents, embeddings)\n",
     "    db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
     "    print(f\"Sucessfully created vector store for {period}. legislature\")\n",
-    "    # Change for next iteration\n",
     "    period += 1\n",
     "    previous_date = date\n",
     "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
     "\n",
-    "\n"
    ]
   }
  ],

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "from langchain_community.document_loaders import DataFrameLoader\n",
+    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
+    "from langchain_community.vectorstores import FAISS\n",
+    "from datetime import datetime\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the whole speeches data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
        "[930960 rows x 4 columns]"
       ]
      },
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "df = pd.read_pickle(r\"C:\\Users\\Tom\\OneDrive\\Dokumente\\Lokal\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
+    "df['date'] = pd.to_datetime(df['date'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split_documents(df, min_chunk_size=100):\n",
+    "    \"\"\"\n",
+    "    Load documents from a DataFrame, split them into smaller chunks for vector storage and remove chunks of small size.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    df : pandas.DataFrame\n",
+    "        A DataFrame containing the documents to be processed, with a column named 'speech_content'.\n",
+    "    min_chunk_size : int, optional\n",
+    "        Minimum number of characters a chunk must have to be included in the result. Default is 100.\n",
     "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    list\n",
+    "        A list of split document chunks ready for further processing or vectorization.\n",
+    "    \"\"\"\n",
+    "    # Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load\n",
+    "    loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')\n",
+    "    # Load the data from the DataFrame into a suitable format for processing\n",
+    "    data = loader.load()\n",
+    "    # Initialize a RecursiveCharacterTextSplitter to split the text into chunks\n",
+    "    splitter = RecursiveCharacterTextSplitter(\n",
+    "        chunk_size=1024,\n",
+    "        chunk_overlap=32,\n",
+    "        length_function=len,\n",
+    "        is_separator_regex=False,\n",
+    "    )\n",
+    "    # Split the loaded data into smaller chunks using the splitter\n",
+    "    documents = splitter.split_documents(documents=data)\n",
+    "    # Discard small chunks below the threshold\n",
+    "    cleaned_documents = [doc for doc in documents if len(doc.page_content) >= min_chunk_size]\n",
     "\n",
+    "    return cleaned_documents"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
       "  warnings.warn(\n"
      ]
     }
    ],
    "source": [
+    "# Define starting dates of legislature periods\n",
     "dates = [\"1953-10-06\", \"1957-10-16\", \"1961-10-17\", \"1965-10-19\", \"1969-10-20\", \"1972-12-13\", \"1976-12-14\", \"1980-11-04\", \"1983-03-29\", \"1987-02-18\",\"1990-12-20\", \"1994-11-10\", \"1998-10-26\", \"2002-10-17\", \"2005-10-18\", \"2009-10-27\", \"2013-10-22\",\"2017-10-24\",\"2021-10-26\", None]\n",
+    "# Load sentence transformer \n",
     "embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
     "\n",
+    "# Create vector store for all speaches\n",
+    "# Split text into documents for vectorstore\n",
+    "documents = split_documents(df)\n",
+    "# Create and save faiss vectorstorage\n",
+    "index_name = 'speeches_1949_09_12'\n",
+    "db = FAISS.from_documents(documents, embeddings)\n",
+    "db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
+    "print(\"Sucessfully created vector store for all legislature\")\n",
     "\n",
+    "# Create vector store for each legislature\n",
+    "# loop parameters\n",
     "period = 1\n",
     "previous_date = None\n",
+    "\n",
+    "# Iterate over all date to split by legislature getting vector stores for each period\n",
     "for date in dates:\n",
     "    if previous_date is None:\n",
+    "        legislature_df = df.loc[df['date'] < datetime.strptime(date, \"%Y-%m-%d\")]\n",
     "    elif date is None:\n",
+    "        legislature_df = df.loc[df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")]\n",
     "    else:\n",
+    "        legislature_df = df.loc[(df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")) & (df['date'] < datetime.strptime(date, \"%Y-%m-%d\"))]\n",
     "\n",
     "   \n",
+    "    # Split text into documents for vectorstore\n",
+    "    documents = split_documents(legislature_df)\n",
+    "\n",
+    "    # Create and save faiss vectorstorage\n",
     "    index_name = f'{period}_legislature'\n",
     "    db = FAISS.from_documents(documents, embeddings)\n",
     "    db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
     "    print(f\"Sucessfully created vector store for {period}. legislature\")\n",
+    "\n",
+    "    # Change loop parameters for next iteration\n",
     "    period += 1\n",
     "    previous_date = date\n",
     "\n",
     "\n",
+    "    \n"
    ]
   }
  ],

src/Speeches/{querry.ipynb → query.ipynb} RENAMED Viewed

@@ -19,14 +19,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_22016\\2374447718.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
       "  df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n"
      ]
     }
@@ -38,7 +38,7 @@
     "    \"database\"  : \"next\",\n",
     "    \"user\"      : \"postgres\",\n",
     "    \"password\"  : \"postgres\",\n",
-    "    \"port\"      : \"5432\"\n",
     "}\n",
     "con = psycopg2.connect(**con_details)\n",
     "\n",
@@ -60,14 +60,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'Z', 'FDP', 'GB/BHE', 'DIE LINKE.', 'DRP', 'WAV', 'Fraktionslos', 'NR', 'BP', 'not found', 'SPD', 'Gast', 'FU', 'SSW', 'KPD', 'DA', 'FVP', 'AfD', 'Grüne', 'DP', 'CDU/CSU', 'PDS'}\n"
      ]
     }
    ],
@@ -78,161 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>speech_content</th>\n",
-       "      <th>date</th>\n",
-       "      <th>party</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>126</th>\n",
-       "      <td>121</td>\n",
-       "      <td>Meine Damen und Herren, die Zentrumsfraktion, ...</td>\n",
-       "      <td>1949-09-22</td>\n",
-       "      <td>Z</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>192</th>\n",
-       "      <td>181</td>\n",
-       "      <td>Meine Damen und Herren! Der Herr Bundeskanzler...</td>\n",
-       "      <td>1949-09-22</td>\n",
-       "      <td>Z</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>208</th>\n",
-       "      <td>196</td>\n",
-       "      <td>Die Zentrumsfraktion des Deutschen Bundestags ...</td>\n",
-       "      <td>1949-09-27</td>\n",
-       "      <td>Z</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>210</th>\n",
-       "      <td>198</td>\n",
-       "      <td>Den Antrag habe ich hier.\\n({0})\\n- Ich begrün...</td>\n",
-       "      <td>1949-09-27</td>\n",
-       "      <td>Z</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>211</th>\n",
-       "      <td>199</td>\n",
-       "      <td>Ich werde Ihnen, Herr Präsident, also den Antr...</td>\n",
-       "      <td>1949-09-27</td>\n",
-       "      <td>Z</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16480</th>\n",
-       "      <td>16412</td>\n",
-       "      <td>Meine Damen und Herren! Das, was Herr Kollege ...</td>\n",
-       "      <td>1951-12-06</td>\n",
-       "      <td>Z</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16558</th>\n",
-       "      <td>16496</td>\n",
-       "      <td>Herr Präsident! Meine sehr verehrten Damen und...</td>\n",
-       "      <td>1951-12-12</td>\n",
-       "      <td>Z</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16592</th>\n",
-       "      <td>16526</td>\n",
-       "      <td>Herr Präsident! Meine Damen und Herren! Der He...</td>\n",
-       "      <td>1951-12-12</td>\n",
-       "      <td>Z</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16622</th>\n",
-       "      <td>16580</td>\n",
-       "      <td>Herr Präsident! Meine Herren und Damen! Entgeg...</td>\n",
-       "      <td>1951-12-12</td>\n",
-       "      <td>Z</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16699</th>\n",
-       "      <td>16634</td>\n",
-       "      <td>Herr Präsident! Meine Damen und Herren! Die Ze...</td>\n",
-       "      <td>1951-12-13</td>\n",
-       "      <td>Z</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>420 rows × 4 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "          id                                     speech_content        date  \\\n",
-       "126      121  Meine Damen und Herren, die Zentrumsfraktion, ...  1949-09-22   \n",
-       "192      181  Meine Damen und Herren! Der Herr Bundeskanzler...  1949-09-22   \n",
-       "208      196  Die Zentrumsfraktion des Deutschen Bundestags ...  1949-09-27   \n",
-       "210      198  Den Antrag habe ich hier.\\n({0})\\n- Ich begrün...  1949-09-27   \n",
-       "211      199  Ich werde Ihnen, Herr Präsident, also den Antr...  1949-09-27   \n",
-       "...      ...                                                ...         ...   \n",
-       "16480  16412  Meine Damen und Herren! Das, was Herr Kollege ...  1951-12-06   \n",
-       "16558  16496  Herr Präsident! Meine sehr verehrten Damen und...  1951-12-12   \n",
-       "16592  16526  Herr Präsident! Meine Damen und Herren! Der He...  1951-12-12   \n",
-       "16622  16580  Herr Präsident! Meine Herren und Damen! Entgeg...  1951-12-12   \n",
-       "16699  16634  Herr Präsident! Meine Damen und Herren! Die Ze...  1951-12-13   \n",
-       "\n",
-       "      party  \n",
-       "126       Z  \n",
-       "192       Z  \n",
-       "208       Z  \n",
-       "210       Z  \n",
-       "211       Z  \n",
-       "...     ...  \n",
-       "16480     Z  \n",
-       "16558     Z  \n",
-       "16592     Z  \n",
-       "16622     Z  \n",
-       "16699     Z  \n",
-       "\n",
-       "[420 rows x 4 columns]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['party'] == 'Z']\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -375,22 +221,24 @@
        "[930960 rows x 4 columns]"
       ]
      },
-     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) #removing keys from interruptions\n",
     "df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "df.to_pickle(\"speeches_1949_09_12\")"
    ]
   }

   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_12368\\2515868855.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
       "  df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n"
      ]
     }
     "    \"database\"  : \"next\",\n",
     "    \"user\"      : \"postgres\",\n",
     "    \"password\"  : \"postgres\",\n",
+    "    \"port\"      : \"5433\"\n",
     "}\n",
     "con = psycopg2.connect(**con_details)\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "{'FVP', 'DA', 'FDP', 'BP', 'DP', 'DRP', 'PDS', 'SSW', 'Grüne', 'Fraktionslos', 'WAV', 'Gast', 'FU', 'KPD', 'DIE LINKE.', 'CDU/CSU', 'not found', 'GB/BHE', 'AfD', 'SPD', 'NR', 'Z'}\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
        "[930960 rows x 4 columns]"
       ]
      },
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) #removing keys from interruptions\n",
+    "df['date'] = pd.to_datetime(df['date'])\n",
     "df"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Dave to pickle\n",
     "df.to_pickle(\"speeches_1949_09_12\")"
    ]
   }

src/chatbot.py CHANGED Viewed

@@ -1,13 +1,21 @@
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.llms.huggingface_hub import HuggingFaceHub
 from langchain_community.embeddings import HuggingFaceEmbeddings
-from src.vectordatabase import RAG, get_vectorstore
 import pandas as pd
 # Load environmental variables from .env-file
-# from dotenv import load_dotenv, find_dotenv
-# load_dotenv(find_dotenv())
 # Define important variables
 embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2") # Remove embedding input parameter from functions?
@@ -56,6 +64,98 @@ prompt_en = ChatPromptTemplate.from_template("""Answer the following question in
         # Returns the answer in English
 )
 def chatbot(message, history, db_inputs, prompt_language, llm=llm):
@@ -109,7 +209,7 @@ def chatbot(message, history, db_inputs, prompt_language, llm=llm):
     return response
-def keyword_search(query, n=10, embeddings=embeddings, method='ss', party_filter='All'):
     """
     Retrieve speech contents based on keywords using a specified method.
@@ -156,7 +256,7 @@ def keyword_search(query, n=10, embeddings=embeddings, method='ss', party_filter
     query_embedding = embeddings.embed_query(query)
     # Maximal Marginal Relevance
-    if method == 'mmr':
         df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
         results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
         for doc in results:
@@ -173,8 +273,8 @@ def keyword_search(query, n=10, embeddings=embeddings, method='ss', party_filter
         df_res.sort_values('Relevance', inplace=True, ascending=True)
     # Similarity Search
-    else:
-        df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party'])
         results = db.similarity_search_by_vector(query_embedding, k=n)
         for doc in results:
             party = doc.metadata["party"]
@@ -182,7 +282,15 @@ def keyword_search(query, n=10, embeddings=embeddings, method='ss', party_filter
                 continue
             speech_content = doc.page_content
             speech_date = doc.metadata["date"]
-            df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
-                                                      'Date': [speech_date],
-                                                      'Party': [party]})], ignore_index=True)
     return df_res

 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.llms.huggingface_hub import HuggingFaceHub
 from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains import create_retrieval_chain
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from faiss import IndexFlatL2
+#import functools
 import pandas as pd
 # Load environmental variables from .env-file
+from dotenv import load_dotenv, find_dotenv
+load_dotenv(find_dotenv())
 # Define important variables
 embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2") # Remove embedding input parameter from functions?
         # Returns the answer in English
 )
+db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
+                                           embeddings=embeddings, allow_dangerous_deserialization=True)
+def get_vectorstore(inputs, embeddings):
+    """
+    Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
+    Parameters
+    ----------
+    inputs : list of str
+        A list of strings specifying which vector stores to combine. Each string represents a specific
+        index or a special keyword "All". If "All" is the first entry in the list,
+        it directly return the pre-defined vectorstore for all speeches
+    embeddings : Embeddings
+        An instance of embeddings that will be used to load the vector stores. The specific type and
+        structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
+    Returns
+    -------
+    FAISS
+        A FAISS vector store that combines the specified indices into a single vector store.
+    """
+    # Default folder path
+    folder_path = "./src/FAISS"
+    if inputs[0] == "All" or inputs[0] is None:
+        return db_all
+    # Initialize empty db
+    embedding_function = embeddings
+    dimensions = len(embedding_function.embed_query("dummy"))
+    db = FAISS(
+        embedding_function=embedding_function,
+        index=IndexFlatL2(dimensions),
+        docstore=InMemoryDocstore(),
+        index_to_docstore_id={},
+        normalize_L2=False
+    )
+    # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
+    for input in inputs:
+        # Ignore if user also selected All among other legislatures
+        if input == "All":
+            continue
+        # Retrieve selected index and merge vector stores
+        index = input.split(".")[0]
+        index_name = f'{index}_legislature'
+        local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
+                                    embeddings=embeddings, allow_dangerous_deserialization=True)
+        db.merge_from(local_db)
+        print('Successfully merged inputs')
+    return db
+def RAG(llm, prompt, db, question):
+    """
+    Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
+    language model using a predefined template.
+    Parameters:
+    ----------
+    llm : LanguageModel
+        An instance of the language model to be used for generating responses.
+    prompt : str
+        A predefined template or prompt that structures how the context and question are presented to the language model.
+    db : VectorStore
+        A vector store instance that supports retrieval of relevant documents based on the input question.
+    question : str
+        The question or query to be answered by the language model.
+    Returns:
+    -------
+    str
+        The response generated by the language model, based on the retrieved context and provided question.
+    """
+    # Create a document chain using the provided language model and prompt template
+    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
+    # Convert the vector store into a retriever
+    retriever = db.as_retriever()
+    # Create a retrieval chain that integrates the retriever with the document chain
+    retrieval_chain = create_retrieval_chain(retriever, document_chain)
+    # Invoke the retrieval chain with the input question to get the final response
+    response = retrieval_chain.invoke({"input": question})
+    return response
 def chatbot(message, history, db_inputs, prompt_language, llm=llm):
     return response
+def keyword_search(query, n=10, embeddings=embeddings, method="ss", party_filter="All"):
     """
     Retrieve speech contents based on keywords using a specified method.
     query_embedding = embeddings.embed_query(query)
     # Maximal Marginal Relevance
+    if method == "mmr":
         df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
         results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
         for doc in results:
         df_res.sort_values('Relevance', inplace=True, ascending=True)
     # Similarity Search
+    elif method == "ss":
+        kws_data = []
         results = db.similarity_search_by_vector(query_embedding, k=n)
         for doc in results:
             party = doc.metadata["party"]
                 continue
             speech_content = doc.page_content
             speech_date = doc.metadata["date"]
+            speech_date = speech_date.strftime("%Y-%m-%d")
+            print(speech_date)
+            # Error here?
+            kws_entry = {'Speech Content': speech_content,
+                        'Date': speech_date,
+                        'Party': party}
+            kws_data.append(kws_entry)
+    df_res = pd.DataFrame(kws_data)
     return df_res

src/vectordatabase.py DELETED Viewed

@@ -1,152 +0,0 @@
-from langchain_community.document_loaders import DataFrameLoader
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.vectorstores import FAISS
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.chains.combine_documents import create_stuff_documents_chain
-from langchain.chains import create_retrieval_chain
-from langchain_community.docstore.in_memory import InMemoryDocstore
-from faiss import IndexFlatL2
-#import functools
-import pandas as pd
-import os
-# For local run load environmental variables from .env-file
-# from dotenv import load_dotenv
-# load_dotenv()
-# Define important variables
-embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
-db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
-                                            embeddings=embeddings, allow_dangerous_deserialization=True)
-def load_documents(df):
-    """
-    Load documents from a DataFrame and split them into smaller chunks for vector storage.
-    Parameters:
-    ----------
-    df : pandas.DataFrame
-        A DataFrame containing the documents to be processed, with a column named 'speech_content' that holds the text content.
-    Returns:
-    -------
-    list
-        A list of split document chunks ready for further processing or vectorization.
-    """
-    # Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load
-    loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')
-    # Load the data from the DataFrame into a suitable format for processing
-    data = loader.load()
-    # Initialize a RecursiveCharacterTextSplitter to split the text into chunks
-    splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1024,
-            chunk_overlap=32,
-            length_function=len,
-            is_separator_regex=False,
-        )
-    # Split the loaded data into smaller chunks using the splitter
-    documents = splitter.split_documents(documents=data)
-    return documents
-#@functools.lru_cache()
-def get_vectorstore(inputs, embeddings):
-    """
-    Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
-    Parameters
-    ----------
-    inputs : list of str
-        A list of strings specifying which vector stores to combine. Each string represents a specific
-        index or a special keyword "All". If "All" is the first entry in the list,
-        it directly return the pre-defined vectorstore for all speeches
-    embeddings : Embeddings
-        An instance of embeddings that will be used to load the vector stores. The specific type and
-        structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
-    Returns
-    -------
-    FAISS
-        A FAISS vector store that combines the specified indices into a single vector store.
-    """
-    # Default folder path
-    folder_path = "./src/FAISS"
-    if inputs[0] == "All" or inputs[0] is None:
-        return db_all
-    # Initialize empty db
-    embedding_function = embeddings
-    dimensions = len(embedding_function.embed_query("dummy"))
-    db = FAISS(
-        embedding_function=embedding_function,
-        index=IndexFlatL2(dimensions),
-        docstore=InMemoryDocstore(),
-        index_to_docstore_id={},
-        normalize_L2=False
-    )
-    # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
-    for input in inputs:
-        # Ignore if user also selected All among other legislatures
-        if input == "All":
-            continue
-        # Retrieve selected index and merge vector stores
-        index = input.split(".")[0]
-        index_name = f'{index}_legislature'
-        local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
-                                    embeddings=embeddings, allow_dangerous_deserialization=True)
-        db.merge_from(local_db)
-        print('Successfully merged inputs')
-    return db
-def RAG(llm, prompt, db, question):
-    """
-    Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
-    language model using a predefined template.
-    Parameters:
-    ----------
-    llm : LanguageModel
-        An instance of the language model to be used for generating responses.
-    prompt : str
-        A predefined template or prompt that structures how the context and question are presented to the language model.
-    db : VectorStore
-        A vector store instance that supports retrieval of relevant documents based on the input question.
-    question : str
-        The question or query to be answered by the language model.
-    Returns:
-    -------
-    str
-        The response generated by the language model, based on the retrieved context and provided question.
-    """
-    # Create a document chain using the provided language model and prompt template
-    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
-    # Convert the vector store into a retriever
-    retriever = db.as_retriever()
-    # Create a retrieval chain that integrates the retriever with the document chain
-    retrieval_chain = create_retrieval_chain(retriever, document_chain)
-    # Invoke the retrieval chain with the input question to get the final response
-    response = retrieval_chain.invoke({"input": question})
-    return response