TomData commited on
Commit
38166c5
·
1 Parent(s): cdd7ddb

upload refactored code to exclude small chunks without data files

Browse files
Home.py CHANGED
@@ -3,6 +3,17 @@ from src.chatbot import chatbot, keyword_search
3
  #from gradio_calendar import Calendar
4
  #from datetime import datetime
5
 
 
 
 
 
 
 
 
 
 
 
 
6
  # Define important variables
7
  legislature_periods = [
8
  "All",
 
3
  #from gradio_calendar import Calendar
4
  #from datetime import datetime
5
 
6
+
7
+ # Log into HF
8
+ # Only required when running locally
9
+ # import os
10
+ # from dotenv import load_dotenv
11
+ # from huggingface_hub import login
12
+ # load_dotenv(dotenv_path=".env")
13
+ # login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN")) # Your token here
14
+
15
+
16
+
17
  # Define important variables
18
  legislature_periods = [
19
  "All",
src/FAISS.ipynb CHANGED
@@ -2,7 +2,29 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 2,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "metadata": {},
7
  "outputs": [
8
  {
@@ -145,37 +167,66 @@
145
  "[930960 rows x 4 columns]"
146
  ]
147
  },
148
- "execution_count": 2,
149
  "metadata": {},
150
  "output_type": "execute_result"
151
  }
152
  ],
153
  "source": [
154
- "# Create vectorstore\n",
155
- "import pandas as pd\n",
156
- "from vectordatabase import load_documents\n",
157
- "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
158
- "from langchain_community.vectorstores import FAISS\n",
159
- "from datetime import datetime\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  "\n",
162
- "df = pd.read_pickle(\"C:\\\\Users\\Tom\\SynologyDrive\\Tom\\Programming\\\\NLP\\Spaces\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
163
- "df['date'] = pd.to_datetime(df['date'])\n",
164
- "# Split speeches into documents\n",
165
- "df"
166
  ]
167
  },
168
  {
169
  "cell_type": "code",
170
- "execution_count": 3,
171
  "metadata": {},
172
  "outputs": [
173
  {
174
  "name": "stderr",
175
  "output_type": "stream",
176
  "text": [
177
- "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
178
- " warnings.warn(\n",
179
  "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
180
  " warnings.warn(\n"
181
  ]
@@ -208,44 +259,50 @@
208
  }
209
  ],
210
  "source": [
211
- "\n",
212
  "dates = [\"1953-10-06\", \"1957-10-16\", \"1961-10-17\", \"1965-10-19\", \"1969-10-20\", \"1972-12-13\", \"1976-12-14\", \"1980-11-04\", \"1983-03-29\", \"1987-02-18\",\"1990-12-20\", \"1994-11-10\", \"1998-10-26\", \"2002-10-17\", \"2005-10-18\", \"2009-10-27\", \"2013-10-22\",\"2017-10-24\",\"2021-10-26\", None]\n",
 
213
  "embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
214
  "\n",
215
- "# Iterate over all date to split by legislature getting vector stores for each period\n",
 
 
 
 
 
 
 
216
  "\n",
 
 
217
  "period = 1\n",
218
  "previous_date = None\n",
 
 
219
  "for date in dates:\n",
220
  " if previous_date is None:\n",
221
- " legislature = df.loc[df['date'] < datetime.strptime(date, \"%Y-%m-%d\")]\n",
222
  " elif date is None:\n",
223
- " legislature = df.loc[df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")]\n",
224
  " else:\n",
225
- " legislature = df.loc[(df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")) & (df['date'] < datetime.strptime(date, \"%Y-%m-%d\"))]\n",
226
  "\n",
227
  " \n",
228
- " # Split text into documents\n",
229
- " documents = load_documents(legislature)\n",
 
 
230
  " index_name = f'{period}_legislature'\n",
231
  " db = FAISS.from_documents(documents, embeddings)\n",
232
  " db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
233
  " print(f\"Sucessfully created vector store for {period}. legislature\")\n",
234
- " # Change for next iteration\n",
 
235
  " period += 1\n",
236
  " previous_date = date\n",
237
  "\n",
238
- "\n"
239
- ]
240
- },
241
- {
242
- "cell_type": "code",
243
- "execution_count": null,
244
- "metadata": {},
245
- "outputs": [],
246
- "source": [
247
  "\n",
248
- "\n"
249
  ]
250
  }
251
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
11
+ "from langchain_community.document_loaders import DataFrameLoader\n",
12
+ "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
13
+ "from langchain_community.vectorstores import FAISS\n",
14
+ "from datetime import datetime\n",
15
+ "\n"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "markdown",
20
+ "metadata": {},
21
+ "source": [
22
+ "### Load the whole speeches data"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
  "metadata": {},
29
  "outputs": [
30
  {
 
167
  "[930960 rows x 4 columns]"
168
  ]
169
  },
170
+ "execution_count": 3,
171
  "metadata": {},
172
  "output_type": "execute_result"
173
  }
174
  ],
175
  "source": [
176
+ "df = pd.read_pickle(r\"C:\\Users\\Tom\\OneDrive\\Dokumente\\Lokal\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
177
+ "df['date'] = pd.to_datetime(df['date'])\n"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 27,
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "def split_documents(df, min_chunk_size=100):\n",
187
+ " \"\"\"\n",
188
+ " Load documents from a DataFrame, split them into smaller chunks for vector storage and remove chunks of small size.\n",
189
+ "\n",
190
+ " Parameters\n",
191
+ " ----------\n",
192
+ " df : pandas.DataFrame\n",
193
+ " A DataFrame containing the documents to be processed, with a column named 'speech_content'.\n",
194
+ " min_chunk_size : int, optional\n",
195
+ " Minimum number of characters a chunk must have to be included in the result. Default is 100.\n",
196
  "\n",
197
+ " Returns\n",
198
+ " -------\n",
199
+ " list\n",
200
+ " A list of split document chunks ready for further processing or vectorization.\n",
201
+ " \"\"\"\n",
202
+ " # Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load\n",
203
+ " loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')\n",
204
+ " # Load the data from the DataFrame into a suitable format for processing\n",
205
+ " data = loader.load()\n",
206
+ " # Initialize a RecursiveCharacterTextSplitter to split the text into chunks\n",
207
+ " splitter = RecursiveCharacterTextSplitter(\n",
208
+ " chunk_size=1024,\n",
209
+ " chunk_overlap=32,\n",
210
+ " length_function=len,\n",
211
+ " is_separator_regex=False,\n",
212
+ " )\n",
213
+ " # Split the loaded data into smaller chunks using the splitter\n",
214
+ " documents = splitter.split_documents(documents=data)\n",
215
+ " # Discard small chunks below the threshold\n",
216
+ " cleaned_documents = [doc for doc in documents if len(doc.page_content) >= min_chunk_size]\n",
217
  "\n",
218
+ " return cleaned_documents"
 
 
 
219
  ]
220
  },
221
  {
222
  "cell_type": "code",
223
+ "execution_count": null,
224
  "metadata": {},
225
  "outputs": [
226
  {
227
  "name": "stderr",
228
  "output_type": "stream",
229
  "text": [
 
 
230
  "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
231
  " warnings.warn(\n"
232
  ]
 
259
  }
260
  ],
261
  "source": [
262
+ "# Define starting dates of legislature periods\n",
263
  "dates = [\"1953-10-06\", \"1957-10-16\", \"1961-10-17\", \"1965-10-19\", \"1969-10-20\", \"1972-12-13\", \"1976-12-14\", \"1980-11-04\", \"1983-03-29\", \"1987-02-18\",\"1990-12-20\", \"1994-11-10\", \"1998-10-26\", \"2002-10-17\", \"2005-10-18\", \"2009-10-27\", \"2013-10-22\",\"2017-10-24\",\"2021-10-26\", None]\n",
264
+ "# Load sentence transformer \n",
265
  "embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
266
  "\n",
267
+ "# Create vector store for all speaches\n",
268
+ "# Split text into documents for vectorstore\n",
269
+ "documents = split_documents(df)\n",
270
+ "# Create and save faiss vectorstorage\n",
271
+ "index_name = 'speeches_1949_09_12'\n",
272
+ "db = FAISS.from_documents(documents, embeddings)\n",
273
+ "db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
274
+ "print(\"Sucessfully created vector store for all legislature\")\n",
275
  "\n",
276
+ "# Create vector store for each legislature\n",
277
+ "# loop parameters\n",
278
  "period = 1\n",
279
  "previous_date = None\n",
280
+ "\n",
281
+ "# Iterate over all date to split by legislature getting vector stores for each period\n",
282
  "for date in dates:\n",
283
  " if previous_date is None:\n",
284
+ " legislature_df = df.loc[df['date'] < datetime.strptime(date, \"%Y-%m-%d\")]\n",
285
  " elif date is None:\n",
286
+ " legislature_df = df.loc[df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")]\n",
287
  " else:\n",
288
+ " legislature_df = df.loc[(df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")) & (df['date'] < datetime.strptime(date, \"%Y-%m-%d\"))]\n",
289
  "\n",
290
  " \n",
291
+ " # Split text into documents for vectorstore\n",
292
+ " documents = split_documents(legislature_df)\n",
293
+ "\n",
294
+ " # Create and save faiss vectorstorage\n",
295
  " index_name = f'{period}_legislature'\n",
296
  " db = FAISS.from_documents(documents, embeddings)\n",
297
  " db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
298
  " print(f\"Sucessfully created vector store for {period}. legislature\")\n",
299
+ "\n",
300
+ " # Change loop parameters for next iteration\n",
301
  " period += 1\n",
302
  " previous_date = date\n",
303
  "\n",
 
 
 
 
 
 
 
 
 
304
  "\n",
305
+ " \n"
306
  ]
307
  }
308
  ],
src/Speeches/{querry.ipynb → query.ipynb} RENAMED
@@ -19,14 +19,14 @@
19
  },
20
  {
21
  "cell_type": "code",
22
- "execution_count": 2,
23
  "metadata": {},
24
  "outputs": [
25
  {
26
  "name": "stderr",
27
  "output_type": "stream",
28
  "text": [
29
- "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_22016\\2374447718.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
30
  " df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n"
31
  ]
32
  }
@@ -38,7 +38,7 @@
38
  " \"database\" : \"next\",\n",
39
  " \"user\" : \"postgres\",\n",
40
  " \"password\" : \"postgres\",\n",
41
- " \"port\" : \"5432\"\n",
42
  "}\n",
43
  "con = psycopg2.connect(**con_details)\n",
44
  "\n",
@@ -60,14 +60,14 @@
60
  },
61
  {
62
  "cell_type": "code",
63
- "execution_count": 3,
64
  "metadata": {},
65
  "outputs": [
66
  {
67
  "name": "stdout",
68
  "output_type": "stream",
69
  "text": [
70
- "{'Z', 'FDP', 'GB/BHE', 'DIE LINKE.', 'DRP', 'WAV', 'Fraktionslos', 'NR', 'BP', 'not found', 'SPD', 'Gast', 'FU', 'SSW', 'KPD', 'DA', 'FVP', 'AfD', 'Grüne', 'DP', 'CDU/CSU', 'PDS'}\n"
71
  ]
72
  }
73
  ],
@@ -78,161 +78,7 @@
78
  },
79
  {
80
  "cell_type": "code",
81
- "execution_count": 7,
82
- "metadata": {},
83
- "outputs": [
84
- {
85
- "data": {
86
- "text/html": [
87
- "<div>\n",
88
- "<style scoped>\n",
89
- " .dataframe tbody tr th:only-of-type {\n",
90
- " vertical-align: middle;\n",
91
- " }\n",
92
- "\n",
93
- " .dataframe tbody tr th {\n",
94
- " vertical-align: top;\n",
95
- " }\n",
96
- "\n",
97
- " .dataframe thead th {\n",
98
- " text-align: right;\n",
99
- " }\n",
100
- "</style>\n",
101
- "<table border=\"1\" class=\"dataframe\">\n",
102
- " <thead>\n",
103
- " <tr style=\"text-align: right;\">\n",
104
- " <th></th>\n",
105
- " <th>id</th>\n",
106
- " <th>speech_content</th>\n",
107
- " <th>date</th>\n",
108
- " <th>party</th>\n",
109
- " </tr>\n",
110
- " </thead>\n",
111
- " <tbody>\n",
112
- " <tr>\n",
113
- " <th>126</th>\n",
114
- " <td>121</td>\n",
115
- " <td>Meine Damen und Herren, die Zentrumsfraktion, ...</td>\n",
116
- " <td>1949-09-22</td>\n",
117
- " <td>Z</td>\n",
118
- " </tr>\n",
119
- " <tr>\n",
120
- " <th>192</th>\n",
121
- " <td>181</td>\n",
122
- " <td>Meine Damen und Herren! Der Herr Bundeskanzler...</td>\n",
123
- " <td>1949-09-22</td>\n",
124
- " <td>Z</td>\n",
125
- " </tr>\n",
126
- " <tr>\n",
127
- " <th>208</th>\n",
128
- " <td>196</td>\n",
129
- " <td>Die Zentrumsfraktion des Deutschen Bundestags ...</td>\n",
130
- " <td>1949-09-27</td>\n",
131
- " <td>Z</td>\n",
132
- " </tr>\n",
133
- " <tr>\n",
134
- " <th>210</th>\n",
135
- " <td>198</td>\n",
136
- " <td>Den Antrag habe ich hier.\\n({0})\\n- Ich begrün...</td>\n",
137
- " <td>1949-09-27</td>\n",
138
- " <td>Z</td>\n",
139
- " </tr>\n",
140
- " <tr>\n",
141
- " <th>211</th>\n",
142
- " <td>199</td>\n",
143
- " <td>Ich werde Ihnen, Herr Präsident, also den Antr...</td>\n",
144
- " <td>1949-09-27</td>\n",
145
- " <td>Z</td>\n",
146
- " </tr>\n",
147
- " <tr>\n",
148
- " <th>...</th>\n",
149
- " <td>...</td>\n",
150
- " <td>...</td>\n",
151
- " <td>...</td>\n",
152
- " <td>...</td>\n",
153
- " </tr>\n",
154
- " <tr>\n",
155
- " <th>16480</th>\n",
156
- " <td>16412</td>\n",
157
- " <td>Meine Damen und Herren! Das, was Herr Kollege ...</td>\n",
158
- " <td>1951-12-06</td>\n",
159
- " <td>Z</td>\n",
160
- " </tr>\n",
161
- " <tr>\n",
162
- " <th>16558</th>\n",
163
- " <td>16496</td>\n",
164
- " <td>Herr Präsident! Meine sehr verehrten Damen und...</td>\n",
165
- " <td>1951-12-12</td>\n",
166
- " <td>Z</td>\n",
167
- " </tr>\n",
168
- " <tr>\n",
169
- " <th>16592</th>\n",
170
- " <td>16526</td>\n",
171
- " <td>Herr Präsident! Meine Damen und Herren! Der He...</td>\n",
172
- " <td>1951-12-12</td>\n",
173
- " <td>Z</td>\n",
174
- " </tr>\n",
175
- " <tr>\n",
176
- " <th>16622</th>\n",
177
- " <td>16580</td>\n",
178
- " <td>Herr Präsident! Meine Herren und Damen! Entgeg...</td>\n",
179
- " <td>1951-12-12</td>\n",
180
- " <td>Z</td>\n",
181
- " </tr>\n",
182
- " <tr>\n",
183
- " <th>16699</th>\n",
184
- " <td>16634</td>\n",
185
- " <td>Herr Präsident! Meine Damen und Herren! Die Ze...</td>\n",
186
- " <td>1951-12-13</td>\n",
187
- " <td>Z</td>\n",
188
- " </tr>\n",
189
- " </tbody>\n",
190
- "</table>\n",
191
- "<p>420 rows × 4 columns</p>\n",
192
- "</div>"
193
- ],
194
- "text/plain": [
195
- " id speech_content date \\\n",
196
- "126 121 Meine Damen und Herren, die Zentrumsfraktion, ... 1949-09-22 \n",
197
- "192 181 Meine Damen und Herren! Der Herr Bundeskanzler... 1949-09-22 \n",
198
- "208 196 Die Zentrumsfraktion des Deutschen Bundestags ... 1949-09-27 \n",
199
- "210 198 Den Antrag habe ich hier.\\n({0})\\n- Ich begrün... 1949-09-27 \n",
200
- "211 199 Ich werde Ihnen, Herr Präsident, also den Antr... 1949-09-27 \n",
201
- "... ... ... ... \n",
202
- "16480 16412 Meine Damen und Herren! Das, was Herr Kollege ... 1951-12-06 \n",
203
- "16558 16496 Herr Präsident! Meine sehr verehrten Damen und... 1951-12-12 \n",
204
- "16592 16526 Herr Präsident! Meine Damen und Herren! Der He... 1951-12-12 \n",
205
- "16622 16580 Herr Präsident! Meine Herren und Damen! Entgeg... 1951-12-12 \n",
206
- "16699 16634 Herr Präsident! Meine Damen und Herren! Die Ze... 1951-12-13 \n",
207
- "\n",
208
- " party \n",
209
- "126 Z \n",
210
- "192 Z \n",
211
- "208 Z \n",
212
- "210 Z \n",
213
- "211 Z \n",
214
- "... ... \n",
215
- "16480 Z \n",
216
- "16558 Z \n",
217
- "16592 Z \n",
218
- "16622 Z \n",
219
- "16699 Z \n",
220
- "\n",
221
- "[420 rows x 4 columns]"
222
- ]
223
- },
224
- "execution_count": 7,
225
- "metadata": {},
226
- "output_type": "execute_result"
227
- }
228
- ],
229
- "source": [
230
- "df[df['party'] == 'Z']\n"
231
- ]
232
- },
233
- {
234
- "cell_type": "code",
235
- "execution_count": 4,
236
  "metadata": {},
237
  "outputs": [
238
  {
@@ -375,22 +221,24 @@
375
  "[930960 rows x 4 columns]"
376
  ]
377
  },
378
- "execution_count": 4,
379
  "metadata": {},
380
  "output_type": "execute_result"
381
  }
382
  ],
383
  "source": [
384
  "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) #removing keys from interruptions\n",
 
385
  "df"
386
  ]
387
  },
388
  {
389
  "cell_type": "code",
390
- "execution_count": 5,
391
  "metadata": {},
392
  "outputs": [],
393
  "source": [
 
394
  "df.to_pickle(\"speeches_1949_09_12\")"
395
  ]
396
  }
 
19
  },
20
  {
21
  "cell_type": "code",
22
+ "execution_count": 13,
23
  "metadata": {},
24
  "outputs": [
25
  {
26
  "name": "stderr",
27
  "output_type": "stream",
28
  "text": [
29
+ "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_12368\\2515868855.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
30
  " df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n"
31
  ]
32
  }
 
38
  " \"database\" : \"next\",\n",
39
  " \"user\" : \"postgres\",\n",
40
  " \"password\" : \"postgres\",\n",
41
+ " \"port\" : \"5433\"\n",
42
  "}\n",
43
  "con = psycopg2.connect(**con_details)\n",
44
  "\n",
 
60
  },
61
  {
62
  "cell_type": "code",
63
+ "execution_count": 14,
64
  "metadata": {},
65
  "outputs": [
66
  {
67
  "name": "stdout",
68
  "output_type": "stream",
69
  "text": [
70
+ "{'FVP', 'DA', 'FDP', 'BP', 'DP', 'DRP', 'PDS', 'SSW', 'Grüne', 'Fraktionslos', 'WAV', 'Gast', 'FU', 'KPD', 'DIE LINKE.', 'CDU/CSU', 'not found', 'GB/BHE', 'AfD', 'SPD', 'NR', 'Z'}\n"
71
  ]
72
  }
73
  ],
 
78
  },
79
  {
80
  "cell_type": "code",
81
+ "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  "metadata": {},
83
  "outputs": [
84
  {
 
221
  "[930960 rows x 4 columns]"
222
  ]
223
  },
224
+ "execution_count": 16,
225
  "metadata": {},
226
  "output_type": "execute_result"
227
  }
228
  ],
229
  "source": [
230
  "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) #removing keys from interruptions\n",
231
+ "df['date'] = pd.to_datetime(df['date'])\n",
232
  "df"
233
  ]
234
  },
235
  {
236
  "cell_type": "code",
237
+ "execution_count": null,
238
  "metadata": {},
239
  "outputs": [],
240
  "source": [
241
+ "# Dave to pickle\n",
242
  "df.to_pickle(\"speeches_1949_09_12\")"
243
  ]
244
  }
src/chatbot.py CHANGED
@@ -1,13 +1,21 @@
1
  from langchain_core.prompts import ChatPromptTemplate
2
  from langchain_community.llms.huggingface_hub import HuggingFaceHub
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
 
4
 
5
- from src.vectordatabase import RAG, get_vectorstore
 
 
 
 
 
 
 
6
  import pandas as pd
7
 
8
  # Load environmental variables from .env-file
9
- # from dotenv import load_dotenv, find_dotenv
10
- # load_dotenv(find_dotenv())
11
 
12
  # Define important variables
13
  embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2") # Remove embedding input parameter from functions?
@@ -56,6 +64,98 @@ prompt_en = ChatPromptTemplate.from_template("""Answer the following question in
56
  # Returns the answer in English
57
  )
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
 
61
  def chatbot(message, history, db_inputs, prompt_language, llm=llm):
@@ -109,7 +209,7 @@ def chatbot(message, history, db_inputs, prompt_language, llm=llm):
109
  return response
110
 
111
 
112
- def keyword_search(query, n=10, embeddings=embeddings, method='ss', party_filter='All'):
113
  """
114
  Retrieve speech contents based on keywords using a specified method.
115
 
@@ -156,7 +256,7 @@ def keyword_search(query, n=10, embeddings=embeddings, method='ss', party_filter
156
  query_embedding = embeddings.embed_query(query)
157
 
158
  # Maximal Marginal Relevance
159
- if method == 'mmr':
160
  df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
161
  results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
162
  for doc in results:
@@ -173,8 +273,8 @@ def keyword_search(query, n=10, embeddings=embeddings, method='ss', party_filter
173
  df_res.sort_values('Relevance', inplace=True, ascending=True)
174
 
175
  # Similarity Search
176
- else:
177
- df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party'])
178
  results = db.similarity_search_by_vector(query_embedding, k=n)
179
  for doc in results:
180
  party = doc.metadata["party"]
@@ -182,7 +282,15 @@ def keyword_search(query, n=10, embeddings=embeddings, method='ss', party_filter
182
  continue
183
  speech_content = doc.page_content
184
  speech_date = doc.metadata["date"]
185
- df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
186
- 'Date': [speech_date],
187
- 'Party': [party]})], ignore_index=True)
 
 
 
 
 
 
 
 
188
  return df_res
 
1
  from langchain_core.prompts import ChatPromptTemplate
2
  from langchain_community.llms.huggingface_hub import HuggingFaceHub
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
 
6
+
7
+ from langchain.chains.combine_documents import create_stuff_documents_chain
8
+ from langchain.chains import create_retrieval_chain
9
+
10
+ from langchain_community.docstore.in_memory import InMemoryDocstore
11
+ from faiss import IndexFlatL2
12
+
13
+ #import functools
14
  import pandas as pd
15
 
16
  # Load environmental variables from .env-file
17
+ from dotenv import load_dotenv, find_dotenv
18
+ load_dotenv(find_dotenv())
19
 
20
  # Define important variables
21
  embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2") # Remove embedding input parameter from functions?
 
64
  # Returns the answer in English
65
  )
66
 
67
+ db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
68
+ embeddings=embeddings, allow_dangerous_deserialization=True)
69
+
70
+ def get_vectorstore(inputs, embeddings):
71
+ """
72
+ Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
73
+
74
+ Parameters
75
+ ----------
76
+ inputs : list of str
77
+ A list of strings specifying which vector stores to combine. Each string represents a specific
78
+ index or a special keyword "All". If "All" is the first entry in the list,
79
+ it directly return the pre-defined vectorstore for all speeches
80
+
81
+ embeddings : Embeddings
82
+ An instance of embeddings that will be used to load the vector stores. The specific type and
83
+ structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
84
+
85
+ Returns
86
+ -------
87
+ FAISS
88
+ A FAISS vector store that combines the specified indices into a single vector store.
89
+
90
+ """
91
+
92
+ # Default folder path
93
+ folder_path = "./src/FAISS"
94
+
95
+
96
+ if inputs[0] == "All" or inputs[0] is None:
97
+ return db_all
98
+
99
+ # Initialize empty db
100
+ embedding_function = embeddings
101
+ dimensions = len(embedding_function.embed_query("dummy"))
102
+
103
+ db = FAISS(
104
+ embedding_function=embedding_function,
105
+ index=IndexFlatL2(dimensions),
106
+ docstore=InMemoryDocstore(),
107
+ index_to_docstore_id={},
108
+ normalize_L2=False
109
+ )
110
+
111
+ # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
112
+ for input in inputs:
113
+ # Ignore if user also selected All among other legislatures
114
+ if input == "All":
115
+ continue
116
+ # Retrieve selected index and merge vector stores
117
+ index = input.split(".")[0]
118
+ index_name = f'{index}_legislature'
119
+ local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
120
+ embeddings=embeddings, allow_dangerous_deserialization=True)
121
+ db.merge_from(local_db)
122
+ print('Successfully merged inputs')
123
+ return db
124
+
125
+ def RAG(llm, prompt, db, question):
126
+ """
127
+ Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
128
+ language model using a predefined template.
129
+
130
+ Parameters:
131
+ ----------
132
+ llm : LanguageModel
133
+ An instance of the language model to be used for generating responses.
134
+
135
+ prompt : str
136
+ A predefined template or prompt that structures how the context and question are presented to the language model.
137
+
138
+ db : VectorStore
139
+ A vector store instance that supports retrieval of relevant documents based on the input question.
140
+
141
+ question : str
142
+ The question or query to be answered by the language model.
143
+
144
+ Returns:
145
+ -------
146
+ str
147
+ The response generated by the language model, based on the retrieved context and provided question.
148
+ """
149
+ # Create a document chain using the provided language model and prompt template
150
+ document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
151
+ # Convert the vector store into a retriever
152
+ retriever = db.as_retriever()
153
+ # Create a retrieval chain that integrates the retriever with the document chain
154
+ retrieval_chain = create_retrieval_chain(retriever, document_chain)
155
+ # Invoke the retrieval chain with the input question to get the final response
156
+ response = retrieval_chain.invoke({"input": question})
157
+
158
+ return response
159
 
160
 
161
  def chatbot(message, history, db_inputs, prompt_language, llm=llm):
 
209
  return response
210
 
211
 
212
+ def keyword_search(query, n=10, embeddings=embeddings, method="ss", party_filter="All"):
213
  """
214
  Retrieve speech contents based on keywords using a specified method.
215
 
 
256
  query_embedding = embeddings.embed_query(query)
257
 
258
  # Maximal Marginal Relevance
259
+ if method == "mmr":
260
  df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
261
  results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
262
  for doc in results:
 
273
  df_res.sort_values('Relevance', inplace=True, ascending=True)
274
 
275
  # Similarity Search
276
+ elif method == "ss":
277
+ kws_data = []
278
  results = db.similarity_search_by_vector(query_embedding, k=n)
279
  for doc in results:
280
  party = doc.metadata["party"]
 
282
  continue
283
  speech_content = doc.page_content
284
  speech_date = doc.metadata["date"]
285
+ speech_date = speech_date.strftime("%Y-%m-%d")
286
+ print(speech_date)
287
+ # Error here?
288
+ kws_entry = {'Speech Content': speech_content,
289
+ 'Date': speech_date,
290
+ 'Party': party}
291
+
292
+ kws_data.append(kws_entry)
293
+
294
+ df_res = pd.DataFrame(kws_data)
295
+
296
  return df_res
src/vectordatabase.py DELETED
@@ -1,152 +0,0 @@
1
- from langchain_community.document_loaders import DataFrameLoader
2
- from langchain_community.embeddings import HuggingFaceEmbeddings
3
- from langchain_community.vectorstores import FAISS
4
-
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain.chains.combine_documents import create_stuff_documents_chain
7
- from langchain.chains import create_retrieval_chain
8
-
9
- from langchain_community.docstore.in_memory import InMemoryDocstore
10
- from faiss import IndexFlatL2
11
-
12
- #import functools
13
-
14
- import pandas as pd
15
- import os
16
-
17
- # For local run load environmental variables from .env-file
18
- # from dotenv import load_dotenv
19
- # load_dotenv()
20
-
21
- # Define important variables
22
- embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
23
- db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
24
- embeddings=embeddings, allow_dangerous_deserialization=True)
25
-
26
- def load_documents(df):
27
- """
28
- Load documents from a DataFrame and split them into smaller chunks for vector storage.
29
-
30
- Parameters:
31
- ----------
32
- df : pandas.DataFrame
33
- A DataFrame containing the documents to be processed, with a column named 'speech_content' that holds the text content.
34
-
35
- Returns:
36
- -------
37
- list
38
- A list of split document chunks ready for further processing or vectorization.
39
- """
40
-
41
- # Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load
42
- loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')
43
- # Load the data from the DataFrame into a suitable format for processing
44
- data = loader.load()
45
-
46
- # Initialize a RecursiveCharacterTextSplitter to split the text into chunks
47
- splitter = RecursiveCharacterTextSplitter(
48
- chunk_size=1024,
49
- chunk_overlap=32,
50
- length_function=len,
51
- is_separator_regex=False,
52
- )
53
-
54
- # Split the loaded data into smaller chunks using the splitter
55
- documents = splitter.split_documents(documents=data)
56
-
57
- return documents
58
-
59
-
60
- #@functools.lru_cache()
61
- def get_vectorstore(inputs, embeddings):
62
- """
63
- Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
64
-
65
- Parameters
66
- ----------
67
- inputs : list of str
68
- A list of strings specifying which vector stores to combine. Each string represents a specific
69
- index or a special keyword "All". If "All" is the first entry in the list,
70
- it directly return the pre-defined vectorstore for all speeches
71
-
72
- embeddings : Embeddings
73
- An instance of embeddings that will be used to load the vector stores. The specific type and
74
- structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
75
-
76
- Returns
77
- -------
78
- FAISS
79
- A FAISS vector store that combines the specified indices into a single vector store.
80
-
81
- """
82
-
83
- # Default folder path
84
- folder_path = "./src/FAISS"
85
-
86
- if inputs[0] == "All" or inputs[0] is None:
87
- return db_all
88
-
89
- # Initialize empty db
90
- embedding_function = embeddings
91
- dimensions = len(embedding_function.embed_query("dummy"))
92
-
93
- db = FAISS(
94
- embedding_function=embedding_function,
95
- index=IndexFlatL2(dimensions),
96
- docstore=InMemoryDocstore(),
97
- index_to_docstore_id={},
98
- normalize_L2=False
99
- )
100
-
101
- # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
102
- for input in inputs:
103
- # Ignore if user also selected All among other legislatures
104
- if input == "All":
105
- continue
106
- # Retrieve selected index and merge vector stores
107
- index = input.split(".")[0]
108
- index_name = f'{index}_legislature'
109
- local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
110
- embeddings=embeddings, allow_dangerous_deserialization=True)
111
- db.merge_from(local_db)
112
- print('Successfully merged inputs')
113
- return db
114
-
115
-
116
-
117
-
118
- def RAG(llm, prompt, db, question):
119
- """
120
- Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
121
- language model using a predefined template.
122
-
123
- Parameters:
124
- ----------
125
- llm : LanguageModel
126
- An instance of the language model to be used for generating responses.
127
-
128
- prompt : str
129
- A predefined template or prompt that structures how the context and question are presented to the language model.
130
-
131
- db : VectorStore
132
- A vector store instance that supports retrieval of relevant documents based on the input question.
133
-
134
- question : str
135
- The question or query to be answered by the language model.
136
-
137
- Returns:
138
- -------
139
- str
140
- The response generated by the language model, based on the retrieved context and provided question.
141
- """
142
- # Create a document chain using the provided language model and prompt template
143
- document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
144
- # Convert the vector store into a retriever
145
- retriever = db.as_retriever()
146
- # Create a retrieval chain that integrates the retriever with the document chain
147
- retrieval_chain = create_retrieval_chain(retriever, document_chain)
148
- # Invoke the retrieval chain with the input question to get the final response
149
- response = retrieval_chain.invoke({"input": question})
150
-
151
- return response
152
-