Sean Pedrick-Case commited on
Commit
d2296bf
Β·
unverified Β·
2 Parent(s): ae0ac42 d3a9db3

Merge pull request #8 from seanpedrick-case/dev

Browse files

Changed embedding model to mixed bread xsmall, optimised related functions in general. Updated Gemini model references.

.dockerignore CHANGED
@@ -11,4 +11,5 @@ dist/*
11
  test/*
12
  config/*
13
  output/*
14
- input/*
 
 
11
  test/*
12
  config/*
13
  output/*
14
+ input/*
15
+ borough_plan/*
.gitignore CHANGED
@@ -11,4 +11,5 @@ dist/*
11
  test/*
12
  config/*
13
  output/*
14
- input/*
 
 
11
  test/*
12
  config/*
13
  output/*
14
+ input/*
15
+ borough_plan/*
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  from typing import Type
3
- from langchain_huggingface.embeddings import HuggingFaceEmbeddings
4
  from langchain_community.vectorstores import FAISS
5
  import gradio as gr
6
  import pandas as pd
@@ -8,17 +8,15 @@ from torch import float16, float32
8
  from llama_cpp import Llama
9
  from huggingface_hub import hf_hub_download
10
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
11
- import zipfile
12
 
13
- from chatfuncs.ingest import embed_faiss_save_to_zip
14
-
15
- from chatfuncs.helper_functions import get_connection_params, reveal_feedback_buttons, wipe_logs
16
- from chatfuncs.aws_functions import upload_file_to_s3
17
- from chatfuncs.auth import authenticate_user
18
- from chatfuncs.config import FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, HOST_NAME, COGNITO_AUTH, INPUT_FOLDER, OUTPUT_FOLDER, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, DEFAULT_EMBEDDINGS_LOCATION, EMBEDDINGS_MODEL_NAME, DEFAULT_DATA_SOURCE, HF_TOKEN, LARGE_MODEL_REPO_ID, LARGE_MODEL_GGUF_FILE, LARGE_MODEL_NAME, SMALL_MODEL_NAME, SMALL_MODEL_REPO_ID, DEFAULT_DATA_SOURCE_NAME, DEFAULT_EXAMPLES, DEFAULT_MODEL_CHOICES, RUN_GEMINI_MODELS, LOAD_LARGE_MODEL
19
- from chatfuncs.model_load import torch_device, gpu_config, cpu_config, context_length
20
- import chatfuncs.chatfuncs as chatf
21
- import chatfuncs.ingest as ing
22
 
23
  PandasDataFrame = Type[pd.DataFrame]
24
 
@@ -34,41 +32,14 @@ if isinstance(DEFAULT_EXAMPLES, str): default_examples_set = eval(DEFAULT_EXAMPL
34
  if isinstance(DEFAULT_MODEL_CHOICES, str): default_model_choices = eval(DEFAULT_MODEL_CHOICES)
35
 
36
  # Disable cuda devices if necessary
37
- #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
38
-
39
 
40
  ###
41
  # Load preset embeddings, vectorstore, and model
42
  ###
43
-
44
- def load_embeddings_model(embeddings_model = EMBEDDINGS_MODEL_NAME):
45
-
46
- embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_model)
47
-
48
- #global embeddings
49
-
50
- #embeddings = embeddings_func
51
-
52
- return embeddings_func
53
-
54
- def get_faiss_store(faiss_vstore_folder:str, embeddings_model:object):
55
-
56
- with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref:
57
- zip_ref.extractall(faiss_vstore_folder)
58
-
59
- faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings_model, allow_dangerous_deserialization=True)
60
- os.remove(faiss_vstore_folder + "/index.faiss")
61
- os.remove(faiss_vstore_folder + "/index.pkl")
62
-
63
- #global vectorstore
64
-
65
- #vectorstore = faiss_vstore
66
-
67
- return faiss_vstore #vectorstore
68
-
69
  # Load in default embeddings and embeddings model name
70
  embeddings_model = load_embeddings_model(EMBEDDINGS_MODEL_NAME)
71
- vectorstore = get_faiss_store(faiss_vstore_folder=DEFAULT_EMBEDDINGS_LOCATION,embeddings_model=embeddings_model)#globals()["embeddings"])
72
 
73
  chatf.embeddings = embeddings_model
74
  chatf.vectorstore = vectorstore
@@ -87,7 +58,6 @@ def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings_model=embeddings_mod
87
 
88
  return out_message, vectorstore_func
89
 
90
-
91
  def create_hf_model(model_name:str, hf_token=HF_TOKEN):
92
  if torch_device == "cuda":
93
  if "flan" in model_name:
@@ -167,12 +137,11 @@ def load_model(model_type:str, gpu_layers:int, gpu_config:dict=gpu_config, cpu_c
167
 
168
  return model_type, load_confirmation, model_type#model, tokenizer, model_type
169
 
170
-
171
  ###
172
  # RUN UI
173
  ###
174
 
175
- app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)#css=".gradio-container {background-color: black}")
176
 
177
  with app:
178
  model_type = SMALL_MODEL_NAME
 
1
  import os
2
  from typing import Type
3
+ #from langchain_huggingface.embeddings import HuggingFaceEmbeddings
4
  from langchain_community.vectorstores import FAISS
5
  import gradio as gr
6
  import pandas as pd
 
8
  from llama_cpp import Llama
9
  from huggingface_hub import hf_hub_download
10
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
 
11
 
12
+ from tools.ingest import embed_faiss_save_to_zip, load_embeddings_model, get_faiss_store
13
+ from tools.helper_functions import get_connection_params, reveal_feedback_buttons, wipe_logs
14
+ from tools.aws_functions import upload_file_to_s3
15
+ from tools.auth import authenticate_user
16
+ from tools.config import FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, HOST_NAME, COGNITO_AUTH, INPUT_FOLDER, OUTPUT_FOLDER, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, DEFAULT_EMBEDDINGS_LOCATION, EMBEDDINGS_MODEL_NAME, DEFAULT_DATA_SOURCE, HF_TOKEN, LARGE_MODEL_REPO_ID, LARGE_MODEL_GGUF_FILE, LARGE_MODEL_NAME, SMALL_MODEL_NAME, SMALL_MODEL_REPO_ID, DEFAULT_DATA_SOURCE_NAME, DEFAULT_EXAMPLES, DEFAULT_MODEL_CHOICES, RUN_GEMINI_MODELS, LOAD_LARGE_MODEL
17
+ from tools.model_load import torch_device, gpu_config, cpu_config, context_length
18
+ import tools.chatfuncs as chatf
19
+ import tools.ingest as ing
 
20
 
21
  PandasDataFrame = Type[pd.DataFrame]
22
 
 
32
  if isinstance(DEFAULT_MODEL_CHOICES, str): default_model_choices = eval(DEFAULT_MODEL_CHOICES)
33
 
34
  # Disable cuda devices if necessary
35
+ #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 
36
 
37
  ###
38
  # Load preset embeddings, vectorstore, and model
39
  ###
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  # Load in default embeddings and embeddings model name
41
  embeddings_model = load_embeddings_model(EMBEDDINGS_MODEL_NAME)
42
+ vectorstore = get_faiss_store(zip_file_path=DEFAULT_EMBEDDINGS_LOCATION,embeddings_model=embeddings_model)#globals()["embeddings"])
43
 
44
  chatf.embeddings = embeddings_model
45
  chatf.vectorstore = vectorstore
 
58
 
59
  return out_message, vectorstore_func
60
 
 
61
  def create_hf_model(model_name:str, hf_token=HF_TOKEN):
62
  if torch_device == "cuda":
63
  if "flan" in model_name:
 
137
 
138
  return model_type, load_confirmation, model_type#model, tokenizer, model_type
139
 
 
140
  ###
141
  # RUN UI
142
  ###
143
 
144
+ app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True)#css=".gradio-container {background-color: black}")
145
 
146
  with app:
147
  model_type = SMALL_MODEL_NAME
app_save_docstore.py DELETED
@@ -1,307 +0,0 @@
1
- # Load in packages
2
-
3
- import os
4
-
5
- from typing import Type
6
- from langchain_community.embeddings import HuggingFaceEmbeddings#, HuggingFaceInstructEmbeddings
7
- from langchain_community.vectorstores import FAISS
8
- import gradio as gr
9
- import pandas as pd
10
-
11
- from transformers import AutoTokenizer
12
- import torch
13
-
14
- from llama_cpp import Llama
15
- from huggingface_hub import hf_hub_download
16
-
17
- PandasDataFrame = Type[pd.DataFrame]
18
-
19
- # Disable cuda devices if necessary
20
- #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
21
-
22
- #from chatfuncs.chatfuncs import *
23
- import chatfuncs.ingest as ing
24
-
25
- ## Load preset embeddings, vectorstore, and model
26
-
27
- embeddings_name = "BAAI/bge-base-en-v1.5"
28
-
29
- def load_embeddings(embeddings_name = embeddings_name):
30
-
31
- embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_name)
32
-
33
- global embeddings
34
-
35
- embeddings = embeddings_func
36
-
37
- return embeddings
38
-
39
- def get_faiss_store(faiss_vstore_folder,embeddings):
40
- import zipfile
41
- with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref:
42
- zip_ref.extractall(faiss_vstore_folder)
43
-
44
- faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings, allow_dangerous_deserialization=True)
45
- os.remove(faiss_vstore_folder + "/index.faiss")
46
- os.remove(faiss_vstore_folder + "/index.pkl")
47
-
48
- global vectorstore
49
-
50
- vectorstore = faiss_vstore
51
-
52
- return vectorstore
53
-
54
- import chatfuncs.chatfuncs as chatf
55
-
56
- chatf.embeddings = load_embeddings(embeddings_name)
57
- chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
58
-
59
-
60
- def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
61
- print("Loading model")
62
-
63
- # Default values inside the function
64
- if gpu_config is None:
65
- gpu_config = chatf.gpu_config
66
- if cpu_config is None:
67
- cpu_config = chatf.cpu_config
68
- if torch_device is None:
69
- torch_device = chatf.torch_device
70
-
71
- if model_type == "Phi 3 Mini (larger, slow)":
72
- if torch_device == "cuda":
73
- gpu_config.update_gpu(gpu_layers)
74
- print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU.")
75
- else:
76
- gpu_config.update_gpu(gpu_layers)
77
- cpu_config.update_gpu(gpu_layers)
78
-
79
- print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU.")
80
-
81
- print(vars(gpu_config))
82
- print(vars(cpu_config))
83
-
84
- try:
85
- model = Llama(
86
- model_path=hf_hub_download(
87
- repo_id=os.environ.get("REPO_ID", "QuantFactory/Phi-3-mini-128k-instruct-GGUF"),# "QuantFactory/Phi-3-mini-128k-instruct-GGUF"), # "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"), #"microsoft/Phi-3-mini-4k-instruct-gguf"),#"TheBloke/Mistral-7B-OpenOrca-GGUF"),
88
- filename=os.environ.get("MODEL_FILE", "Phi-3-mini-128k-instruct.Q4_K_M.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf") #"Meta-Llama-3-8B-Instruct-v2.Q6_K.gguf") #"Phi-3-mini-4k-instruct-q4.gguf")#"mistral-7b-openorca.Q4_K_M.gguf"),
89
- ),
90
- **vars(gpu_config) # change n_gpu_layers if you have more or less VRAM
91
- )
92
-
93
- except Exception as e:
94
- print("GPU load failed")
95
- print(e)
96
- model = Llama(
97
- model_path=hf_hub_download(
98
- repo_id=os.environ.get("REPO_ID", "QuantFactory/Phi-3-mini-128k-instruct-GGUF"), #"QuantFactory/Phi-3-mini-128k-instruct-GGUF"), #, "microsoft/Phi-3-mini-4k-instruct-gguf"),#"QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"), #"microsoft/Phi-3-mini-4k-instruct-gguf"),#"TheBloke/Mistral-7B-OpenOrca-GGUF"),
99
- filename=os.environ.get("MODEL_FILE", "Phi-3-mini-128k-instruct.Q4_K_M.gguf"), # "Phi-3-mini-128k-instruct.Q4_K_M.gguf") # , #"Meta-Llama-3-8B-Instruct-v2.Q6_K.gguf") #"Phi-3-mini-4k-instruct-q4.gguf"),#"mistral-7b-openorca.Q4_K_M.gguf"),
100
- ),
101
- **vars(cpu_config)
102
- )
103
-
104
- tokenizer = []
105
-
106
- if model_type == "Flan Alpaca (small, fast)":
107
- # Huggingface chat model
108
- hf_checkpoint = 'declare-lab/flan-alpaca-large'#'declare-lab/flan-alpaca-base' # # #
109
-
110
- def create_hf_model(model_name):
111
-
112
- from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
113
-
114
- if torch_device == "cuda":
115
- if "flan" in model_name:
116
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
117
- else:
118
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
119
- else:
120
- if "flan" in model_name:
121
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
122
- else:
123
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)
124
-
125
- tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
126
-
127
- return model, tokenizer, model_type
128
-
129
- model, tokenizer, model_type = create_hf_model(model_name = hf_checkpoint)
130
-
131
- chatf.model = model
132
- chatf.tokenizer = tokenizer
133
- chatf.model_type = model_type
134
-
135
- load_confirmation = "Finished loading model: " + model_type
136
-
137
- print(load_confirmation)
138
- return model_type, load_confirmation, model_type
139
-
140
- # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
141
- model_type = "Phi 3 Mini (larger, slow)"
142
- load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
143
-
144
- model_type = "Flan Alpaca (small, fast)"
145
- load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
146
-
147
- def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
148
-
149
- print(f"> Total split documents: {len(docs_out)}")
150
-
151
- print(docs_out)
152
-
153
- vectorstore_func = FAISS.from_documents(documents=docs_out, embedding=embeddings)
154
-
155
-
156
- chatf.vectorstore = vectorstore_func
157
-
158
- out_message = "Document processing complete"
159
-
160
- return out_message, vectorstore_func, out_file
161
-
162
- # Gradio chat
163
-
164
- block = gr.Blocks(theme = gr.themes.Base())#css=".gradio-container {background-color: black}")
165
-
166
- with block:
167
- ingest_text = gr.State()
168
- ingest_metadata = gr.State()
169
- ingest_docs = gr.State()
170
-
171
- model_type_state = gr.State(model_type)
172
- embeddings_state = gr.State(chatf.embeddings)#globals()["embeddings"])
173
- vectorstore_state = gr.State(chatf.vectorstore)#globals()["vectorstore"])
174
-
175
- model_state = gr.State() # chatf.model (gives error)
176
- tokenizer_state = gr.State() # chatf.tokenizer (gives error)
177
-
178
- chat_history_state = gr.State()
179
- instruction_prompt_out = gr.State()
180
-
181
- gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
182
-
183
- gr.Markdown("Chat with PDF, web page or (new) csv/Excel documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Phi 3 Mini (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
184
-
185
- with gr.Row():
186
- current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
187
- current_model = gr.Textbox(label="Current model", value=model_type, scale = 3)
188
-
189
- with gr.Tab("Chatbot"):
190
-
191
- with gr.Row():
192
- #chat_height = 500
193
- chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
194
- with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
195
- sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here") # , height=chat_height
196
-
197
- with gr.Row():
198
- message = gr.Textbox(
199
- label="Enter your question here",
200
- lines=1,
201
- )
202
- with gr.Row():
203
- submit = gr.Button(value="Send message", variant="secondary", scale = 1)
204
- clear = gr.Button(value="Clear chat", variant="secondary", scale=0)
205
- stop = gr.Button(value="Stop generating", variant="secondary", scale=0)
206
-
207
- examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
208
- #value = "What were the five pillars of the previous borough plan?",
209
- choices=["What were the five pillars of the previous borough plan?",
210
- "What is the vision statement for Lambeth?",
211
- "What are the commitments for Lambeth?",
212
- "What are the 2030 outcomes for Lambeth?"])
213
-
214
-
215
- current_topic = gr.Textbox(label="Feature currently disabled - Keywords related to current conversation topic.", placeholder="Keywords related to the conversation topic will appear here")
216
-
217
-
218
- with gr.Tab("Load in a different file to chat with"):
219
- with gr.Accordion("PDF file", open = False):
220
- in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
221
- load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
222
-
223
- with gr.Accordion("Web page", open = False):
224
- with gr.Row():
225
- in_web = gr.Textbox(label="Enter web page url")
226
- in_div = gr.Textbox(label="(Advanced) Web page div for text extraction", value="p", placeholder="p")
227
- load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
228
-
229
- with gr.Accordion("CSV/Excel file", open = False):
230
- in_csv = gr.File(label="Upload CSV/Excel file", file_count="multiple", file_types=['.csv', '.xlsx'])
231
- in_text_column = gr.Textbox(label="Enter column name where text is stored")
232
- load_csv = gr.Button(value="Load in CSV/Excel file", variant="secondary", scale=0)
233
-
234
- with gr.Row():
235
- ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
236
- out_file_box = gr.File(count='single', filetype=['.zip'])
237
-
238
- with gr.Tab("Advanced features"):
239
- out_passages = gr.Slider(minimum=1, value = 2, maximum=10, step=1, label="Choose number of passages to retrieve from the document. Numbers greater than 2 may lead to increased hallucinations or input text being truncated.")
240
- temp_slide = gr.Slider(minimum=0.1, value = 0.5, maximum=1, step=0.1, label="Choose temperature setting for response generation.")
241
- with gr.Row():
242
- model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca (small, fast)", choices = ["Flan Alpaca (small, fast)", "Phi 3 Mini (larger, slow)"])
243
- change_model_button = gr.Button(value="Load model", scale=0)
244
- with gr.Accordion("Choose number of model layers to send to GPU (WARNING: please don't modify unless you are sure you have a GPU).", open = False):
245
- gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU.", value=0, minimum=0, maximum=100, step = 1, visible=True)
246
-
247
- load_text = gr.Text(label="Load status")
248
-
249
-
250
- gr.HTML(
251
- "<center>This app is based on the models Flan Alpaca and Phi 3 Mini. It powered by Gradio, Transformers, and Llama.cpp.</a></center>"
252
- )
253
-
254
- examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
255
-
256
- change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
257
- then(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text, current_model]).\
258
- then(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
259
- then(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
260
- then(lambda: None, None, chatbot, queue=False)
261
-
262
- # Load in a pdf
263
- load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
264
- then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
265
- then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
266
- then(chatf.hide_block, outputs = [examples_set])
267
-
268
- # Load in a webpage
269
- load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
270
- then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
271
- then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
272
- then(chatf.hide_block, outputs = [examples_set])
273
-
274
- # Load in a csv/excel file
275
- load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
276
- then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_text_column], outputs=[ingest_docs]).\
277
- then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
278
- then(chatf.hide_block, outputs = [examples_set])
279
-
280
- # Load in a webpage
281
-
282
- # Click/enter to send message action
283
- response_click = submit.click(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False, api_name="retrieval").\
284
- then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
285
- then(chatf.produce_streaming_answer_chatbot, inputs=[chatbot, instruction_prompt_out, model_type_state, temp_slide], outputs=chatbot)
286
- response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
287
- then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
288
- then(lambda: chatf.restore_interactivity(), None, [message], queue=False)
289
-
290
- response_enter = message.submit(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False).\
291
- then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
292
- then(chatf.produce_streaming_answer_chatbot, [chatbot, instruction_prompt_out, model_type_state, temp_slide], chatbot)
293
- response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
294
- then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
295
- then(lambda: chatf.restore_interactivity(), None, [message], queue=False)
296
-
297
- # Stop box
298
- stop.click(fn=None, inputs=None, outputs=None, cancels=[response_click, response_enter])
299
-
300
- # Clear box
301
- clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
302
- clear.click(lambda: None, None, chatbot, queue=False)
303
-
304
- # Thumbs up or thumbs down voting function
305
- chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
306
-
307
- block.queue().launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
chatfuncs/ingest_borough_plan.py DELETED
@@ -1,14 +0,0 @@
1
- import ingest as ing
2
-
3
- borough_plan_text, file_names = ing.parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
4
- print("Borough plan text created")
5
-
6
- print(borough_plan_text)
7
-
8
- borough_plan_docs = ing.text_to_docs(borough_plan_text)
9
- print("Borough plan docs created")
10
-
11
- embedding_model = "BAAI/bge-base-en-v1.5" # "mixedbread-ai/mxbai-embed-xsmall-v1" #
12
-
13
- embeddings = ing.load_embeddings(model_name = embedding_model)
14
- ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
faiss_embedding/faiss_embedding.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35bdbf12bc8094c1a4460f790dcd3cd78b4af883fb08f0d10afec7bcfa9eff5a
3
- size 421142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca011e3baf4b92201d378f35ab5f2fe0b6d16ac2eaac4f0705b8c4e84e24a6ae
3
+ size 243109
ingest_borough_plan.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tools.ingest import parse_file, text_to_docs, load_embeddings_model, embed_faiss_save_to_zip
2
+
3
+ borough_plan_text, file_names = parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
4
+ print("Borough plan text created")
5
+
6
+ #print(borough_plan_text)
7
+
8
+ borough_plan_docs = text_to_docs(borough_plan_text)
9
+ print("Borough plan docs created")
10
+
11
+ embedding_model = "mixedbread-ai/mxbai-embed-xsmall-v1" # "mixedbread-ai/mxbai-embed-xsmall-v1" #
12
+
13
+ embeddings = load_embeddings_model(embeddings_model = embedding_model)
14
+ embed_faiss_save_to_zip(borough_plan_docs, save_folder="borough_plan", embeddings_model_object= embeddings, save_to="faiss_embedding", model_name = embedding_model)
requirements.txt CHANGED
@@ -7,7 +7,8 @@ pandas==2.2.3
7
  transformers==4.51.3
8
  # For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
9
  llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # For linux if dependencies for below build command are not available in the environment
10
- #llama-cpp-python==0.3.8 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
 
11
  torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
12
  sentence_transformers==4.1.0
13
  faiss-cpu==1.10.0
 
7
  transformers==4.51.3
8
  # For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
9
  llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # For linux if dependencies for below build command are not available in the environment
10
+ #llama-cpp-python==0.3.9 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS" # CPU
11
+ #llama-cpp-python==0.3.9 -C cmake.args="-DGGML_CUDA=on" # With CUDA
12
  torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
13
  sentence_transformers==4.1.0
14
  faiss-cpu==1.10.0
test/test_module.py CHANGED
@@ -16,8 +16,8 @@
16
  # +
17
  import pytest
18
  import gradio as gr
19
- from ..chatfuncs.ingest import *
20
- from ..chatfuncs.chatfuncs import *
21
 
22
  def test_read_docx():
23
  content = read_docx('sample.docx')
 
16
  # +
17
  import pytest
18
  import gradio as gr
19
+ from ..tools.ingest import *
20
+ from ..tools.chatfuncs import *
21
 
22
  def test_read_docx():
23
  content = read_docx('sample.docx')
{chatfuncs β†’ tools}/__init__.py RENAMED
File without changes
{chatfuncs β†’ tools}/auth.py RENAMED
@@ -4,7 +4,7 @@ import boto3
4
  import hmac
5
  import hashlib
6
  import base64
7
- from chatfuncs.config import AWS_CLIENT_ID, AWS_CLIENT_SECRET, AWS_USER_POOL_ID, AWS_REGION
8
 
9
  def calculate_secret_hash(client_id:str, client_secret:str, username:str):
10
  message = username + client_id
 
4
  import hmac
5
  import hashlib
6
  import base64
7
+ from tools.config import AWS_CLIENT_ID, AWS_CLIENT_SECRET, AWS_USER_POOL_ID, AWS_REGION
8
 
9
  def calculate_secret_hash(client_id:str, client_secret:str, username:str):
10
  message = username + client_id
{chatfuncs β†’ tools}/aws_functions.py RENAMED
@@ -2,7 +2,7 @@ from typing import Type, List
2
  import pandas as pd
3
  import boto3
4
  import os
5
- from chatfuncs.config import AWS_REGION, RUN_AWS_FUNCTIONS, QA_CHATBOT_BUCKET
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
 
2
  import pandas as pd
3
  import boto3
4
  import os
5
+ from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, QA_CHATBOT_BUCKET
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
{chatfuncs β†’ tools}/chatfuncs.py RENAMED
@@ -14,6 +14,7 @@ from nltk.corpus import stopwords
14
  from nltk.tokenize import RegexpTokenizer
15
  from nltk.stem import WordNetLemmatizer
16
  from keybert import KeyBERT
 
17
 
18
  # For Name Entity Recognition model
19
  #from span_marker import SpanMarkerModel # Not currently used
@@ -32,9 +33,9 @@ from langchain_community.retrievers import SVMRetriever
32
  from langchain.text_splitter import RecursiveCharacterTextSplitter
33
  from langchain.docstore.document import Document
34
 
35
- from chatfuncs.prompts import instruction_prompt_template_alpaca, instruction_prompt_mistral_orca, instruction_prompt_phi3, instruction_prompt_llama3, instruction_prompt_qwen, instruction_prompt_template_orca, instruction_prompt_gemma, instruction_prompt_template_gemini_aws
36
- from chatfuncs.model_load import temperature, max_new_tokens, sample, repetition_penalty, top_p, top_k, torch_device, CtransGenGenerationConfig, max_tokens
37
- from chatfuncs.config import GEMINI_API_KEY, AWS_DEFAULT_REGION, LARGE_MODEL_NAME, SMALL_MODEL_NAME, RUN_AWS_FUNCTIONS, FEEDBACK_LOGS_FOLDER
38
 
39
  model_object = [] # Define empty list for model functions to run
40
  tokenizer = [] # Define empty list for model functions to run
@@ -75,51 +76,6 @@ ner_model = []#SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base
75
  # Used to pull out keywords from chat history to add to user queries behind the scenes
76
  kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
77
 
78
- # Vectorstore funcs
79
-
80
- # def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
81
-
82
- # print(f"> Total split documents: {len(docs_out)}")
83
-
84
- # vectorstore_func = FAISS.from_documents(documents=docs_out, embedding=embeddings)
85
-
86
- # '''
87
- # #with open("vectorstore.pkl", "wb") as f:
88
- # #pickle.dump(vectorstore, f)
89
- # '''
90
-
91
- # #if Path(save_to).exists():
92
- # # vectorstore_func.save_local(folder_path=save_to)
93
- # #else:
94
- # # os.mkdir(save_to)
95
- # # vectorstore_func.save_local(folder_path=save_to)
96
-
97
- # global vectorstore
98
-
99
- # vectorstore = vectorstore_func
100
-
101
- # out_message = "Document processing complete"
102
-
103
- # #print(out_message)
104
- # #print(f"> Saved to: {save_to}")
105
-
106
- # return out_message
107
-
108
- # def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings_model=embeddings_model):
109
-
110
- # print(f"> Total split documents: {len(docs_out)}")
111
-
112
- # print(docs_out)
113
-
114
- # vectorstore_func = FAISS.from_documents(documents=docs_out, embedding=embeddings_model)
115
-
116
- # vectorstore = vectorstore_func
117
-
118
- # out_message = "Document processing complete"
119
-
120
- # return out_message, vectorstore_func
121
-
122
- # Prompt functions
123
 
124
  def base_prompt_templates(model_type:str = SMALL_MODEL_NAME):
125
 
@@ -141,7 +97,6 @@ def base_prompt_templates(model_type:str = SMALL_MODEL_NAME):
141
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_phi3, input_variables=['question', 'summaries'])
142
  else:
143
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_gemini_aws, input_variables=['question', 'summaries'])
144
-
145
 
146
  return INSTRUCTION_PROMPT, CONTENT_PROMPT
147
 
@@ -149,14 +104,44 @@ def write_out_metadata_as_string(metadata_in:str):
149
  metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
150
  return metadata_string
151
 
152
- def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt:str, content_prompt:str, extracted_memory:list, vectorstore:object, embeddings:object, relevant_flag:bool = True, out_passages:int = 2, total_output_passage_chunks_size:int=5): # ,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  question = inputs["question"]
155
  chat_history = inputs["chat_history"]
156
 
157
  if relevant_flag == True:
158
  new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
159
- docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 25, out_passages = out_passages, vec_score_cut_off = 0.85, vec_weight = 1, bm25_weight = 1, svm_weight = 1)
160
  else:
161
  new_question_kworded = question
162
  doc_df = pd.DataFrame()
@@ -164,7 +149,7 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt:str, con
164
  docs_keep_out = []
165
 
166
  if (not docs_keep_as_doc) | (doc_df.empty):
167
- sorry_prompt = """Say 'Sorry, there is no relevant information to answer this question.'"""
168
  return sorry_prompt, "No relevant sources found.", new_question_kworded
169
 
170
  # Expand the found passages to the neighbouring context
@@ -198,7 +183,7 @@ def create_full_prompt(user_input:str,
198
  history:list[dict],
199
  extracted_memory:str,
200
  vectorstore:object,
201
- embeddings:object,
202
  model_type:str,
203
  out_passages:list[str],
204
  api_key:str="",
@@ -213,7 +198,7 @@ def create_full_prompt(user_input:str,
213
  print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
214
 
215
  history = history or []
216
-
217
  # Create instruction prompt
218
  instruction_prompt, content_prompt = base_prompt_templates(model_type=model_type)
219
 
@@ -225,7 +210,7 @@ def create_full_prompt(user_input:str,
225
 
226
  instruction_prompt_out, docs_content_string, new_question_kworded =\
227
  generate_expanded_prompt({"question": user_input, "chat_history": history}, #vectorstore,
228
- instruction_prompt, content_prompt, extracted_memory, vectorstore, embeddings, relevant_flag, out_passages)
229
 
230
  history.append({"metadata":None, "options":None, "role": 'user', "content": user_input})
231
 
@@ -259,8 +244,6 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
259
  ],
260
  }
261
 
262
- print("prompt_config:", prompt_config)
263
-
264
  body = json.dumps(prompt_config)
265
 
266
  modelId = model_choice
@@ -367,8 +350,6 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
367
  elif "claude" in model_choice:
368
  try:
369
  print("Calling AWS Claude model")
370
- print("prompt:", prompt)
371
- print("system_prompt:", system_prompt)
372
  response = call_aws_claude(prompt, system_prompt, temperature, max_tokens, model_choice)
373
  except Exception as e:
374
  # If fails, try again after x seconds in case there is a throttle limit
@@ -420,9 +401,8 @@ def process_requests(prompts: List[str], system_prompt_with_table: str, conversa
420
 
421
  response, conversation_history = send_request(prompts[0], conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt_with_table, temperature=temperature)
422
 
423
- print(response.text)
424
- #"Okay, I'm ready. What source are we discussing, and what's your question about it? Please provide as much context as possible so I can give you the best answer."]
425
- print(response.usage_metadata)
426
  responses.append(response)
427
 
428
  # Create conversation txt object
@@ -464,8 +444,6 @@ def produce_streaming_answer_chatbot(
464
 
465
  history = chat_history
466
 
467
- print("history at start of streaming function:", history)
468
-
469
  if relevant_query_bool == False:
470
  history.append({"metadata":None, "options":None, "role": "assistant", "content": 'No relevant query found. Please retry your question'})
471
 
@@ -557,8 +535,6 @@ def produce_streaming_answer_chatbot(
557
  elif "claude" in model_type:
558
  system_prompt = "You are answering questions from the user based on source material. Make sure to fully answer the questions with all required detail."
559
 
560
- print("full_prompt:", full_prompt)
561
-
562
  if isinstance(full_prompt, str):
563
  full_prompt = [full_prompt]
564
 
@@ -622,7 +598,7 @@ def produce_streaming_answer_chatbot(
622
  history[-1]['content'] += char
623
  yield history
624
 
625
- print("history at end of function:", history)
626
 
627
  # Chat helper functions
628
 
@@ -691,164 +667,188 @@ def create_doc_df(docs_keep_out):
691
 
692
  return doc_df
693
 
694
- def hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val, out_passages,
695
- vec_score_cut_off, vec_weight, bm25_weight, svm_weight): # ,vectorstore, embeddings
 
 
 
 
 
 
 
 
 
 
 
696
 
697
- #vectorstore=globals()["vectorstore"]
698
- #embeddings=globals()["embeddings"]
699
- doc_df = pd.DataFrame()
 
 
 
 
 
 
 
700
 
 
 
 
 
 
 
701
 
702
- docs = vectorstore.similarity_search_with_score(new_question_kworded, k=k_val)
703
 
704
- # Keep only documents with a certain score
705
- docs_len = [len(x[0].page_content) for x in docs]
706
- docs_scores = [x[1] for x in docs]
707
 
708
- # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
709
- score_more_limit = pd.Series(docs_scores) < vec_score_cut_off
710
- docs_keep = list(compress(docs, score_more_limit))
711
 
712
- if not docs_keep:
713
- return [], pd.DataFrame(), []
 
714
 
715
- # Only keep sources that are at least 100 characters long
716
- length_more_limit = pd.Series(docs_len) >= 100
717
- docs_keep = list(compress(docs_keep, length_more_limit))
718
 
719
- if not docs_keep:
720
- return [], pd.DataFrame(), []
 
721
 
722
- docs_keep_as_doc = [x[0] for x in docs_keep]
723
- docs_keep_length = len(docs_keep_as_doc)
724
 
 
 
 
 
725
 
726
-
727
- if docs_keep_length == 1:
 
 
 
 
 
 
728
 
729
- content=[]
730
- meta_url=[]
731
- score=[]
732
-
733
- for item in docs_keep:
734
- content.append(item[0].page_content)
735
- meta_url.append(item[0].metadata['source'])
736
- score.append(item[1])
737
 
738
- # Create df from 'winning' passages
 
739
 
740
- doc_df = pd.DataFrame(list(zip(content, meta_url, score)),
741
- columns =['page_content', 'meta_url', 'score'])
742
 
743
- docs_content = doc_df['page_content'].astype(str)
744
- docs_url = doc_df['meta_url']
 
 
 
 
 
 
 
745
 
746
- return docs_keep_as_doc, doc_df, docs_content, docs_url
747
-
748
- # Check for if more docs are removed than the desired output
749
- if out_passages > docs_keep_length:
750
- out_passages = docs_keep_length
751
- k_val = docs_keep_length
752
-
753
- vec_rank = [*range(1, docs_keep_length+1)]
754
- vec_score = [(docs_keep_length/x)*vec_weight for x in vec_rank]
755
-
756
- print("Number of documents remaining: ", docs_keep_length)
757
-
758
- # 2nd level check using BM25s package to do keyword search on retrieved passages.
759
-
760
- content_keep=[]
761
- for item in docs_keep:
762
- content_keep.append(item[0].page_content)
763
-
764
- # Prepare Corpus (Tokenized & Optional Stemming)
765
- corpus = [doc.lower() for doc in content_keep]
766
- #stemmer = SnowballStemmer("english", ignore_stopwords=True) # NLTK stemming not compatible
767
- stemmer = Stemmer.Stemmer("english")
768
- corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)
769
-
770
- # Create and Index with BM25s
771
- retriever = bm25s.BM25()
772
- retriever.index(corpus_tokens)
773
-
774
- # Query Processing (Stemming applied consistently if used above)
775
- query_tokens = bm25s.tokenize(new_question_kworded.lower(), stemmer=stemmer)
776
- results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=len(corpus)) # Retrieve all docs
777
-
778
- for i in range(results.shape[1]):
779
- doc, score = results[0, i], scores[0, i]
780
- print(f"Rank {i+1} (score: {score:.2f}): {doc}")
781
-
782
- #print("BM25 results:", results)
783
- #print("BM25 scores:", scores)
784
-
785
- # Rank Calculation (Custom Logic for Your BM25 Score)
786
- bm25_rank = list(range(1, len(results[0]) + 1))
787
- #bm25_rank = results[0]#.tolist()[0] # Since you have a single query
788
- bm25_score = [(docs_keep_length / (rank + 1)) * bm25_weight for rank in bm25_rank]
789
- # +1 to avoid division by 0 for rank 0
790
-
791
- # Result Ordering (Using the calculated ranks)
792
- pairs = list(zip(bm25_rank, docs_keep_as_doc))
793
- pairs.sort()
794
- bm25_result = [value for rank, value in pairs]
795
-
796
 
797
- # 3rd level check on retrieved docs with SVM retriever
798
- # Check the type of the embeddings object
799
- embeddings_type = type(embeddings)
800
 
801
 
802
- #hf_embeddings = HuggingFaceEmbeddings(**embeddings)
803
- hf_embeddings = embeddings
804
-
805
- svm_retriever = SVMRetriever.from_texts(content_keep, hf_embeddings, k = k_val)
806
- svm_result = svm_retriever.invoke(new_question_kworded)
807
 
808
-
809
- svm_rank=[]
810
- svm_score = []
811
 
812
- for vec_item in docs_keep:
813
- x = 0
814
- for svm_item in svm_result:
815
- x = x + 1
816
- if svm_item.page_content == vec_item[0].page_content:
817
- svm_rank.append(x)
818
- svm_score.append((docs_keep_length/x)*svm_weight)
819
 
820
-
821
- ## Calculate final score based on three ranking methods
822
- final_score = [a + b + c for a, b, c in zip(vec_score, bm25_score, svm_score)]
823
- final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score]
824
- # Force final_rank to increment by 1 each time
825
- final_rank = list(pd.Series(final_rank).rank(method='first'))
826
 
827
- #print("final rank: " + str(final_rank))
828
- #print("out_passages: " + str(out_passages))
 
 
 
829
 
830
- best_rank_index_pos = []
 
831
 
832
- for x in range(1,out_passages+1):
833
- try:
834
- best_rank_index_pos.append(final_rank.index(x))
835
- except IndexError: # catch the error
836
- pass
837
 
838
- # Adjust best_rank_index_pos to
 
 
 
 
839
 
840
- best_rank_pos_series = pd.Series(best_rank_index_pos)
841
 
 
842
 
843
- docs_keep_out = [docs_keep[i] for i in best_rank_index_pos]
844
-
845
- # Keep only 'best' options
846
- docs_keep_as_doc = [x[0] for x in docs_keep_out]
847
-
848
- # Make df of best options
849
- doc_df = create_doc_df(docs_keep_out)
850
 
851
- return docs_keep_as_doc, doc_df, docs_keep_out
 
 
 
 
 
 
 
 
852
 
853
  def get_expanded_passages(vectorstore, docs, width):
854
 
 
14
  from nltk.tokenize import RegexpTokenizer
15
  from nltk.stem import WordNetLemmatizer
16
  from keybert import KeyBERT
17
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
18
 
19
  # For Name Entity Recognition model
20
  #from span_marker import SpanMarkerModel # Not currently used
 
33
  from langchain.text_splitter import RecursiveCharacterTextSplitter
34
  from langchain.docstore.document import Document
35
 
36
+ from tools.prompts import instruction_prompt_template_alpaca, instruction_prompt_mistral_orca, instruction_prompt_phi3, instruction_prompt_llama3, instruction_prompt_qwen, instruction_prompt_template_orca, instruction_prompt_gemma, instruction_prompt_template_gemini_aws
37
+ from tools.model_load import temperature, max_new_tokens, sample, repetition_penalty, top_p, top_k, torch_device, CtransGenGenerationConfig, max_tokens
38
+ from tools.config import GEMINI_API_KEY, AWS_DEFAULT_REGION, LARGE_MODEL_NAME, SMALL_MODEL_NAME, RUN_AWS_FUNCTIONS, FEEDBACK_LOGS_FOLDER
39
 
40
  model_object = [] # Define empty list for model functions to run
41
  tokenizer = [] # Define empty list for model functions to run
 
76
  # Used to pull out keywords from chat history to add to user queries behind the scenes
77
  kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  def base_prompt_templates(model_type:str = SMALL_MODEL_NAME):
81
 
 
97
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_phi3, input_variables=['question', 'summaries'])
98
  else:
99
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_gemini_aws, input_variables=['question', 'summaries'])
 
100
 
101
  return INSTRUCTION_PROMPT, CONTENT_PROMPT
102
 
 
104
  metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
105
  return metadata_string
106
 
107
+ def generate_expanded_prompt(
108
+ inputs: Dict[str, str],
109
+ instruction_prompt: str,
110
+ content_prompt: str,
111
+ extracted_memory: list,
112
+ vectorstore: object,
113
+ embeddings_model: object,
114
+ relevant_flag: bool = True,
115
+ out_passages: int = 2,
116
+ total_output_passage_chunks_size: int = 5
117
+ ):
118
+ """
119
+ Generate an expanded prompt for a language model by retrieving and formatting relevant document passages.
120
+
121
+ Args:
122
+ inputs (Dict[str, str]): Dictionary containing the user's question and chat history.
123
+ instruction_prompt (str): The instruction prompt template to use for the model.
124
+ content_prompt (str): The content prompt template for formatting passages.
125
+ extracted_memory (list): List of previous conversation memory or context.
126
+ vectorstore (object): The vector store object used for document retrieval.
127
+ embeddings_model (object): The embeddings model used for vector search.
128
+ relevant_flag (bool, optional): Whether to perform relevant document retrieval. Defaults to True.
129
+ out_passages (int, optional): Number of passages to retrieve. Defaults to 2.
130
+ total_output_passage_chunks_size (int, optional): Number of neighboring chunks to expand for context. Defaults to 5.
131
+
132
+ Returns:
133
+ tuple: (instruction_prompt_out, sources_docs_content_string, new_question_kworded)
134
+ instruction_prompt_out (str): The fully formatted instruction prompt for the model.
135
+ sources_docs_content_string (str): The formatted string of source passages and metadata for user display.
136
+ new_question_kworded (str): The (possibly keyword-adapted) user question.
137
+ """
138
 
139
  question = inputs["question"]
140
  chat_history = inputs["chat_history"]
141
 
142
  if relevant_flag == True:
143
  new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
144
+ docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings_model, k_val = 25, out_passages = out_passages, vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)
145
  else:
146
  new_question_kworded = question
147
  doc_df = pd.DataFrame()
 
149
  docs_keep_out = []
150
 
151
  if (not docs_keep_as_doc) | (doc_df.empty):
152
+ sorry_prompt = """Respond 'Sorry, there is no relevant information to answer this question.'"""
153
  return sorry_prompt, "No relevant sources found.", new_question_kworded
154
 
155
  # Expand the found passages to the neighbouring context
 
183
  history:list[dict],
184
  extracted_memory:str,
185
  vectorstore:object,
186
+ embeddings_model:object,
187
  model_type:str,
188
  out_passages:list[str],
189
  api_key:str="",
 
198
  print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
199
 
200
  history = history or []
201
+
202
  # Create instruction prompt
203
  instruction_prompt, content_prompt = base_prompt_templates(model_type=model_type)
204
 
 
210
 
211
  instruction_prompt_out, docs_content_string, new_question_kworded =\
212
  generate_expanded_prompt({"question": user_input, "chat_history": history}, #vectorstore,
213
+ instruction_prompt, content_prompt, extracted_memory, vectorstore, embeddings_model, relevant_flag, out_passages)
214
 
215
  history.append({"metadata":None, "options":None, "role": 'user', "content": user_input})
216
 
 
244
  ],
245
  }
246
 
 
 
247
  body = json.dumps(prompt_config)
248
 
249
  modelId = model_choice
 
350
  elif "claude" in model_choice:
351
  try:
352
  print("Calling AWS Claude model")
 
 
353
  response = call_aws_claude(prompt, system_prompt, temperature, max_tokens, model_choice)
354
  except Exception as e:
355
  # If fails, try again after x seconds in case there is a throttle limit
 
401
 
402
  response, conversation_history = send_request(prompts[0], conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt_with_table, temperature=temperature)
403
 
404
+ #print(response.text)
405
+ #print(response.usage_metadata)
 
406
  responses.append(response)
407
 
408
  # Create conversation txt object
 
444
 
445
  history = chat_history
446
 
 
 
447
  if relevant_query_bool == False:
448
  history.append({"metadata":None, "options":None, "role": "assistant", "content": 'No relevant query found. Please retry your question'})
449
 
 
535
  elif "claude" in model_type:
536
  system_prompt = "You are answering questions from the user based on source material. Make sure to fully answer the questions with all required detail."
537
 
 
 
538
  if isinstance(full_prompt, str):
539
  full_prompt = [full_prompt]
540
 
 
598
  history[-1]['content'] += char
599
  yield history
600
 
601
+ #print("history at end of function:", history)
602
 
603
  # Chat helper functions
604
 
 
667
 
668
  return doc_df
669
 
670
+ def hybrid_retrieval(
671
+ new_question_kworded: str,
672
+ vectorstore:FAISS,
673
+ embeddings_model:HuggingFaceEmbeddings,
674
+ k_val: int,
675
+ out_passages: int,
676
+ vec_score_cut_off: float,
677
+ vec_weight: float,
678
+ bm25_weight: float,
679
+ svm_weight: float
680
+ ) -> tuple:
681
+ """
682
+ Perform hybrid retrieval of relevant documents based on a query using vector similarity, BM25, and SVM weights.
683
 
684
+ Args:
685
+ new_question_kworded (str): The keyword-adapted user query.
686
+ vectorstore: The vectorstore object for similarity search.
687
+ embeddings_model: The embeddings model used for vector search.
688
+ k_val (int): Number of top documents to retrieve.
689
+ out_passages (int): Number of passages to output.
690
+ vec_score_cut_off (float): Similarity score threshold for filtering.
691
+ vec_weight (float): Weight for vector similarity.
692
+ bm25_weight (float): Weight for BM25 retrieval.
693
+ svm_weight (float): Weight for SVM retrieval.
694
 
695
+ Returns:
696
+ tuple: (docs_keep_as_doc, doc_df, docs_keep_out)
697
+ docs_keep_as_doc: List of kept document objects.
698
+ doc_df: DataFrame of kept documents and metadata.
699
+ docs_keep_out: List of kept (document, score) tuples.
700
+ """
701
 
702
+ doc_df = pd.DataFrame()
703
 
704
+ docs = vectorstore.similarity_search_with_score(new_question_kworded, k=k_val)
 
 
705
 
706
+ # Keep only documents with a certain score
707
+ docs_len = [len(x[0].page_content) for x in docs]
708
+ docs_scores = [x[1] for x in docs]
709
 
710
+ # Only keep sources that are sufficiently relevant (i.e. similarity search score above threshold below)
711
+ score_more_limit = pd.Series(docs_scores) > vec_score_cut_off
712
+ docs_keep = list(compress(docs, score_more_limit))
713
 
714
+ if not docs_keep:
715
+ return [], pd.DataFrame(), []
 
716
 
717
+ # Only keep sources that are at least 100 characters long
718
+ length_more_limit = pd.Series(docs_len) >= 100
719
+ docs_keep = list(compress(docs_keep, length_more_limit))
720
 
721
+ if not docs_keep:
722
+ return [], pd.DataFrame(), []
723
 
724
+ docs_keep_as_doc = [x[0] for x in docs_keep]
725
+ docs_keep_length = len(docs_keep_as_doc)
726
+
727
+ if docs_keep_length == 1:
728
 
729
+ content=[]
730
+ meta_url=[]
731
+ score=[]
732
+
733
+ for item in docs_keep:
734
+ content.append(item[0].page_content)
735
+ meta_url.append(item[0].metadata['source'])
736
+ score.append(item[1])
737
 
738
+ # Create df from 'winning' passages
 
 
 
 
 
 
 
739
 
740
+ doc_df = pd.DataFrame(list(zip(content, meta_url, score)),
741
+ columns =['page_content', 'meta_url', 'score'])
742
 
743
+ docs_content = doc_df['page_content'].astype(str)
744
+ docs_url = doc_df['meta_url']
745
 
746
+ return docs_keep_as_doc, doc_df, docs_content, docs_url
747
+
748
+ # Check for if more docs are removed than the desired output
749
+ if out_passages > docs_keep_length:
750
+ out_passages = docs_keep_length
751
+ k_val = docs_keep_length
752
+
753
+ vec_rank = [*range(1, docs_keep_length+1)]
754
+ vec_score = [(docs_keep_length/x)*vec_weight for x in vec_rank]
755
 
756
+ print("Number of documents remaining: ", docs_keep_length)
757
+
758
+ # 2nd level check using BM25s package to do keyword search on retrieved passages.
759
+
760
+ content_keep=[]
761
+ for item in docs_keep:
762
+ content_keep.append(item[0].page_content)
763
+
764
+ # Prepare Corpus (Tokenized & Optional Stemming)
765
+ corpus = [doc.lower() for doc in content_keep]
766
+ #stemmer = SnowballStemmer("english", ignore_stopwords=True) # NLTK stemming not compatible
767
+ stemmer = Stemmer.Stemmer("english")
768
+ corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)
769
+
770
+ # Create and Index with BM25s
771
+ retriever = bm25s.BM25()
772
+ retriever.index(corpus_tokens)
773
+
774
+ # Query Processing (Stemming applied consistently if used above)
775
+ query_tokens = bm25s.tokenize(new_question_kworded.lower(), stemmer=stemmer)
776
+ results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=len(corpus)) # Retrieve all docs
777
+
778
+ for i in range(results.shape[1]):
779
+ doc, score = results[0, i], scores[0, i]
780
+ print(f"Rank {i+1} (score: {score:.2f}): {doc}")
781
+
782
+ #print("BM25 results:", results)
783
+ #print("BM25 scores:", scores)
784
+
785
+ # Rank Calculation (Custom Logic for Your BM25 Score)
786
+ bm25_rank = list(range(1, len(results[0]) + 1))
787
+ #bm25_rank = results[0]#.tolist()[0] # Since you have a single query
788
+ bm25_score = [(docs_keep_length / (rank + 1)) * bm25_weight for rank in bm25_rank]
789
+ # +1 to avoid division by 0 for rank 0
790
+
791
+ # Result Ordering (Using the calculated ranks)
792
+ pairs = list(zip(bm25_rank, docs_keep_as_doc))
793
+ pairs.sort()
794
+ bm25_result = [value for rank, value in pairs]
795
+
 
 
 
 
 
 
 
 
 
 
796
 
797
+ # 3rd level check on retrieved docs with SVM retriever
798
+ # Check the type of the embeddings_model object
799
+ embeddings_type = type(embeddings_model)
800
 
801
 
802
+ #hf_embeddings = HuggingFaceEmbeddings(**embeddings)
803
+ hf_embeddings = embeddings_model
804
+
805
+ svm_retriever = SVMRetriever.from_texts(content_keep, hf_embeddings, k = k_val)
806
+ svm_result = svm_retriever.invoke(new_question_kworded)
807
 
808
+
809
+ svm_rank=[]
810
+ svm_score = []
811
 
812
+ for vec_item in docs_keep:
813
+ x = 0
814
+ for svm_item in svm_result:
815
+ x = x + 1
816
+ if svm_item.page_content == vec_item[0].page_content:
817
+ svm_rank.append(x)
818
+ svm_score.append((docs_keep_length/x)*svm_weight)
819
 
 
 
 
 
 
 
820
 
821
+ ## Calculate final score based on three ranking methods
822
+ final_score = [a + b + c for a, b, c in zip(vec_score, bm25_score, svm_score)]
823
+ final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score]
824
+ # Force final_rank to increment by 1 each time
825
+ final_rank = list(pd.Series(final_rank).rank(method='first'))
826
 
827
+ #print("final rank: " + str(final_rank))
828
+ #print("out_passages: " + str(out_passages))
829
 
830
+ best_rank_index_pos = []
 
 
 
 
831
 
832
+ for x in range(1,out_passages+1):
833
+ try:
834
+ best_rank_index_pos.append(final_rank.index(x))
835
+ except IndexError: # catch the error
836
+ pass
837
 
838
+ # Adjust best_rank_index_pos to
839
 
840
+ best_rank_pos_series = pd.Series(best_rank_index_pos)
841
 
 
 
 
 
 
 
 
842
 
843
+ docs_keep_out = [docs_keep[i] for i in best_rank_index_pos]
844
+
845
+ # Keep only 'best' options
846
+ docs_keep_as_doc = [x[0] for x in docs_keep_out]
847
+
848
+ # Make df of best options
849
+ doc_df = create_doc_df(docs_keep_out)
850
+
851
+ return docs_keep_as_doc, doc_df, docs_keep_out
852
 
853
  def get_expanded_passages(vectorstore, docs, width):
854
 
{chatfuncs β†’ tools}/config.py RENAMED
@@ -200,17 +200,18 @@ if LOAD_LARGE_MODEL == "1":
200
  default_model_choices.append(LARGE_MODEL_NAME)
201
 
202
  if RUN_AWS_FUNCTIONS == "1":
203
- default_model_choices.extend(["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"])
204
 
205
  if RUN_GEMINI_MODELS == "1":
206
- default_model_choices.extend(["gemini-2.0-flash-001", "gemini-2.5-flash-preview-04-17", "models/gemini-2.5-pro-exp-03-25"])
 
207
 
208
 
209
  DEFAULT_MODEL_CHOICES = get_or_create_env_var("DEFAULT_MODEL_CHOICES", str(default_model_choices))
210
 
211
- EMBEDDINGS_MODEL_NAME = get_or_create_env_var('EMBEDDINGS_MODEL_NAME', "BAAI/bge-base-en-v1.5") #"mixedbread-ai/mxbai-embed-xsmall-v1"
212
 
213
- DEFAULT_EMBEDDINGS_LOCATION = get_or_create_env_var('DEFAULT_EMBEDDINGS_LOCATION', "faiss_embedding")
214
 
215
  DEFAULT_DATA_SOURCE_NAME = get_or_create_env_var('DEFAULT_DATA_SOURCE_NAME', "Document redaction app documentation")
216
 
 
200
  default_model_choices.append(LARGE_MODEL_NAME)
201
 
202
  if RUN_AWS_FUNCTIONS == "1":
203
+ default_model_choices.extend(["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-7-sonnet-20250219-v1:0"])
204
 
205
  if RUN_GEMINI_MODELS == "1":
206
+ GEMINI_MODELS = ["gemini-2.5-flash-lite", "gemini-2.5-flash", "gemini-2.5-pro"]
207
+ default_model_choices.extend(GEMINI_MODELS)
208
 
209
 
210
  DEFAULT_MODEL_CHOICES = get_or_create_env_var("DEFAULT_MODEL_CHOICES", str(default_model_choices))
211
 
212
+ EMBEDDINGS_MODEL_NAME = get_or_create_env_var('EMBEDDINGS_MODEL_NAME', "mixedbread-ai/mxbai-embed-xsmall-v1") #"mixedbread-ai/mxbai-embed-xsmall-v1"
213
 
214
+ DEFAULT_EMBEDDINGS_LOCATION = get_or_create_env_var('DEFAULT_EMBEDDINGS_LOCATION', "faiss_embedding/faiss_embedding.zip")
215
 
216
  DEFAULT_DATA_SOURCE_NAME = get_or_create_env_var('DEFAULT_DATA_SOURCE_NAME', "Document redaction app documentation")
217
 
{chatfuncs β†’ tools}/helper_functions.py RENAMED
@@ -3,7 +3,7 @@ import gradio as gr
3
  import pandas as pd
4
  import boto3
5
  from botocore.exceptions import ClientError
6
- from chatfuncs.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID
7
 
8
  def get_or_create_env_var(var_name, default_value):
9
  # Get the environment variable if it exists
 
3
  import pandas as pd
4
  import boto3
5
  from botocore.exceptions import ClientError
6
+ from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID
7
 
8
  def get_or_create_env_var(var_name, default_value):
9
  # Get the environment variable if it exists
{chatfuncs β†’ tools}/ingest.py RENAMED
@@ -6,19 +6,31 @@ import re
6
  import requests
7
  import pandas as pd
8
  import dateutil.parser
9
- from typing import Type, List
10
  import shutil
 
 
 
 
 
11
 
 
12
  #from langchain_community.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
13
  from langchain_community.vectorstores.faiss import FAISS
14
  #from langchain_community.vectorstores import Chroma
15
  from langchain.text_splitter import RecursiveCharacterTextSplitter
16
  from langchain.docstore.document import Document
17
  #from chatfuncs.config import EMBEDDINGS_MODEL_NAME
18
-
 
 
 
19
  from bs4 import BeautifulSoup
20
  from docx import Document as Doc
21
  from pypdf import PdfReader
 
 
 
22
 
23
  PandasDataFrame = Type[pd.DataFrame]
24
 
@@ -558,22 +570,130 @@ def docs_elements_from_csv_save(docs_path="documents.csv"):
558
 
559
  # ## Create embeddings and save faiss vector store to the path specified in `save_to`
560
 
561
- # def load_embeddings_model(embeddings_model = EMBEDDINGS_MODEL_NAME):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
 
563
- # embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_model)
 
 
564
 
565
- # #global embeddings
 
566
 
567
- # #embeddings = embeddings_func
 
568
 
569
- # return embeddings_func
570
 
571
- def embed_faiss_save_to_zip(docs_out, save_folder, embeddings_model_object, save_to="faiss_embeddings", model_name="BAAI/bge-base-en-v1.5"):
572
- #load_embeddings(model_name=model_name)
 
 
 
 
 
 
 
 
573
 
574
  print(f"> Total split documents: {len(docs_out)}")
575
 
576
- vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings_model_object)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
578
  save_to_path = Path(save_folder, save_to)
579
  save_to_path.mkdir(parents=True, exist_ok=True)
@@ -603,14 +723,68 @@ def embed_faiss_save_to_zip(docs_out, save_folder, embeddings_model_object, save
603
  index_faiss.unlink(missing_ok=True)
604
  index_pkl.unlink(missing_ok=True)
605
 
606
- # Move ZIP inside the folder for easier reference
607
- #final_zip_path = save_to_path.with_suffix('.zip')
608
-
609
  print("> Archive complete")
610
  print(f"> Final ZIP path: {final_zip_path}")
611
 
612
- return "Document processing complete", vectorstore, final_zip_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
 
 
614
 
615
 
616
  # def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
 
6
  import requests
7
  import pandas as pd
8
  import dateutil.parser
9
+ from typing import Type, List, Tuple
10
  import shutil
11
+ import numpy as np
12
+ import gradio as gr
13
+ import zipfile
14
+ import tempfile
15
+ from pathlib import Path
16
 
17
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
18
  #from langchain_community.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
19
  from langchain_community.vectorstores.faiss import FAISS
20
  #from langchain_community.vectorstores import Chroma
21
  from langchain.text_splitter import RecursiveCharacterTextSplitter
22
  from langchain.docstore.document import Document
23
  #from chatfuncs.config import EMBEDDINGS_MODEL_NAME
24
+ from langchain_core.embeddings import Embeddings # Import Embeddings for type hinting
25
+ from tqdm import tqdm
26
+ from langchain_community.docstore.in_memory import InMemoryDocstore # To manually build the docstore
27
+ from uuid import uuid4 # To generate unique IDs for documents in the docstore
28
  from bs4 import BeautifulSoup
29
  from docx import Document as Doc
30
  from pypdf import PdfReader
31
+ import faiss # For directly creating the FAISS index
32
+
33
+ from tools.config import EMBEDDINGS_MODEL_NAME
34
 
35
  PandasDataFrame = Type[pd.DataFrame]
36
 
 
570
 
571
  # ## Create embeddings and save faiss vector store to the path specified in `save_to`
572
 
573
+ def load_embeddings_model(embeddings_model = EMBEDDINGS_MODEL_NAME):
574
+
575
+ embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_model)
576
+
577
+ #global embeddings
578
+
579
+ #embeddings = embeddings_func
580
+
581
+ return embeddings_func
582
+
583
+ # def embed_faiss_save_to_zip(docs_out, save_folder, embeddings_model_object, save_to="faiss_embeddings", model_name="mixedbread-ai/mxbai-embed-xsmall-v1"):
584
+
585
+ # print(f"> Total split documents: {len(docs_out)}")
586
+
587
+ # vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings_model_object)
588
+
589
+ # save_to_path = Path(save_folder, save_to)
590
+ # save_to_path.mkdir(parents=True, exist_ok=True)
591
+
592
+ # vectorstore.save_local(folder_path=str(save_to_path))
593
+
594
+ # print("> FAISS index saved")
595
+ # print(f"> Saved to: {save_to}")
596
+
597
+ # # Ensure files are written before archiving
598
+ # index_faiss = save_to_path / "index.faiss"
599
+ # index_pkl = save_to_path / "index.pkl"
600
+
601
+ # if not index_faiss.exists() or not index_pkl.exists():
602
+ # raise FileNotFoundError("Expected FAISS index files not found before zipping.")
603
+
604
+ # # Flush file system writes by forcing a sync (works best on Unix)
605
+ # try:
606
+ # os.sync()
607
+ # except AttributeError:
608
+ # pass # os.sync() not available on Windows
609
+
610
+ # # Create ZIP archive
611
+ # final_zip_path = shutil.make_archive(str(save_to_path), 'zip', root_dir=str(save_to_path))
612
 
613
+ # # Remove individual index files to avoid leaking large raw files
614
+ # index_faiss.unlink(missing_ok=True)
615
+ # index_pkl.unlink(missing_ok=True)
616
 
617
+ # # Move ZIP inside the folder for easier reference
618
+ # #final_zip_path = save_to_path.with_suffix('.zip')
619
 
620
+ # print("> Archive complete")
621
+ # print(f"> Final ZIP path: {final_zip_path}")
622
 
623
+ # return "Document processing complete", vectorstore, final_zip_path
624
 
625
+
626
+
627
+ def embed_faiss_save_to_zip(
628
+ docs_out: List[Document],
629
+ save_folder: str,
630
+ embeddings_model_object: Embeddings, # Type hint for clarity
631
+ save_to: str = "faiss_embeddings",
632
+ model_name: str = "mixedbread-ai/mxbai-embed-xsmall-v1", # This is a descriptive name, not directly used in FAISS build
633
+ progress: gr.Progress = gr.Progress(track_tqdm=True)
634
+ ) -> Tuple[str, FAISS, Path]:
635
 
636
  print(f"> Total split documents: {len(docs_out)}")
637
 
638
+ # --- Progress Bar Integration Starts Here ---
639
+ print("Starting embedding generation and FAISS index construction...")
640
+
641
+ texts = []
642
+ metadatas = []
643
+ vectors = []
644
+ docstore = InMemoryDocstore()
645
+ index_to_docstore_id = {} # Maps FAISS index position to docstore ID
646
+
647
+ if not docs_out:
648
+ print("No documents provided. Skipping FAISS index creation.")
649
+ return "No documents to process", None, None # Or handle as an error
650
+
651
+ # 1. Generate Embeddings and Populate Data Structures with tqdm
652
+ # Wrap the iteration over docs_out with tqdm for a progress bar
653
+ for i, doc in tqdm(enumerate(docs_out), desc="Generating Embeddings", total=len(docs_out)):
654
+ # Store text and metadata
655
+ texts.append(doc.page_content)
656
+ metadatas.append(doc.metadata)
657
+
658
+ # Generate embedding for the current document
659
+ # embeddings_model_object.embed_documents expects a list of strings
660
+ # and returns a list of lists (embeddings). We take the first element.
661
+ vector = embeddings_model_object.embed_documents([doc.page_content])[0]
662
+ vectors.append(vector)
663
+
664
+ # Populate the internal docstore that FAISS uses
665
+ doc_id = str(uuid4()) # Generate a unique ID for each document
666
+ docstore.add({doc_id: doc}) # Add the full Document object to the docstore
667
+ index_to_docstore_id[i] = doc_id # Map FAISS index position (i) to its doc_id
668
+
669
+ print("\nEmbedding generation complete. Building FAISS index...")
670
+
671
+ # 2. Build the Raw FAISS Index
672
+ # Ensure all embeddings are numpy float32, which FAISS expects.
673
+ # BGE models (like bge-base-en-v1.5) typically produce L2-normalized embeddings,
674
+ # which are ideal for Inner Product (IP) similarity, equivalent to cosine similarity.
675
+ # If your model *does not* output normalized vectors and you want cosine similarity,
676
+ # you must normalize them here: `np.array([v / np.linalg.norm(v) for v in vectors]).astype("float32")`
677
+ # Otherwise, you might use IndexFlatL2 for Euclidean distance.
678
+ # For common embedding models and cosine similarity, `IndexFlatIP` with pre-normalized vectors is standard.
679
+ embeddings_np = np.array(vectors).astype("float32")
680
+ embedding_dimension = embeddings_np.shape[1]
681
+
682
+ # Create a raw FAISS index (e.g., IndexFlatIP for cosine similarity)
683
+ raw_faiss_index = faiss.IndexFlatIP(embedding_dimension)
684
+ raw_faiss_index.add(embeddings_np) # Add all vectors to the raw FAISS index
685
+
686
+ # 3. Create the LangChain FAISS Vectorstore from the components
687
+ # The `embedding_function` is used for subsequent queries to the vectorstore,
688
+ # not for building the initial index here (as we've already done that).
689
+ vectorstore = FAISS(
690
+ embedding_function=embeddings_model_object.embed_query,
691
+ index=raw_faiss_index,
692
+ docstore=docstore,
693
+ index_to_docstore_id=index_to_docstore_id
694
+ # distance_strategy defaults to COSINE, which is appropriate for IndexFlatIP
695
+ )
696
+ # --- Progress Bar Integration Ends Here ---
697
 
698
  save_to_path = Path(save_folder, save_to)
699
  save_to_path.mkdir(parents=True, exist_ok=True)
 
723
  index_faiss.unlink(missing_ok=True)
724
  index_pkl.unlink(missing_ok=True)
725
 
 
 
 
726
  print("> Archive complete")
727
  print(f"> Final ZIP path: {final_zip_path}")
728
 
729
+ return "Document processing complete", vectorstore, final_zip_path # Return Path object for consistency
730
+
731
+ def get_faiss_store(zip_file_path: str, embeddings_model: Embeddings) -> FAISS:
732
+ """
733
+ Loads a FAISS vector store from a ZIP archive.
734
+
735
+ Args:
736
+ zip_file_path: The string path pointing to the .zip archive containing
737
+ index.faiss and index.pkl. This should be the
738
+ final_zip_path returned by embed_faiss_save_to_zip.
739
+ embeddings_model: The embeddings model object (e.g., OpenAIEmbeddings, HuggingFaceEmbeddings)
740
+ used to create the index. This is crucial for proper deserialization.
741
+
742
+ Returns:
743
+ A FAISS vector store object.
744
+ """
745
+
746
+ zip_file_path = Path(zip_file_path)
747
+
748
+ if not zip_file_path.exists():
749
+ raise FileNotFoundError(f"ZIP archive not found at: {zip_file_path}")
750
+ if not zip_file_path.suffix == '.zip':
751
+ raise ValueError(f"Expected a .zip file, but got: {zip_file_path}")
752
+
753
+ # Create a temporary directory to extract the FAISS index files
754
+ # tempfile.TemporaryDirectory() handles cleanup automatically when the 'with' block exits.
755
+ with tempfile.TemporaryDirectory() as temp_dir_str:
756
+ temp_extract_path = Path(temp_dir_str)
757
+
758
+ print(f"> Extracting {zip_file_path} to temporary directory: {temp_extract_path}")
759
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
760
+ # The zip file contains 'index.faiss' and 'index.pkl' directly at its root.
761
+ # So, extracting to temp_extract_path will place them as temp_extract_path/index.faiss
762
+ zip_ref.extractall(temp_extract_path)
763
+
764
+ # Verify that the files were extracted successfully
765
+ extracted_faiss_file = temp_extract_path / "index.faiss"
766
+ extracted_pkl_file = temp_extract_path / "index.pkl"
767
+
768
+ if not extracted_faiss_file.exists() or not extracted_pkl_file.exists():
769
+ raise FileNotFoundError(
770
+ f"Required FAISS index files (index.faiss, index.pkl) not found "
771
+ f"in extracted location: {temp_extract_path}. "
772
+ f"ZIP content might be structured unexpectedly."
773
+ )
774
+
775
+ print("> Loading FAISS index from extracted files...")
776
+ faiss_vstore = FAISS.load_local(
777
+ folder_path=str(temp_extract_path), # FAISS.load_local expects a string path
778
+ embeddings=embeddings_model,
779
+ allow_dangerous_deserialization=True
780
+ )
781
+ print("> FAISS index loaded successfully.")
782
+
783
+ # The temporary directory and its contents are automatically removed here
784
+ # when the `with tempfile.TemporaryDirectory()` block exits.
785
+ # No need for manual os.remove() calls for index.faiss and index.pkl.
786
 
787
+ return faiss_vstore
788
 
789
 
790
  # def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
{chatfuncs β†’ tools}/llm_api_call.py RENAMED
@@ -12,8 +12,9 @@ from gradio import Progress
12
  from typing import List, Tuple
13
  from io import StringIO
14
 
15
- from chatfuncs.prompts import prompt1, prompt2, prompt3, system_prompt, summarise_system_prompt, summarise_prompt
16
- from chatfuncs.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var
 
17
 
18
  # ResponseObject class for AWS Bedrock calls
19
  class ResponseObject:
@@ -171,33 +172,6 @@ def construct_gemini_generative_model(in_api_key: str, temperature: float, model
171
  #model = ai.GenerativeModel.from_cached_content(cached_content=cache, generation_config=config)
172
  model = ai.GenerativeModel(model_name='models/' + model_choice, system_instruction=system_prompt, generation_config=config)
173
 
174
- # Upload CSV file (replace with your actual file path)
175
- #file_id = ai.upload_file(upload_file_path)
176
-
177
-
178
- # if file_type == 'xlsx':
179
- # print("Running through all xlsx sheets")
180
- # #anon_xlsx = pd.ExcelFile(upload_file_path)
181
- # if not in_excel_sheets:
182
- # out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
183
- # continue
184
-
185
- # anon_xlsx = pd.ExcelFile(upload_file_path)
186
-
187
- # # Create xlsx file:
188
- # anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
189
-
190
-
191
- ### QUERYING LARGE LANGUAGE MODEL ###
192
- # Prompt caching the table and system prompt. See here: https://ai.google.dev/gemini-api/docs/caching?lang=python
193
- # Create a cache with a 5 minute TTL. ONLY FOR CACHES OF AT LEAST 32k TOKENS!
194
- # cache = ai.caching.CachedContent.create(
195
- # model='models/' + model_choice,
196
- # display_name=out_file_part, # used to identify the cache
197
- # system_instruction=system_prompt_with_table,
198
- # ttl=datetime.timedelta(minutes=5),
199
- # )
200
-
201
  return model, config
202
 
203
  def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tokens: int, model_choice: str) -> ResponseObject:
@@ -276,7 +250,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
276
  #print("full_prompt:", full_prompt)
277
 
278
  # Generate the model's response
279
- if model_choice in ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]:
280
  try:
281
  response = model.generate_content(contents=full_prompt, generation_config=config)
282
  except Exception as e:
@@ -701,7 +675,7 @@ def llm_query(file_data:pd.DataFrame, existing_topics_w_references_table:pd.Data
701
  #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
702
 
703
  # Prepare Gemini models before query
704
- if model_choice in ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]:
705
  print("Using Gemini model:", model_choice)
706
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=summarise_system_prompt, max_tokens=max_tokens)
707
  else:
@@ -772,17 +746,12 @@ def llm_query(file_data:pd.DataFrame, existing_topics_w_references_table:pd.Data
772
 
773
  summary_prompt_list = [formatted_summary_prompt]
774
 
775
- print("master_summary_prompt_list:", summary_prompt_list[0])
776
-
777
  summary_conversation_history = []
778
  summary_whole_conversation = []
779
 
780
  # Process requests to large language model
781
  master_summary_response, summary_conversation_history, whole_summary_conversation, whole_conversation_metadata = process_requests(summary_prompt_list, summarise_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, master = True)
782
 
783
- print("master_summary_response:", master_summary_response[-1].text)
784
- print("Whole conversation metadata:", whole_conversation_metadata)
785
-
786
  new_topic_table_out_path, new_reference_table_out_path, new_unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(master_summary_response, whole_summary_conversation, whole_conversation_metadata, out_file_part, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, first_run=False)
787
 
788
  # If error in table parsing, leave function
@@ -832,7 +801,7 @@ def llm_query(file_data:pd.DataFrame, existing_topics_w_references_table:pd.Data
832
  #system_prompt_with_table = system_prompt + normalised_simple_markdown_table
833
 
834
  # Prepare Gemini models before query
835
- if model_choice in ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]:
836
  print("Using Gemini model:", model_choice)
837
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
838
  else:
@@ -857,9 +826,6 @@ def llm_query(file_data:pd.DataFrame, existing_topics_w_references_table:pd.Data
857
 
858
  #print("Whole conversation metadata before:", whole_conversation_metadata)
859
 
860
- print("responses:", responses[-1].text)
861
- print("Whole conversation metadata:", whole_conversation_metadata)
862
-
863
  topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_table_df, markdown_table, reference_df, new_unique_topics_df, batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, out_file_part, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, first_run=True)
864
 
865
  # If error in table parsing, leave function
@@ -879,8 +845,6 @@ def llm_query(file_data:pd.DataFrame, existing_topics_w_references_table:pd.Data
879
 
880
  new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]).drop_duplicates('Subtopic')
881
 
882
- print("new_unique_topics_df:", new_unique_topics_df)
883
-
884
  new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
885
  out_file_paths.append(unique_topics_df_out_path)
886
 
 
12
  from typing import List, Tuple
13
  from io import StringIO
14
 
15
+ from tools.prompts import prompt1, prompt2, prompt3, system_prompt, summarise_system_prompt, summarise_prompt
16
+ from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var
17
+ from tools.config import GEMINI_MODELS
18
 
19
  # ResponseObject class for AWS Bedrock calls
20
  class ResponseObject:
 
172
  #model = ai.GenerativeModel.from_cached_content(cached_content=cache, generation_config=config)
173
  model = ai.GenerativeModel(model_name='models/' + model_choice, system_instruction=system_prompt, generation_config=config)
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  return model, config
176
 
177
  def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tokens: int, model_choice: str) -> ResponseObject:
 
250
  #print("full_prompt:", full_prompt)
251
 
252
  # Generate the model's response
253
+ if model_choice in GEMINI_MODELS:
254
  try:
255
  response = model.generate_content(contents=full_prompt, generation_config=config)
256
  except Exception as e:
 
675
  #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
676
 
677
  # Prepare Gemini models before query
678
+ if model_choice in GEMINI_MODELS:
679
  print("Using Gemini model:", model_choice)
680
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=summarise_system_prompt, max_tokens=max_tokens)
681
  else:
 
746
 
747
  summary_prompt_list = [formatted_summary_prompt]
748
 
 
 
749
  summary_conversation_history = []
750
  summary_whole_conversation = []
751
 
752
  # Process requests to large language model
753
  master_summary_response, summary_conversation_history, whole_summary_conversation, whole_conversation_metadata = process_requests(summary_prompt_list, summarise_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, master = True)
754
 
 
 
 
755
  new_topic_table_out_path, new_reference_table_out_path, new_unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(master_summary_response, whole_summary_conversation, whole_conversation_metadata, out_file_part, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, first_run=False)
756
 
757
  # If error in table parsing, leave function
 
801
  #system_prompt_with_table = system_prompt + normalised_simple_markdown_table
802
 
803
  # Prepare Gemini models before query
804
+ if model_choice in GEMINI_MODELS:
805
  print("Using Gemini model:", model_choice)
806
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
807
  else:
 
826
 
827
  #print("Whole conversation metadata before:", whole_conversation_metadata)
828
 
 
 
 
829
  topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_table_df, markdown_table, reference_df, new_unique_topics_df, batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, out_file_part, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, first_run=True)
830
 
831
  # If error in table parsing, leave function
 
845
 
846
  new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]).drop_duplicates('Subtopic')
847
 
 
 
848
  new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
849
  out_file_paths.append(unique_topics_df_out_path)
850
 
{chatfuncs β†’ tools}/model_load.py RENAMED
File without changes
{chatfuncs β†’ tools}/prompts.py RENAMED
File without changes