gaonkarrs commited on
Commit
5d34f4c
·
0 Parent(s):

Proper root structure with app.py and requirements.txt

Browse files
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.index filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Rag Eval Dashboard
3
+ emoji: 🚀
4
+ colorFrom: gray
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 5.36.2
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: RAGBench evalution
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,627 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_from_disk
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import faiss
4
+ import numpy as np
5
+ import torch
6
+ from datasets import load_from_disk
7
+ import faiss
8
+ import numpy as np
9
+ import os
10
+ from datasets import load_dataset, Dataset, get_dataset_config_names
11
+ from sentence_transformers import SentenceTransformer
12
+ from groq import Groq
13
+ from sentence_transformers import CrossEncoder
14
+ import requests
15
+ import uuid
16
+ import re
17
+ import json
18
+ import gradio as gr
19
+ import io
20
+ import sys
21
+ import traceback
22
+
23
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
24
+
25
+ def build_index_and_dataset(domain, subsets, embedder_type="sentence-transformers/all-MiniLM-L6-v2", legal=False):
26
+ dataset_path = f"{domain}_dataset"
27
+ index_path = f"{domain}_index/faiss.index"
28
+
29
+ # ❌ Always remove previous
30
+ if os.path.exists(dataset_path):
31
+ shutil.rmtree(dataset_path)
32
+ if os.path.exists(index_path):
33
+ os.remove(index_path)
34
+
35
+ print(f"🚀 Rebuilding dataset and index for domain: {domain}")
36
+
37
+ all_docs = []
38
+ for subset in subsets:
39
+ ds = load_dataset("rungalileo/ragbench", subset, split="test")
40
+ for item in ds:
41
+ if isinstance(item, dict) and "documents" in item and isinstance(item["documents"], list):
42
+ all_docs.extend(item["documents"])
43
+ elif isinstance(item, str):
44
+ all_docs.append(item)
45
+ all_docs = list(set(all_docs))
46
+
47
+ splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
48
+ chunks = []
49
+ for doc in all_docs:
50
+ chunks.extend(splitter.split_text(doc))
51
+
52
+ if legal:
53
+ tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
54
+ model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased").to("cuda" if torch.cuda.is_available() else "cpu")
55
+ model.eval()
56
+ device = model.device
57
+ all_embeddings = []
58
+ for i in tqdm(range(0, len(chunks), 16), desc="Embedding Legal"):
59
+ batch = chunks[i:i+16]
60
+ inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
61
+ inputs = {k: v.to(device) for k, v in inputs.items()}
62
+ with torch.no_grad():
63
+ outputs = model(**inputs)
64
+ batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
65
+ all_embeddings.append(batch_embeddings)
66
+ embeddings = np.vstack(all_embeddings)
67
+ else:
68
+ embedder = SentenceTransformer(embedder_type, device="cuda" if torch.cuda.is_available() else "cpu")
69
+ embeddings = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
70
+
71
+ hf_dataset = Dataset.from_dict({"text": chunks})
72
+ dim = embeddings.shape[1]
73
+ faiss_index = faiss.IndexFlatL2(dim)
74
+ faiss_index.add(embeddings.astype("float32"))
75
+
76
+ os.makedirs(dataset_path, exist_ok=True)
77
+ os.makedirs(os.path.dirname(index_path), exist_ok=True)
78
+
79
+ hf_dataset.save_to_disk(dataset_path)
80
+ faiss.write_index(faiss_index, index_path)
81
+
82
+ print(f"✅ Saved {domain} dataset at {dataset_path}, index at {index_path}")
83
+ return hf_dataset, faiss_index
84
+
85
+ # 🔁 Always regenerate these indices and datasets at app start
86
+ RAGBENCH_SUBSETS_BY_DOMAIN = {
87
+ "legal": ["cuad"],
88
+ "med": ["pubmedqa"],
89
+ "gk": ["hotpotqa"],
90
+ "cs": ["emanual"],
91
+ "fin": ["finqa"]
92
+ }
93
+
94
+ hf_dataset_cs, faiss_index_cs = build_index_and_dataset("cs", RAGBENCH_SUBSETS_BY_DOMAIN["cs"])
95
+ hf_dataset_med, faiss_index_med = build_index_and_dataset("med", RAGBENCH_SUBSETS_BY_DOMAIN["med"])
96
+ hf_dataset_gk, faiss_index_gk = build_index_and_dataset("gk", RAGBENCH_SUBSETS_BY_DOMAIN["gk"])
97
+ hf_dataset_fin, faiss_index_fin = build_index_and_dataset("fin", RAGBENCH_SUBSETS_BY_DOMAIN["fin"])
98
+ hf_dataset_legal, faiss_index_legal = build_index_and_dataset("legal", RAGBENCH_SUBSETS_BY_DOMAIN["legal"], legal=True)
99
+
100
+ # Now load Hugging Face RAGBench datasets for GT
101
+ legal_dataset = load_dataset("rungalileo/ragbench", "cuad", split="test")
102
+ med_dataset = load_dataset("rungalileo/ragbench", "pubmedqa", split="test")
103
+ gk_dataset = load_dataset("rungalileo/ragbench", "hotpotqa", split="test")
104
+ cs_dataset = load_dataset("rungalileo/ragbench", "emanual", split="test")
105
+ fin_dataset = load_dataset("rungalileo/ragbench", "finqa", split="test")
106
+
107
+ # Load BGE reranker
108
+ reranker = CrossEncoder("BAAI/bge-reranker-base", max_length=512)
109
+
110
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
111
+ model_name = "nlpaueb/legal-bert-base-uncased"
112
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
113
+ model = AutoModel.from_pretrained(model_name).to(device)
114
+ model.eval()
115
+
116
+
117
+ def retrieve_top_k(query,domain='legal', model_name='nlpaueb/legal-bert-base-uncased', k=8):
118
+ # Load tokenizer and model
119
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
120
+ model = AutoModel.from_pretrained(model_name).to(device)
121
+ model.eval()
122
+
123
+ #print(f"In retrive_top_k Query:{query}")
124
+ # Tokenize and embed query using mean pooling
125
+ inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
126
+ inputs = {k: v.to(device) for k, v in inputs.items()}
127
+ with torch.no_grad():
128
+ outputs = model(**inputs)
129
+ query_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
130
+
131
+ # Load FAISS index and dataset
132
+ index_path = f"legal_index/faiss.index"
133
+ dataset_path = f"legal_dataset"
134
+
135
+ faiss_index = faiss.read_index(index_path)
136
+ dataset = load_from_disk(dataset_path)
137
+
138
+ # Perform FAISS search
139
+ D, I = faiss_index.search(query_embedding.astype('float32'), k)
140
+
141
+ # Retrieve top-k matching chunks
142
+ top_chunks = [dataset[int(idx)]['text'] for idx in I[0]]
143
+ return top_chunks
144
+
145
+
146
+
147
+ # Retrieval function using preloaded objects
148
+ def retrieve_top_c(query, domain, embedder, k=5):
149
+ if domain == "CS":
150
+ hf_dataset = hf_dataset_cs
151
+ faiss_index = faiss_index_cs
152
+ elif domain == "Medical":
153
+ hf_dataset = hf_dataset_med
154
+ faiss_index = faiss_index_med
155
+ elif domain == "GK":
156
+ hf_dataset = hf_dataset_gk
157
+ faiss_index = faiss_index_gk
158
+ elif domain == "Finance":
159
+ hf_dataset = hf_dataset_fin
160
+ faiss_index = faiss_index_fin
161
+ else:
162
+ raise ValueError(f"Unknown domain: {domain}")
163
+
164
+ # Encode query and search
165
+ query_embedding = embedder.encode([query]).astype('float32')
166
+ #query_embedding = embedder.encode([query], convert_to_numpy=True).astype('float32')
167
+ distances, indices = faiss_index.search(query_embedding, k)
168
+
169
+ return [hf_dataset[int(i)]["text"] for i in indices[0]]
170
+
171
+
172
+ client = Groq(
173
+ api_key= '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',#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',
174
+ )
175
+
176
+
177
+ def rerank_documents_bge(query, documents, top_n=5, return_scores=False):
178
+ """
179
+ Rerank documents using BAAI/bge-reranker-base CrossEncoder.
180
+
181
+ Args:
182
+ query (str): The query string.
183
+ documents (List[str]): List of candidate documents.
184
+ top_n (int): Number of top results to return.
185
+ return_scores (bool): Whether to return scores along with documents.
186
+
187
+ Returns:
188
+ List[str] or List[Tuple[str, float]]
189
+ """
190
+ if not documents:
191
+ return []
192
+
193
+ # Prepare (query, doc) pairs
194
+ pairs = [(query, doc) for doc in documents]
195
+
196
+ # Predict relevance scores
197
+ scores = reranker.predict(pairs, batch_size=16)
198
+
199
+ # Sort by score descending
200
+ reranked = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
201
+
202
+ if return_scores:
203
+ return reranked[:top_n]
204
+ else:
205
+ return [doc for doc, _ in reranked[:top_n]]
206
+
207
+
208
+
209
+ def generate_response_rag(query,domain):
210
+ # Step 1: Retrieve top-k context chunks using your FAISS setup
211
+ if domain == "Legal":
212
+ top_chunks = retrieve_top_k(query,'Legal', model_name)
213
+ else:
214
+ top_chunks = retrieve_top_c(query, domain,embedder)
215
+
216
+ # Step 2: Rerank retrieved documents using cross-encoder
217
+ #reranked_chunks = rerank_documents(query, top_chunks, top_n=15)
218
+ #rerank_and_filter_chunks = filter_by_faithfulness(query, reranked_chunks)
219
+ #print("Retrieved Top chunks",top_chunks)
220
+
221
+ #reranked_chunks = rerank_and_filter_chunks
222
+ reranked_chunks_bge = rerank_documents_bge(query, top_chunks, top_n=5)
223
+ #sum_context = summarize_context("\n\n".join(reranked_chunks_bge))
224
+
225
+
226
+
227
+ final_context = reranked_chunks_bge
228
+ # Step 2: Prepare context and RAG-style prompt
229
+ context = "\n\n".join(final_context)
230
+
231
+ #print(f"Context:{context}")
232
+ prompt = f"""You are a helpful legal assistant.
233
+ Use the following context to answer the question.
234
+ Using only the information from the retrieved context, answer the following question. If the answer cannot be derived, say "I don't know." Always have answer with prefix **Answer:**
235
+
236
+ Context:{context}
237
+
238
+ Question: {query}
239
+ Answer:"""
240
+
241
+ # Step 3: Call the LLM (LLaMA3 or any chat model)
242
+ chat_completion = client.chat.completions.create(
243
+ messages=[
244
+ {"role": "user", "content": prompt}
245
+ ],
246
+ model="llama3-70b-8192",#"gemma2-9b-it"#"qwen/qwen3-32b"#deepseek-r1-distill-llama-70b",#"llama3-70b-8192", # mistral-saba-24b
247
+ temperature=0.0
248
+ )
249
+
250
+ return context,chat_completion.choices[0].message.content.strip()
251
+
252
+ '''response = openai.chat.completions.create(
253
+ model="gpt-3.5-turbo",
254
+ messages=[
255
+ {"role": "user", "content": prompt}
256
+ ],
257
+ temperature=0.0,
258
+ max_tokens=1024
259
+ )
260
+
261
+ return response.choices[0].message.content'''
262
+
263
+ #JUDGE LLM
264
+
265
+
266
+ def split_into_keyed_sentences(text, prefix):
267
+ """Splits text into sentences with keys like '0a.', '0b.', or 'a.', 'b.', etc."""
268
+ # Basic sentence tokenizer with keys
269
+ sentences = re.split(r'(?<=[.?!])\s+', text.strip())
270
+ keyed = {}
271
+ for i, s in enumerate(sentences):
272
+ key = f"{prefix}{chr(97 + i)}" # 'a', 'b', ...
273
+ if s:
274
+ keyed[key] = s.strip()
275
+ return keyed
276
+
277
+
278
+ def jugde_response_rag(query, domain):
279
+
280
+ #top_chunks = retrieve_top_k(query)
281
+
282
+ #top_chunks = [chunk[0] if isinstance(chunk, tuple) else chunk for chunk in top_chunks]
283
+
284
+ # Step 2: Prepare context and RAG-style prompt
285
+ #context = "\n\n".join(top_chunks)
286
+
287
+ # Split context and dummy answer into keyed sentences
288
+ #document_keys = split_into_keyed_sentences(context, "0")
289
+
290
+ #print(f"Query:{query}\n====================================================================")
291
+ context,response = generate_response_rag(query,domain) #deepseek-r1-distill-llama-70b llama3-70b-8192
292
+
293
+ # Split context and dummy answer into keyed sentences
294
+ document_keys = split_into_keyed_sentences(context, "0")
295
+ #print(f"\n====================================\Generator Response:{response}")
296
+ #For deepseek
297
+ #print("Before Curated:",response)
298
+ response=response[response.find("**Answer"):].replace("**Answer","");
299
+
300
+ print(f"Response for Generator LLM:{response}")
301
+
302
+ response_keys = split_into_keyed_sentences(response, "")
303
+ # Rebuild sections for prompt
304
+ documents_formatted = "\n".join([f"{k}. {v}" for k, v in document_keys.items()])
305
+ response_formatted = "\n".join([f"{k}. {v}" for k, v in response_keys.items()])
306
+
307
+ '''print(f"\n====================================================================")
308
+ print(f"documents_formatted:{documents_formatted}")
309
+ print(f"\n====================================================================")
310
+ print(f"response_formatted:{response_formatted}")
311
+ print(f"\n====================================================================")'''
312
+
313
+
314
+ prompt = f"""I asked someone to answer a question based on one or more documents.
315
+ Your task is to review their response and assess whether or not each sentence
316
+ in that response is supported by text in the documents. And if so, which
317
+ sentences in the documents provide that support. You will also tell me which
318
+ of the documents contain useful information for answering the question, and
319
+ which of the documents the answer was sourced from.
320
+ Here are the documents, each of which is split into sentences. Alongside each
321
+ sentence is associated key, such as ’0a.’ or ’0b.’ that you can use to refer
322
+ to it:
323
+ '''
324
+ {documents_formatted}
325
+ '''
326
+ The question was:
327
+ '''
328
+ {query}
329
+ '''
330
+ Here is their response, split into sentences. Alongside each sentence is
331
+ associated key, such as ’a.’ or ’b.’ that you can use to refer to it. Note
332
+ that these keys are unique to the response, and are not related to the keys
333
+ in the documents:
334
+ '''
335
+ {response_formatted}
336
+ '''
337
+ You must respond with a JSON object matching this schema:
338
+ '''
339
+ {{
340
+ "relevance_explanation": string,
341
+ "all_relevant_sentence_keys": [string],
342
+ "overall_supported_explanation": string,
343
+ "overall_supported": boolean,
344
+ "sentence_support_information": [
345
+ {{
346
+ "response_sentence_key": string,
347
+ "explanation": string,
348
+ "supporting_sentence_keys": [string],
349
+ "fully_supported": boolean
350
+ }},
351
+ ],
352
+ "all_utilized_sentence_keys": [string]
353
+ }}
354
+ '''
355
+ The relevance_explanation field is a string explaining which documents
356
+ contain useful information for answering the question. Provide a step-by-step
357
+ breakdown of information provided in the documents and how it is useful for
358
+ answering the question.
359
+ The all_relevant_sentence_keys field is a list of all document sentences keys
360
+ (e.g. ’0a’) that are revant to the question. Include every sentence that is
361
+ useful and relevant to the question, even if it was not used in the response,
362
+ or if only parts of the sentence are useful. Ignore the provided response when
363
+ making this judgement and base your judgement solely on the provided documents
364
+ and question. Omit sentences that, if removed from the document, would not
365
+ impact someone’s ability to answer the question.
366
+ The overall_supported_explanation field is a string explaining why the response
367
+ *as a whole* is or is not supported by the documents. In this field, provide a
368
+ step-by-step breakdown of the claims made in the response and the support (or
369
+ lack thereof) for those claims in the documents. Begin by assessing each claim
370
+ separately, one by one; don’t make any remarks about the response as a whole
371
+ until you have assessed all the claims in isolation.
372
+ The overall_supported field is a boolean indicating whether the response as a
373
+ whole is supported by the documents. This value should reflect the conclusion
374
+ you drew at the end of your step-by-step breakdown in overall_supported_explanation.
375
+ In the sentence_support_information field, provide information about the support
376
+ *for each sentence* in the response.
377
+ The sentence_support_information field is a list of objects, one for each sentence
378
+ in the response. Each object MUST have the following fields:
379
+ - response_sentence_key: a string identifying the sentence in the response.
380
+ This key is the same as the one used in the response above.
381
+ - explanation: a string explaining why the sentence is or is not supported by the
382
+ documents.
383
+ - supporting_sentence_keys: keys (e.g. ’0a’) of sentences from the documents that
384
+ support the response sentence. If the sentence is not supported, this list MUST
385
+ be empty. If the sentence is supported, this list MUST contain one or more keys.
386
+ In special cases where the sentence is supported, but not by any specific sentence,
387
+ you can use the string "supported_without_sentence" to indicate that the sentence
388
+ is generally supported by the documents. Consider cases where the sentence is
389
+ expressing inability to answer the question due to lack of relevant information in
390
+ the provided contex as "supported_without_sentence". In cases where the sentence
391
+ is making a general statement (e.g. outlining the steps to produce an answer, or
392
+ summarizing previously stated sentences, or a transition sentence), use the
393
+ sting "general".In cases where the sentence is correctly stating a well-known fact,
394
+ like a mathematical formula, use the string "well_known_fact". In cases where the
395
+ sentence is performing numerical reasoning (e.g. addition, multiplication), use
396
+ the string "numerical_reasoning".
397
+ - fully_supported: a boolean indicating whether the sentence is fully supported by
398
+ the documents.
399
+ - This value should reflect the conclusion you drew at the end of your step-by-step
400
+ breakdown in explanation.
401
+ - If supporting_sentence_keys is an empty list, then fully_supported must be false.
402
+ 17
403
+ - Otherwise, use fully_supported to clarify whether everything in the response
404
+ sentence is fully supported by the document text indicated in supporting_sentence_keys
405
+ (fully_supported = true), or whether the sentence is only partially or incompletely
406
+ supported by that document text (fully_supported = false).
407
+ The all_utilized_sentence_keys field is a list of all sentences keys (e.g. ’0a’) that
408
+ were used to construct the answer. Include every sentence that either directly supported
409
+ the answer, or was implicitly used to construct the answer, even if it was not used
410
+ in its entirety. Omit sentences that were not used, and could have been removed from
411
+ the documents without affecting the answer.
412
+ You must respond with a valid JSON string. Use escapes for quotes, e.g. ‘\\"‘, and
413
+ newlines, e.g. ‘\\n‘. Do not write anything before or after the JSON string. Do not
414
+ wrap the JSON string in backticks like ‘‘‘ or ‘‘‘json.
415
+ As a reminder: your task is to review the response and assess which documents contain
416
+ useful information pertaining to the question, and how each sentence in the response
417
+ is supported by the text in the documents.\
418
+ """
419
+
420
+ # Step 3: Call the LLM
421
+ chat_completion = client.chat.completions.create(
422
+ messages=[
423
+ {"role": "user", "content": prompt}
424
+ ],
425
+ model="meta-llama/llama-4-maverick-17b-128e-instruct", #deepseek-r1-distill-llama-70b llama3-70b-8192 meta-llama/llama-4-maverick-17b-128e-instruct
426
+ )
427
+
428
+ return documents_formatted,chat_completion.choices[0].message.content.strip()
429
+
430
+ '''chat_completion = openai.chat.completions.create(
431
+ messages=[
432
+ {"role":"user",
433
+ "content":prompt}
434
+ ],
435
+ model="gpt-4o",
436
+ max_tokens=1024,
437
+
438
+ )
439
+ return documents_formatted,chat_completion.choices[0].message.content'''
440
+
441
+
442
+ def extract_retrieved_sentence_keys(document_text: str) -> list[str]:
443
+ """
444
+ Extracts sentence keys like '0a.', '0b.', etc. from a formatted document string.
445
+
446
+ Parameters:
447
+ - document_text (str): full text of document with sentence keys
448
+
449
+ Returns:
450
+ - List of unique sentence keys in the order they appear
451
+ """
452
+ # Match pattern like 0a., 0b., 0z., 0{., 0|., etc.
453
+ pattern = r'\b0[\w\{\|\}~€‚]\.'
454
+
455
+ matches = re.findall(pattern, document_text)
456
+ return list(dict.fromkeys(matches)) # Removes duplicates while preserving order
457
+
458
+ def compute_ragbench_metrics(judge_response: dict, retrieved_sentence_keys: list[str]) -> dict:
459
+ """
460
+ Computes RAGBench-style metrics from Judge LLM response.
461
+
462
+ Parameters:
463
+ - judge_response (dict): JSON response from Judge LLM
464
+ - retrieved_sentence_keys (list of str): all sentence keys from the retrieved documents
465
+
466
+ Returns:
467
+ - Dictionary with Context Relevance, Context Utilization, Completeness, and Adherence
468
+ """
469
+
470
+ R = set(judge_response.get("all_relevant_sentence_keys", [])) # Relevant sentences
471
+ U = set(judge_response.get("all_utilized_sentence_keys", [])) # Utilized sentences
472
+ intersection_RU = R & U
473
+
474
+ total_retrieved = len(retrieved_sentence_keys)
475
+ len_R = len(R)
476
+ len_U = len(U)
477
+ len_intersection = len(intersection_RU)
478
+
479
+ # Context Relevance: fraction of retrieved context that is relevant
480
+ context_relevance = len_R / total_retrieved if total_retrieved else 0.0
481
+
482
+ # Context Utilization: fraction of retrieved context that was used
483
+ context_utilization = len_U / total_retrieved if total_retrieved else 0.0
484
+
485
+ # Completeness: fraction of relevant content that was used
486
+ completeness = len_intersection / len_R if len_R else 0.0
487
+
488
+ # Adherence: 1 if all response sentences are fully supported, else 0
489
+ is_fully_supported = all(s.get("fully_supported", False)
490
+ for s in judge_response.get("sentence_support_information", []))
491
+ adherence = 1.0 if is_fully_supported and judge_response.get("overall_supported", False) else 0.0
492
+
493
+ return {
494
+ "Context Relevance": round(context_relevance, 4),
495
+ "Context Utilization": round(context_utilization, 4),
496
+ "Completeness": round(completeness, 4),
497
+ "Adherence": adherence
498
+ }
499
+
500
+
501
+ def evaluate_rag_pipeline(domain, q_indices):
502
+ import torch
503
+ import numpy as np
504
+ from sklearn.metrics import mean_squared_error, roc_auc_score
505
+
506
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
507
+
508
+ def safe_append(gt_list, pred_list, gt_val, pred_val):
509
+ if gt_val is not None and pred_val is not None:
510
+ gt_list.append(gt_val)
511
+ pred_list.append(pred_val)
512
+
513
+ def clean_and_parse_json_block(text):
514
+ # Strip markdown-style code block if present
515
+ #text = text.strip().strip("`").strip()
516
+ code_block_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
517
+ if code_block_match:
518
+ text = code_block_match.group(1).strip()
519
+
520
+ # Remove invalid/control characters that break decoding
521
+ text = re.sub(r"[^\x20-\x7E\n\t]", "", text)
522
+
523
+ try:
524
+ return json.loads(text)
525
+ except json.JSONDecodeError as e:
526
+ print("❌ JSON Decode Error:", e)
527
+ print("⚠️ Cleaned text:\n", text)
528
+ raise
529
+
530
+
531
+ gt_relevance, pred_relevance = [], []
532
+ gt_utilization, pred_utilization = [], []
533
+ gt_completeness, pred_completeness = [], []
534
+ gt_adherence, pred_adherence = [], []
535
+
536
+ if(domain=="Legal"):
537
+ dataset = legal_dataset
538
+ elif(domain=="Medical"):
539
+ dataset = med_dataset
540
+ elif(domain=="GK"):
541
+ dataset = gk_dataset
542
+ elif(domain=="CS"):
543
+ dataset = cs_dataset
544
+ elif(domain=="Finance"):
545
+ dataset = fin_dataset
546
+
547
+ for i in q_indices:
548
+ query = dataset[i]['question']
549
+ print(f"\n\n\nQuery:{i}.{query}\n====================================================================")
550
+ #print(f"\ndomain:{domain}====================================================================")
551
+ documents_formatted, response = jugde_response_rag(query, domain)
552
+ judge_response = clean_and_parse_json_block(response)
553
+ print(f"\ndocuments_formatted:{documents_formatted}")
554
+ print(f"\n======================================================================\nResponse:{judge_response}")
555
+ retrieved_sentences = extract_retrieved_sentence_keys(documents_formatted)
556
+ predicted = compute_ragbench_metrics(judge_response, retrieved_sentences)
557
+
558
+ # GT values
559
+ gt_r = dataset[i].get('relevance_score')
560
+ gt_u = dataset[i].get('utilization_score')
561
+ gt_c = dataset[i].get('completeness_score')
562
+ gt_a = dataset[i].get('gpt3_adherence')
563
+
564
+ safe_append(gt_relevance, pred_relevance, gt_r, predicted['Context Relevance'])
565
+ safe_append(gt_utilization, pred_utilization, gt_u, predicted['Context Utilization'])
566
+ safe_append(gt_completeness, pred_completeness, gt_c, predicted['Completeness'])
567
+ if gt_a is not None and predicted['Adherence'] is not None:
568
+ safe_append(gt_adherence, pred_adherence, int(gt_a), int(predicted['Adherence']))
569
+
570
+ def compute_rmse(gt, pred):
571
+ return round(np.sqrt(np.mean((np.array(gt) - np.array(pred)) ** 2)), 4)
572
+
573
+ result = {
574
+ "Context Relevance": compute_rmse(gt_relevance, pred_relevance),
575
+ "Context Utilization": compute_rmse(gt_utilization, pred_utilization),
576
+ "Completeness": compute_rmse(gt_completeness, pred_completeness),
577
+ }
578
+
579
+ if len(set(gt_adherence)) == 2:
580
+ result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
581
+ result["AUC-ROC (Adherence)"] = round(roc_auc_score(gt_adherence, pred_adherence), 4)
582
+ else:
583
+ result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
584
+ result["AUC-ROC (Adherence)"] = "N/A - one class only"
585
+
586
+ return result
587
+
588
+
589
+
590
+ # Updated wrapper
591
+ def evaluate_rag_gradio(domain, q_indices_str):
592
+ # Capture logs
593
+ log_stream = io.StringIO()
594
+ sys.stdout = log_stream
595
+
596
+ try:
597
+ # Parse comma-separated indices
598
+ q_indices = [int(x.strip()) for x in q_indices_str.split(",") if x.strip().isdigit()]
599
+ results = evaluate_rag_pipeline(domain, q_indices)
600
+
601
+ logs = log_stream.getvalue()
602
+ return results, logs
603
+
604
+ except Exception as e:
605
+ traceback.print_exc()
606
+ return {"error": str(e)}, log_stream.getvalue()
607
+
608
+ finally:
609
+ sys.stdout = sys.__stdout__ # Restore stdout
610
+
611
+ # Gradio interface
612
+ iface = gr.Interface(
613
+ fn=evaluate_rag_gradio,
614
+ inputs=[
615
+ gr.Dropdown(choices=["Legal", "Medical", "GK", "CS", "Finance"], label="Domain"),
616
+ gr.Textbox(label="Comma-separated Query Indices (e.g. 89,121,245)", lines=1),
617
+ ],
618
+ outputs=[
619
+ gr.JSON(label="Evaluation Metrics (RMSE & AUC-ROC)"),
620
+ gr.Textbox(label="Execution Log", lines=10, interactive=True),
621
+ ],
622
+ title="RAG Evaluation Dashboard",
623
+ description="Evaluate your RAG pipeline across selected queries using GPT-based generation and judgment."
624
+ )
625
+
626
+ # Launch app
627
+ iface.launch(server_name="0.0.0.0", server_port=7860, debug=True)
bkp1_app.py ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_from_disk
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import faiss
4
+ import numpy as np
5
+ import torch
6
+ from datasets import load_from_disk
7
+ import faiss
8
+ import numpy as np
9
+ import os
10
+ from datasets import load_dataset, Dataset, get_dataset_config_names
11
+ from sentence_transformers import SentenceTransformer
12
+ from groq import Groq
13
+ from sentence_transformers import CrossEncoder
14
+ import requests
15
+ import uuid
16
+ import re
17
+ import json
18
+ import gradio as gr
19
+ import io
20
+ import sys
21
+ import traceback
22
+
23
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
24
+ # Preload datasets and indices
25
+ hf_dataset_cs = load_from_disk("cs_dataset")
26
+ faiss_index_cs = faiss.read_index("cs_index/faiss.index")
27
+
28
+ hf_dataset_med = load_from_disk("med_dataset")
29
+ faiss_index_med = faiss.read_index("med_index/faiss.index")
30
+
31
+ hf_dataset_gk = load_from_disk("gk_dataset")
32
+ faiss_index_gk = faiss.read_index("gk_index/faiss.index")
33
+
34
+ hf_dataset_fin = load_from_disk("fin_dataset")
35
+ faiss_index_fin = faiss.read_index("fin_index/faiss.index")
36
+
37
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+ print(device)
39
+
40
+
41
+ legal_dataset = load_dataset("rungalileo/ragbench", "cuad", split="test")
42
+ med_dataset = load_dataset("rungalileo/ragbench", "pubmedqa", split="test")
43
+ gk_dataset = load_dataset("rungalileo/ragbench", "hotpotqa", split="test")
44
+ cs_dataset = load_dataset("rungalileo/ragbench", "emanual", split="test")
45
+ fin_dataset = load_dataset("rungalileo/ragbench", "finqa", split="test")
46
+
47
+ # Load BGE reranker
48
+ reranker = CrossEncoder("BAAI/bge-reranker-base", max_length=512)
49
+
50
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
51
+ model_name = "nlpaueb/legal-bert-base-uncased"
52
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
53
+ model = AutoModel.from_pretrained(model_name).to(device)
54
+ model.eval()
55
+
56
+
57
+ def retrieve_top_k(query,domain='legal', model_name='nlpaueb/legal-bert-base-uncased', k=8):
58
+ # Load tokenizer and model
59
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
60
+ model = AutoModel.from_pretrained(model_name).to(device)
61
+ model.eval()
62
+
63
+ #print(f"In retrive_top_k Query:{query}")
64
+ # Tokenize and embed query using mean pooling
65
+ inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
66
+ inputs = {k: v.to(device) for k, v in inputs.items()}
67
+ with torch.no_grad():
68
+ outputs = model(**inputs)
69
+ query_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
70
+
71
+ # Load FAISS index and dataset
72
+ index_path = f"legal_index/faiss.index"
73
+ dataset_path = f"legal_dataset"
74
+
75
+ faiss_index = faiss.read_index(index_path)
76
+ dataset = load_from_disk(dataset_path)
77
+
78
+ # Perform FAISS search
79
+ D, I = faiss_index.search(query_embedding.astype('float32'), k)
80
+
81
+ # Retrieve top-k matching chunks
82
+ top_chunks = [dataset[int(idx)]['text'] for idx in I[0]]
83
+ return top_chunks
84
+
85
+
86
+
87
+ # Retrieval function using preloaded objects
88
+ def retrieve_top_c(query, domain, embedder, k=5):
89
+ if domain == "CS":
90
+ hf_dataset = hf_dataset_cs
91
+ faiss_index = faiss_index_cs
92
+ elif domain == "Medical":
93
+ hf_dataset = hf_dataset_med
94
+ faiss_index = faiss_index_med
95
+ elif domain == "GK":
96
+ hf_dataset = hf_dataset_gk
97
+ faiss_index = faiss_index_gk
98
+ elif domain == "Finance":
99
+ hf_dataset = hf_dataset_fin
100
+ faiss_index = faiss_index_fin
101
+ else:
102
+ raise ValueError(f"Unknown domain: {domain}")
103
+
104
+ # Encode query and search
105
+ query_embedding = embedder.encode([query]).astype('float32')
106
+ #query_embedding = embedder.encode([query], convert_to_numpy=True).astype('float32')
107
+ distances, indices = faiss_index.search(query_embedding, k)
108
+
109
+ return [hf_dataset[int(i)]["text"] for i in indices[0]]
110
+
111
+
112
+ client = Groq(
113
+ api_key= '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',#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',
114
+ )
115
+
116
+
117
+ def rerank_documents_bge(query, documents, top_n=5, return_scores=False):
118
+ """
119
+ Rerank documents using BAAI/bge-reranker-base CrossEncoder.
120
+
121
+ Args:
122
+ query (str): The query string.
123
+ documents (List[str]): List of candidate documents.
124
+ top_n (int): Number of top results to return.
125
+ return_scores (bool): Whether to return scores along with documents.
126
+
127
+ Returns:
128
+ List[str] or List[Tuple[str, float]]
129
+ """
130
+ if not documents:
131
+ return []
132
+
133
+ # Prepare (query, doc) pairs
134
+ pairs = [(query, doc) for doc in documents]
135
+
136
+ # Predict relevance scores
137
+ scores = reranker.predict(pairs, batch_size=16)
138
+
139
+ # Sort by score descending
140
+ reranked = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
141
+
142
+ if return_scores:
143
+ return reranked[:top_n]
144
+ else:
145
+ return [doc for doc, _ in reranked[:top_n]]
146
+
147
+
148
+
149
+ def generate_response_rag(query,domain):
150
+ # Step 1: Retrieve top-k context chunks using your FAISS setup
151
+ if domain == "Legal":
152
+ top_chunks = retrieve_top_k(query,'Legal', model_name)
153
+ else:
154
+ top_chunks = retrieve_top_c(query, domain,embedder)
155
+
156
+ # Step 2: Rerank retrieved documents using cross-encoder
157
+ #reranked_chunks = rerank_documents(query, top_chunks, top_n=15)
158
+ #rerank_and_filter_chunks = filter_by_faithfulness(query, reranked_chunks)
159
+ #print("Retrieved Top chunks",top_chunks)
160
+
161
+ #reranked_chunks = rerank_and_filter_chunks
162
+ reranked_chunks_bge = rerank_documents_bge(query, top_chunks, top_n=5)
163
+ #sum_context = summarize_context("\n\n".join(reranked_chunks_bge))
164
+
165
+
166
+
167
+ final_context = reranked_chunks_bge
168
+ # Step 2: Prepare context and RAG-style prompt
169
+ context = "\n\n".join(final_context)
170
+
171
+ #print(f"Context:{context}")
172
+ prompt = f"""You are a helpful legal assistant.
173
+ Use the following context to answer the question.
174
+ Using only the information from the retrieved context, answer the following question. If the answer cannot be derived, say "I don't know." Always have answer with prefix **Answer:**
175
+
176
+ Context:{context}
177
+
178
+ Question: {query}
179
+ Answer:"""
180
+
181
+ # Step 3: Call the LLM (LLaMA3 or any chat model)
182
+ chat_completion = client.chat.completions.create(
183
+ messages=[
184
+ {"role": "user", "content": prompt}
185
+ ],
186
+ model="llama3-70b-8192",#"gemma2-9b-it"#"qwen/qwen3-32b"#deepseek-r1-distill-llama-70b",#"llama3-70b-8192", # mistral-saba-24b
187
+ temperature=0.0
188
+ )
189
+
190
+ return context,chat_completion.choices[0].message.content.strip()
191
+
192
+ '''response = openai.chat.completions.create(
193
+ model="gpt-3.5-turbo",
194
+ messages=[
195
+ {"role": "user", "content": prompt}
196
+ ],
197
+ temperature=0.0,
198
+ max_tokens=1024
199
+ )
200
+
201
+ return response.choices[0].message.content'''
202
+
203
+ #JUDGE LLM
204
+
205
+
206
+ def split_into_keyed_sentences(text, prefix):
207
+ """Splits text into sentences with keys like '0a.', '0b.', or 'a.', 'b.', etc."""
208
+ # Basic sentence tokenizer with keys
209
+ sentences = re.split(r'(?<=[.?!])\s+', text.strip())
210
+ keyed = {}
211
+ for i, s in enumerate(sentences):
212
+ key = f"{prefix}{chr(97 + i)}" # 'a', 'b', ...
213
+ if s:
214
+ keyed[key] = s.strip()
215
+ return keyed
216
+
217
+
218
+ def jugde_response_rag(query, domain):
219
+
220
+ #top_chunks = retrieve_top_k(query)
221
+
222
+ #top_chunks = [chunk[0] if isinstance(chunk, tuple) else chunk for chunk in top_chunks]
223
+
224
+ # Step 2: Prepare context and RAG-style prompt
225
+ #context = "\n\n".join(top_chunks)
226
+
227
+ # Split context and dummy answer into keyed sentences
228
+ #document_keys = split_into_keyed_sentences(context, "0")
229
+
230
+ #print(f"Query:{query}\n====================================================================")
231
+ context,response = generate_response_rag(query,domain) #deepseek-r1-distill-llama-70b llama3-70b-8192
232
+
233
+ # Split context and dummy answer into keyed sentences
234
+ document_keys = split_into_keyed_sentences(context, "0")
235
+ #print(f"\n====================================\Generator Response:{response}")
236
+ #For deepseek
237
+ #print("Before Curated:",response)
238
+ response=response[response.find("**Answer"):].replace("**Answer","");
239
+
240
+ print(f"Response for Generator LLM:{response}")
241
+
242
+ response_keys = split_into_keyed_sentences(response, "")
243
+ # Rebuild sections for prompt
244
+ documents_formatted = "\n".join([f"{k}. {v}" for k, v in document_keys.items()])
245
+ response_formatted = "\n".join([f"{k}. {v}" for k, v in response_keys.items()])
246
+
247
+ '''print(f"\n====================================================================")
248
+ print(f"documents_formatted:{documents_formatted}")
249
+ print(f"\n====================================================================")
250
+ print(f"response_formatted:{response_formatted}")
251
+ print(f"\n====================================================================")'''
252
+
253
+
254
+ prompt = f"""I asked someone to answer a question based on one or more documents.
255
+ Your task is to review their response and assess whether or not each sentence
256
+ in that response is supported by text in the documents. And if so, which
257
+ sentences in the documents provide that support. You will also tell me which
258
+ of the documents contain useful information for answering the question, and
259
+ which of the documents the answer was sourced from.
260
+ Here are the documents, each of which is split into sentences. Alongside each
261
+ sentence is associated key, such as ’0a.’ or ’0b.’ that you can use to refer
262
+ to it:
263
+ '''
264
+ {documents_formatted}
265
+ '''
266
+ The question was:
267
+ '''
268
+ {query}
269
+ '''
270
+ Here is their response, split into sentences. Alongside each sentence is
271
+ associated key, such as ’a.’ or ’b.’ that you can use to refer to it. Note
272
+ that these keys are unique to the response, and are not related to the keys
273
+ in the documents:
274
+ '''
275
+ {response_formatted}
276
+ '''
277
+ You must respond with a JSON object matching this schema:
278
+ '''
279
+ {{
280
+ "relevance_explanation": string,
281
+ "all_relevant_sentence_keys": [string],
282
+ "overall_supported_explanation": string,
283
+ "overall_supported": boolean,
284
+ "sentence_support_information": [
285
+ {{
286
+ "response_sentence_key": string,
287
+ "explanation": string,
288
+ "supporting_sentence_keys": [string],
289
+ "fully_supported": boolean
290
+ }},
291
+ ],
292
+ "all_utilized_sentence_keys": [string]
293
+ }}
294
+ '''
295
+ The relevance_explanation field is a string explaining which documents
296
+ contain useful information for answering the question. Provide a step-by-step
297
+ breakdown of information provided in the documents and how it is useful for
298
+ answering the question.
299
+ The all_relevant_sentence_keys field is a list of all document sentences keys
300
+ (e.g. ’0a’) that are revant to the question. Include every sentence that is
301
+ useful and relevant to the question, even if it was not used in the response,
302
+ or if only parts of the sentence are useful. Ignore the provided response when
303
+ making this judgement and base your judgement solely on the provided documents
304
+ and question. Omit sentences that, if removed from the document, would not
305
+ impact someone’s ability to answer the question.
306
+ The overall_supported_explanation field is a string explaining why the response
307
+ *as a whole* is or is not supported by the documents. In this field, provide a
308
+ step-by-step breakdown of the claims made in the response and the support (or
309
+ lack thereof) for those claims in the documents. Begin by assessing each claim
310
+ separately, one by one; don’t make any remarks about the response as a whole
311
+ until you have assessed all the claims in isolation.
312
+ The overall_supported field is a boolean indicating whether the response as a
313
+ whole is supported by the documents. This value should reflect the conclusion
314
+ you drew at the end of your step-by-step breakdown in overall_supported_explanation.
315
+ In the sentence_support_information field, provide information about the support
316
+ *for each sentence* in the response.
317
+ The sentence_support_information field is a list of objects, one for each sentence
318
+ in the response. Each object MUST have the following fields:
319
+ - response_sentence_key: a string identifying the sentence in the response.
320
+ This key is the same as the one used in the response above.
321
+ - explanation: a string explaining why the sentence is or is not supported by the
322
+ documents.
323
+ - supporting_sentence_keys: keys (e.g. ’0a’) of sentences from the documents that
324
+ support the response sentence. If the sentence is not supported, this list MUST
325
+ be empty. If the sentence is supported, this list MUST contain one or more keys.
326
+ In special cases where the sentence is supported, but not by any specific sentence,
327
+ you can use the string "supported_without_sentence" to indicate that the sentence
328
+ is generally supported by the documents. Consider cases where the sentence is
329
+ expressing inability to answer the question due to lack of relevant information in
330
+ the provided contex as "supported_without_sentence". In cases where the sentence
331
+ is making a general statement (e.g. outlining the steps to produce an answer, or
332
+ summarizing previously stated sentences, or a transition sentence), use the
333
+ sting "general".In cases where the sentence is correctly stating a well-known fact,
334
+ like a mathematical formula, use the string "well_known_fact". In cases where the
335
+ sentence is performing numerical reasoning (e.g. addition, multiplication), use
336
+ the string "numerical_reasoning".
337
+ - fully_supported: a boolean indicating whether the sentence is fully supported by
338
+ the documents.
339
+ - This value should reflect the conclusion you drew at the end of your step-by-step
340
+ breakdown in explanation.
341
+ - If supporting_sentence_keys is an empty list, then fully_supported must be false.
342
+ 17
343
+ - Otherwise, use fully_supported to clarify whether everything in the response
344
+ sentence is fully supported by the document text indicated in supporting_sentence_keys
345
+ (fully_supported = true), or whether the sentence is only partially or incompletely
346
+ supported by that document text (fully_supported = false).
347
+ The all_utilized_sentence_keys field is a list of all sentences keys (e.g. ’0a’) that
348
+ were used to construct the answer. Include every sentence that either directly supported
349
+ the answer, or was implicitly used to construct the answer, even if it was not used
350
+ in its entirety. Omit sentences that were not used, and could have been removed from
351
+ the documents without affecting the answer.
352
+ You must respond with a valid JSON string. Use escapes for quotes, e.g. ‘\\"‘, and
353
+ newlines, e.g. ‘\\n‘. Do not write anything before or after the JSON string. Do not
354
+ wrap the JSON string in backticks like ‘‘‘ or ‘‘‘json.
355
+ As a reminder: your task is to review the response and assess which documents contain
356
+ useful information pertaining to the question, and how each sentence in the response
357
+ is supported by the text in the documents.\
358
+ """
359
+
360
+ # Step 3: Call the LLM
361
+ chat_completion = client.chat.completions.create(
362
+ messages=[
363
+ {"role": "user", "content": prompt}
364
+ ],
365
+ model="meta-llama/llama-4-maverick-17b-128e-instruct", #deepseek-r1-distill-llama-70b llama3-70b-8192 meta-llama/llama-4-maverick-17b-128e-instruct
366
+ )
367
+
368
+ return documents_formatted,chat_completion.choices[0].message.content.strip()
369
+
370
+ '''chat_completion = openai.chat.completions.create(
371
+ messages=[
372
+ {"role":"user",
373
+ "content":prompt}
374
+ ],
375
+ model="gpt-4o",
376
+ max_tokens=1024,
377
+
378
+ )
379
+ return documents_formatted,chat_completion.choices[0].message.content'''
380
+
381
+
382
+ def extract_retrieved_sentence_keys(document_text: str) -> list[str]:
383
+ """
384
+ Extracts sentence keys like '0a.', '0b.', etc. from a formatted document string.
385
+
386
+ Parameters:
387
+ - document_text (str): full text of document with sentence keys
388
+
389
+ Returns:
390
+ - List of unique sentence keys in the order they appear
391
+ """
392
+ # Match pattern like 0a., 0b., 0z., 0{., 0|., etc.
393
+ pattern = r'\b0[\w\{\|\}~€‚]\.'
394
+
395
+ matches = re.findall(pattern, document_text)
396
+ return list(dict.fromkeys(matches)) # Removes duplicates while preserving order
397
+
398
+ def compute_ragbench_metrics(judge_response: dict, retrieved_sentence_keys: list[str]) -> dict:
399
+ """
400
+ Computes RAGBench-style metrics from Judge LLM response.
401
+
402
+ Parameters:
403
+ - judge_response (dict): JSON response from Judge LLM
404
+ - retrieved_sentence_keys (list of str): all sentence keys from the retrieved documents
405
+
406
+ Returns:
407
+ - Dictionary with Context Relevance, Context Utilization, Completeness, and Adherence
408
+ """
409
+
410
+ R = set(judge_response.get("all_relevant_sentence_keys", [])) # Relevant sentences
411
+ U = set(judge_response.get("all_utilized_sentence_keys", [])) # Utilized sentences
412
+ intersection_RU = R & U
413
+
414
+ total_retrieved = len(retrieved_sentence_keys)
415
+ len_R = len(R)
416
+ len_U = len(U)
417
+ len_intersection = len(intersection_RU)
418
+
419
+ # Context Relevance: fraction of retrieved context that is relevant
420
+ context_relevance = len_R / total_retrieved if total_retrieved else 0.0
421
+
422
+ # Context Utilization: fraction of retrieved context that was used
423
+ context_utilization = len_U / total_retrieved if total_retrieved else 0.0
424
+
425
+ # Completeness: fraction of relevant content that was used
426
+ completeness = len_intersection / len_R if len_R else 0.0
427
+
428
+ # Adherence: 1 if all response sentences are fully supported, else 0
429
+ is_fully_supported = all(s.get("fully_supported", False)
430
+ for s in judge_response.get("sentence_support_information", []))
431
+ adherence = 1.0 if is_fully_supported and judge_response.get("overall_supported", False) else 0.0
432
+
433
+ return {
434
+ "Context Relevance": round(context_relevance, 4),
435
+ "Context Utilization": round(context_utilization, 4),
436
+ "Completeness": round(completeness, 4),
437
+ "Adherence": adherence
438
+ }
439
+
440
+
441
+ def evaluate_rag_pipeline(domain, q_indices):
442
+ import torch
443
+ import numpy as np
444
+ from sklearn.metrics import mean_squared_error, roc_auc_score
445
+
446
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
447
+
448
+ def safe_append(gt_list, pred_list, gt_val, pred_val):
449
+ if gt_val is not None and pred_val is not None:
450
+ gt_list.append(gt_val)
451
+ pred_list.append(pred_val)
452
+
453
+ def clean_and_parse_json_block(text):
454
+ # Strip markdown-style code block if present
455
+ #text = text.strip().strip("`").strip()
456
+ code_block_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
457
+ if code_block_match:
458
+ text = code_block_match.group(1).strip()
459
+
460
+ # Remove invalid/control characters that break decoding
461
+ text = re.sub(r"[^\x20-\x7E\n\t]", "", text)
462
+
463
+ try:
464
+ return json.loads(text)
465
+ except json.JSONDecodeError as e:
466
+ print("❌ JSON Decode Error:", e)
467
+ print("⚠️ Cleaned text:\n", text)
468
+ raise
469
+
470
+
471
+ gt_relevance, pred_relevance = [], []
472
+ gt_utilization, pred_utilization = [], []
473
+ gt_completeness, pred_completeness = [], []
474
+ gt_adherence, pred_adherence = [], []
475
+
476
+ if(domain=="Legal"):
477
+ dataset = legal_dataset
478
+ elif(domain=="Medical"):
479
+ dataset = med_dataset
480
+ elif(domain=="GK"):
481
+ dataset = gk_dataset
482
+ elif(domain=="CS"):
483
+ dataset = cs_dataset
484
+ elif(domain=="Finance"):
485
+ dataset = fin_dataset
486
+
487
+ for i in q_indices:
488
+ query = dataset[i]['question']
489
+ print(f"\n\n\nQuery:{i}.{query}\n====================================================================")
490
+ #print(f"\ndomain:{domain}====================================================================")
491
+ documents_formatted, response = jugde_response_rag(query, domain)
492
+ judge_response = clean_and_parse_json_block(response)
493
+ print(f"\ndocuments_formatted:{documents_formatted}")
494
+ print(f"\n======================================================================\nResponse:{judge_response}")
495
+ retrieved_sentences = extract_retrieved_sentence_keys(documents_formatted)
496
+ predicted = compute_ragbench_metrics(judge_response, retrieved_sentences)
497
+
498
+ # GT values
499
+ gt_r = dataset[i].get('relevance_score')
500
+ gt_u = dataset[i].get('utilization_score')
501
+ gt_c = dataset[i].get('completeness_score')
502
+ gt_a = dataset[i].get('gpt3_adherence')
503
+
504
+ safe_append(gt_relevance, pred_relevance, gt_r, predicted['Context Relevance'])
505
+ safe_append(gt_utilization, pred_utilization, gt_u, predicted['Context Utilization'])
506
+ safe_append(gt_completeness, pred_completeness, gt_c, predicted['Completeness'])
507
+ if gt_a is not None and predicted['Adherence'] is not None:
508
+ safe_append(gt_adherence, pred_adherence, int(gt_a), int(predicted['Adherence']))
509
+
510
+ def compute_rmse(gt, pred):
511
+ return round(np.sqrt(np.mean((np.array(gt) - np.array(pred)) ** 2)), 4)
512
+
513
+ result = {
514
+ "Context Relevance": compute_rmse(gt_relevance, pred_relevance),
515
+ "Context Utilization": compute_rmse(gt_utilization, pred_utilization),
516
+ "Completeness": compute_rmse(gt_completeness, pred_completeness),
517
+ }
518
+
519
+ if len(set(gt_adherence)) == 2:
520
+ result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
521
+ result["AUC-ROC (Adherence)"] = round(roc_auc_score(gt_adherence, pred_adherence), 4)
522
+ else:
523
+ result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
524
+ result["AUC-ROC (Adherence)"] = "N/A - one class only"
525
+
526
+ return result
527
+
528
+
529
+
530
+ # Updated wrapper
531
+ def evaluate_rag_gradio(domain, q_indices_str):
532
+ # Capture logs
533
+ log_stream = io.StringIO()
534
+ sys.stdout = log_stream
535
+
536
+ try:
537
+ # Parse comma-separated indices
538
+ q_indices = [int(x.strip()) for x in q_indices_str.split(",") if x.strip().isdigit()]
539
+ results = evaluate_rag_pipeline(domain, q_indices)
540
+
541
+ logs = log_stream.getvalue()
542
+ return results, logs
543
+
544
+ except Exception as e:
545
+ traceback.print_exc()
546
+ return {"error": str(e)}, log_stream.getvalue()
547
+
548
+ finally:
549
+ sys.stdout = sys.__stdout__ # Restore stdout
550
+
551
+ # Gradio interface
552
+ iface = gr.Interface(
553
+ fn=evaluate_rag_gradio,
554
+ inputs=[
555
+ gr.Dropdown(choices=["Legal", "Medical", "GK", "CS", "Finance"], label="Domain"),
556
+ gr.Textbox(label="Comma-separated Query Indices (e.g. 89,121,245)", lines=1),
557
+ ],
558
+ outputs=[
559
+ gr.JSON(label="Evaluation Metrics (RMSE & AUC-ROC)"),
560
+ gr.Textbox(label="Execution Log", lines=10, interactive=True),
561
+ ],
562
+ title="RAG Evaluation Dashboard",
563
+ description="Evaluate your RAG pipeline across selected queries using GPT-based generation and judgment."
564
+ )
565
+
566
+ # Launch app
567
+ iface.launch(server_name="0.0.0.0", server_port=7860, debug=True)
bkp_app.py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Deploy_CapstoneRagBench.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1OG-77VqKwz3509_osgNgSeOMJ9G6RvB4
8
+ """
9
+
10
+ # For Legal
11
+
12
+ from datasets import load_from_disk
13
+ from transformers import AutoTokenizer, AutoModel
14
+ import faiss
15
+ import numpy as np
16
+ import torch
17
+ from datasets import load_dataset, Dataset, get_dataset_config_names
18
+ import os
19
+ from groq import Groq
20
+ from sentence_transformers import CrossEncoder
21
+ import requests
22
+ import uuid
23
+ import re
24
+ import gradio as gr
25
+ import json
26
+ import torch
27
+ import numpy as np
28
+ from sklearn.metrics import mean_squared_error, roc_auc_score
29
+ import gradio as gr
30
+ import io
31
+ import sys
32
+ import traceback
33
+
34
+
35
+ def retrieve_top_k(query,domain='legal', model_name='nlpaueb/legal-bert-base-uncased', k=8):
36
+ # Load tokenizer and model
37
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
38
+ model = AutoModel.from_pretrained(model_name).to(device)
39
+ model.eval()
40
+
41
+ #print(f"In retrive_top_k Query:{query}")
42
+ # Tokenize and embed query using mean pooling
43
+ inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
44
+ inputs = {k: v.to(device) for k, v in inputs.items()}
45
+ with torch.no_grad():
46
+ outputs = model(**inputs)
47
+ query_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
48
+
49
+ # Load FAISS index and dataset
50
+ index_path = f"{domain}_index/faiss.index"
51
+ dataset_path = f"{domain}_dataset"
52
+
53
+ faiss_index = faiss.read_index(index_path)
54
+ dataset = load_from_disk(dataset_path)
55
+
56
+ # Perform FAISS search
57
+ D, I = faiss_index.search(query_embedding.astype('float32'), k)
58
+
59
+ # Retrieve top-k matching chunks
60
+ top_chunks = [dataset[int(idx)]['text'] for idx in I[0]]
61
+ return top_chunks
62
+
63
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
64
+ #print(device)
65
+
66
+ dataset = load_dataset("rungalileo/ragbench", "cuad", split="test")
67
+
68
+ client = Groq(
69
+ api_key= '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',#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',
70
+ )
71
+
72
+ # Load BGE reranker
73
+ reranker = CrossEncoder("BAAI/bge-reranker-base", max_length=512)
74
+
75
+ def rerank_documents_bge(query, documents, top_n=5, return_scores=False):
76
+ """
77
+ Rerank documents using BAAI/bge-reranker-base CrossEncoder.
78
+
79
+ Args:
80
+ query (str): The query string.
81
+ documents (List[str]): List of candidate documents.
82
+ top_n (int): Number of top results to return.
83
+ return_scores (bool): Whether to return scores along with documents.
84
+
85
+ Returns:
86
+ List[str] or List[Tuple[str, float]]
87
+ """
88
+ if not documents:
89
+ return []
90
+
91
+ # Prepare (query, doc) pairs
92
+ pairs = [(query, doc) for doc in documents]
93
+
94
+ # Predict relevance scores
95
+ scores = reranker.predict(pairs, batch_size=16)
96
+
97
+ # Sort by score descending
98
+ reranked = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
99
+
100
+ if return_scores:
101
+ return reranked[:top_n]
102
+ else:
103
+ return [doc for doc, _ in reranked[:top_n]]
104
+
105
+
106
+ def generate_response_rag(query,model,index_dir="legal_index"):
107
+ # Step 1: Retrieve top-k context chunks using your FAISS setup
108
+ top_chunks = retrieve_top_k(query,'legal', "nlpaueb/legal-bert-base-uncased")
109
+
110
+ # Step 2: Rerank retrieved documents using cross-encoder
111
+ #reranked_chunks = rerank_documents(query, top_chunks, top_n=15)
112
+ #rerank_and_filter_chunks = filter_by_faithfulness(query, reranked_chunks)
113
+
114
+ #reranked_chunks = rerank_and_filter_chunks
115
+ reranked_chunks_bge = rerank_documents_bge(query, top_chunks, top_n=5)
116
+ #sum_context = summarize_context("\n\n".join(reranked_chunks_bge))
117
+
118
+
119
+
120
+ final_context = reranked_chunks_bge
121
+ # Step 2: Prepare context and RAG-style prompt
122
+ context = "\n\n".join(final_context)
123
+
124
+ #print(f"Context:{context}")
125
+ prompt = f"""You are a helpful legal assistant.
126
+ Use the following context to answer the question.
127
+ Using only the information from the retrieved context, answer the following question. If the answer cannot be derived, say "I don't know." Always have answer with prefix **Answer:**
128
+
129
+ Context:{context}
130
+
131
+ Question: {query}
132
+ Answer:"""
133
+
134
+ # Step 3: Call the LLM (LLaMA3 or any chat model)
135
+ chat_completion = client.chat.completions.create(
136
+ messages=[
137
+ {"role": "user", "content": prompt}
138
+ ],
139
+ model=model,#"gemma2-9b-it"#"qwen/qwen3-32b"#deepseek-r1-distill-llama-70b",#"llama3-70b-8192", # mistral-saba-24b
140
+ temperature=0.0
141
+ )
142
+
143
+ return chat_completion.choices[0].message.content.strip()
144
+
145
+ '''response = openai.chat.completions.create(
146
+ model="gpt-3.5-turbo",
147
+ messages=[
148
+ {"role": "user", "content": prompt}
149
+ ],
150
+ temperature=0.0,
151
+ max_tokens=1024
152
+ )
153
+
154
+ return response.choices[0].message.content'''
155
+
156
+ #JUDGE LLM
157
+
158
+ def split_into_keyed_sentences(text, prefix):
159
+ """Splits text into sentences with keys like '0a.', '0b.', or 'a.', 'b.', etc."""
160
+ # Basic sentence tokenizer with keys
161
+ sentences = re.split(r'(?<=[.?!])\s+', text.strip())
162
+ keyed = {}
163
+ for i, s in enumerate(sentences):
164
+ key = f"{prefix}{chr(97 + i)}" # 'a', 'b', ...
165
+ if s:
166
+ keyed[key] = s.strip()
167
+ return keyed
168
+
169
+
170
+ def jugde_response_rag(query, embedder="nlpaueb/legal-bert-base-uncased", domain="legal", k=5):
171
+
172
+ top_chunks = retrieve_top_k(query)
173
+
174
+ top_chunks = [chunk[0] if isinstance(chunk, tuple) else chunk for chunk in top_chunks]
175
+
176
+ # Step 2: Prepare context and RAG-style prompt
177
+ context = "\n\n".join(top_chunks)
178
+
179
+ # Split context and dummy answer into keyed sentences
180
+ document_keys = split_into_keyed_sentences(context, "0")
181
+
182
+ #print(f"Query:{query}\n====================================================================")
183
+ response = generate_response_rag(query,model="llama3-70b-8192") #deepseek-r1-distill-llama-70b llama3-70b-8192
184
+ #print(f"\n====================================\Generator Response:{response}")
185
+ #For deepseek
186
+ #print("Before Curated:",response)
187
+ response=response[response.find("**Answer"):].replace("**Answer","");
188
+
189
+ print(f"Response for Generator LLM:{response}")
190
+
191
+ response_keys = split_into_keyed_sentences(response, "")
192
+ # Rebuild sections for prompt
193
+ documents_formatted = "\n".join([f"{k}. {v}" for k, v in document_keys.items()])
194
+ response_formatted = "\n".join([f"{k}. {v}" for k, v in response_keys.items()])
195
+
196
+ '''print(f"\n====================================================================")
197
+ print(f"documents_formatted:{documents_formatted}")
198
+ print(f"\n====================================================================")
199
+ print(f"response_formatted:{response_formatted}")
200
+ print(f"\n====================================================================")'''
201
+
202
+
203
+ prompt = f"""I asked someone to answer a question based on one or more documents.
204
+ Your task is to review their response and assess whether or not each sentence
205
+ in that response is supported by text in the documents. And if so, which
206
+ sentences in the documents provide that support. You will also tell me which
207
+ of the documents contain useful information for answering the question, and
208
+ which of the documents the answer was sourced from.
209
+ Here are the documents, each of which is split into sentences. Alongside each
210
+ sentence is associated key, such as ’0a.’ or ’0b.’ that you can use to refer
211
+ to it:
212
+ '''
213
+ {documents_formatted}
214
+ '''
215
+ The question was:
216
+ '''
217
+ {query}
218
+ '''
219
+ Here is their response, split into sentences. Alongside each sentence is
220
+ associated key, such as ’a.’ or ’b.’ that you can use to refer to it. Note
221
+ that these keys are unique to the response, and are not related to the keys
222
+ in the documents:
223
+ '''
224
+ {response_formatted}
225
+ '''
226
+ You must respond with a JSON object matching this schema:
227
+ '''
228
+ {{
229
+ "relevance_explanation": string,
230
+ "all_relevant_sentence_keys": [string],
231
+ "overall_supported_explanation": string,
232
+ "overall_supported": boolean,
233
+ "sentence_support_information": [
234
+ {{
235
+ "response_sentence_key": string,
236
+ "explanation": string,
237
+ "supporting_sentence_keys": [string],
238
+ "fully_supported": boolean
239
+ }},
240
+ ],
241
+ "all_utilized_sentence_keys": [string]
242
+ }}
243
+ '''
244
+ The relevance_explanation field is a string explaining which documents
245
+ contain useful information for answering the question. Provide a step-by-step
246
+ breakdown of information provided in the documents and how it is useful for
247
+ answering the question.
248
+ The all_relevant_sentence_keys field is a list of all document sentences keys
249
+ (e.g. ’0a’) that are revant to the question. Include every sentence that is
250
+ useful and relevant to the question, even if it was not used in the response,
251
+ or if only parts of the sentence are useful. Ignore the provided response when
252
+ making this judgement and base your judgement solely on the provided documents
253
+ and question. Omit sentences that, if removed from the document, would not
254
+ impact someone’s ability to answer the question.
255
+ The overall_supported_explanation field is a string explaining why the response
256
+ *as a whole* is or is not supported by the documents. In this field, provide a
257
+ step-by-step breakdown of the claims made in the response and the support (or
258
+ lack thereof) for those claims in the documents. Begin by assessing each claim
259
+ separately, one by one; don’t make any remarks about the response as a whole
260
+ until you have assessed all the claims in isolation.
261
+ The overall_supported field is a boolean indicating whether the response as a
262
+ whole is supported by the documents. This value should reflect the conclusion
263
+ you drew at the end of your step-by-step breakdown in overall_supported_explanation.
264
+ In the sentence_support_information field, provide information about the support
265
+ *for each sentence* in the response.
266
+ The sentence_support_information field is a list of objects, one for each sentence
267
+ in the response. Each object MUST have the following fields:
268
+ - response_sentence_key: a string identifying the sentence in the response.
269
+ This key is the same as the one used in the response above.
270
+ - explanation: a string explaining why the sentence is or is not supported by the
271
+ documents.
272
+ - supporting_sentence_keys: keys (e.g. ’0a’) of sentences from the documents that
273
+ support the response sentence. If the sentence is not supported, this list MUST
274
+ be empty. If the sentence is supported, this list MUST contain one or more keys.
275
+ In special cases where the sentence is supported, but not by any specific sentence,
276
+ you can use the string "supported_without_sentence" to indicate that the sentence
277
+ is generally supported by the documents. Consider cases where the sentence is
278
+ expressing inability to answer the question due to lack of relevant information in
279
+ the provided contex as "supported_without_sentence". In cases where the sentence
280
+ is making a general statement (e.g. outlining the steps to produce an answer, or
281
+ summarizing previously stated sentences, or a transition sentence), use the
282
+ sting "general".In cases where the sentence is correctly stating a well-known fact,
283
+ like a mathematical formula, use the string "well_known_fact". In cases where the
284
+ sentence is performing numerical reasoning (e.g. addition, multiplication), use
285
+ the string "numerical_reasoning".
286
+ - fully_supported: a boolean indicating whether the sentence is fully supported by
287
+ the documents.
288
+ - This value should reflect the conclusion you drew at the end of your step-by-step
289
+ breakdown in explanation.
290
+ - If supporting_sentence_keys is an empty list, then fully_supported must be false.
291
+ 17
292
+ - Otherwise, use fully_supported to clarify whether everything in the response
293
+ sentence is fully supported by the document text indicated in supporting_sentence_keys
294
+ (fully_supported = true), or whether the sentence is only partially or incompletely
295
+ supported by that document text (fully_supported = false).
296
+ The all_utilized_sentence_keys field is a list of all sentences keys (e.g. ’0a’) that
297
+ were used to construct the answer. Include every sentence that either directly supported
298
+ the answer, or was implicitly used to construct the answer, even if it was not used
299
+ in its entirety. Omit sentences that were not used, and could have been removed from
300
+ the documents without affecting the answer.
301
+ You must respond with a valid JSON string. Use escapes for quotes, e.g. ‘\\"‘, and
302
+ newlines, e.g. ‘\\n‘. Do not write anything before or after the JSON string. Do not
303
+ wrap the JSON string in backticks like ‘‘‘ or ‘‘‘json.
304
+ As a reminder: your task is to review the response and assess which documents contain
305
+ useful information pertaining to the question, and how each sentence in the response
306
+ is supported by the text in the documents.\
307
+ """
308
+
309
+ # Step 3: Call the LLM
310
+ chat_completion = client.chat.completions.create(
311
+ messages=[
312
+ {"role": "user", "content": prompt}
313
+ ],
314
+ model="meta-llama/llama-4-maverick-17b-128e-instruct", #deepseek-r1-distill-llama-70b llama3-70b-8192 meta-llama/llama-4-maverick-17b-128e-instruct
315
+ )
316
+
317
+ return documents_formatted,chat_completion.choices[0].message.content.strip()
318
+
319
+ '''chat_completion = openai.chat.completions.create(
320
+ messages=[
321
+ {"role":"user",
322
+ "content":prompt}
323
+ ],
324
+ model="gpt-4o",
325
+ max_tokens=1024,
326
+
327
+ )
328
+ return documents_formatted,chat_completion.choices[0].message.content'''
329
+
330
+ def extract_retrieved_sentence_keys(document_text: str) -> list[str]:
331
+ """
332
+ Extracts sentence keys like '0a.', '0b.', etc. from a formatted document string.
333
+
334
+ Parameters:
335
+ - document_text (str): full text of document with sentence keys
336
+
337
+ Returns:
338
+ - List of unique sentence keys in the order they appear
339
+ """
340
+ # Match pattern like 0a., 0b., 0z., 0{., 0|., etc.
341
+ pattern = r'\b0[\w\{\|\}~€‚]\.'
342
+
343
+ matches = re.findall(pattern, document_text)
344
+ return list(dict.fromkeys(matches)) # Removes duplicates while preserving order
345
+
346
+ def compute_ragbench_metrics(judge_response: dict, retrieved_sentence_keys: list[str]) -> dict:
347
+ """
348
+ Computes RAGBench-style metrics from Judge LLM response.
349
+
350
+ Parameters:
351
+ - judge_response (dict): JSON response from Judge LLM
352
+ - retrieved_sentence_keys (list of str): all sentence keys from the retrieved documents
353
+
354
+ Returns:
355
+ - Dictionary with Context Relevance, Context Utilization, Completeness, and Adherence
356
+ """
357
+
358
+ R = set(judge_response.get("all_relevant_sentence_keys", [])) # Relevant sentences
359
+ U = set(judge_response.get("all_utilized_sentence_keys", [])) # Utilized sentences
360
+ intersection_RU = R & U
361
+
362
+ total_retrieved = len(retrieved_sentence_keys)
363
+ len_R = len(R)
364
+ len_U = len(U)
365
+ len_intersection = len(intersection_RU)
366
+
367
+ # Context Relevance: fraction of retrieved context that is relevant
368
+ context_relevance = len_R / total_retrieved if total_retrieved else 0.0
369
+
370
+ # Context Utilization: fraction of retrieved context that was used
371
+ context_utilization = len_U / total_retrieved if total_retrieved else 0.0
372
+
373
+ # Completeness: fraction of relevant content that was used
374
+ completeness = len_intersection / len_R if len_R else 0.0
375
+
376
+ # Adherence: 1 if all response sentences are fully supported, else 0
377
+ is_fully_supported = all(s.get("fully_supported", False)
378
+ for s in judge_response.get("sentence_support_information", []))
379
+ adherence = 1.0 if is_fully_supported and judge_response.get("overall_supported", False) else 0.0
380
+
381
+ return {
382
+ "Context Relevance": round(context_relevance, 4),
383
+ "Context Utilization": round(context_utilization, 4),
384
+ "Completeness": round(completeness, 4),
385
+ "Adherence": adherence
386
+ }
387
+
388
+
389
+ def compute_rmse(gt, pred):
390
+ return round(np.sqrt(np.mean((np.array(gt) - np.array(pred)) ** 2)), 4)
391
+
392
+
393
+ def evaluate_rag_pipeline(q_indices):
394
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
395
+
396
+ def safe_append(gt_list, pred_list, gt_val, pred_val):
397
+ if gt_val is not None and pred_val is not None:
398
+ gt_list.append(gt_val)
399
+ pred_list.append(pred_val)
400
+
401
+ def clean_and_parse_json_block(text):
402
+ # Strip markdown-style code block if present
403
+ #text = text.strip().strip("`").strip()
404
+ code_block_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
405
+ if code_block_match:
406
+ text = code_block_match.group(1).strip()
407
+
408
+ # Remove invalid/control characters that break decoding
409
+ text = re.sub(r"[^\x20-\x7E\n\t]", "", text)
410
+
411
+ try:
412
+ return json.loads(text)
413
+ except json.JSONDecodeError as e:
414
+ print("❌ JSON Decode Error:", e)
415
+ print("⚠️ Cleaned text:\n", text)
416
+ raise
417
+
418
+
419
+ gt_relevance, pred_relevance = [], []
420
+ gt_utilization, pred_utilization = [], []
421
+ gt_completeness, pred_completeness = [], []
422
+ gt_adherence, pred_adherence = [], []
423
+
424
+ for i in q_indices:
425
+ query = dataset[i]['question']
426
+ print(f"\n\n\nQuery:{i}.{query}\n====================================================================")
427
+ documents_formatted, response = jugde_response_rag(
428
+ query, embedder="nlpaueb/legal-bert-base-uncased", domain="legal")
429
+ judge_response = clean_and_parse_json_block(response)
430
+ print(f"\n======================================================================\nResponse:{judge_response}")
431
+ retrieved_sentences = extract_retrieved_sentence_keys(documents_formatted)
432
+ predicted = compute_ragbench_metrics(judge_response, retrieved_sentences)
433
+
434
+ # GT values
435
+ gt_r = dataset[i].get('relevance_score')
436
+ gt_u = dataset[i].get('utilization_score')
437
+ gt_c = dataset[i].get('completeness_score')
438
+ gt_a = dataset[i].get('gpt3_adherence')
439
+
440
+ safe_append(gt_relevance, pred_relevance, gt_r, predicted['Context Relevance'])
441
+ safe_append(gt_utilization, pred_utilization, gt_u, predicted['Context Utilization'])
442
+ safe_append(gt_completeness, pred_completeness, gt_c, predicted['Completeness'])
443
+ if gt_a is not None and predicted['Adherence'] is not None:
444
+ safe_append(gt_adherence, pred_adherence, int(gt_a), int(predicted['Adherence']))
445
+
446
+ def compute_rmse(gt, pred):
447
+ return round(np.sqrt(np.mean((np.array(gt) - np.array(pred)) ** 2)), 4)
448
+
449
+ result = {
450
+ "Context Relevance": compute_rmse(gt_relevance, pred_relevance),
451
+ "Context Utilization": compute_rmse(gt_utilization, pred_utilization),
452
+ "Completeness": compute_rmse(gt_completeness, pred_completeness),
453
+ }
454
+
455
+ if len(set(gt_adherence)) == 2:
456
+ result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
457
+ result["AUC-ROC (Adherence)"] = round(roc_auc_score(gt_adherence, pred_adherence), 4)
458
+ else:
459
+ result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
460
+ result["AUC-ROC (Adherence)"] = "N/A - one class only"
461
+
462
+ return result
463
+
464
+
465
+ # Wrapper to parse textbox input into list of ints
466
+ def evaluate_rag_gradio(q_indices_str):
467
+ # Capture printed logs
468
+ log_stream = io.StringIO()
469
+ sys.stdout = log_stream
470
+
471
+ try:
472
+ q_indices = [int(x.strip()) for x in q_indices_str.split(",") if x.strip().isdigit()]
473
+ results = evaluate_rag_pipeline(q_indices)
474
+
475
+ # Return metrics and logs
476
+ logs = log_stream.getvalue()
477
+ return results, logs
478
+
479
+ except Exception as e:
480
+ traceback.print_exc()
481
+ return {"error": str(e)}, log_stream.getvalue()
482
+
483
+ finally:
484
+ sys.stdout = sys.__stdout__
485
+
486
+ iface = gr.Interface(
487
+ fn=evaluate_rag_gradio,
488
+ inputs=gr.Textbox(label="Comma-separated Query Indices (e.g. 89,121,245)", lines=1),
489
+ outputs=[
490
+ gr.JSON(label="Evaluation Metrics (RMSE & AUC-ROC)"),
491
+ gr.Textbox(label="Execution Log", lines=5, interactive=True)
492
+ ],
493
+ title="RAG Evaluation Dashboard",
494
+ description="Evaluate your RAG pipeline across selected queries using GPT-based generation and judgment."
495
+ )
496
+
497
+ iface.launch(debug=True)
cs_dataset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c81fcd283298c766efceed51005f94977eb042565a6d6e32a141af3516eddab
3
+ size 88920
cs_dataset/dataset_info.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "text": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ }
9
+ },
10
+ "homepage": "",
11
+ "license": ""
12
+ }
cs_dataset/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "ec44a3721c635a27",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
cs_index/faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:781715fcdd5abaccf46e7df9e34cb8fe08cefa3f47fc4381c1530e83ad3d3cb6
3
+ size 370221
fin_dataset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b6bb5e029518500e5764893d7267aa86d93e7a0e8ceae7969c371f17b42e3fc
3
+ size 1504056
fin_dataset/dataset_info.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "text": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ }
9
+ },
10
+ "homepage": "",
11
+ "license": ""
12
+ }
fin_dataset/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "fe416e18cf3f19d0",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
fin_index/faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdb1e231b83001723586362b682f80487689ad9bb208a1c8dea3bade5d004cbd
3
+ size 6039597
gk_dataset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2ff3d603dd8f386d4f55dfb696d9a486e29e2c948c7e4cb03291b3f1185e671
3
+ size 777424
gk_dataset/dataset_info.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "text": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ }
9
+ },
10
+ "homepage": "",
11
+ "license": ""
12
+ }
gk_dataset/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "00d8c8388a8ac73c",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
gk_index/faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c22862009c798b27b235f0af137915a98fad631649735dbf19a467e3f896be6
3
+ size 3526701
legal_dataset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85ff687742e949901491ed90e9babf78ced5c7dcd3a910986ab25bb5f390072b
3
+ size 4926576
legal_dataset/dataset_info.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "text": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ }
9
+ },
10
+ "homepage": "",
11
+ "license": ""
12
+ }
legal_dataset/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "f6b83182f5e3cfa3",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
legal_index/faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64c4157c3617e605acc8464d3cf1da1ea321c07a6a1bdb6c2675edec41d3a0ba
3
+ size 7978029
med_dataset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb7eaae975a43389ed3ccd143dfcfca1e61ad094e3064ec477f36f9cd47d11ad
3
+ size 2245824
med_dataset/dataset_info.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "text": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ }
9
+ },
10
+ "homepage": "",
11
+ "license": ""
12
+ }
med_dataset/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "e0ae8ccbcca935ea",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
med_index/faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:678041534c1d3641eaaed36f5efca24094762a1454eb6bdd413c2973b94c5dff
3
+ size 11473965
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ sentence-transformers
4
+ faiss-cpu
5
+ torch
6
+ datasets
7
+ scikit-learn
8
+ groq