Spaces:

gaonkarrs
/

RAG_Evaluation_System

Running

App Files Files Community

gaonkarrs commited on 28 days ago

Commit

5d34f4c

0 Parent(s):

Proper root structure with app.py and requirements.txt

Browse files

Files changed (26) hide show

.gitattributes +36 -0
README.md +13 -0
app.py +627 -0
bkp1_app.py +567 -0
bkp_app.py +497 -0
cs_dataset/data-00000-of-00001.arrow +3 -0
cs_dataset/dataset_info.json +12 -0
cs_dataset/state.json +13 -0
cs_index/faiss.index +3 -0
fin_dataset/data-00000-of-00001.arrow +3 -0
fin_dataset/dataset_info.json +12 -0
fin_dataset/state.json +13 -0
fin_index/faiss.index +3 -0
gk_dataset/data-00000-of-00001.arrow +3 -0
gk_dataset/dataset_info.json +12 -0
gk_dataset/state.json +13 -0
gk_index/faiss.index +3 -0
legal_dataset/data-00000-of-00001.arrow +3 -0
legal_dataset/dataset_info.json +12 -0
legal_dataset/state.json +13 -0
legal_index/faiss.index +3 -0
med_dataset/data-00000-of-00001.arrow +3 -0
med_dataset/dataset_info.json +12 -0
med_dataset/state.json +13 -0
med_index/faiss.index +3 -0
requirements.txt +8 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.index filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Rag Eval Dashboard
+emoji: 🚀
+colorFrom: gray
+colorTo: pink
+sdk: gradio
+sdk_version: 5.36.2
+app_file: app.py
+pinned: false
+short_description: RAGBench evalution
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,627 @@

+from datasets import load_from_disk
+from transformers import AutoTokenizer, AutoModel
+import faiss
+import numpy as np
+import torch
+from datasets import load_from_disk
+import faiss
+import numpy as np
+import os
+from datasets import load_dataset, Dataset, get_dataset_config_names
+from sentence_transformers import SentenceTransformer
+from groq import Groq
+from sentence_transformers import CrossEncoder
+import requests
+import uuid
+import re
+import json
+import gradio as gr
+import io
+import sys
+import traceback
+embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+def build_index_and_dataset(domain, subsets, embedder_type="sentence-transformers/all-MiniLM-L6-v2", legal=False):
+    dataset_path = f"{domain}_dataset"
+    index_path = f"{domain}_index/faiss.index"
+    # ❌ Always remove previous
+    if os.path.exists(dataset_path):
+        shutil.rmtree(dataset_path)
+    if os.path.exists(index_path):
+        os.remove(index_path)
+    print(f"🚀 Rebuilding dataset and index for domain: {domain}")
+    all_docs = []
+    for subset in subsets:
+        ds = load_dataset("rungalileo/ragbench", subset, split="test")
+        for item in ds:
+            if isinstance(item, dict) and "documents" in item and isinstance(item["documents"], list):
+                all_docs.extend(item["documents"])
+            elif isinstance(item, str):
+                all_docs.append(item)
+    all_docs = list(set(all_docs))
+    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
+    chunks = []
+    for doc in all_docs:
+        chunks.extend(splitter.split_text(doc))
+    if legal:
+        tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
+        model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased").to("cuda" if torch.cuda.is_available() else "cpu")
+        model.eval()
+        device = model.device
+        all_embeddings = []
+        for i in tqdm(range(0, len(chunks), 16), desc="Embedding Legal"):
+            batch = chunks[i:i+16]
+            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = model(**inputs)
+                batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
+                all_embeddings.append(batch_embeddings)
+        embeddings = np.vstack(all_embeddings)
+    else:
+        embedder = SentenceTransformer(embedder_type, device="cuda" if torch.cuda.is_available() else "cpu")
+        embeddings = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
+    hf_dataset = Dataset.from_dict({"text": chunks})
+    dim = embeddings.shape[1]
+    faiss_index = faiss.IndexFlatL2(dim)
+    faiss_index.add(embeddings.astype("float32"))
+    os.makedirs(dataset_path, exist_ok=True)
+    os.makedirs(os.path.dirname(index_path), exist_ok=True)
+    hf_dataset.save_to_disk(dataset_path)
+    faiss.write_index(faiss_index, index_path)
+    print(f"✅ Saved {domain} dataset at {dataset_path}, index at {index_path}")
+    return hf_dataset, faiss_index
+# 🔁 Always regenerate these indices and datasets at app start
+RAGBENCH_SUBSETS_BY_DOMAIN = {
+    "legal": ["cuad"],
+    "med": ["pubmedqa"],
+    "gk": ["hotpotqa"],
+    "cs": ["emanual"],
+    "fin": ["finqa"]
+}
+hf_dataset_cs, faiss_index_cs = build_index_and_dataset("cs", RAGBENCH_SUBSETS_BY_DOMAIN["cs"])
+hf_dataset_med, faiss_index_med = build_index_and_dataset("med", RAGBENCH_SUBSETS_BY_DOMAIN["med"])
+hf_dataset_gk, faiss_index_gk = build_index_and_dataset("gk", RAGBENCH_SUBSETS_BY_DOMAIN["gk"])
+hf_dataset_fin, faiss_index_fin = build_index_and_dataset("fin", RAGBENCH_SUBSETS_BY_DOMAIN["fin"])
+hf_dataset_legal, faiss_index_legal = build_index_and_dataset("legal", RAGBENCH_SUBSETS_BY_DOMAIN["legal"], legal=True)
+# Now load Hugging Face RAGBench datasets for GT
+legal_dataset = load_dataset("rungalileo/ragbench", "cuad", split="test")
+med_dataset = load_dataset("rungalileo/ragbench", "pubmedqa", split="test")
+gk_dataset = load_dataset("rungalileo/ragbench", "hotpotqa", split="test")
+cs_dataset = load_dataset("rungalileo/ragbench", "emanual", split="test")
+fin_dataset = load_dataset("rungalileo/ragbench", "finqa", split="test")
+# Load BGE reranker
+reranker = CrossEncoder("BAAI/bge-reranker-base", max_length=512)
+embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+model_name = "nlpaueb/legal-bert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModel.from_pretrained(model_name).to(device)
+model.eval()
+def retrieve_top_k(query,domain='legal', model_name='nlpaueb/legal-bert-base-uncased', k=8):
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name).to(device)
+    model.eval()
+    #print(f"In retrive_top_k Query:{query}")
+    # Tokenize and embed query using mean pooling
+    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        query_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
+    # Load FAISS index and dataset
+    index_path = f"legal_index/faiss.index"
+    dataset_path = f"legal_dataset"
+    faiss_index = faiss.read_index(index_path)
+    dataset = load_from_disk(dataset_path)
+    # Perform FAISS search
+    D, I = faiss_index.search(query_embedding.astype('float32'), k)
+    # Retrieve top-k matching chunks
+    top_chunks = [dataset[int(idx)]['text'] for idx in I[0]]
+    return top_chunks
+# Retrieval function using preloaded objects
+def retrieve_top_c(query, domain, embedder,  k=5):
+    if domain == "CS":
+        hf_dataset = hf_dataset_cs
+        faiss_index = faiss_index_cs
+    elif domain == "Medical":
+        hf_dataset = hf_dataset_med
+        faiss_index = faiss_index_med
+    elif domain == "GK":
+        hf_dataset = hf_dataset_gk
+        faiss_index = faiss_index_gk
+    elif domain == "Finance":
+        hf_dataset = hf_dataset_fin
+        faiss_index = faiss_index_fin
+    else:
+        raise ValueError(f"Unknown domain: {domain}")
+    # Encode query and search
+    query_embedding = embedder.encode([query]).astype('float32')
+    #query_embedding = embedder.encode([query], convert_to_numpy=True).astype('float32')
+    distances, indices = faiss_index.search(query_embedding, k)
+    return [hf_dataset[int(i)]["text"] for i in indices[0]]
+client = Groq(
+    api_key= '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',#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',
+)
+def rerank_documents_bge(query, documents, top_n=5, return_scores=False):
+    """
+    Rerank documents using BAAI/bge-reranker-base CrossEncoder.
+    Args:
+        query (str): The query string.
+        documents (List[str]): List of candidate documents.
+        top_n (int): Number of top results to return.
+        return_scores (bool): Whether to return scores along with documents.
+    Returns:
+        List[str] or List[Tuple[str, float]]
+    """
+    if not documents:
+        return []
+    # Prepare (query, doc) pairs
+    pairs = [(query, doc) for doc in documents]
+    # Predict relevance scores
+    scores = reranker.predict(pairs, batch_size=16)
+    # Sort by score descending
+    reranked = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
+    if return_scores:
+        return reranked[:top_n]
+    else:
+        return [doc for doc, _ in reranked[:top_n]]
+def generate_response_rag(query,domain):
+    # Step 1: Retrieve top-k context chunks using your FAISS setup
+    if domain == "Legal":
+        top_chunks = retrieve_top_k(query,'Legal', model_name)
+    else:
+        top_chunks = retrieve_top_c(query, domain,embedder)
+    # Step 2: Rerank retrieved documents using cross-encoder
+    #reranked_chunks = rerank_documents(query, top_chunks, top_n=15)
+    #rerank_and_filter_chunks = filter_by_faithfulness(query, reranked_chunks)
+    #print("Retrieved Top chunks",top_chunks)
+    #reranked_chunks = rerank_and_filter_chunks
+    reranked_chunks_bge = rerank_documents_bge(query, top_chunks, top_n=5)
+    #sum_context = summarize_context("\n\n".join(reranked_chunks_bge))
+    final_context = reranked_chunks_bge
+    # Step 2: Prepare context and RAG-style prompt
+    context = "\n\n".join(final_context)
+    #print(f"Context:{context}")
+    prompt = f"""You are a helpful legal assistant.
+    Use the following context to answer the question.
+    Using only the information from the retrieved context, answer the following question. If the answer cannot be derived, say "I don't know." Always have answer with prefix **Answer:**
+    Context:{context}
+    Question: {query}
+    Answer:"""
+    # Step 3: Call the LLM (LLaMA3 or any chat model)
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+        model="llama3-70b-8192",#"gemma2-9b-it"#"qwen/qwen3-32b"#deepseek-r1-distill-llama-70b",#"llama3-70b-8192", # mistral-saba-24b
+        temperature=0.0
+    )
+    return context,chat_completion.choices[0].message.content.strip()
+    '''response = openai.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0.0,
+        max_tokens=1024
+    )
+    return response.choices[0].message.content'''
+#JUDGE LLM
+def split_into_keyed_sentences(text, prefix):
+    """Splits text into sentences with keys like '0a.', '0b.', or 'a.', 'b.', etc."""
+    # Basic sentence tokenizer with keys
+    sentences = re.split(r'(?<=[.?!])\s+', text.strip())
+    keyed = {}
+    for i, s in enumerate(sentences):
+        key = f"{prefix}{chr(97 + i)}"  # 'a', 'b', ...
+        if s:
+            keyed[key] = s.strip()
+    return keyed
+def jugde_response_rag(query, domain):
+    #top_chunks = retrieve_top_k(query)
+    #top_chunks = [chunk[0] if isinstance(chunk, tuple) else chunk for chunk in top_chunks]
+    # Step 2: Prepare context and RAG-style prompt
+    #context = "\n\n".join(top_chunks)
+        # Split context and dummy answer into keyed sentences
+    #document_keys = split_into_keyed_sentences(context, "0")
+    #print(f"Query:{query}\n====================================================================")
+    context,response = generate_response_rag(query,domain) #deepseek-r1-distill-llama-70b llama3-70b-8192
+        # Split context and dummy answer into keyed sentences
+    document_keys = split_into_keyed_sentences(context, "0")
+    #print(f"\n====================================\Generator Response:{response}")
+    #For deepseek
+    #print("Before Curated:",response)
+    response=response[response.find("**Answer"):].replace("**Answer","");
+    print(f"Response for Generator LLM:{response}")
+    response_keys = split_into_keyed_sentences(response, "")
+        # Rebuild sections for prompt
+    documents_formatted = "\n".join([f"{k}. {v}" for k, v in document_keys.items()])
+    response_formatted = "\n".join([f"{k}. {v}" for k, v in response_keys.items()])
+    '''print(f"\n====================================================================")
+    print(f"documents_formatted:{documents_formatted}")
+    print(f"\n====================================================================")
+    print(f"response_formatted:{response_formatted}")
+    print(f"\n====================================================================")'''
+    prompt = f"""I asked someone to answer a question based on one or more documents.
+Your task is to review their response and assess whether or not each sentence
+in that response is supported by text in the documents. And if so, which
+sentences in the documents provide that support. You will also tell me which
+of the documents contain useful information for answering the question, and
+which of the documents the answer was sourced from.
+Here are the documents, each of which is split into sentences. Alongside each
+sentence is associated key, such as ’0a.’ or ’0b.’ that you can use to refer
+to it:
+'''
+{documents_formatted}
+'''
+The question was:
+'''
+{query}
+'''
+Here is their response, split into sentences. Alongside each sentence is
+associated key, such as ’a.’ or ’b.’ that you can use to refer to it. Note
+that these keys are unique to the response, and are not related to the keys
+in the documents:
+'''
+{response_formatted}
+'''
+You must respond with a JSON object matching this schema:
+'''
+{{
+"relevance_explanation": string,
+"all_relevant_sentence_keys": [string],
+"overall_supported_explanation": string,
+"overall_supported": boolean,
+"sentence_support_information": [
+{{
+"response_sentence_key": string,
+"explanation": string,
+"supporting_sentence_keys": [string],
+"fully_supported": boolean
+}},
+],
+"all_utilized_sentence_keys": [string]
+}}
+'''
+The relevance_explanation field is a string explaining which documents
+contain useful information for answering the question. Provide a step-by-step
+breakdown of information provided in the documents and how it is useful for
+answering the question.
+The all_relevant_sentence_keys field is a list of all document sentences keys
+(e.g. ’0a’) that are revant to the question. Include every sentence that is
+useful and relevant to the question, even if it was not used in the response,
+or if only parts of the sentence are useful. Ignore the provided response when
+making this judgement and base your judgement solely on the provided documents
+and question. Omit sentences that, if removed from the document, would not
+impact someone’s ability to answer the question.
+The overall_supported_explanation field is a string explaining why the response
+*as a whole* is or is not supported by the documents. In this field, provide a
+step-by-step breakdown of the claims made in the response and the support (or
+lack thereof) for those claims in the documents. Begin by assessing each claim
+separately, one by one; don’t make any remarks about the response as a whole
+until you have assessed all the claims in isolation.
+The overall_supported field is a boolean indicating whether the response as a
+whole is supported by the documents. This value should reflect the conclusion
+you drew at the end of your step-by-step breakdown in overall_supported_explanation.
+In the sentence_support_information field, provide information about the support
+*for each sentence* in the response.
+The sentence_support_information field is a list of objects, one for each sentence
+in the response. Each object MUST have the following fields:
+- response_sentence_key: a string identifying the sentence in the response.
+This key is the same as the one used in the response above.
+- explanation: a string explaining why the sentence is or is not supported by the
+documents.
+- supporting_sentence_keys: keys (e.g. ’0a’) of sentences from the documents that
+support the response sentence. If the sentence is not supported, this list MUST
+be empty. If the sentence is supported, this list MUST contain one or more keys.
+In special cases where the sentence is supported, but not by any specific sentence,
+you can use the string "supported_without_sentence" to indicate that the sentence
+is generally supported by the documents. Consider cases where the sentence is
+expressing inability to answer the question due to lack of relevant information in
+the provided contex as "supported_without_sentence". In cases where the sentence
+is making a general statement (e.g. outlining the steps to produce an answer, or
+summarizing previously stated sentences, or a transition sentence), use the
+sting "general".In cases where the sentence is correctly stating a well-known fact,
+like a mathematical formula, use the string "well_known_fact". In cases where the
+sentence is performing numerical reasoning (e.g. addition, multiplication), use
+the string "numerical_reasoning".
+- fully_supported: a boolean indicating whether the sentence is fully supported by
+the documents.
+- This value should reflect the conclusion you drew at the end of your step-by-step
+breakdown in explanation.
+- If supporting_sentence_keys is an empty list, then fully_supported must be false.
+17
+- Otherwise, use fully_supported to clarify whether everything in the response
+sentence is fully supported by the document text indicated in supporting_sentence_keys
+(fully_supported = true), or whether the sentence is only partially or incompletely
+supported by that document text (fully_supported = false).
+The all_utilized_sentence_keys field is a list of all sentences keys (e.g. ’0a’) that
+were used to construct the answer. Include every sentence that either directly supported
+the answer, or was implicitly used to construct the answer, even if it was not used
+in its entirety. Omit sentences that were not used, and could have been removed from
+the documents without affecting the answer.
+You must respond with a valid JSON string. Use escapes for quotes, e.g. ‘\\"‘, and
+newlines, e.g. ‘\\n‘. Do not write anything before or after the JSON string. Do not
+wrap the JSON string in backticks like ‘‘‘ or ‘‘‘json.
+As a reminder: your task is to review the response and assess which documents contain
+useful information pertaining to the question, and how each sentence in the response
+is supported by the text in the documents.\
+"""
+    # Step 3: Call the LLM
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+        model="meta-llama/llama-4-maverick-17b-128e-instruct",  #deepseek-r1-distill-llama-70b llama3-70b-8192 meta-llama/llama-4-maverick-17b-128e-instruct
+    )
+    return documents_formatted,chat_completion.choices[0].message.content.strip()
+    '''chat_completion = openai.chat.completions.create(
+        messages=[
+            {"role":"user",
+            "content":prompt}
+        ],
+        model="gpt-4o",
+        max_tokens=1024,
+    )
+    return documents_formatted,chat_completion.choices[0].message.content'''
+def extract_retrieved_sentence_keys(document_text: str) -> list[str]:
+    """
+    Extracts sentence keys like '0a.', '0b.', etc. from a formatted document string.
+    Parameters:
+    - document_text (str): full text of document with sentence keys
+    Returns:
+    - List of unique sentence keys in the order they appear
+    """
+    # Match pattern like 0a., 0b., 0z., 0{., 0|., etc.
+    pattern = r'\b0[\w\{\|\}~]\.'
+    matches = re.findall(pattern, document_text)
+    return list(dict.fromkeys(matches))  # Removes duplicates while preserving order
+def compute_ragbench_metrics(judge_response: dict, retrieved_sentence_keys: list[str]) -> dict:
+    """
+    Computes RAGBench-style metrics from Judge LLM response.
+    Parameters:
+    - judge_response (dict): JSON response from Judge LLM
+    - retrieved_sentence_keys (list of str): all sentence keys from the retrieved documents
+    Returns:
+    - Dictionary with Context Relevance, Context Utilization, Completeness, and Adherence
+    """
+    R = set(judge_response.get("all_relevant_sentence_keys", []))    # Relevant sentences
+    U = set(judge_response.get("all_utilized_sentence_keys", []))    # Utilized sentences
+    intersection_RU = R & U
+    total_retrieved = len(retrieved_sentence_keys)
+    len_R = len(R)
+    len_U = len(U)
+    len_intersection = len(intersection_RU)
+    # Context Relevance: fraction of retrieved context that is relevant
+    context_relevance = len_R / total_retrieved if total_retrieved else 0.0
+    # Context Utilization: fraction of retrieved context that was used
+    context_utilization = len_U / total_retrieved if total_retrieved else 0.0
+    # Completeness: fraction of relevant content that was used
+    completeness = len_intersection / len_R if len_R else 0.0
+    # Adherence: 1 if all response sentences are fully supported, else 0
+    is_fully_supported = all(s.get("fully_supported", False)
+                             for s in judge_response.get("sentence_support_information", []))
+    adherence = 1.0 if is_fully_supported and judge_response.get("overall_supported", False) else 0.0
+    return {
+        "Context Relevance": round(context_relevance, 4),
+        "Context Utilization": round(context_utilization, 4),
+        "Completeness": round(completeness, 4),
+        "Adherence": adherence
+    }
+def evaluate_rag_pipeline(domain, q_indices):
+    import torch
+    import numpy as np
+    from sklearn.metrics import mean_squared_error, roc_auc_score
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def safe_append(gt_list, pred_list, gt_val, pred_val):
+        if gt_val is not None and pred_val is not None:
+            gt_list.append(gt_val)
+            pred_list.append(pred_val)
+    def clean_and_parse_json_block(text):
+        # Strip markdown-style code block if present
+        #text = text.strip().strip("`").strip()
+        code_block_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
+        if code_block_match:
+            text = code_block_match.group(1).strip()
+        # Remove invalid/control characters that break decoding
+        text = re.sub(r"[^\x20-\x7E\n\t]", "", text)
+        try:
+           return json.loads(text)
+        except json.JSONDecodeError as e:
+            print("❌ JSON Decode Error:", e)
+            print("⚠️ Cleaned text:\n", text)
+            raise
+    gt_relevance, pred_relevance = [], []
+    gt_utilization, pred_utilization = [], []
+    gt_completeness, pred_completeness = [], []
+    gt_adherence, pred_adherence = [], []
+    if(domain=="Legal"):
+      dataset = legal_dataset
+    elif(domain=="Medical"):
+      dataset = med_dataset
+    elif(domain=="GK"):
+      dataset = gk_dataset
+    elif(domain=="CS"):
+      dataset = cs_dataset
+    elif(domain=="Finance"):
+      dataset = fin_dataset
+    for i in q_indices:
+        query = dataset[i]['question']
+        print(f"\n\n\nQuery:{i}.{query}\n====================================================================")
+        #print(f"\ndomain:{domain}====================================================================")
+        documents_formatted, response = jugde_response_rag(query, domain)
+        judge_response = clean_and_parse_json_block(response)
+        print(f"\ndocuments_formatted:{documents_formatted}")
+        print(f"\n======================================================================\nResponse:{judge_response}")
+        retrieved_sentences = extract_retrieved_sentence_keys(documents_formatted)
+        predicted = compute_ragbench_metrics(judge_response, retrieved_sentences)
+        # GT values
+        gt_r = dataset[i].get('relevance_score')
+        gt_u = dataset[i].get('utilization_score')
+        gt_c = dataset[i].get('completeness_score')
+        gt_a = dataset[i].get('gpt3_adherence')
+        safe_append(gt_relevance, pred_relevance, gt_r, predicted['Context Relevance'])
+        safe_append(gt_utilization, pred_utilization, gt_u, predicted['Context Utilization'])
+        safe_append(gt_completeness, pred_completeness, gt_c, predicted['Completeness'])
+        if gt_a is not None and predicted['Adherence'] is not None:
+            safe_append(gt_adherence, pred_adherence, int(gt_a), int(predicted['Adherence']))
+    def compute_rmse(gt, pred):
+        return round(np.sqrt(np.mean((np.array(gt) - np.array(pred)) ** 2)), 4)
+    result = {
+        "Context Relevance": compute_rmse(gt_relevance, pred_relevance),
+        "Context Utilization": compute_rmse(gt_utilization, pred_utilization),
+        "Completeness": compute_rmse(gt_completeness, pred_completeness),
+    }
+    if len(set(gt_adherence)) == 2:
+        result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
+        result["AUC-ROC (Adherence)"] = round(roc_auc_score(gt_adherence, pred_adherence), 4)
+    else:
+        result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
+        result["AUC-ROC (Adherence)"] = "N/A - one class only"
+    return result
+# Updated wrapper
+def evaluate_rag_gradio(domain, q_indices_str):
+    # Capture logs
+    log_stream = io.StringIO()
+    sys.stdout = log_stream
+    try:
+        # Parse comma-separated indices
+        q_indices = [int(x.strip()) for x in q_indices_str.split(",") if x.strip().isdigit()]
+        results = evaluate_rag_pipeline(domain, q_indices)
+        logs = log_stream.getvalue()
+        return results, logs
+    except Exception as e:
+        traceback.print_exc()
+        return {"error": str(e)}, log_stream.getvalue()
+    finally:
+        sys.stdout = sys.__stdout__  # Restore stdout
+# Gradio interface
+iface = gr.Interface(
+    fn=evaluate_rag_gradio,
+    inputs=[
+        gr.Dropdown(choices=["Legal", "Medical", "GK", "CS", "Finance"], label="Domain"),
+        gr.Textbox(label="Comma-separated Query Indices (e.g. 89,121,245)", lines=1),
+    ],
+    outputs=[
+        gr.JSON(label="Evaluation Metrics (RMSE & AUC-ROC)"),
+        gr.Textbox(label="Execution Log", lines=10, interactive=True),
+    ],
+    title="RAG Evaluation Dashboard",
+    description="Evaluate your RAG pipeline across selected queries using GPT-based generation and judgment."
+)
+# Launch app
+iface.launch(server_name="0.0.0.0", server_port=7860, debug=True)

bkp1_app.py ADDED Viewed

	@@ -0,0 +1,567 @@

+from datasets import load_from_disk
+from transformers import AutoTokenizer, AutoModel
+import faiss
+import numpy as np
+import torch
+from datasets import load_from_disk
+import faiss
+import numpy as np
+import os
+from datasets import load_dataset, Dataset, get_dataset_config_names
+from sentence_transformers import SentenceTransformer
+from groq import Groq
+from sentence_transformers import CrossEncoder
+import requests
+import uuid
+import re
+import json
+import gradio as gr
+import io
+import sys
+import traceback
+embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# Preload datasets and indices
+hf_dataset_cs = load_from_disk("cs_dataset")
+faiss_index_cs = faiss.read_index("cs_index/faiss.index")
+hf_dataset_med = load_from_disk("med_dataset")
+faiss_index_med = faiss.read_index("med_index/faiss.index")
+hf_dataset_gk = load_from_disk("gk_dataset")
+faiss_index_gk = faiss.read_index("gk_index/faiss.index")
+hf_dataset_fin = load_from_disk("fin_dataset")
+faiss_index_fin = faiss.read_index("fin_index/faiss.index")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(device)
+legal_dataset = load_dataset("rungalileo/ragbench", "cuad", split="test")
+med_dataset = load_dataset("rungalileo/ragbench", "pubmedqa", split="test")
+gk_dataset = load_dataset("rungalileo/ragbench", "hotpotqa", split="test")
+cs_dataset = load_dataset("rungalileo/ragbench", "emanual", split="test")
+fin_dataset = load_dataset("rungalileo/ragbench", "finqa", split="test")
+# Load BGE reranker
+reranker = CrossEncoder("BAAI/bge-reranker-base", max_length=512)
+embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+model_name = "nlpaueb/legal-bert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModel.from_pretrained(model_name).to(device)
+model.eval()
+def retrieve_top_k(query,domain='legal', model_name='nlpaueb/legal-bert-base-uncased', k=8):
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name).to(device)
+    model.eval()
+    #print(f"In retrive_top_k Query:{query}")
+    # Tokenize and embed query using mean pooling
+    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        query_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
+    # Load FAISS index and dataset
+    index_path = f"legal_index/faiss.index"
+    dataset_path = f"legal_dataset"
+    faiss_index = faiss.read_index(index_path)
+    dataset = load_from_disk(dataset_path)
+    # Perform FAISS search
+    D, I = faiss_index.search(query_embedding.astype('float32'), k)
+    # Retrieve top-k matching chunks
+    top_chunks = [dataset[int(idx)]['text'] for idx in I[0]]
+    return top_chunks
+# Retrieval function using preloaded objects
+def retrieve_top_c(query, domain, embedder,  k=5):
+    if domain == "CS":
+        hf_dataset = hf_dataset_cs
+        faiss_index = faiss_index_cs
+    elif domain == "Medical":
+        hf_dataset = hf_dataset_med
+        faiss_index = faiss_index_med
+    elif domain == "GK":
+        hf_dataset = hf_dataset_gk
+        faiss_index = faiss_index_gk
+    elif domain == "Finance":
+        hf_dataset = hf_dataset_fin
+        faiss_index = faiss_index_fin
+    else:
+        raise ValueError(f"Unknown domain: {domain}")
+    # Encode query and search
+    query_embedding = embedder.encode([query]).astype('float32')
+    #query_embedding = embedder.encode([query], convert_to_numpy=True).astype('float32')
+    distances, indices = faiss_index.search(query_embedding, k)
+    return [hf_dataset[int(i)]["text"] for i in indices[0]]
+client = Groq(
+    api_key= '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',#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',
+)
+def rerank_documents_bge(query, documents, top_n=5, return_scores=False):
+    """
+    Rerank documents using BAAI/bge-reranker-base CrossEncoder.
+    Args:
+        query (str): The query string.
+        documents (List[str]): List of candidate documents.
+        top_n (int): Number of top results to return.
+        return_scores (bool): Whether to return scores along with documents.
+    Returns:
+        List[str] or List[Tuple[str, float]]
+    """
+    if not documents:
+        return []
+    # Prepare (query, doc) pairs
+    pairs = [(query, doc) for doc in documents]
+    # Predict relevance scores
+    scores = reranker.predict(pairs, batch_size=16)
+    # Sort by score descending
+    reranked = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
+    if return_scores:
+        return reranked[:top_n]
+    else:
+        return [doc for doc, _ in reranked[:top_n]]
+def generate_response_rag(query,domain):
+    # Step 1: Retrieve top-k context chunks using your FAISS setup
+    if domain == "Legal":
+        top_chunks = retrieve_top_k(query,'Legal', model_name)
+    else:
+        top_chunks = retrieve_top_c(query, domain,embedder)
+    # Step 2: Rerank retrieved documents using cross-encoder
+    #reranked_chunks = rerank_documents(query, top_chunks, top_n=15)
+    #rerank_and_filter_chunks = filter_by_faithfulness(query, reranked_chunks)
+    #print("Retrieved Top chunks",top_chunks)
+    #reranked_chunks = rerank_and_filter_chunks
+    reranked_chunks_bge = rerank_documents_bge(query, top_chunks, top_n=5)
+    #sum_context = summarize_context("\n\n".join(reranked_chunks_bge))
+    final_context = reranked_chunks_bge
+    # Step 2: Prepare context and RAG-style prompt
+    context = "\n\n".join(final_context)
+    #print(f"Context:{context}")
+    prompt = f"""You are a helpful legal assistant.
+    Use the following context to answer the question.
+    Using only the information from the retrieved context, answer the following question. If the answer cannot be derived, say "I don't know." Always have answer with prefix **Answer:**
+    Context:{context}
+    Question: {query}
+    Answer:"""
+    # Step 3: Call the LLM (LLaMA3 or any chat model)
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+        model="llama3-70b-8192",#"gemma2-9b-it"#"qwen/qwen3-32b"#deepseek-r1-distill-llama-70b",#"llama3-70b-8192", # mistral-saba-24b
+        temperature=0.0
+    )
+    return context,chat_completion.choices[0].message.content.strip()
+    '''response = openai.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0.0,
+        max_tokens=1024
+    )
+    return response.choices[0].message.content'''
+#JUDGE LLM
+def split_into_keyed_sentences(text, prefix):
+    """Splits text into sentences with keys like '0a.', '0b.', or 'a.', 'b.', etc."""
+    # Basic sentence tokenizer with keys
+    sentences = re.split(r'(?<=[.?!])\s+', text.strip())
+    keyed = {}
+    for i, s in enumerate(sentences):
+        key = f"{prefix}{chr(97 + i)}"  # 'a', 'b', ...
+        if s:
+            keyed[key] = s.strip()
+    return keyed
+def jugde_response_rag(query, domain):
+    #top_chunks = retrieve_top_k(query)
+    #top_chunks = [chunk[0] if isinstance(chunk, tuple) else chunk for chunk in top_chunks]
+    # Step 2: Prepare context and RAG-style prompt
+    #context = "\n\n".join(top_chunks)
+        # Split context and dummy answer into keyed sentences
+    #document_keys = split_into_keyed_sentences(context, "0")
+    #print(f"Query:{query}\n====================================================================")
+    context,response = generate_response_rag(query,domain) #deepseek-r1-distill-llama-70b llama3-70b-8192
+        # Split context and dummy answer into keyed sentences
+    document_keys = split_into_keyed_sentences(context, "0")
+    #print(f"\n====================================\Generator Response:{response}")
+    #For deepseek
+    #print("Before Curated:",response)
+    response=response[response.find("**Answer"):].replace("**Answer","");
+    print(f"Response for Generator LLM:{response}")
+    response_keys = split_into_keyed_sentences(response, "")
+        # Rebuild sections for prompt
+    documents_formatted = "\n".join([f"{k}. {v}" for k, v in document_keys.items()])
+    response_formatted = "\n".join([f"{k}. {v}" for k, v in response_keys.items()])
+    '''print(f"\n====================================================================")
+    print(f"documents_formatted:{documents_formatted}")
+    print(f"\n====================================================================")
+    print(f"response_formatted:{response_formatted}")
+    print(f"\n====================================================================")'''
+    prompt = f"""I asked someone to answer a question based on one or more documents.
+Your task is to review their response and assess whether or not each sentence
+in that response is supported by text in the documents. And if so, which
+sentences in the documents provide that support. You will also tell me which
+of the documents contain useful information for answering the question, and
+which of the documents the answer was sourced from.
+Here are the documents, each of which is split into sentences. Alongside each
+sentence is associated key, such as ’0a.’ or ’0b.’ that you can use to refer
+to it:
+'''
+{documents_formatted}
+'''
+The question was:
+'''
+{query}
+'''
+Here is their response, split into sentences. Alongside each sentence is
+associated key, such as ’a.’ or ’b.’ that you can use to refer to it. Note
+that these keys are unique to the response, and are not related to the keys
+in the documents:
+'''
+{response_formatted}
+'''
+You must respond with a JSON object matching this schema:
+'''
+{{
+"relevance_explanation": string,
+"all_relevant_sentence_keys": [string],
+"overall_supported_explanation": string,
+"overall_supported": boolean,
+"sentence_support_information": [
+{{
+"response_sentence_key": string,
+"explanation": string,
+"supporting_sentence_keys": [string],
+"fully_supported": boolean
+}},
+],
+"all_utilized_sentence_keys": [string]
+}}
+'''
+The relevance_explanation field is a string explaining which documents
+contain useful information for answering the question. Provide a step-by-step
+breakdown of information provided in the documents and how it is useful for
+answering the question.
+The all_relevant_sentence_keys field is a list of all document sentences keys
+(e.g. ’0a’) that are revant to the question. Include every sentence that is
+useful and relevant to the question, even if it was not used in the response,
+or if only parts of the sentence are useful. Ignore the provided response when
+making this judgement and base your judgement solely on the provided documents
+and question. Omit sentences that, if removed from the document, would not
+impact someone’s ability to answer the question.
+The overall_supported_explanation field is a string explaining why the response
+*as a whole* is or is not supported by the documents. In this field, provide a
+step-by-step breakdown of the claims made in the response and the support (or
+lack thereof) for those claims in the documents. Begin by assessing each claim
+separately, one by one; don’t make any remarks about the response as a whole
+until you have assessed all the claims in isolation.
+The overall_supported field is a boolean indicating whether the response as a
+whole is supported by the documents. This value should reflect the conclusion
+you drew at the end of your step-by-step breakdown in overall_supported_explanation.
+In the sentence_support_information field, provide information about the support
+*for each sentence* in the response.
+The sentence_support_information field is a list of objects, one for each sentence
+in the response. Each object MUST have the following fields:
+- response_sentence_key: a string identifying the sentence in the response.
+This key is the same as the one used in the response above.
+- explanation: a string explaining why the sentence is or is not supported by the
+documents.
+- supporting_sentence_keys: keys (e.g. ’0a’) of sentences from the documents that
+support the response sentence. If the sentence is not supported, this list MUST
+be empty. If the sentence is supported, this list MUST contain one or more keys.
+In special cases where the sentence is supported, but not by any specific sentence,
+you can use the string "supported_without_sentence" to indicate that the sentence
+is generally supported by the documents. Consider cases where the sentence is
+expressing inability to answer the question due to lack of relevant information in
+the provided contex as "supported_without_sentence". In cases where the sentence
+is making a general statement (e.g. outlining the steps to produce an answer, or
+summarizing previously stated sentences, or a transition sentence), use the
+sting "general".In cases where the sentence is correctly stating a well-known fact,
+like a mathematical formula, use the string "well_known_fact". In cases where the
+sentence is performing numerical reasoning (e.g. addition, multiplication), use
+the string "numerical_reasoning".
+- fully_supported: a boolean indicating whether the sentence is fully supported by
+the documents.
+- This value should reflect the conclusion you drew at the end of your step-by-step
+breakdown in explanation.
+- If supporting_sentence_keys is an empty list, then fully_supported must be false.
+17
+- Otherwise, use fully_supported to clarify whether everything in the response
+sentence is fully supported by the document text indicated in supporting_sentence_keys
+(fully_supported = true), or whether the sentence is only partially or incompletely
+supported by that document text (fully_supported = false).
+The all_utilized_sentence_keys field is a list of all sentences keys (e.g. ’0a’) that
+were used to construct the answer. Include every sentence that either directly supported
+the answer, or was implicitly used to construct the answer, even if it was not used
+in its entirety. Omit sentences that were not used, and could have been removed from
+the documents without affecting the answer.
+You must respond with a valid JSON string. Use escapes for quotes, e.g. ‘\\"‘, and
+newlines, e.g. ‘\\n‘. Do not write anything before or after the JSON string. Do not
+wrap the JSON string in backticks like ‘‘‘ or ‘‘‘json.
+As a reminder: your task is to review the response and assess which documents contain
+useful information pertaining to the question, and how each sentence in the response
+is supported by the text in the documents.\
+"""
+    # Step 3: Call the LLM
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+        model="meta-llama/llama-4-maverick-17b-128e-instruct",  #deepseek-r1-distill-llama-70b llama3-70b-8192 meta-llama/llama-4-maverick-17b-128e-instruct
+    )
+    return documents_formatted,chat_completion.choices[0].message.content.strip()
+    '''chat_completion = openai.chat.completions.create(
+        messages=[
+            {"role":"user",
+            "content":prompt}
+        ],
+        model="gpt-4o",
+        max_tokens=1024,
+    )
+    return documents_formatted,chat_completion.choices[0].message.content'''
+def extract_retrieved_sentence_keys(document_text: str) -> list[str]:
+    """
+    Extracts sentence keys like '0a.', '0b.', etc. from a formatted document string.
+    Parameters:
+    - document_text (str): full text of document with sentence keys
+    Returns:
+    - List of unique sentence keys in the order they appear
+    """
+    # Match pattern like 0a., 0b., 0z., 0{., 0|., etc.
+    pattern = r'\b0[\w\{\|\}~]\.'
+    matches = re.findall(pattern, document_text)
+    return list(dict.fromkeys(matches))  # Removes duplicates while preserving order
+def compute_ragbench_metrics(judge_response: dict, retrieved_sentence_keys: list[str]) -> dict:
+    """
+    Computes RAGBench-style metrics from Judge LLM response.
+    Parameters:
+    - judge_response (dict): JSON response from Judge LLM
+    - retrieved_sentence_keys (list of str): all sentence keys from the retrieved documents
+    Returns:
+    - Dictionary with Context Relevance, Context Utilization, Completeness, and Adherence
+    """
+    R = set(judge_response.get("all_relevant_sentence_keys", []))    # Relevant sentences
+    U = set(judge_response.get("all_utilized_sentence_keys", []))    # Utilized sentences
+    intersection_RU = R & U
+    total_retrieved = len(retrieved_sentence_keys)
+    len_R = len(R)
+    len_U = len(U)
+    len_intersection = len(intersection_RU)
+    # Context Relevance: fraction of retrieved context that is relevant
+    context_relevance = len_R / total_retrieved if total_retrieved else 0.0
+    # Context Utilization: fraction of retrieved context that was used
+    context_utilization = len_U / total_retrieved if total_retrieved else 0.0
+    # Completeness: fraction of relevant content that was used
+    completeness = len_intersection / len_R if len_R else 0.0
+    # Adherence: 1 if all response sentences are fully supported, else 0
+    is_fully_supported = all(s.get("fully_supported", False)
+                             for s in judge_response.get("sentence_support_information", []))
+    adherence = 1.0 if is_fully_supported and judge_response.get("overall_supported", False) else 0.0
+    return {
+        "Context Relevance": round(context_relevance, 4),
+        "Context Utilization": round(context_utilization, 4),
+        "Completeness": round(completeness, 4),
+        "Adherence": adherence
+    }
+def evaluate_rag_pipeline(domain, q_indices):
+    import torch
+    import numpy as np
+    from sklearn.metrics import mean_squared_error, roc_auc_score
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def safe_append(gt_list, pred_list, gt_val, pred_val):
+        if gt_val is not None and pred_val is not None:
+            gt_list.append(gt_val)
+            pred_list.append(pred_val)
+    def clean_and_parse_json_block(text):
+        # Strip markdown-style code block if present
+        #text = text.strip().strip("`").strip()
+        code_block_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
+        if code_block_match:
+            text = code_block_match.group(1).strip()
+        # Remove invalid/control characters that break decoding
+        text = re.sub(r"[^\x20-\x7E\n\t]", "", text)
+        try:
+           return json.loads(text)
+        except json.JSONDecodeError as e:
+            print("❌ JSON Decode Error:", e)
+            print("⚠️ Cleaned text:\n", text)
+            raise
+    gt_relevance, pred_relevance = [], []
+    gt_utilization, pred_utilization = [], []
+    gt_completeness, pred_completeness = [], []
+    gt_adherence, pred_adherence = [], []
+    if(domain=="Legal"):
+      dataset = legal_dataset
+    elif(domain=="Medical"):
+      dataset = med_dataset
+    elif(domain=="GK"):
+      dataset = gk_dataset
+    elif(domain=="CS"):
+      dataset = cs_dataset
+    elif(domain=="Finance"):
+      dataset = fin_dataset
+    for i in q_indices:
+        query = dataset[i]['question']
+        print(f"\n\n\nQuery:{i}.{query}\n====================================================================")
+        #print(f"\ndomain:{domain}====================================================================")
+        documents_formatted, response = jugde_response_rag(query, domain)
+        judge_response = clean_and_parse_json_block(response)
+        print(f"\ndocuments_formatted:{documents_formatted}")
+        print(f"\n======================================================================\nResponse:{judge_response}")
+        retrieved_sentences = extract_retrieved_sentence_keys(documents_formatted)
+        predicted = compute_ragbench_metrics(judge_response, retrieved_sentences)
+        # GT values
+        gt_r = dataset[i].get('relevance_score')
+        gt_u = dataset[i].get('utilization_score')
+        gt_c = dataset[i].get('completeness_score')
+        gt_a = dataset[i].get('gpt3_adherence')
+        safe_append(gt_relevance, pred_relevance, gt_r, predicted['Context Relevance'])
+        safe_append(gt_utilization, pred_utilization, gt_u, predicted['Context Utilization'])
+        safe_append(gt_completeness, pred_completeness, gt_c, predicted['Completeness'])
+        if gt_a is not None and predicted['Adherence'] is not None:
+            safe_append(gt_adherence, pred_adherence, int(gt_a), int(predicted['Adherence']))
+    def compute_rmse(gt, pred):
+        return round(np.sqrt(np.mean((np.array(gt) - np.array(pred)) ** 2)), 4)
+    result = {
+        "Context Relevance": compute_rmse(gt_relevance, pred_relevance),
+        "Context Utilization": compute_rmse(gt_utilization, pred_utilization),
+        "Completeness": compute_rmse(gt_completeness, pred_completeness),
+    }
+    if len(set(gt_adherence)) == 2:
+        result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
+        result["AUC-ROC (Adherence)"] = round(roc_auc_score(gt_adherence, pred_adherence), 4)
+    else:
+        result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
+        result["AUC-ROC (Adherence)"] = "N/A - one class only"
+    return result
+# Updated wrapper
+def evaluate_rag_gradio(domain, q_indices_str):
+    # Capture logs
+    log_stream = io.StringIO()
+    sys.stdout = log_stream
+    try:
+        # Parse comma-separated indices
+        q_indices = [int(x.strip()) for x in q_indices_str.split(",") if x.strip().isdigit()]
+        results = evaluate_rag_pipeline(domain, q_indices)
+        logs = log_stream.getvalue()
+        return results, logs
+    except Exception as e:
+        traceback.print_exc()
+        return {"error": str(e)}, log_stream.getvalue()
+    finally:
+        sys.stdout = sys.__stdout__  # Restore stdout
+# Gradio interface
+iface = gr.Interface(
+    fn=evaluate_rag_gradio,
+    inputs=[
+        gr.Dropdown(choices=["Legal", "Medical", "GK", "CS", "Finance"], label="Domain"),
+        gr.Textbox(label="Comma-separated Query Indices (e.g. 89,121,245)", lines=1),
+    ],
+    outputs=[
+        gr.JSON(label="Evaluation Metrics (RMSE & AUC-ROC)"),
+        gr.Textbox(label="Execution Log", lines=10, interactive=True),
+    ],
+    title="RAG Evaluation Dashboard",
+    description="Evaluate your RAG pipeline across selected queries using GPT-based generation and judgment."
+)
+# Launch app
+iface.launch(server_name="0.0.0.0", server_port=7860, debug=True)

bkp_app.py ADDED Viewed

	@@ -0,0 +1,497 @@

+# -*- coding: utf-8 -*-
+"""Deploy_CapstoneRagBench.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1OG-77VqKwz3509_osgNgSeOMJ9G6RvB4
+"""
+# For Legal
+from datasets import load_from_disk
+from transformers import AutoTokenizer, AutoModel
+import faiss
+import numpy as np
+import torch
+from datasets import load_dataset, Dataset, get_dataset_config_names
+import os
+from groq import Groq
+from sentence_transformers import CrossEncoder
+import requests
+import uuid
+import re
+import gradio as gr
+import json
+import torch
+import numpy as np
+from sklearn.metrics import mean_squared_error, roc_auc_score
+import gradio as gr
+import io
+import sys
+import traceback
+def retrieve_top_k(query,domain='legal', model_name='nlpaueb/legal-bert-base-uncased', k=8):
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name).to(device)
+    model.eval()
+    #print(f"In retrive_top_k Query:{query}")
+    # Tokenize and embed query using mean pooling
+    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        query_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
+    # Load FAISS index and dataset
+    index_path = f"{domain}_index/faiss.index"
+    dataset_path = f"{domain}_dataset"
+    faiss_index = faiss.read_index(index_path)
+    dataset = load_from_disk(dataset_path)
+    # Perform FAISS search
+    D, I = faiss_index.search(query_embedding.astype('float32'), k)
+    # Retrieve top-k matching chunks
+    top_chunks = [dataset[int(idx)]['text'] for idx in I[0]]
+    return top_chunks
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#print(device)
+dataset = load_dataset("rungalileo/ragbench", "cuad", split="test")
+client = Groq(
+    api_key= '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',#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',
+)
+# Load BGE reranker
+reranker = CrossEncoder("BAAI/bge-reranker-base", max_length=512)
+def rerank_documents_bge(query, documents, top_n=5, return_scores=False):
+    """
+    Rerank documents using BAAI/bge-reranker-base CrossEncoder.
+    Args:
+        query (str): The query string.
+        documents (List[str]): List of candidate documents.
+        top_n (int): Number of top results to return.
+        return_scores (bool): Whether to return scores along with documents.
+    Returns:
+        List[str] or List[Tuple[str, float]]
+    """
+    if not documents:
+        return []
+    # Prepare (query, doc) pairs
+    pairs = [(query, doc) for doc in documents]
+    # Predict relevance scores
+    scores = reranker.predict(pairs, batch_size=16)
+    # Sort by score descending
+    reranked = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
+    if return_scores:
+        return reranked[:top_n]
+    else:
+        return [doc for doc, _ in reranked[:top_n]]
+def generate_response_rag(query,model,index_dir="legal_index"):
+    # Step 1: Retrieve top-k context chunks using your FAISS setup
+    top_chunks = retrieve_top_k(query,'legal', "nlpaueb/legal-bert-base-uncased")
+    # Step 2: Rerank retrieved documents using cross-encoder
+    #reranked_chunks = rerank_documents(query, top_chunks, top_n=15)
+    #rerank_and_filter_chunks = filter_by_faithfulness(query, reranked_chunks)
+    #reranked_chunks = rerank_and_filter_chunks
+    reranked_chunks_bge = rerank_documents_bge(query, top_chunks, top_n=5)
+    #sum_context = summarize_context("\n\n".join(reranked_chunks_bge))
+    final_context = reranked_chunks_bge
+    # Step 2: Prepare context and RAG-style prompt
+    context = "\n\n".join(final_context)
+    #print(f"Context:{context}")
+    prompt = f"""You are a helpful legal assistant.
+    Use the following context to answer the question.
+    Using only the information from the retrieved context, answer the following question. If the answer cannot be derived, say "I don't know." Always have answer with prefix **Answer:**
+    Context:{context}
+    Question: {query}
+    Answer:"""
+    # Step 3: Call the LLM (LLaMA3 or any chat model)
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+        model=model,#"gemma2-9b-it"#"qwen/qwen3-32b"#deepseek-r1-distill-llama-70b",#"llama3-70b-8192", # mistral-saba-24b
+        temperature=0.0
+    )
+    return chat_completion.choices[0].message.content.strip()
+    '''response = openai.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0.0,
+        max_tokens=1024
+    )
+    return response.choices[0].message.content'''
+#JUDGE LLM
+def split_into_keyed_sentences(text, prefix):
+    """Splits text into sentences with keys like '0a.', '0b.', or 'a.', 'b.', etc."""
+    # Basic sentence tokenizer with keys
+    sentences = re.split(r'(?<=[.?!])\s+', text.strip())
+    keyed = {}
+    for i, s in enumerate(sentences):
+        key = f"{prefix}{chr(97 + i)}"  # 'a', 'b', ...
+        if s:
+            keyed[key] = s.strip()
+    return keyed
+def jugde_response_rag(query, embedder="nlpaueb/legal-bert-base-uncased", domain="legal", k=5):
+    top_chunks = retrieve_top_k(query)
+    top_chunks = [chunk[0] if isinstance(chunk, tuple) else chunk for chunk in top_chunks]
+    # Step 2: Prepare context and RAG-style prompt
+    context = "\n\n".join(top_chunks)
+        # Split context and dummy answer into keyed sentences
+    document_keys = split_into_keyed_sentences(context, "0")
+    #print(f"Query:{query}\n====================================================================")
+    response = generate_response_rag(query,model="llama3-70b-8192") #deepseek-r1-distill-llama-70b llama3-70b-8192
+    #print(f"\n====================================\Generator Response:{response}")
+    #For deepseek
+    #print("Before Curated:",response)
+    response=response[response.find("**Answer"):].replace("**Answer","");
+    print(f"Response for Generator LLM:{response}")
+    response_keys = split_into_keyed_sentences(response, "")
+        # Rebuild sections for prompt
+    documents_formatted = "\n".join([f"{k}. {v}" for k, v in document_keys.items()])
+    response_formatted = "\n".join([f"{k}. {v}" for k, v in response_keys.items()])
+    '''print(f"\n====================================================================")
+    print(f"documents_formatted:{documents_formatted}")
+    print(f"\n====================================================================")
+    print(f"response_formatted:{response_formatted}")
+    print(f"\n====================================================================")'''
+    prompt = f"""I asked someone to answer a question based on one or more documents.
+Your task is to review their response and assess whether or not each sentence
+in that response is supported by text in the documents. And if so, which
+sentences in the documents provide that support. You will also tell me which
+of the documents contain useful information for answering the question, and
+which of the documents the answer was sourced from.
+Here are the documents, each of which is split into sentences. Alongside each
+sentence is associated key, such as ’0a.’ or ’0b.’ that you can use to refer
+to it:
+'''
+{documents_formatted}
+'''
+The question was:
+'''
+{query}
+'''
+Here is their response, split into sentences. Alongside each sentence is
+associated key, such as ’a.’ or ’b.’ that you can use to refer to it. Note
+that these keys are unique to the response, and are not related to the keys
+in the documents:
+'''
+{response_formatted}
+'''
+You must respond with a JSON object matching this schema:
+'''
+{{
+"relevance_explanation": string,
+"all_relevant_sentence_keys": [string],
+"overall_supported_explanation": string,
+"overall_supported": boolean,
+"sentence_support_information": [
+{{
+"response_sentence_key": string,
+"explanation": string,
+"supporting_sentence_keys": [string],
+"fully_supported": boolean
+}},
+],
+"all_utilized_sentence_keys": [string]
+}}
+'''
+The relevance_explanation field is a string explaining which documents
+contain useful information for answering the question. Provide a step-by-step
+breakdown of information provided in the documents and how it is useful for
+answering the question.
+The all_relevant_sentence_keys field is a list of all document sentences keys
+(e.g. ’0a’) that are revant to the question. Include every sentence that is
+useful and relevant to the question, even if it was not used in the response,
+or if only parts of the sentence are useful. Ignore the provided response when
+making this judgement and base your judgement solely on the provided documents
+and question. Omit sentences that, if removed from the document, would not
+impact someone’s ability to answer the question.
+The overall_supported_explanation field is a string explaining why the response
+*as a whole* is or is not supported by the documents. In this field, provide a
+step-by-step breakdown of the claims made in the response and the support (or
+lack thereof) for those claims in the documents. Begin by assessing each claim
+separately, one by one; don’t make any remarks about the response as a whole
+until you have assessed all the claims in isolation.
+The overall_supported field is a boolean indicating whether the response as a
+whole is supported by the documents. This value should reflect the conclusion
+you drew at the end of your step-by-step breakdown in overall_supported_explanation.
+In the sentence_support_information field, provide information about the support
+*for each sentence* in the response.
+The sentence_support_information field is a list of objects, one for each sentence
+in the response. Each object MUST have the following fields:
+- response_sentence_key: a string identifying the sentence in the response.
+This key is the same as the one used in the response above.
+- explanation: a string explaining why the sentence is or is not supported by the
+documents.
+- supporting_sentence_keys: keys (e.g. ’0a’) of sentences from the documents that
+support the response sentence. If the sentence is not supported, this list MUST
+be empty. If the sentence is supported, this list MUST contain one or more keys.
+In special cases where the sentence is supported, but not by any specific sentence,
+you can use the string "supported_without_sentence" to indicate that the sentence
+is generally supported by the documents. Consider cases where the sentence is
+expressing inability to answer the question due to lack of relevant information in
+the provided contex as "supported_without_sentence". In cases where the sentence
+is making a general statement (e.g. outlining the steps to produce an answer, or
+summarizing previously stated sentences, or a transition sentence), use the
+sting "general".In cases where the sentence is correctly stating a well-known fact,
+like a mathematical formula, use the string "well_known_fact". In cases where the
+sentence is performing numerical reasoning (e.g. addition, multiplication), use
+the string "numerical_reasoning".
+- fully_supported: a boolean indicating whether the sentence is fully supported by
+the documents.
+- This value should reflect the conclusion you drew at the end of your step-by-step
+breakdown in explanation.
+- If supporting_sentence_keys is an empty list, then fully_supported must be false.
+17
+- Otherwise, use fully_supported to clarify whether everything in the response
+sentence is fully supported by the document text indicated in supporting_sentence_keys
+(fully_supported = true), or whether the sentence is only partially or incompletely
+supported by that document text (fully_supported = false).
+The all_utilized_sentence_keys field is a list of all sentences keys (e.g. ’0a’) that
+were used to construct the answer. Include every sentence that either directly supported
+the answer, or was implicitly used to construct the answer, even if it was not used
+in its entirety. Omit sentences that were not used, and could have been removed from
+the documents without affecting the answer.
+You must respond with a valid JSON string. Use escapes for quotes, e.g. ‘\\"‘, and
+newlines, e.g. ‘\\n‘. Do not write anything before or after the JSON string. Do not
+wrap the JSON string in backticks like ‘‘‘ or ‘‘‘json.
+As a reminder: your task is to review the response and assess which documents contain
+useful information pertaining to the question, and how each sentence in the response
+is supported by the text in the documents.\
+"""
+    # Step 3: Call the LLM
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+        model="meta-llama/llama-4-maverick-17b-128e-instruct",  #deepseek-r1-distill-llama-70b llama3-70b-8192 meta-llama/llama-4-maverick-17b-128e-instruct
+    )
+    return documents_formatted,chat_completion.choices[0].message.content.strip()
+    '''chat_completion = openai.chat.completions.create(
+        messages=[
+            {"role":"user",
+            "content":prompt}
+        ],
+        model="gpt-4o",
+        max_tokens=1024,
+    )
+    return documents_formatted,chat_completion.choices[0].message.content'''
+def extract_retrieved_sentence_keys(document_text: str) -> list[str]:
+    """
+    Extracts sentence keys like '0a.', '0b.', etc. from a formatted document string.
+    Parameters:
+    - document_text (str): full text of document with sentence keys
+    Returns:
+    - List of unique sentence keys in the order they appear
+    """
+    # Match pattern like 0a., 0b., 0z., 0{., 0|., etc.
+    pattern = r'\b0[\w\{\|\}~]\.'
+    matches = re.findall(pattern, document_text)
+    return list(dict.fromkeys(matches))  # Removes duplicates while preserving order
+def compute_ragbench_metrics(judge_response: dict, retrieved_sentence_keys: list[str]) -> dict:
+    """
+    Computes RAGBench-style metrics from Judge LLM response.
+    Parameters:
+    - judge_response (dict): JSON response from Judge LLM
+    - retrieved_sentence_keys (list of str): all sentence keys from the retrieved documents
+    Returns:
+    - Dictionary with Context Relevance, Context Utilization, Completeness, and Adherence
+    """
+    R = set(judge_response.get("all_relevant_sentence_keys", []))    # Relevant sentences
+    U = set(judge_response.get("all_utilized_sentence_keys", []))    # Utilized sentences
+    intersection_RU = R & U
+    total_retrieved = len(retrieved_sentence_keys)
+    len_R = len(R)
+    len_U = len(U)
+    len_intersection = len(intersection_RU)
+    # Context Relevance: fraction of retrieved context that is relevant
+    context_relevance = len_R / total_retrieved if total_retrieved else 0.0
+    # Context Utilization: fraction of retrieved context that was used
+    context_utilization = len_U / total_retrieved if total_retrieved else 0.0
+    # Completeness: fraction of relevant content that was used
+    completeness = len_intersection / len_R if len_R else 0.0
+    # Adherence: 1 if all response sentences are fully supported, else 0
+    is_fully_supported = all(s.get("fully_supported", False)
+                             for s in judge_response.get("sentence_support_information", []))
+    adherence = 1.0 if is_fully_supported and judge_response.get("overall_supported", False) else 0.0
+    return {
+        "Context Relevance": round(context_relevance, 4),
+        "Context Utilization": round(context_utilization, 4),
+        "Completeness": round(completeness, 4),
+        "Adherence": adherence
+    }
+def compute_rmse(gt, pred):
+    return round(np.sqrt(np.mean((np.array(gt) - np.array(pred)) ** 2)), 4)
+def evaluate_rag_pipeline(q_indices):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def safe_append(gt_list, pred_list, gt_val, pred_val):
+        if gt_val is not None and pred_val is not None:
+            gt_list.append(gt_val)
+            pred_list.append(pred_val)
+    def clean_and_parse_json_block(text):
+        # Strip markdown-style code block if present
+        #text = text.strip().strip("`").strip()
+        code_block_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
+        if code_block_match:
+            text = code_block_match.group(1).strip()
+        # Remove invalid/control characters that break decoding
+        text = re.sub(r"[^\x20-\x7E\n\t]", "", text)
+        try:
+           return json.loads(text)
+        except json.JSONDecodeError as e:
+            print("❌ JSON Decode Error:", e)
+            print("⚠️ Cleaned text:\n", text)
+            raise
+    gt_relevance, pred_relevance = [], []
+    gt_utilization, pred_utilization = [], []
+    gt_completeness, pred_completeness = [], []
+    gt_adherence, pred_adherence = [], []
+    for i in q_indices:
+        query = dataset[i]['question']
+        print(f"\n\n\nQuery:{i}.{query}\n====================================================================")
+        documents_formatted, response = jugde_response_rag(
+            query, embedder="nlpaueb/legal-bert-base-uncased", domain="legal")
+        judge_response = clean_and_parse_json_block(response)
+        print(f"\n======================================================================\nResponse:{judge_response}")
+        retrieved_sentences = extract_retrieved_sentence_keys(documents_formatted)
+        predicted = compute_ragbench_metrics(judge_response, retrieved_sentences)
+        # GT values
+        gt_r = dataset[i].get('relevance_score')
+        gt_u = dataset[i].get('utilization_score')
+        gt_c = dataset[i].get('completeness_score')
+        gt_a = dataset[i].get('gpt3_adherence')
+        safe_append(gt_relevance, pred_relevance, gt_r, predicted['Context Relevance'])
+        safe_append(gt_utilization, pred_utilization, gt_u, predicted['Context Utilization'])
+        safe_append(gt_completeness, pred_completeness, gt_c, predicted['Completeness'])
+        if gt_a is not None and predicted['Adherence'] is not None:
+            safe_append(gt_adherence, pred_adherence, int(gt_a), int(predicted['Adherence']))
+    def compute_rmse(gt, pred):
+        return round(np.sqrt(np.mean((np.array(gt) - np.array(pred)) ** 2)), 4)
+    result = {
+        "Context Relevance": compute_rmse(gt_relevance, pred_relevance),
+        "Context Utilization": compute_rmse(gt_utilization, pred_utilization),
+        "Completeness": compute_rmse(gt_completeness, pred_completeness),
+    }
+    if len(set(gt_adherence)) == 2:
+        result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
+        result["AUC-ROC (Adherence)"] = round(roc_auc_score(gt_adherence, pred_adherence), 4)
+    else:
+        result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
+        result["AUC-ROC (Adherence)"] = "N/A - one class only"
+    return result
+# Wrapper to parse textbox input into list of ints
+def evaluate_rag_gradio(q_indices_str):
+    # Capture printed logs
+    log_stream = io.StringIO()
+    sys.stdout = log_stream
+    try:
+        q_indices = [int(x.strip()) for x in q_indices_str.split(",") if x.strip().isdigit()]
+        results = evaluate_rag_pipeline(q_indices)
+        # Return metrics and logs
+        logs = log_stream.getvalue()
+        return results, logs
+    except Exception as e:
+        traceback.print_exc()
+        return {"error": str(e)}, log_stream.getvalue()
+    finally:
+        sys.stdout = sys.__stdout__
+iface = gr.Interface(
+    fn=evaluate_rag_gradio,
+    inputs=gr.Textbox(label="Comma-separated Query Indices (e.g. 89,121,245)", lines=1),
+    outputs=[
+        gr.JSON(label="Evaluation Metrics (RMSE & AUC-ROC)"),
+        gr.Textbox(label="Execution Log", lines=5, interactive=True)
+    ],
+    title="RAG Evaluation Dashboard",
+    description="Evaluate your RAG pipeline across selected queries using GPT-based generation and judgment."
+)
+iface.launch(debug=True)

cs_dataset/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c81fcd283298c766efceed51005f94977eb042565a6d6e32a141af3516eddab
+size 88920

cs_dataset/dataset_info.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

cs_dataset/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "ec44a3721c635a27",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

cs_index/faiss.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:781715fcdd5abaccf46e7df9e34cb8fe08cefa3f47fc4381c1530e83ad3d3cb6
+size 370221

fin_dataset/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b6bb5e029518500e5764893d7267aa86d93e7a0e8ceae7969c371f17b42e3fc
+size 1504056

fin_dataset/dataset_info.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

fin_dataset/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "fe416e18cf3f19d0",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

fin_index/faiss.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb1e231b83001723586362b682f80487689ad9bb208a1c8dea3bade5d004cbd
+size 6039597

gk_dataset/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2ff3d603dd8f386d4f55dfb696d9a486e29e2c948c7e4cb03291b3f1185e671
+size 777424

gk_dataset/dataset_info.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

gk_dataset/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "00d8c8388a8ac73c",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

gk_index/faiss.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c22862009c798b27b235f0af137915a98fad631649735dbf19a467e3f896be6
+size 3526701

legal_dataset/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85ff687742e949901491ed90e9babf78ced5c7dcd3a910986ab25bb5f390072b
+size 4926576

legal_dataset/dataset_info.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

legal_dataset/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f6b83182f5e3cfa3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

legal_index/faiss.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64c4157c3617e605acc8464d3cf1da1ea321c07a6a1bdb6c2675edec41d3a0ba
+size 7978029

med_dataset/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb7eaae975a43389ed3ccd143dfcfca1e61ad094e3064ec477f36f9cd47d11ad
+size 2245824

med_dataset/dataset_info.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

med_dataset/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e0ae8ccbcca935ea",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

med_index/faiss.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:678041534c1d3641eaaed36f5efca24094762a1454eb6bdd413c2973b94c5dff
+size 11473965

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+transformers
+sentence-transformers
+faiss-cpu
+torch
+datasets
+scikit-learn
+groq