arjunanand13 commited on
Commit
1d4a990
·
verified ·
1 Parent(s): 1596101

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -24
app.py CHANGED
@@ -3,10 +3,12 @@ import os
3
  from typing import List, Dict
4
  from ragas import evaluate
5
  from ragas.metrics import (
6
- ContextRecall,
7
- ContextRelevancy,
8
- Faithfulness,
9
- AnswerRelevancy
 
 
10
  )
11
  from datasets import load_dataset
12
  from langchain.text_splitter import (
@@ -87,12 +89,11 @@ def load_evaluation_dataset():
87
  dataset = load_dataset("explodinggradients/fiqa", split="test")
88
  return dataset
89
 
90
- def evaluate_rag_pipeline(qa_chain, dataset):
91
  # Sample a few examples for evaluation
92
  eval_samples = dataset.select(range(5))
93
 
94
- # Prepare data for RAGAS evaluation
95
- eval_data = []
96
  for sample in eval_samples:
97
  question = sample["question"]
98
 
@@ -102,33 +103,39 @@ def evaluate_rag_pipeline(qa_chain, dataset):
102
  "chat_history": []
103
  })
104
 
105
- eval_data.append({
106
  "question": question,
107
  "answer": response["answer"],
108
- "ground_truth": sample["answer"],
109
- "contexts": [doc.page_content for doc in response["source_documents"]]
110
  })
111
 
112
- # Initialize RAGAS metrics
113
- metrics = [
114
- ContextRecall(),
115
- ContextRelevancy(),
116
- Faithfulness(),
117
- AnswerRelevancy()
118
- ]
119
 
120
- # Run evaluation
121
  results = evaluate(
122
- eval_data,
123
- metrics=metrics
 
 
 
 
 
 
 
124
  )
125
 
126
- # Convert results to dictionary
127
  return {
128
- "context_recall": float(results["context_recall"]),
129
- "context_relevancy": float(results["context_relevancy"]),
130
  "faithfulness": float(results["faithfulness"]),
131
- "answer_relevancy": float(results["answer_relevancy"])
 
 
 
132
  }
133
 
134
  # Initialize langchain LLM chain
 
3
  from typing import List, Dict
4
  from ragas import evaluate
5
  from ragas.metrics import (
6
+ answer_relevancy,
7
+ faithfulness,
8
+ context_recall,
9
+ context_precision,
10
+ answer_correctness,
11
+ answer_similarity
12
  )
13
  from datasets import load_dataset
14
  from langchain.text_splitter import (
 
89
  dataset = load_dataset("explodinggradients/fiqa", split="test")
90
  return dataset
91
 
92
+ def prepare_ragas_dataset(qa_chain, dataset):
93
  # Sample a few examples for evaluation
94
  eval_samples = dataset.select(range(5))
95
 
96
+ ragas_dataset = []
 
97
  for sample in eval_samples:
98
  question = sample["question"]
99
 
 
103
  "chat_history": []
104
  })
105
 
106
+ ragas_dataset.append({
107
  "question": question,
108
  "answer": response["answer"],
109
+ "contexts": [doc.page_content for doc in response["source_documents"]],
110
+ "ground_truth": sample["answer"]
111
  })
112
 
113
+ return ragas_dataset
114
+
115
+ def evaluate_rag_pipeline(qa_chain, dataset):
116
+ ragas_dataset = prepare_ragas_dataset(qa_chain, dataset)
 
 
 
117
 
118
+ # Run RAGAS evaluation
119
  results = evaluate(
120
+ ragas_dataset,
121
+ metrics=[
122
+ context_precision,
123
+ faithfulness,
124
+ answer_relevancy,
125
+ context_recall,
126
+ answer_correctness,
127
+ answer_similarity
128
+ ]
129
  )
130
 
131
+ # Convert results to a dictionary
132
  return {
133
+ "context_precision": float(results["context_precision"]),
 
134
  "faithfulness": float(results["faithfulness"]),
135
+ "answer_relevancy": float(results["answer_relevancy"]),
136
+ "context_recall": float(results["context_recall"]),
137
+ "answer_correctness": float(results["answer_correctness"]),
138
+ "answer_similarity": float(results["answer_similarity"])
139
  }
140
 
141
  # Initialize langchain LLM chain