Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,10 +3,12 @@ import os
|
|
3 |
from typing import List, Dict
|
4 |
from ragas import evaluate
|
5 |
from ragas.metrics import (
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
10 |
)
|
11 |
from datasets import load_dataset
|
12 |
from langchain.text_splitter import (
|
@@ -87,12 +89,11 @@ def load_evaluation_dataset():
|
|
87 |
dataset = load_dataset("explodinggradients/fiqa", split="test")
|
88 |
return dataset
|
89 |
|
90 |
-
def
|
91 |
# Sample a few examples for evaluation
|
92 |
eval_samples = dataset.select(range(5))
|
93 |
|
94 |
-
|
95 |
-
eval_data = []
|
96 |
for sample in eval_samples:
|
97 |
question = sample["question"]
|
98 |
|
@@ -102,33 +103,39 @@ def evaluate_rag_pipeline(qa_chain, dataset):
|
|
102 |
"chat_history": []
|
103 |
})
|
104 |
|
105 |
-
|
106 |
"question": question,
|
107 |
"answer": response["answer"],
|
108 |
-
"
|
109 |
-
"
|
110 |
})
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
Faithfulness(),
|
117 |
-
AnswerRelevancy()
|
118 |
-
]
|
119 |
|
120 |
-
# Run evaluation
|
121 |
results = evaluate(
|
122 |
-
|
123 |
-
metrics=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
)
|
125 |
|
126 |
-
# Convert results to dictionary
|
127 |
return {
|
128 |
-
"
|
129 |
-
"context_relevancy": float(results["context_relevancy"]),
|
130 |
"faithfulness": float(results["faithfulness"]),
|
131 |
-
"answer_relevancy": float(results["answer_relevancy"])
|
|
|
|
|
|
|
132 |
}
|
133 |
|
134 |
# Initialize langchain LLM chain
|
|
|
3 |
from typing import List, Dict
|
4 |
from ragas import evaluate
|
5 |
from ragas.metrics import (
|
6 |
+
answer_relevancy,
|
7 |
+
faithfulness,
|
8 |
+
context_recall,
|
9 |
+
context_precision,
|
10 |
+
answer_correctness,
|
11 |
+
answer_similarity
|
12 |
)
|
13 |
from datasets import load_dataset
|
14 |
from langchain.text_splitter import (
|
|
|
89 |
dataset = load_dataset("explodinggradients/fiqa", split="test")
|
90 |
return dataset
|
91 |
|
92 |
+
def prepare_ragas_dataset(qa_chain, dataset):
|
93 |
# Sample a few examples for evaluation
|
94 |
eval_samples = dataset.select(range(5))
|
95 |
|
96 |
+
ragas_dataset = []
|
|
|
97 |
for sample in eval_samples:
|
98 |
question = sample["question"]
|
99 |
|
|
|
103 |
"chat_history": []
|
104 |
})
|
105 |
|
106 |
+
ragas_dataset.append({
|
107 |
"question": question,
|
108 |
"answer": response["answer"],
|
109 |
+
"contexts": [doc.page_content for doc in response["source_documents"]],
|
110 |
+
"ground_truth": sample["answer"]
|
111 |
})
|
112 |
|
113 |
+
return ragas_dataset
|
114 |
+
|
115 |
+
def evaluate_rag_pipeline(qa_chain, dataset):
|
116 |
+
ragas_dataset = prepare_ragas_dataset(qa_chain, dataset)
|
|
|
|
|
|
|
117 |
|
118 |
+
# Run RAGAS evaluation
|
119 |
results = evaluate(
|
120 |
+
ragas_dataset,
|
121 |
+
metrics=[
|
122 |
+
context_precision,
|
123 |
+
faithfulness,
|
124 |
+
answer_relevancy,
|
125 |
+
context_recall,
|
126 |
+
answer_correctness,
|
127 |
+
answer_similarity
|
128 |
+
]
|
129 |
)
|
130 |
|
131 |
+
# Convert results to a dictionary
|
132 |
return {
|
133 |
+
"context_precision": float(results["context_precision"]),
|
|
|
134 |
"faithfulness": float(results["faithfulness"]),
|
135 |
+
"answer_relevancy": float(results["answer_relevancy"]),
|
136 |
+
"context_recall": float(results["context_recall"]),
|
137 |
+
"answer_correctness": float(results["answer_correctness"]),
|
138 |
+
"answer_similarity": float(results["answer_similarity"])
|
139 |
}
|
140 |
|
141 |
# Initialize langchain LLM chain
|