Spaces:
Build error
Build error
update example and solve installation issues for Spaces
Browse files
examples/fine-tune-modernbert-rag.ipynb
CHANGED
|
@@ -8,7 +8,7 @@
|
|
| 8 |
"\n",
|
| 9 |
"This notebook demonstrates the fine-tuning process of `modernbert-embed-base` using synthetic data tailored for the Retrieval-Augmented Generation (RAG) model.\n",
|
| 10 |
"\n",
|
| 11 |
-
"It provides a complete walkthrough of the fine-tuning process after generating synthetic data using the Synthetic Data Generator. For a comprehensive explanation of the methodology and additional details, refer to the blog post: [Fine-tune ModernBERT with Synthetic Data
|
| 12 |
]
|
| 13 |
},
|
| 14 |
{
|
|
@@ -47,7 +47,7 @@
|
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"cell_type": "code",
|
| 50 |
-
"execution_count":
|
| 51 |
"metadata": {},
|
| 52 |
"outputs": [],
|
| 53 |
"source": [
|
|
@@ -97,7 +97,7 @@
|
|
| 97 |
},
|
| 98 |
{
|
| 99 |
"cell_type": "code",
|
| 100 |
-
"execution_count":
|
| 101 |
"metadata": {},
|
| 102 |
"outputs": [],
|
| 103 |
"source": [
|
|
@@ -140,7 +140,7 @@
|
|
| 140 |
},
|
| 141 |
{
|
| 142 |
"cell_type": "code",
|
| 143 |
-
"execution_count":
|
| 144 |
"metadata": {},
|
| 145 |
"outputs": [
|
| 146 |
{
|
|
@@ -152,7 +152,7 @@
|
|
| 152 |
"})"
|
| 153 |
]
|
| 154 |
},
|
| 155 |
-
"execution_count":
|
| 156 |
"metadata": {},
|
| 157 |
"output_type": "execute_result"
|
| 158 |
}
|
|
@@ -531,7 +531,7 @@
|
|
| 531 |
"# Remember to adjust the training arguments according to your requirements\n",
|
| 532 |
"\n",
|
| 533 |
"trainer = SentenceTransformerTrainer(\n",
|
| 534 |
-
" model=
|
| 535 |
" args=training_args,\n",
|
| 536 |
" train_dataset=dataset_rag_biencoder[\"train\"],\n",
|
| 537 |
" eval_dataset=dataset_rag_biencoder[\"eval\"],\n",
|
|
@@ -701,7 +701,7 @@
|
|
| 701 |
},
|
| 702 |
{
|
| 703 |
"cell_type": "code",
|
| 704 |
-
"execution_count":
|
| 705 |
"metadata": {},
|
| 706 |
"outputs": [],
|
| 707 |
"source": [
|
|
@@ -710,7 +710,7 @@
|
|
| 710 |
"\n",
|
| 711 |
"df = combined_rag_dataset.to_pandas()\n",
|
| 712 |
"df = df.drop_duplicates(subset=[\"context\"]) # drop duplicates based on \"context\" column\n",
|
| 713 |
-
"
|
| 714 |
"dataset = Dataset.from_pandas(df)\n",
|
| 715 |
"\n",
|
| 716 |
"docs = [Document(content=doc[\"context\"]) for doc in dataset]"
|
|
|
|
| 8 |
"\n",
|
| 9 |
"This notebook demonstrates the fine-tuning process of `modernbert-embed-base` using synthetic data tailored for the Retrieval-Augmented Generation (RAG) model.\n",
|
| 10 |
"\n",
|
| 11 |
+
"It provides a complete walkthrough of the fine-tuning process after generating synthetic data using the Synthetic Data Generator. For a comprehensive explanation of the methodology and additional details, refer to the blog post: [Fine-tune ModernBERT for RAG with Synthetic Data](https://huggingface.co/blog/fine-tune-modernbert-for-rag-with-synthetic-data)."
|
| 12 |
]
|
| 13 |
},
|
| 14 |
{
|
|
|
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"cell_type": "code",
|
| 50 |
+
"execution_count": 1,
|
| 51 |
"metadata": {},
|
| 52 |
"outputs": [],
|
| 53 |
"source": [
|
|
|
|
| 97 |
},
|
| 98 |
{
|
| 99 |
"cell_type": "code",
|
| 100 |
+
"execution_count": 2,
|
| 101 |
"metadata": {},
|
| 102 |
"outputs": [],
|
| 103 |
"source": [
|
|
|
|
| 140 |
},
|
| 141 |
{
|
| 142 |
"cell_type": "code",
|
| 143 |
+
"execution_count": 3,
|
| 144 |
"metadata": {},
|
| 145 |
"outputs": [
|
| 146 |
{
|
|
|
|
| 152 |
"})"
|
| 153 |
]
|
| 154 |
},
|
| 155 |
+
"execution_count": 3,
|
| 156 |
"metadata": {},
|
| 157 |
"output_type": "execute_result"
|
| 158 |
}
|
|
|
|
| 531 |
"# Remember to adjust the training arguments according to your requirements\n",
|
| 532 |
"\n",
|
| 533 |
"trainer = SentenceTransformerTrainer(\n",
|
| 534 |
+
" model=model_biencoder,\n",
|
| 535 |
" args=training_args,\n",
|
| 536 |
" train_dataset=dataset_rag_biencoder[\"train\"],\n",
|
| 537 |
" eval_dataset=dataset_rag_biencoder[\"eval\"],\n",
|
|
|
|
| 701 |
},
|
| 702 |
{
|
| 703 |
"cell_type": "code",
|
| 704 |
+
"execution_count": 4,
|
| 705 |
"metadata": {},
|
| 706 |
"outputs": [],
|
| 707 |
"source": [
|
|
|
|
| 710 |
"\n",
|
| 711 |
"df = combined_rag_dataset.to_pandas()\n",
|
| 712 |
"df = df.drop_duplicates(subset=[\"context\"]) # drop duplicates based on \"context\" column\n",
|
| 713 |
+
"df = df.sample(n=10, random_state=42) # optional: sample a subset of the dataset\n",
|
| 714 |
"dataset = Dataset.from_pandas(df)\n",
|
| 715 |
"\n",
|
| 716 |
"docs = [Document(content=doc[\"context\"]) for doc in dataset]"
|
requirements.txt
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
-e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator
|
|
|
|
|
|
| 1 |
-e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator
|
| 2 |
+
!apt-get install -y poppler-utils
|
src/synthetic_dataset_generator/apps/rag.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import Union
|
|
| 5 |
|
| 6 |
import argilla as rg
|
| 7 |
import gradio as gr
|
|
|
|
| 8 |
import pandas as pd
|
| 9 |
from datasets import (
|
| 10 |
Dataset,
|
|
@@ -50,7 +51,7 @@ from synthetic_dataset_generator.utils import (
|
|
| 50 |
get_random_repo_name,
|
| 51 |
swap_visibility,
|
| 52 |
)
|
| 53 |
-
|
| 54 |
|
| 55 |
def _get_valid_columns(dataframe: pd.DataFrame):
|
| 56 |
doc_valid_columns = []
|
|
|
|
| 5 |
|
| 6 |
import argilla as rg
|
| 7 |
import gradio as gr
|
| 8 |
+
import nltk
|
| 9 |
import pandas as pd
|
| 10 |
from datasets import (
|
| 11 |
Dataset,
|
|
|
|
| 51 |
get_random_repo_name,
|
| 52 |
swap_visibility,
|
| 53 |
)
|
| 54 |
+
nltk.download("punkt_tab")
|
| 55 |
|
| 56 |
def _get_valid_columns(dataframe: pd.DataFrame):
|
| 57 |
doc_valid_columns = []
|