synthetic-data-generator

Build error

sdiazlor commited on Jan 20

Commit

b844fe3

1 Parent(s): c0bdbeb

update example and solve installation issues for Spaces

Browse files

Files changed (3) hide show

examples/fine-tune-modernbert-rag.ipynb +8 -8
requirements.txt +1 -0
src/synthetic_dataset_generator/apps/rag.py +2 -1

examples/fine-tune-modernbert-rag.ipynb CHANGED Viewed

@@ -8,7 +8,7 @@
     "\n",
     "This notebook demonstrates the fine-tuning process of `modernbert-embed-base` using synthetic data tailored for the Retrieval-Augmented Generation (RAG) model.\n",
     "\n",
-    "It provides a complete walkthrough of the fine-tuning process after generating synthetic data using the Synthetic Data Generator. For a comprehensive explanation of the methodology and additional details, refer to the blog post: [Fine-tune ModernBERT with Synthetic Data for RAG](https://huggingface.co/blog/fine-tune-modernbert-with-synthetic-data-for-rag)."
    ]
   },
   {
@@ -47,7 +47,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -97,7 +97,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -140,7 +140,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -152,7 +152,7 @@
        "})"
       ]
      },
-     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -531,7 +531,7 @@
     "# Remember to adjust the training arguments according to your requirements\n",
     "\n",
     "trainer = SentenceTransformerTrainer(\n",
-    "    model=model,\n",
     "    args=training_args,\n",
     "    train_dataset=dataset_rag_biencoder[\"train\"],\n",
     "    eval_dataset=dataset_rag_biencoder[\"eval\"],\n",
@@ -701,7 +701,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -710,7 +710,7 @@
     "\n",
     "df = combined_rag_dataset.to_pandas()\n",
     "df = df.drop_duplicates(subset=[\"context\"]) # drop duplicates based on \"context\" column\n",
-    "# df = df.sample(n=100, random_state=42) # optional: sample a subset of the dataset\n",
     "dataset = Dataset.from_pandas(df)\n",
     "\n",
     "docs = [Document(content=doc[\"context\"]) for doc in dataset]"

     "\n",
     "This notebook demonstrates the fine-tuning process of `modernbert-embed-base` using synthetic data tailored for the Retrieval-Augmented Generation (RAG) model.\n",
     "\n",
+    "It provides a complete walkthrough of the fine-tuning process after generating synthetic data using the Synthetic Data Generator. For a comprehensive explanation of the methodology and additional details, refer to the blog post: [Fine-tune ModernBERT for RAG with Synthetic Data](https://huggingface.co/blog/fine-tune-modernbert-for-rag-with-synthetic-data)."
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
        "})"
       ]
      },
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
     "# Remember to adjust the training arguments according to your requirements\n",
     "\n",
     "trainer = SentenceTransformerTrainer(\n",
+    "    model=model_biencoder,\n",
     "    args=training_args,\n",
     "    train_dataset=dataset_rag_biencoder[\"train\"],\n",
     "    eval_dataset=dataset_rag_biencoder[\"eval\"],\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
     "df = combined_rag_dataset.to_pandas()\n",
     "df = df.drop_duplicates(subset=[\"context\"]) # drop duplicates based on \"context\" column\n",
+    "df = df.sample(n=10, random_state=42) # optional: sample a subset of the dataset\n",
     "dataset = Dataset.from_pandas(df)\n",
     "\n",
     "docs = [Document(content=doc[\"context\"]) for doc in dataset]"

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	-e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator


1	-e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator
2	+ !apt-get install -y poppler-utils

src/synthetic_dataset_generator/apps/rag.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Union
 import argilla as rg
 import gradio as gr
 import pandas as pd
 from datasets import (
     Dataset,
@@ -50,7 +51,7 @@ from synthetic_dataset_generator.utils import (
     get_random_repo_name,
     swap_visibility,
 )
 def _get_valid_columns(dataframe: pd.DataFrame):
     doc_valid_columns = []

 import argilla as rg
 import gradio as gr
+import nltk
 import pandas as pd
 from datasets import (
     Dataset,
     get_random_repo_name,
     swap_visibility,
 )
+nltk.download("punkt_tab")
 def _get_valid_columns(dataframe: pd.DataFrame):
     doc_valid_columns = []