Spaces:
				
			
			
	
			
			
		Build error
		
	
	
	
			
			
	
	
	
	
		
		
		Build error
		
	update example and solve installation issues for Spaces
Browse files
    	
        examples/fine-tune-modernbert-rag.ipynb
    CHANGED
    
    | @@ -8,7 +8,7 @@ | |
| 8 | 
             
                "\n",
         | 
| 9 | 
             
                "This notebook demonstrates the fine-tuning process of `modernbert-embed-base` using synthetic data tailored for the Retrieval-Augmented Generation (RAG) model.\n",
         | 
| 10 | 
             
                "\n",
         | 
| 11 | 
            -
                "It provides a complete walkthrough of the fine-tuning process after generating synthetic data using the Synthetic Data Generator. For a comprehensive explanation of the methodology and additional details, refer to the blog post: [Fine-tune ModernBERT with Synthetic Data | 
| 12 | 
             
               ]
         | 
| 13 | 
             
              },
         | 
| 14 | 
             
              {
         | 
| @@ -47,7 +47,7 @@ | |
| 47 | 
             
              },
         | 
| 48 | 
             
              {
         | 
| 49 | 
             
               "cell_type": "code",
         | 
| 50 | 
            -
               "execution_count":  | 
| 51 | 
             
               "metadata": {},
         | 
| 52 | 
             
               "outputs": [],
         | 
| 53 | 
             
               "source": [
         | 
| @@ -97,7 +97,7 @@ | |
| 97 | 
             
              },
         | 
| 98 | 
             
              {
         | 
| 99 | 
             
               "cell_type": "code",
         | 
| 100 | 
            -
               "execution_count":  | 
| 101 | 
             
               "metadata": {},
         | 
| 102 | 
             
               "outputs": [],
         | 
| 103 | 
             
               "source": [
         | 
| @@ -140,7 +140,7 @@ | |
| 140 | 
             
              },
         | 
| 141 | 
             
              {
         | 
| 142 | 
             
               "cell_type": "code",
         | 
| 143 | 
            -
               "execution_count":  | 
| 144 | 
             
               "metadata": {},
         | 
| 145 | 
             
               "outputs": [
         | 
| 146 | 
             
                {
         | 
| @@ -152,7 +152,7 @@ | |
| 152 | 
             
                   "})"
         | 
| 153 | 
             
                  ]
         | 
| 154 | 
             
                 },
         | 
| 155 | 
            -
                 "execution_count":  | 
| 156 | 
             
                 "metadata": {},
         | 
| 157 | 
             
                 "output_type": "execute_result"
         | 
| 158 | 
             
                }
         | 
| @@ -531,7 +531,7 @@ | |
| 531 | 
             
                "# Remember to adjust the training arguments according to your requirements\n",
         | 
| 532 | 
             
                "\n",
         | 
| 533 | 
             
                "trainer = SentenceTransformerTrainer(\n",
         | 
| 534 | 
            -
                "    model= | 
| 535 | 
             
                "    args=training_args,\n",
         | 
| 536 | 
             
                "    train_dataset=dataset_rag_biencoder[\"train\"],\n",
         | 
| 537 | 
             
                "    eval_dataset=dataset_rag_biencoder[\"eval\"],\n",
         | 
| @@ -701,7 +701,7 @@ | |
| 701 | 
             
              },
         | 
| 702 | 
             
              {
         | 
| 703 | 
             
               "cell_type": "code",
         | 
| 704 | 
            -
               "execution_count":  | 
| 705 | 
             
               "metadata": {},
         | 
| 706 | 
             
               "outputs": [],
         | 
| 707 | 
             
               "source": [
         | 
| @@ -710,7 +710,7 @@ | |
| 710 | 
             
                "\n",
         | 
| 711 | 
             
                "df = combined_rag_dataset.to_pandas()\n",
         | 
| 712 | 
             
                "df = df.drop_duplicates(subset=[\"context\"]) # drop duplicates based on \"context\" column\n",
         | 
| 713 | 
            -
                " | 
| 714 | 
             
                "dataset = Dataset.from_pandas(df)\n",
         | 
| 715 | 
             
                "\n",
         | 
| 716 | 
             
                "docs = [Document(content=doc[\"context\"]) for doc in dataset]"
         | 
|  | |
| 8 | 
             
                "\n",
         | 
| 9 | 
             
                "This notebook demonstrates the fine-tuning process of `modernbert-embed-base` using synthetic data tailored for the Retrieval-Augmented Generation (RAG) model.\n",
         | 
| 10 | 
             
                "\n",
         | 
| 11 | 
            +
                "It provides a complete walkthrough of the fine-tuning process after generating synthetic data using the Synthetic Data Generator. For a comprehensive explanation of the methodology and additional details, refer to the blog post: [Fine-tune ModernBERT for RAG with Synthetic Data](https://huggingface.co/blog/fine-tune-modernbert-for-rag-with-synthetic-data)."
         | 
| 12 | 
             
               ]
         | 
| 13 | 
             
              },
         | 
| 14 | 
             
              {
         | 
|  | |
| 47 | 
             
              },
         | 
| 48 | 
             
              {
         | 
| 49 | 
             
               "cell_type": "code",
         | 
| 50 | 
            +
               "execution_count": 1,
         | 
| 51 | 
             
               "metadata": {},
         | 
| 52 | 
             
               "outputs": [],
         | 
| 53 | 
             
               "source": [
         | 
|  | |
| 97 | 
             
              },
         | 
| 98 | 
             
              {
         | 
| 99 | 
             
               "cell_type": "code",
         | 
| 100 | 
            +
               "execution_count": 2,
         | 
| 101 | 
             
               "metadata": {},
         | 
| 102 | 
             
               "outputs": [],
         | 
| 103 | 
             
               "source": [
         | 
|  | |
| 140 | 
             
              },
         | 
| 141 | 
             
              {
         | 
| 142 | 
             
               "cell_type": "code",
         | 
| 143 | 
            +
               "execution_count": 3,
         | 
| 144 | 
             
               "metadata": {},
         | 
| 145 | 
             
               "outputs": [
         | 
| 146 | 
             
                {
         | 
|  | |
| 152 | 
             
                   "})"
         | 
| 153 | 
             
                  ]
         | 
| 154 | 
             
                 },
         | 
| 155 | 
            +
                 "execution_count": 3,
         | 
| 156 | 
             
                 "metadata": {},
         | 
| 157 | 
             
                 "output_type": "execute_result"
         | 
| 158 | 
             
                }
         | 
|  | |
| 531 | 
             
                "# Remember to adjust the training arguments according to your requirements\n",
         | 
| 532 | 
             
                "\n",
         | 
| 533 | 
             
                "trainer = SentenceTransformerTrainer(\n",
         | 
| 534 | 
            +
                "    model=model_biencoder,\n",
         | 
| 535 | 
             
                "    args=training_args,\n",
         | 
| 536 | 
             
                "    train_dataset=dataset_rag_biencoder[\"train\"],\n",
         | 
| 537 | 
             
                "    eval_dataset=dataset_rag_biencoder[\"eval\"],\n",
         | 
|  | |
| 701 | 
             
              },
         | 
| 702 | 
             
              {
         | 
| 703 | 
             
               "cell_type": "code",
         | 
| 704 | 
            +
               "execution_count": 4,
         | 
| 705 | 
             
               "metadata": {},
         | 
| 706 | 
             
               "outputs": [],
         | 
| 707 | 
             
               "source": [
         | 
|  | |
| 710 | 
             
                "\n",
         | 
| 711 | 
             
                "df = combined_rag_dataset.to_pandas()\n",
         | 
| 712 | 
             
                "df = df.drop_duplicates(subset=[\"context\"]) # drop duplicates based on \"context\" column\n",
         | 
| 713 | 
            +
                "df = df.sample(n=10, random_state=42) # optional: sample a subset of the dataset\n",
         | 
| 714 | 
             
                "dataset = Dataset.from_pandas(df)\n",
         | 
| 715 | 
             
                "\n",
         | 
| 716 | 
             
                "docs = [Document(content=doc[\"context\"]) for doc in dataset]"
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -1 +1,2 @@ | |
| 1 | 
             
            -e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator
         | 
|  | 
|  | |
| 1 | 
             
            -e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator
         | 
| 2 | 
            +
            !apt-get install -y poppler-utils
         | 
    	
        src/synthetic_dataset_generator/apps/rag.py
    CHANGED
    
    | @@ -5,6 +5,7 @@ from typing import Union | |
| 5 |  | 
| 6 | 
             
            import argilla as rg
         | 
| 7 | 
             
            import gradio as gr
         | 
|  | |
| 8 | 
             
            import pandas as pd
         | 
| 9 | 
             
            from datasets import (
         | 
| 10 | 
             
                Dataset,
         | 
| @@ -50,7 +51,7 @@ from synthetic_dataset_generator.utils import ( | |
| 50 | 
             
                get_random_repo_name,
         | 
| 51 | 
             
                swap_visibility,
         | 
| 52 | 
             
            )
         | 
| 53 | 
            -
             | 
| 54 |  | 
| 55 | 
             
            def _get_valid_columns(dataframe: pd.DataFrame):
         | 
| 56 | 
             
                doc_valid_columns = []
         | 
|  | |
| 5 |  | 
| 6 | 
             
            import argilla as rg
         | 
| 7 | 
             
            import gradio as gr
         | 
| 8 | 
            +
            import nltk
         | 
| 9 | 
             
            import pandas as pd
         | 
| 10 | 
             
            from datasets import (
         | 
| 11 | 
             
                Dataset,
         | 
|  | |
| 51 | 
             
                get_random_repo_name,
         | 
| 52 | 
             
                swap_visibility,
         | 
| 53 | 
             
            )
         | 
| 54 | 
            +
            nltk.download("punkt_tab")
         | 
| 55 |  | 
| 56 | 
             
            def _get_valid_columns(dataframe: pd.DataFrame):
         | 
| 57 | 
             
                doc_valid_columns = []
         | 
 
			
