Spaces:

joelg
/

discover_rag

Sleeping

joelg commited on Oct 8

Commit

0cc549f

1 Parent(s): 0157c73

- better default corpus

- better corpus presentation in the interface
- embedding model choice earlier
- see processed chunks
- better results order

Files changed (8) hide show

app.py +95 -78
documents/Archivage electronique-des raisons d'etre optimiste.pdf +3 -0
documents/CGU_LetempsLongdelArchive_2019.pdf +3 -0
documents/CIDE23_Presentation.pdf +3 -0
documents/Le concept d'archives-vdiffuséeHAL.pdf +3 -0
documents/Les sources numériques.pdf +3 -0
documents/guyon_celine_reprisaaf.pdf +3 -0
rag_system.py +64 -12

app.py CHANGED Viewed

@@ -6,38 +6,26 @@ from i18n import get_text
 # Initialize RAG system
 rag = RAGSystem()
-# Language state
-language = "en"
-def switch_language(lang):
-    global language
-    language = lang
-    return update_interface()
-def update_interface():
-    t = lambda key: get_text(key, language)
-    return {
-        # Update all interface elements with new language
-    }
 @spaces.GPU
-def process_pdf(pdf_file, chunk_size, chunk_overlap):
     """Process uploaded PDF and create embeddings"""
-    t = lambda key: get_text(key, language)
     try:
         if pdf_file is None:
             # Load default corpus
-            status = rag.load_default_corpus(chunk_size, chunk_overlap)
         else:
-            status = rag.process_document(pdf_file.name, chunk_size, chunk_overlap)
-        return status
     except Exception as e:
-        return f"{t('error')}: {str(e)}"
 @spaces.GPU
 def perform_query(
     query,
-    embedding_model,
     top_k,
     similarity_threshold,
     llm_model,
@@ -45,21 +33,18 @@ def perform_query(
     max_tokens
 ):
     """Perform RAG query and return results"""
-    t = lambda key: get_text(key, language)
     if not rag.is_ready():
-        return t("no_corpus"), "", "", ""
     try:
-        # Set models and parameters
-        rag.set_embedding_model(embedding_model)
         rag.set_llm_model(llm_model)
         # Retrieve relevant chunks
         results = rag.retrieve(query, top_k, similarity_threshold)
         # Format retrieved chunks display
-        chunks_display = format_chunks(results, t)
         # Generate answer
         answer, prompt = rag.generate(
@@ -69,42 +54,67 @@ def perform_query(
             max_tokens
         )
-        return answer, chunks_display, prompt, ""
     except Exception as e:
-        return "", "", "", f"{t('error')}: {str(e)}"
-def format_chunks(results, t):
     """Format retrieved chunks with scores for display"""
-    output = f"### {t('retrieved_chunks')}\n\n"
     for i, (chunk, score) in enumerate(results, 1):
-        output += f"**Chunk {i}** - {t('similarity_score')}: {score:.4f}\n"
         output += f"```\n{chunk}\n```\n\n"
     return output
 def create_interface():
-    t = lambda key: get_text(key, language)
     with gr.Blocks(title="RAG Pedagogical Demo", theme=gr.themes.Soft()) as demo:
         # Header with language selector
         with gr.Row():
             gr.Markdown("# 🎓 RAG Pedagogical Demo / Démo Pédagogique RAG")
-            lang_radio = gr.Radio(
-                choices=["en", "fr"],
-                value="en",
-                label="Language / Langue"
-            )
         with gr.Tabs() as tabs:
             # Tab 1: Corpus Management
             with gr.Tab(label="📚 Corpus"):
-                gr.Markdown(f"## {t('corpus_management')}")
-                gr.Markdown(t('corpus_description'))
                 pdf_upload = gr.File(
-                    label=t('upload_pdf'),
                     file_types=[".pdf"]
                 )
@@ -114,38 +124,39 @@ def create_interface():
                         maximum=1000,
                         value=500,
                         step=50,
-                        label=t('chunk_size')
                     )
                     chunk_overlap = gr.Slider(
                         minimum=0,
                         maximum=200,
                         value=50,
                         step=10,
-                        label=t('chunk_overlap')
                     )
-                process_btn = gr.Button(t('process_corpus'), variant="primary")
-                corpus_status = gr.Textbox(label=t('status'), interactive=False)
                 process_btn.click(
                     fn=process_pdf,
-                    inputs=[pdf_upload, chunk_size, chunk_overlap],
-                    outputs=corpus_status
                 )
             # Tab 2: Retrieval Configuration
             with gr.Tab(label="🔍 Retrieval"):
-                gr.Markdown(f"## {t('retrieval_config')}")
-                embedding_model = gr.Dropdown(
-                    choices=[
-                        "sentence-transformers/all-MiniLM-L6-v2",
-                        "sentence-transformers/all-mpnet-base-v2",
-                        "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
-                    ],
-                    value="sentence-transformers/all-MiniLM-L6-v2",
-                    label=t('embedding_model')
-                )
                 with gr.Row():
                     top_k = gr.Slider(
@@ -153,19 +164,20 @@ def create_interface():
                         maximum=10,
                         value=3,
                         step=1,
-                        label=t('top_k')
                     )
                     similarity_threshold = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
                         value=0.0,
                         step=0.05,
-                        label=t('similarity_threshold')
                     )
             # Tab 3: Generation Configuration
             with gr.Tab(label="🤖 Generation"):
-                gr.Markdown(f"## {t('generation_config')}")
                 llm_model = gr.Dropdown(
                     choices=[
@@ -174,7 +186,7 @@ def create_interface():
                         "ibm-granite/granite-4.0-micro",
                     ],
                     value="meta-llama/Llama-3.2-1B-Instruct",
-                    label=t('llm_model')
                 )
                 with gr.Row():
@@ -183,23 +195,23 @@ def create_interface():
                         maximum=2.0,
                         value=0.7,
                         step=0.1,
-                        label=t('temperature')
                     )
                     max_tokens = gr.Slider(
                         minimum=50,
                         maximum=1000,
                         value=300,
                         step=50,
-                        label=t('max_tokens')
                     )
             # Tab 4: Query & Results
             with gr.Tab(label="💬 Query"):
-                gr.Markdown(f"## {t('ask_question')}")
                 query_input = gr.Textbox(
-                    label=t('your_question'),
-                    placeholder=t('question_placeholder'),
                     lines=3
                 )
@@ -208,46 +220,51 @@ def create_interface():
                         ["What is Retrieval Augmented Generation?"],
                         ["How does RAG improve language models?"],
                         ["What are the main components of a RAG system?"],
                     ],
                     inputs=query_input,
-                    label=t('example_questions')
                 )
-                query_btn = gr.Button(t('submit_query'), variant="primary")
-                gr.Markdown(f"### {t('answer')}")
-                answer_output = gr.Markdown()
-                with gr.Accordion(t('retrieved_chunks'), open=True):
                     chunks_output = gr.Markdown()
-                with gr.Accordion(t('prompt_sent'), open=False):
                     prompt_output = gr.Textbox(lines=10, max_lines=20, show_copy_button=True)
-                error_output = gr.Textbox(label=t('errors'), visible=False)
                 query_btn.click(
                     fn=perform_query,
                     inputs=[
                         query_input,
-                        embedding_model,
                         top_k,
                         similarity_threshold,
                         llm_model,
                         temperature,
                         max_tokens
                     ],
-                    outputs=[answer_output, chunks_output, prompt_output, error_output]
                 )
         # Footer
         gr.Markdown("""
         ---
         **Note**: This is a pedagogical demonstration of RAG systems.
-        Models run on HuggingFace ZeroGPU infrastructure.
         **Note** : Ceci est une démonstration pédagogique des systèmes RAG.
-        Les modèles tournent sur l'infrastructure HuggingFace ZeroGPU.
         """)
     return demo

 # Initialize RAG system
 rag = RAGSystem()
 @spaces.GPU
+def process_pdf(pdf_file, embedding_model, chunk_size, chunk_overlap):
     """Process uploaded PDF and create embeddings"""
     try:
+        # Set embedding model BEFORE processing
+        rag.set_embedding_model(embedding_model)
         if pdf_file is None:
             # Load default corpus
+            status, chunks_display, corpus_text = rag.load_default_corpus(chunk_size, chunk_overlap)
         else:
+            status, chunks_display, corpus_text = rag.process_document(pdf_file.name, chunk_size, chunk_overlap)
+        return status, chunks_display, corpus_text
     except Exception as e:
+        return f"Error: {str(e)}", "", ""
 @spaces.GPU
 def perform_query(
     query,
     top_k,
     similarity_threshold,
     llm_model,
     max_tokens
 ):
     """Perform RAG query and return results"""
     if not rag.is_ready():
+        return "", "⚠️ Please process a corpus first in the Corpus tab.", "", ""
     try:
+        # Set LLM model
         rag.set_llm_model(llm_model)
         # Retrieve relevant chunks
         results = rag.retrieve(query, top_k, similarity_threshold)
         # Format retrieved chunks display
+        chunks_display = format_chunks(results)
         # Generate answer
         answer, prompt = rag.generate(
             max_tokens
         )
+        return chunks_display, prompt, answer, ""
     except Exception as e:
+        return "", "", "", f"Error: {str(e)}"
+def format_chunks(results):
     """Format retrieved chunks with scores for display"""
+    if not results:
+        return "No relevant chunks found."
+    output = "### 📄 Retrieved Chunks\n\n"
     for i, (chunk, score) in enumerate(results, 1):
+        output += f"**Chunk {i}** - Similarity Score: `{score:.4f}`\n"
         output += f"```\n{chunk}\n```\n\n"
     return output
 def create_interface():
     with gr.Blocks(title="RAG Pedagogical Demo", theme=gr.themes.Soft()) as demo:
+        # State for language
+        lang_state = gr.State("en")
         # Header with language selector
         with gr.Row():
             gr.Markdown("# 🎓 RAG Pedagogical Demo / Démo Pédagogique RAG")
+            with gr.Column(scale=1):
+                lang_dropdown = gr.Dropdown(
+                    choices=[("English", "en"), ("Français", "fr")],
+                    value="en",
+                    label="Language / Langue",
+                    interactive=True
+                )
         with gr.Tabs() as tabs:
             # Tab 1: Corpus Management
             with gr.Tab(label="📚 Corpus"):
+                gr.Markdown("## Corpus Management")
+                gr.Markdown("""
+                **Default corpus:** Multiple PDF documents from the `documents/` folder.
+                **Or:** Upload your own PDF document to use instead.
+                1. Select your embedding model
+                2. Adjust chunking parameters if needed
+                3. Click "Process Corpus"
+                """)
+                # Embedding model selection FIRST
+                embedding_model = gr.Dropdown(
+                    choices=[
+                        "sentence-transformers/all-MiniLM-L6-v2",
+                        "sentence-transformers/all-mpnet-base-v2",
+                        "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+                    ],
+                    value="sentence-transformers/all-MiniLM-L6-v2",
+                    label="🔤 Embedding Model (select before processing)"
+                )
                 pdf_upload = gr.File(
+                    label="📄 Upload PDF (optional - leave empty to use default corpus from documents/ folder)",
                     file_types=[".pdf"]
                 )
                         maximum=1000,
                         value=500,
                         step=50,
+                        label="Chunk Size (characters)"
                     )
                     chunk_overlap = gr.Slider(
                         minimum=0,
                         maximum=200,
                         value=50,
                         step=10,
+                        label="Chunk Overlap (characters)"
                     )
+                process_btn = gr.Button("🚀 Process Corpus", variant="primary", size="lg")
+                corpus_status = gr.Textbox(label="Status", interactive=False)
+                # Display default corpus info
+                with gr.Accordion("📖 Corpus Information", open=False):
+                    default_corpus_display = gr.Markdown()
+                # Display processed chunks
+                with gr.Accordion("📑 Processed Chunks", open=False):
+                    processed_chunks_display = gr.Markdown()
                 process_btn.click(
                     fn=process_pdf,
+                    inputs=[pdf_upload, embedding_model, chunk_size, chunk_overlap],
+                    outputs=[corpus_status, processed_chunks_display, default_corpus_display]
                 )
             # Tab 2: Retrieval Configuration
             with gr.Tab(label="🔍 Retrieval"):
+                gr.Markdown("## Retrieval Configuration")
+                gr.Markdown("Configure how relevant chunks are retrieved from the corpus.")
+                gr.Markdown(f"**Current Embedding Model:** The model selected in the Corpus tab is used.")
                 with gr.Row():
                     top_k = gr.Slider(
                         maximum=10,
                         value=3,
                         step=1,
+                        label="Top K (number of chunks to retrieve)"
                     )
                     similarity_threshold = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
                         value=0.0,
                         step=0.05,
+                        label="Similarity Threshold (minimum score)"
                     )
             # Tab 3: Generation Configuration
             with gr.Tab(label="🤖 Generation"):
+                gr.Markdown("## Generation Configuration")
+                gr.Markdown("Select the language model and configure generation parameters.")
                 llm_model = gr.Dropdown(
                     choices=[
                         "ibm-granite/granite-4.0-micro",
                     ],
                     value="meta-llama/Llama-3.2-1B-Instruct",
+                    label="Language Model"
                 )
                 with gr.Row():
                         maximum=2.0,
                         value=0.7,
                         step=0.1,
+                        label="Temperature (creativity)"
                     )
                     max_tokens = gr.Slider(
                         minimum=50,
                         maximum=1000,
                         value=300,
                         step=50,
+                        label="Max Tokens (response length)"
                     )
             # Tab 4: Query & Results
             with gr.Tab(label="💬 Query"):
+                gr.Markdown("## Ask a Question")
                 query_input = gr.Textbox(
+                    label="Your Question",
+                    placeholder="Enter your question here...",
                     lines=3
                 )
                         ["What is Retrieval Augmented Generation?"],
                         ["How does RAG improve language models?"],
                         ["What are the main components of a RAG system?"],
+                        ["Explain the role of embeddings in RAG."],
+                        ["What are the advantages of using RAG?"],
                     ],
                     inputs=query_input,
+                    label="Example Questions"
                 )
+                query_btn = gr.Button("🔍 Submit Query", variant="primary", size="lg")
+                # Results in order: chunks → prompt → answer
+                gr.Markdown("---")
+                gr.Markdown("### 📊 Results")
+                with gr.Accordion("1️⃣ Retrieved Chunks", open=True):
                     chunks_output = gr.Markdown()
+                with gr.Accordion("2️⃣ Prompt Sent to LLM", open=True):
                     prompt_output = gr.Textbox(lines=10, max_lines=20, show_copy_button=True)
+                with gr.Accordion("3️⃣ Generated Answer", open=True):
+                    answer_output = gr.Markdown()
+                error_output = gr.Textbox(label="Errors", visible=False)
                 query_btn.click(
                     fn=perform_query,
                     inputs=[
                         query_input,
                         top_k,
                         similarity_threshold,
                         llm_model,
                         temperature,
                         max_tokens
                     ],
+                    outputs=[chunks_output, prompt_output, answer_output, error_output]
                 )
         # Footer
         gr.Markdown("""
         ---
         **Note**: This is a pedagogical demonstration of RAG systems.
+        Models run on HuggingFace infrastructure.
         **Note** : Ceci est une démonstration pédagogique des systèmes RAG.
+        Les modèles tournent sur l'infrastructure HuggingFace.
         """)
     return demo

documents/Archivage electronique-des raisons d'etre optimiste.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8081e76db463322efc2807a92d7c11427cbeb4951498d5305fe6d59c28002fbe
+size 67803

documents/CGU_LetempsLongdelArchive_2019.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0802d8a3916c7599f280e2d9ad73f66ad60d32c8c33625a43728b52ea024ff47
+size 680219

documents/CIDE23_Presentation.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68a26ff11c3f57b7d2707b54e2c09f6e4a5aa22948b58fb8559fbbcfeca18d4a
+size 206335

documents/Le concept d'archives-vdiffuséeHAL.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c633f5ce93555c25e9ff5ef265aaff337c51950519d0e448925296cc79d5fe3
+size 398379

documents/Les sources numériques.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9411c8cf2e74a29df94dbb3ba809d3c460e80db332f10c9401abf3d71c3bb779
+size 661328

documents/guyon_celine_reprisaaf.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4231980c9a2845b337acbc46a3e222444cb6a962d8badc7b1b79cae667128f33
+size 523452

rag_system.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Core RAG system implementation"""
 import os
 from typing import List, Tuple, Optional
 import PyPDF2
 import faiss
@@ -24,13 +25,57 @@ class RAGSystem:
         """Check if the system is ready to process queries"""
         return self.ready and self.index is not None
-    def load_default_corpus(self, chunk_size: int = 500, chunk_overlap: int = 50) -> str:
-        """Load the default corpus"""
-        default_path = "default_corpus.pdf"
-        if os.path.exists(default_path):
-            return self.process_document(default_path, chunk_size, chunk_overlap)
-        else:
-            return "Default corpus not found. Please upload a PDF."
     def extract_text_from_pdf(self, pdf_path: str) -> str:
         """Extract text from PDF file"""
@@ -89,20 +134,20 @@ class RAGSystem:
         faiss.normalize_L2(embeddings)
         self.index.add(embeddings)
-    def process_document(self, pdf_path: str, chunk_size: int = 500, chunk_overlap: int = 50) -> str:
         """Process a PDF document and create searchable index"""
         try:
             # Extract text
             text = self.extract_text_from_pdf(pdf_path)
             if not text.strip():
-                return "Error: No text could be extracted from the PDF."
             # Chunk text
             self.chunks = self.chunk_text(text, chunk_size, chunk_overlap)
             if not self.chunks:
-                return "Error: No valid chunks created from the document."
             # Create embeddings
             self.embeddings = self.create_embeddings(self.chunks)
@@ -111,11 +156,18 @@ class RAGSystem:
             self.build_index(self.embeddings)
             self.ready = True
-            return f"Success! Processed {len(self.chunks)} chunks from the document."
         except Exception as e:
             self.ready = False
-            return f"Error processing document: {str(e)}"
     def set_embedding_model(self, model_name: str):
         """Set or change the embedding model"""

 """Core RAG system implementation"""
 import os
+import glob
 from typing import List, Tuple, Optional
 import PyPDF2
 import faiss
         """Check if the system is ready to process queries"""
         return self.ready and self.index is not None
+    def load_default_corpus(self, chunk_size: int = 500, chunk_overlap: int = 50):
+        """Load the default corpus from documents folder"""
+        documents_dir = "documents"
+        if not os.path.exists(documents_dir):
+            return "Documents folder not found. Please upload a PDF.", "", ""
+        # Get all PDFs in documents folder
+        pdf_files = glob.glob(os.path.join(documents_dir, "*.pdf"))
+        if not pdf_files:
+            return "No PDF files found in documents folder. Please upload a PDF.", "", ""
+        try:
+            # Extract text from all PDFs
+            all_text = ""
+            corpus_summary = f"📚 **Loading {len(pdf_files)} documents:**\n\n"
+            for pdf_path in pdf_files:
+                filename = os.path.basename(pdf_path)
+                corpus_summary += f"- {filename}\n"
+                text = self.extract_text_from_pdf(pdf_path)
+                all_text += f"\n\n=== {filename} ===\n\n{text}"
+            corpus_summary += f"\n**Total text length:** {len(all_text)} characters\n"
+            # Chunk the combined text
+            self.chunks = self.chunk_text(all_text, chunk_size, chunk_overlap)
+            if not self.chunks:
+                return "Error: No valid chunks created from the documents.", "", ""
+            # Create embeddings
+            self.embeddings = self.create_embeddings(self.chunks)
+            # Build index
+            self.build_index(self.embeddings)
+            self.ready = True
+            # Format chunks for display
+            chunks_display = "### Processed Chunks\n\n"
+            for i, chunk in enumerate(self.chunks, 1):
+                chunks_display += f"**Chunk {i}** ({len(chunk)} chars)\n```\n{chunk[:200]}{'...' if len(chunk) > 200 else ''}\n```\n\n"
+            status = f"✅ Success! Processed {len(pdf_files)} documents into {len(self.chunks)} chunks."
+            return status, chunks_display, corpus_summary
+        except Exception as e:
+            self.ready = False
+            return f"Error loading default corpus: {str(e)}", "", ""
     def extract_text_from_pdf(self, pdf_path: str) -> str:
         """Extract text from PDF file"""
         faiss.normalize_L2(embeddings)
         self.index.add(embeddings)
+    def process_document(self, pdf_path: str, chunk_size: int = 500, chunk_overlap: int = 50):
         """Process a PDF document and create searchable index"""
         try:
             # Extract text
             text = self.extract_text_from_pdf(pdf_path)
             if not text.strip():
+                return "Error: No text could be extracted from the PDF.", "", ""
             # Chunk text
             self.chunks = self.chunk_text(text, chunk_size, chunk_overlap)
             if not self.chunks:
+                return "Error: No valid chunks created from the document.", "", ""
             # Create embeddings
             self.embeddings = self.create_embeddings(self.chunks)
             self.build_index(self.embeddings)
             self.ready = True
+            # Format chunks for display
+            chunks_display = "### Processed Chunks\n\n"
+            for i, chunk in enumerate(self.chunks, 1):
+                chunks_display += f"**Chunk {i}** ({len(chunk)} chars)\n```\n{chunk}\n```\n\n"
+            status = f"✅ Success! Processed {len(self.chunks)} chunks from the document."
+            return status, chunks_display, text[:5000]  # Return first 5000 chars of original text
         except Exception as e:
             self.ready = False
+            return f"Error processing document: {str(e)}", "", ""
     def set_embedding_model(self, model_name: str):
         """Set or change the embedding model"""