joelg commited on
Commit
0cc549f
·
1 Parent(s): 0157c73

- better default corpus

Browse files

- better corpus presentation in the interface
- embedding model choice earlier
- see processed chunks
- better results order

app.py CHANGED
@@ -6,38 +6,26 @@ from i18n import get_text
6
  # Initialize RAG system
7
  rag = RAGSystem()
8
 
9
- # Language state
10
- language = "en"
11
-
12
- def switch_language(lang):
13
- global language
14
- language = lang
15
- return update_interface()
16
-
17
- def update_interface():
18
- t = lambda key: get_text(key, language)
19
- return {
20
- # Update all interface elements with new language
21
- }
22
-
23
  @spaces.GPU
24
- def process_pdf(pdf_file, chunk_size, chunk_overlap):
25
  """Process uploaded PDF and create embeddings"""
26
- t = lambda key: get_text(key, language)
27
  try:
 
 
 
28
  if pdf_file is None:
29
  # Load default corpus
30
- status = rag.load_default_corpus(chunk_size, chunk_overlap)
31
  else:
32
- status = rag.process_document(pdf_file.name, chunk_size, chunk_overlap)
33
- return status
 
34
  except Exception as e:
35
- return f"{t('error')}: {str(e)}"
36
 
37
  @spaces.GPU
38
  def perform_query(
39
  query,
40
- embedding_model,
41
  top_k,
42
  similarity_threshold,
43
  llm_model,
@@ -45,21 +33,18 @@ def perform_query(
45
  max_tokens
46
  ):
47
  """Perform RAG query and return results"""
48
- t = lambda key: get_text(key, language)
49
-
50
  if not rag.is_ready():
51
- return t("no_corpus"), "", "", ""
52
 
53
  try:
54
- # Set models and parameters
55
- rag.set_embedding_model(embedding_model)
56
  rag.set_llm_model(llm_model)
57
 
58
  # Retrieve relevant chunks
59
  results = rag.retrieve(query, top_k, similarity_threshold)
60
 
61
  # Format retrieved chunks display
62
- chunks_display = format_chunks(results, t)
63
 
64
  # Generate answer
65
  answer, prompt = rag.generate(
@@ -69,42 +54,67 @@ def perform_query(
69
  max_tokens
70
  )
71
 
72
- return answer, chunks_display, prompt, ""
73
 
74
  except Exception as e:
75
- return "", "", "", f"{t('error')}: {str(e)}"
76
 
77
- def format_chunks(results, t):
78
  """Format retrieved chunks with scores for display"""
79
- output = f"### {t('retrieved_chunks')}\n\n"
 
 
 
80
  for i, (chunk, score) in enumerate(results, 1):
81
- output += f"**Chunk {i}** - {t('similarity_score')}: {score:.4f}\n"
82
  output += f"```\n{chunk}\n```\n\n"
83
  return output
84
 
85
  def create_interface():
86
- t = lambda key: get_text(key, language)
87
-
88
  with gr.Blocks(title="RAG Pedagogical Demo", theme=gr.themes.Soft()) as demo:
89
 
 
 
 
90
  # Header with language selector
91
  with gr.Row():
92
  gr.Markdown("# 🎓 RAG Pedagogical Demo / Démo Pédagogique RAG")
93
- lang_radio = gr.Radio(
94
- choices=["en", "fr"],
95
- value="en",
96
- label="Language / Langue"
97
- )
 
 
98
 
99
  with gr.Tabs() as tabs:
100
 
101
  # Tab 1: Corpus Management
102
  with gr.Tab(label="📚 Corpus"):
103
- gr.Markdown(f"## {t('corpus_management')}")
104
- gr.Markdown(t('corpus_description'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  pdf_upload = gr.File(
107
- label=t('upload_pdf'),
108
  file_types=[".pdf"]
109
  )
110
 
@@ -114,38 +124,39 @@ def create_interface():
114
  maximum=1000,
115
  value=500,
116
  step=50,
117
- label=t('chunk_size')
118
  )
119
  chunk_overlap = gr.Slider(
120
  minimum=0,
121
  maximum=200,
122
  value=50,
123
  step=10,
124
- label=t('chunk_overlap')
125
  )
126
 
127
- process_btn = gr.Button(t('process_corpus'), variant="primary")
128
- corpus_status = gr.Textbox(label=t('status'), interactive=False)
 
 
 
 
 
 
 
 
129
 
130
  process_btn.click(
131
  fn=process_pdf,
132
- inputs=[pdf_upload, chunk_size, chunk_overlap],
133
- outputs=corpus_status
134
  )
135
 
136
  # Tab 2: Retrieval Configuration
137
  with gr.Tab(label="🔍 Retrieval"):
138
- gr.Markdown(f"## {t('retrieval_config')}")
 
139
 
140
- embedding_model = gr.Dropdown(
141
- choices=[
142
- "sentence-transformers/all-MiniLM-L6-v2",
143
- "sentence-transformers/all-mpnet-base-v2",
144
- "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
145
- ],
146
- value="sentence-transformers/all-MiniLM-L6-v2",
147
- label=t('embedding_model')
148
- )
149
 
150
  with gr.Row():
151
  top_k = gr.Slider(
@@ -153,19 +164,20 @@ def create_interface():
153
  maximum=10,
154
  value=3,
155
  step=1,
156
- label=t('top_k')
157
  )
158
  similarity_threshold = gr.Slider(
159
  minimum=0.0,
160
  maximum=1.0,
161
  value=0.0,
162
  step=0.05,
163
- label=t('similarity_threshold')
164
  )
165
 
166
  # Tab 3: Generation Configuration
167
  with gr.Tab(label="🤖 Generation"):
168
- gr.Markdown(f"## {t('generation_config')}")
 
169
 
170
  llm_model = gr.Dropdown(
171
  choices=[
@@ -174,7 +186,7 @@ def create_interface():
174
  "ibm-granite/granite-4.0-micro",
175
  ],
176
  value="meta-llama/Llama-3.2-1B-Instruct",
177
- label=t('llm_model')
178
  )
179
 
180
  with gr.Row():
@@ -183,23 +195,23 @@ def create_interface():
183
  maximum=2.0,
184
  value=0.7,
185
  step=0.1,
186
- label=t('temperature')
187
  )
188
  max_tokens = gr.Slider(
189
  minimum=50,
190
  maximum=1000,
191
  value=300,
192
  step=50,
193
- label=t('max_tokens')
194
  )
195
 
196
  # Tab 4: Query & Results
197
  with gr.Tab(label="💬 Query"):
198
- gr.Markdown(f"## {t('ask_question')}")
199
 
200
  query_input = gr.Textbox(
201
- label=t('your_question'),
202
- placeholder=t('question_placeholder'),
203
  lines=3
204
  )
205
 
@@ -208,46 +220,51 @@ def create_interface():
208
  ["What is Retrieval Augmented Generation?"],
209
  ["How does RAG improve language models?"],
210
  ["What are the main components of a RAG system?"],
 
 
211
  ],
212
  inputs=query_input,
213
- label=t('example_questions')
214
  )
215
 
216
- query_btn = gr.Button(t('submit_query'), variant="primary")
217
 
218
- gr.Markdown(f"### {t('answer')}")
219
- answer_output = gr.Markdown()
 
220
 
221
- with gr.Accordion(t('retrieved_chunks'), open=True):
222
  chunks_output = gr.Markdown()
223
 
224
- with gr.Accordion(t('prompt_sent'), open=False):
225
  prompt_output = gr.Textbox(lines=10, max_lines=20, show_copy_button=True)
226
 
227
- error_output = gr.Textbox(label=t('errors'), visible=False)
 
 
 
228
 
229
  query_btn.click(
230
  fn=perform_query,
231
  inputs=[
232
  query_input,
233
- embedding_model,
234
  top_k,
235
  similarity_threshold,
236
  llm_model,
237
  temperature,
238
  max_tokens
239
  ],
240
- outputs=[answer_output, chunks_output, prompt_output, error_output]
241
  )
242
 
243
  # Footer
244
  gr.Markdown("""
245
  ---
246
  **Note**: This is a pedagogical demonstration of RAG systems.
247
- Models run on HuggingFace ZeroGPU infrastructure.
248
 
249
  **Note** : Ceci est une démonstration pédagogique des systèmes RAG.
250
- Les modèles tournent sur l'infrastructure HuggingFace ZeroGPU.
251
  """)
252
 
253
  return demo
 
6
  # Initialize RAG system
7
  rag = RAGSystem()
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  @spaces.GPU
10
+ def process_pdf(pdf_file, embedding_model, chunk_size, chunk_overlap):
11
  """Process uploaded PDF and create embeddings"""
 
12
  try:
13
+ # Set embedding model BEFORE processing
14
+ rag.set_embedding_model(embedding_model)
15
+
16
  if pdf_file is None:
17
  # Load default corpus
18
+ status, chunks_display, corpus_text = rag.load_default_corpus(chunk_size, chunk_overlap)
19
  else:
20
+ status, chunks_display, corpus_text = rag.process_document(pdf_file.name, chunk_size, chunk_overlap)
21
+
22
+ return status, chunks_display, corpus_text
23
  except Exception as e:
24
+ return f"Error: {str(e)}", "", ""
25
 
26
  @spaces.GPU
27
  def perform_query(
28
  query,
 
29
  top_k,
30
  similarity_threshold,
31
  llm_model,
 
33
  max_tokens
34
  ):
35
  """Perform RAG query and return results"""
 
 
36
  if not rag.is_ready():
37
+ return "", "⚠️ Please process a corpus first in the Corpus tab.", "", ""
38
 
39
  try:
40
+ # Set LLM model
 
41
  rag.set_llm_model(llm_model)
42
 
43
  # Retrieve relevant chunks
44
  results = rag.retrieve(query, top_k, similarity_threshold)
45
 
46
  # Format retrieved chunks display
47
+ chunks_display = format_chunks(results)
48
 
49
  # Generate answer
50
  answer, prompt = rag.generate(
 
54
  max_tokens
55
  )
56
 
57
+ return chunks_display, prompt, answer, ""
58
 
59
  except Exception as e:
60
+ return "", "", "", f"Error: {str(e)}"
61
 
62
+ def format_chunks(results):
63
  """Format retrieved chunks with scores for display"""
64
+ if not results:
65
+ return "No relevant chunks found."
66
+
67
+ output = "### 📄 Retrieved Chunks\n\n"
68
  for i, (chunk, score) in enumerate(results, 1):
69
+ output += f"**Chunk {i}** - Similarity Score: `{score:.4f}`\n"
70
  output += f"```\n{chunk}\n```\n\n"
71
  return output
72
 
73
  def create_interface():
 
 
74
  with gr.Blocks(title="RAG Pedagogical Demo", theme=gr.themes.Soft()) as demo:
75
 
76
+ # State for language
77
+ lang_state = gr.State("en")
78
+
79
  # Header with language selector
80
  with gr.Row():
81
  gr.Markdown("# 🎓 RAG Pedagogical Demo / Démo Pédagogique RAG")
82
+ with gr.Column(scale=1):
83
+ lang_dropdown = gr.Dropdown(
84
+ choices=[("English", "en"), ("Français", "fr")],
85
+ value="en",
86
+ label="Language / Langue",
87
+ interactive=True
88
+ )
89
 
90
  with gr.Tabs() as tabs:
91
 
92
  # Tab 1: Corpus Management
93
  with gr.Tab(label="📚 Corpus"):
94
+ gr.Markdown("## Corpus Management")
95
+ gr.Markdown("""
96
+ **Default corpus:** Multiple PDF documents from the `documents/` folder.
97
+
98
+ **Or:** Upload your own PDF document to use instead.
99
+
100
+ 1. Select your embedding model
101
+ 2. Adjust chunking parameters if needed
102
+ 3. Click "Process Corpus"
103
+ """)
104
+
105
+ # Embedding model selection FIRST
106
+ embedding_model = gr.Dropdown(
107
+ choices=[
108
+ "sentence-transformers/all-MiniLM-L6-v2",
109
+ "sentence-transformers/all-mpnet-base-v2",
110
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
111
+ ],
112
+ value="sentence-transformers/all-MiniLM-L6-v2",
113
+ label="🔤 Embedding Model (select before processing)"
114
+ )
115
 
116
  pdf_upload = gr.File(
117
+ label="📄 Upload PDF (optional - leave empty to use default corpus from documents/ folder)",
118
  file_types=[".pdf"]
119
  )
120
 
 
124
  maximum=1000,
125
  value=500,
126
  step=50,
127
+ label="Chunk Size (characters)"
128
  )
129
  chunk_overlap = gr.Slider(
130
  minimum=0,
131
  maximum=200,
132
  value=50,
133
  step=10,
134
+ label="Chunk Overlap (characters)"
135
  )
136
 
137
+ process_btn = gr.Button("🚀 Process Corpus", variant="primary", size="lg")
138
+ corpus_status = gr.Textbox(label="Status", interactive=False)
139
+
140
+ # Display default corpus info
141
+ with gr.Accordion("📖 Corpus Information", open=False):
142
+ default_corpus_display = gr.Markdown()
143
+
144
+ # Display processed chunks
145
+ with gr.Accordion("📑 Processed Chunks", open=False):
146
+ processed_chunks_display = gr.Markdown()
147
 
148
  process_btn.click(
149
  fn=process_pdf,
150
+ inputs=[pdf_upload, embedding_model, chunk_size, chunk_overlap],
151
+ outputs=[corpus_status, processed_chunks_display, default_corpus_display]
152
  )
153
 
154
  # Tab 2: Retrieval Configuration
155
  with gr.Tab(label="🔍 Retrieval"):
156
+ gr.Markdown("## Retrieval Configuration")
157
+ gr.Markdown("Configure how relevant chunks are retrieved from the corpus.")
158
 
159
+ gr.Markdown(f"**Current Embedding Model:** The model selected in the Corpus tab is used.")
 
 
 
 
 
 
 
 
160
 
161
  with gr.Row():
162
  top_k = gr.Slider(
 
164
  maximum=10,
165
  value=3,
166
  step=1,
167
+ label="Top K (number of chunks to retrieve)"
168
  )
169
  similarity_threshold = gr.Slider(
170
  minimum=0.0,
171
  maximum=1.0,
172
  value=0.0,
173
  step=0.05,
174
+ label="Similarity Threshold (minimum score)"
175
  )
176
 
177
  # Tab 3: Generation Configuration
178
  with gr.Tab(label="🤖 Generation"):
179
+ gr.Markdown("## Generation Configuration")
180
+ gr.Markdown("Select the language model and configure generation parameters.")
181
 
182
  llm_model = gr.Dropdown(
183
  choices=[
 
186
  "ibm-granite/granite-4.0-micro",
187
  ],
188
  value="meta-llama/Llama-3.2-1B-Instruct",
189
+ label="Language Model"
190
  )
191
 
192
  with gr.Row():
 
195
  maximum=2.0,
196
  value=0.7,
197
  step=0.1,
198
+ label="Temperature (creativity)"
199
  )
200
  max_tokens = gr.Slider(
201
  minimum=50,
202
  maximum=1000,
203
  value=300,
204
  step=50,
205
+ label="Max Tokens (response length)"
206
  )
207
 
208
  # Tab 4: Query & Results
209
  with gr.Tab(label="💬 Query"):
210
+ gr.Markdown("## Ask a Question")
211
 
212
  query_input = gr.Textbox(
213
+ label="Your Question",
214
+ placeholder="Enter your question here...",
215
  lines=3
216
  )
217
 
 
220
  ["What is Retrieval Augmented Generation?"],
221
  ["How does RAG improve language models?"],
222
  ["What are the main components of a RAG system?"],
223
+ ["Explain the role of embeddings in RAG."],
224
+ ["What are the advantages of using RAG?"],
225
  ],
226
  inputs=query_input,
227
+ label="Example Questions"
228
  )
229
 
230
+ query_btn = gr.Button("🔍 Submit Query", variant="primary", size="lg")
231
 
232
+ # Results in order: chunks → prompt → answer
233
+ gr.Markdown("---")
234
+ gr.Markdown("### 📊 Results")
235
 
236
+ with gr.Accordion("1️⃣ Retrieved Chunks", open=True):
237
  chunks_output = gr.Markdown()
238
 
239
+ with gr.Accordion("2️⃣ Prompt Sent to LLM", open=True):
240
  prompt_output = gr.Textbox(lines=10, max_lines=20, show_copy_button=True)
241
 
242
+ with gr.Accordion("3️⃣ Generated Answer", open=True):
243
+ answer_output = gr.Markdown()
244
+
245
+ error_output = gr.Textbox(label="Errors", visible=False)
246
 
247
  query_btn.click(
248
  fn=perform_query,
249
  inputs=[
250
  query_input,
 
251
  top_k,
252
  similarity_threshold,
253
  llm_model,
254
  temperature,
255
  max_tokens
256
  ],
257
+ outputs=[chunks_output, prompt_output, answer_output, error_output]
258
  )
259
 
260
  # Footer
261
  gr.Markdown("""
262
  ---
263
  **Note**: This is a pedagogical demonstration of RAG systems.
264
+ Models run on HuggingFace infrastructure.
265
 
266
  **Note** : Ceci est une démonstration pédagogique des systèmes RAG.
267
+ Les modèles tournent sur l'infrastructure HuggingFace.
268
  """)
269
 
270
  return demo
documents/Archivage electronique-des raisons d'etre optimiste.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8081e76db463322efc2807a92d7c11427cbeb4951498d5305fe6d59c28002fbe
3
+ size 67803
documents/CGU_LetempsLongdelArchive_2019.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0802d8a3916c7599f280e2d9ad73f66ad60d32c8c33625a43728b52ea024ff47
3
+ size 680219
documents/CIDE23_Presentation.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68a26ff11c3f57b7d2707b54e2c09f6e4a5aa22948b58fb8559fbbcfeca18d4a
3
+ size 206335
documents/Le concept d'archives-vdiffuséeHAL.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c633f5ce93555c25e9ff5ef265aaff337c51950519d0e448925296cc79d5fe3
3
+ size 398379
documents/Les sources numériques.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9411c8cf2e74a29df94dbb3ba809d3c460e80db332f10c9401abf3d71c3bb779
3
+ size 661328
documents/guyon_celine_reprisaaf.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4231980c9a2845b337acbc46a3e222444cb6a962d8badc7b1b79cae667128f33
3
+ size 523452
rag_system.py CHANGED
@@ -1,6 +1,7 @@
1
  """Core RAG system implementation"""
2
 
3
  import os
 
4
  from typing import List, Tuple, Optional
5
  import PyPDF2
6
  import faiss
@@ -24,13 +25,57 @@ class RAGSystem:
24
  """Check if the system is ready to process queries"""
25
  return self.ready and self.index is not None
26
 
27
- def load_default_corpus(self, chunk_size: int = 500, chunk_overlap: int = 50) -> str:
28
- """Load the default corpus"""
29
- default_path = "default_corpus.pdf"
30
- if os.path.exists(default_path):
31
- return self.process_document(default_path, chunk_size, chunk_overlap)
32
- else:
33
- return "Default corpus not found. Please upload a PDF."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def extract_text_from_pdf(self, pdf_path: str) -> str:
36
  """Extract text from PDF file"""
@@ -89,20 +134,20 @@ class RAGSystem:
89
  faiss.normalize_L2(embeddings)
90
  self.index.add(embeddings)
91
 
92
- def process_document(self, pdf_path: str, chunk_size: int = 500, chunk_overlap: int = 50) -> str:
93
  """Process a PDF document and create searchable index"""
94
  try:
95
  # Extract text
96
  text = self.extract_text_from_pdf(pdf_path)
97
 
98
  if not text.strip():
99
- return "Error: No text could be extracted from the PDF."
100
 
101
  # Chunk text
102
  self.chunks = self.chunk_text(text, chunk_size, chunk_overlap)
103
 
104
  if not self.chunks:
105
- return "Error: No valid chunks created from the document."
106
 
107
  # Create embeddings
108
  self.embeddings = self.create_embeddings(self.chunks)
@@ -111,11 +156,18 @@ class RAGSystem:
111
  self.build_index(self.embeddings)
112
 
113
  self.ready = True
114
- return f"Success! Processed {len(self.chunks)} chunks from the document."
 
 
 
 
 
 
 
115
 
116
  except Exception as e:
117
  self.ready = False
118
- return f"Error processing document: {str(e)}"
119
 
120
  def set_embedding_model(self, model_name: str):
121
  """Set or change the embedding model"""
 
1
  """Core RAG system implementation"""
2
 
3
  import os
4
+ import glob
5
  from typing import List, Tuple, Optional
6
  import PyPDF2
7
  import faiss
 
25
  """Check if the system is ready to process queries"""
26
  return self.ready and self.index is not None
27
 
28
+ def load_default_corpus(self, chunk_size: int = 500, chunk_overlap: int = 50):
29
+ """Load the default corpus from documents folder"""
30
+ documents_dir = "documents"
31
+
32
+ if not os.path.exists(documents_dir):
33
+ return "Documents folder not found. Please upload a PDF.", "", ""
34
+
35
+ # Get all PDFs in documents folder
36
+ pdf_files = glob.glob(os.path.join(documents_dir, "*.pdf"))
37
+
38
+ if not pdf_files:
39
+ return "No PDF files found in documents folder. Please upload a PDF.", "", ""
40
+
41
+ try:
42
+ # Extract text from all PDFs
43
+ all_text = ""
44
+ corpus_summary = f"📚 **Loading {len(pdf_files)} documents:**\n\n"
45
+
46
+ for pdf_path in pdf_files:
47
+ filename = os.path.basename(pdf_path)
48
+ corpus_summary += f"- {filename}\n"
49
+ text = self.extract_text_from_pdf(pdf_path)
50
+ all_text += f"\n\n=== {filename} ===\n\n{text}"
51
+
52
+ corpus_summary += f"\n**Total text length:** {len(all_text)} characters\n"
53
+
54
+ # Chunk the combined text
55
+ self.chunks = self.chunk_text(all_text, chunk_size, chunk_overlap)
56
+
57
+ if not self.chunks:
58
+ return "Error: No valid chunks created from the documents.", "", ""
59
+
60
+ # Create embeddings
61
+ self.embeddings = self.create_embeddings(self.chunks)
62
+
63
+ # Build index
64
+ self.build_index(self.embeddings)
65
+
66
+ self.ready = True
67
+
68
+ # Format chunks for display
69
+ chunks_display = "### Processed Chunks\n\n"
70
+ for i, chunk in enumerate(self.chunks, 1):
71
+ chunks_display += f"**Chunk {i}** ({len(chunk)} chars)\n```\n{chunk[:200]}{'...' if len(chunk) > 200 else ''}\n```\n\n"
72
+
73
+ status = f"✅ Success! Processed {len(pdf_files)} documents into {len(self.chunks)} chunks."
74
+ return status, chunks_display, corpus_summary
75
+
76
+ except Exception as e:
77
+ self.ready = False
78
+ return f"Error loading default corpus: {str(e)}", "", ""
79
 
80
  def extract_text_from_pdf(self, pdf_path: str) -> str:
81
  """Extract text from PDF file"""
 
134
  faiss.normalize_L2(embeddings)
135
  self.index.add(embeddings)
136
 
137
+ def process_document(self, pdf_path: str, chunk_size: int = 500, chunk_overlap: int = 50):
138
  """Process a PDF document and create searchable index"""
139
  try:
140
  # Extract text
141
  text = self.extract_text_from_pdf(pdf_path)
142
 
143
  if not text.strip():
144
+ return "Error: No text could be extracted from the PDF.", "", ""
145
 
146
  # Chunk text
147
  self.chunks = self.chunk_text(text, chunk_size, chunk_overlap)
148
 
149
  if not self.chunks:
150
+ return "Error: No valid chunks created from the document.", "", ""
151
 
152
  # Create embeddings
153
  self.embeddings = self.create_embeddings(self.chunks)
 
156
  self.build_index(self.embeddings)
157
 
158
  self.ready = True
159
+
160
+ # Format chunks for display
161
+ chunks_display = "### Processed Chunks\n\n"
162
+ for i, chunk in enumerate(self.chunks, 1):
163
+ chunks_display += f"**Chunk {i}** ({len(chunk)} chars)\n```\n{chunk}\n```\n\n"
164
+
165
+ status = f"✅ Success! Processed {len(self.chunks)} chunks from the document."
166
+ return status, chunks_display, text[:5000] # Return first 5000 chars of original text
167
 
168
  except Exception as e:
169
  self.ready = False
170
+ return f"Error processing document: {str(e)}", "", ""
171
 
172
  def set_embedding_model(self, model_name: str):
173
  """Set or change the embedding model"""