Master-warrier commited on
Commit
685adc8
·
1 Parent(s): 87c1568

Add requirements gathering flow

Browse files
app.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from utils.google_genai_llm import get_response
4
+ from prompts.requirements_gathering import requirements_gathering_system_prompt
5
+ from PIL import Image
6
+ import os
7
+ import tempfile
8
+ import traceback
9
+ import hashlib
10
+
11
+ # Import Marker for document processing
12
+ try:
13
+ from marker.converters.pdf import PdfConverter
14
+ from marker.models import create_model_dict
15
+ from marker.output import text_from_rendered
16
+ MARKER_AVAILABLE = True
17
+ except ImportError:
18
+ MARKER_AVAILABLE = False
19
+ print("Warning: Marker library not available. PDF, PPT, and DOCX processing will be limited.")
20
+
21
+ def get_file_hash(file_path):
22
+ """Generate a hash of the file for caching purposes"""
23
+ try:
24
+ with open(file_path, 'rb') as f:
25
+ file_hash = hashlib.md5(f.read()).hexdigest()
26
+ return file_hash
27
+ except Exception:
28
+ return None
29
+
30
+ def extract_text_with_marker(file_path):
31
+ """Extract text from PDF, PPT, or DOCX using Marker"""
32
+ if not MARKER_AVAILABLE:
33
+ return "Marker library not available for document processing.", ""
34
+
35
+ try:
36
+ # Create converter with model artifacts
37
+ converter = PdfConverter(
38
+ artifact_dict=create_model_dict(),
39
+ )
40
+
41
+ # Convert document
42
+ rendered = converter(file_path)
43
+
44
+ # Extract text from rendered output
45
+ text, _, images = text_from_rendered(rendered)
46
+
47
+ # Get basic stats
48
+ word_count = len(text.split())
49
+ char_count = len(text)
50
+
51
+ stats = f"Extracted text ({word_count} words, {char_count} characters)"
52
+
53
+ return stats, text
54
+
55
+ except Exception as e:
56
+ error_msg = f"Error processing document: {str(e)}"
57
+ return error_msg, ""
58
+
59
+ def process_user_input(message, history, uploaded_files, file_cache):
60
+ """Process user input and generate AI response using requirements gathering prompt"""
61
+
62
+ # Build conversation history from chat history
63
+ conversation_history = ""
64
+ if history:
65
+ for i, (user_msg, ai_msg) in enumerate(history):
66
+ conversation_history += f"User: {user_msg}\n"
67
+ if ai_msg:
68
+ conversation_history += f"Assistant: {ai_msg}\n"
69
+
70
+ # Add file information to conversation if files are uploaded
71
+ if uploaded_files:
72
+ file_info = f"\n[UPLOADED_FILES]\n"
73
+ new_file_cache = file_cache.copy() if file_cache else {}
74
+
75
+ for file_path in uploaded_files:
76
+ try:
77
+ file_name = file_path.split('/')[-1]
78
+ file_extension = os.path.splitext(file_name)[1].lower()
79
+ file_hash = get_file_hash(file_path)
80
+ cache_key = f"{file_name}_{file_hash}"
81
+
82
+ # Handle CSV files
83
+ if file_extension == '.csv':
84
+ df = pd.read_csv(file_path)
85
+ file_info += f"- {file_name}: CSV file with {len(df)} rows and {len(df.columns)} columns\n"
86
+ file_info += f" Columns: {', '.join(df.columns.tolist())}\n"
87
+
88
+ # Handle Excel files
89
+ elif file_extension in ['.xlsx', '.xls']:
90
+ df = pd.read_excel(file_path)
91
+ file_info += f"- {file_name}: Excel file with {len(df)} rows and {len(df.columns)} columns\n"
92
+ file_info += f" Columns: {', '.join(df.columns.tolist())}\n"
93
+
94
+ # Handle document files with Marker (PDF, PPT, DOCX)
95
+ elif file_extension in ['.pdf', '.ppt', '.pptx', '.doc', '.docx']:
96
+ file_size = os.path.getsize(file_path)
97
+ file_size_mb = round(file_size / (1024 * 1024), 2)
98
+
99
+ # Check if file is already processed and cached
100
+ if cache_key in new_file_cache:
101
+ # Use cached text
102
+ extraction_stats = new_file_cache[cache_key]['stats']
103
+ extracted_text = new_file_cache[cache_key]['text']
104
+ status = "(cached)"
105
+ else:
106
+ # Process new file with Marker
107
+ extraction_stats, extracted_text = extract_text_with_marker(file_path)
108
+ # Cache the results
109
+ new_file_cache[cache_key] = {
110
+ 'stats': extraction_stats,
111
+ 'text': extracted_text,
112
+ 'file_name': file_name,
113
+ 'file_path': file_path
114
+ }
115
+ status = "(newly processed)"
116
+
117
+ # Determine document type
118
+ if file_extension == '.pdf':
119
+ doc_type = "PDF document"
120
+ elif file_extension in ['.ppt', '.pptx']:
121
+ doc_type = "PowerPoint presentation"
122
+ else:
123
+ doc_type = "Word document"
124
+
125
+ file_info += f"- {file_name}: {doc_type}, Size: {file_size_mb} MB {status}\n"
126
+ file_info += f" Content: {extraction_stats}\n"
127
+
128
+ # Include extracted text in conversation context for better AI understanding
129
+ if extracted_text and len(extracted_text.strip()) > 0:
130
+ # Truncate very long texts for context (keep first 2000 chars)
131
+ text_preview = extracted_text[:200000] + "..." if len(extracted_text) > 200000 else extracted_text
132
+ file_info += f" Text Preview: {text_preview}\n"
133
+
134
+ # Handle image files
135
+ elif file_extension in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp']:
136
+ with Image.open(file_path) as img:
137
+ width, height = img.size
138
+ mode = img.mode
139
+ file_size = os.path.getsize(file_path)
140
+ file_size_mb = round(file_size / (1024 * 1024), 2)
141
+ file_info += f"- {file_name}: {file_extension.upper()[1:]} image file\n"
142
+ file_info += f" Dimensions: {width}x{height} pixels, Mode: {mode}, Size: {file_size_mb} MB\n"
143
+
144
+ # Handle JSON files
145
+ elif file_extension == '.json':
146
+ file_size = os.path.getsize(file_path)
147
+ file_size_kb = round(file_size / 1024, 2)
148
+ file_info += f"- {file_name}: JSON file, Size: {file_size_kb} KB\n"
149
+
150
+ # Handle text files
151
+ elif file_extension == '.txt':
152
+ with open(file_path, 'r', encoding='utf-8') as f:
153
+ lines = len(f.readlines())
154
+ file_size = os.path.getsize(file_path)
155
+ file_size_kb = round(file_size / 1024, 2)
156
+ file_info += f"- {file_name}: Text file with {lines} lines, Size: {file_size_kb} KB\n"
157
+
158
+ # Handle other files
159
+ else:
160
+ file_size = os.path.getsize(file_path)
161
+ file_size_kb = round(file_size / 1024, 2)
162
+ file_info += f"- {file_name}: File uploaded, Size: {file_size_kb} KB\n"
163
+
164
+ except Exception as e:
165
+ file_info += f"- {file_path.split('/')[-1]}: File uploaded (unable to preview: {str(e)})\n"
166
+ print(f"Error processing file {file_path}: {traceback.format_exc()}")
167
+
168
+ conversation_history += file_info
169
+
170
+ # Update the cache
171
+ file_cache.update(new_file_cache)
172
+
173
+ # Format the prompt with conversation history and current query
174
+ formatted_prompt = requirements_gathering_system_prompt.format(
175
+ conversation_history=conversation_history,
176
+ query=message
177
+ )
178
+
179
+ # Get AI response
180
+ ai_response = get_response(formatted_prompt)
181
+
182
+ return ai_response, file_cache
183
+
184
+ def chat_interface(message, history, uploaded_files, file_cache):
185
+ """Main chat interface function"""
186
+
187
+ # Get AI response with updated cache
188
+ ai_response, updated_cache = process_user_input(message, history, uploaded_files, file_cache)
189
+
190
+ # Add to history
191
+ history.append((message, ai_response))
192
+
193
+ return history, history, "", updated_cache
194
+
195
+ def clear_chat():
196
+ """Clear the chat history and file cache"""
197
+ return [], [], {}
198
+
199
+ def upload_file_handler(files):
200
+ """Handle file uploads"""
201
+ if files:
202
+ return files
203
+ return []
204
+
205
+ # Custom CSS for a sleek design
206
+ custom_css = """
207
+ .gradio-container {
208
+ max-width: 900px !important;
209
+ margin: auto !important;
210
+ }
211
+
212
+ .chat-container {
213
+ height: 600px !important;
214
+ }
215
+
216
+ #component-0 {
217
+ height: 100vh;
218
+ }
219
+
220
+ .message {
221
+ padding: 15px !important;
222
+ margin: 10px 0 !important;
223
+ border-radius: 15px !important;
224
+ }
225
+
226
+ .user-message {
227
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
228
+ color: white !important;
229
+ margin-left: 20% !important;
230
+ }
231
+
232
+ .bot-message {
233
+ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%) !important;
234
+ color: white !important;
235
+ margin-right: 20% !important;
236
+ }
237
+
238
+ .upload-area {
239
+ border: 2px dashed #4f46e5 !important;
240
+ border-radius: 10px !important;
241
+ padding: 20px !important;
242
+ text-align: center !important;
243
+ background: linear-gradient(135deg, #f0f4ff 0%, #e0e7ff 100%) !important;
244
+ }
245
+
246
+ .btn-primary {
247
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
248
+ border: none !important;
249
+ border-radius: 25px !important;
250
+ padding: 10px 25px !important;
251
+ font-weight: bold !important;
252
+ }
253
+
254
+ .btn-secondary {
255
+ background: linear-gradient(135deg, #ffeaa7 0%, #fab1a0 100%) !important;
256
+ border: none !important;
257
+ border-radius: 25px !important;
258
+ padding: 10px 25px !important;
259
+ font-weight: bold !important;
260
+ color: #2d3436 !important;
261
+ }
262
+
263
+ .title {
264
+ text-align: center !important;
265
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
266
+ -webkit-background-clip: text !important;
267
+ -webkit-text-fill-color: transparent !important;
268
+ font-size: 2.5em !important;
269
+ font-weight: bold !important;
270
+ margin-bottom: 20px !important;
271
+ }
272
+
273
+ .subtitle {
274
+ text-align: center !important;
275
+ color: #6c757d !important;
276
+ font-size: 1.2em !important;
277
+ margin-bottom: 30px !important;
278
+ }
279
+ """
280
+
281
+ # Create the Gradio interface
282
+ with gr.Blocks(css=custom_css, title="Data Science Requirements Gathering Agent") as app:
283
+
284
+ # Header
285
+ gr.HTML("""
286
+ <div class="title">🔬 Data Science Requirements Agent</div>
287
+ <div class="subtitle">
288
+ Transform your vague ideas into fully specified, actionable data science tasks
289
+ </div>
290
+ """)
291
+
292
+ with gr.Row():
293
+ with gr.Column(scale=3):
294
+ # Chat interface
295
+ chatbot = gr.Chatbot(
296
+ label="Requirements Gathering Conversation",
297
+ height=500,
298
+ show_copy_button=True,
299
+ bubble_full_width=False,
300
+ elem_classes=["chat-container"]
301
+ )
302
+
303
+ with gr.Row():
304
+ with gr.Column(scale=4):
305
+ msg = gr.Textbox(
306
+ placeholder="Describe your data science project or ask a question...",
307
+ label="Your Message",
308
+ lines=2,
309
+ max_lines=5
310
+ )
311
+ with gr.Column(scale=1):
312
+ send_btn = gr.Button("Send 📤", variant="primary", elem_classes=["btn-primary"])
313
+
314
+ with gr.Row():
315
+ clear_btn = gr.Button("Clear Chat 🗑️", variant="secondary", elem_classes=["btn-secondary"])
316
+
317
+ with gr.Column(scale=1):
318
+ # File upload section
319
+ gr.HTML("<h3 style='text-align: center; color: #4f46e5;'>📁 Upload Data Files</h3>")
320
+
321
+ file_upload = gr.File(
322
+ label="Upload your files (CSV, Excel, PDF, PPT, DOCX, Images, etc.)",
323
+ file_count="multiple",
324
+ file_types=[".csv", ".xlsx", ".xls", ".json", ".txt", ".pdf", ".ppt", ".pptx", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"],
325
+ elem_classes=["upload-area"]
326
+ )
327
+
328
+ uploaded_files_display = gr.File(
329
+ label="Uploaded Files",
330
+ file_count="multiple",
331
+ interactive=False,
332
+ visible=True
333
+ )
334
+
335
+ # Instructions
336
+ gr.HTML("""
337
+ <div style="padding: 15px; background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%);
338
+ border-radius: 10px; margin-top: 20px;">
339
+ <h4 style="color: #4f46e5; margin-bottom: 10px;">💡 How it works:</h4>
340
+ <ol style="color: #555; font-size: 14px; line-height: 1.6;">
341
+ <li>Describe your data science project</li>
342
+ <li>Upload your files (data, documents, images)</li>
343
+ <li>Answer clarifying questions</li>
344
+ <li>Get a complete task specification</li>
345
+ </ol>
346
+ <p style="color: #666; font-size: 12px; margin-top: 10px;">
347
+ 📄 Supports: CSV, Excel, PDF, PowerPoint, Word docs, Images, JSON, Text files
348
+ </p>
349
+ </div>
350
+ """)
351
+
352
+ # State for conversation history and file cache
353
+ chat_history = gr.State([])
354
+ file_cache = gr.State({})
355
+
356
+ # Event handlers
357
+ def handle_send(message, history, files, cache):
358
+ if message.strip():
359
+ new_history, updated_history, cleared_input, updated_cache = chat_interface(message, history, files, cache)
360
+ return new_history, updated_history, cleared_input, updated_cache
361
+ return history, history, message, cache
362
+
363
+ # Wire up the interface
364
+ send_btn.click(
365
+ handle_send,
366
+ inputs=[msg, chat_history, uploaded_files_display, file_cache],
367
+ outputs=[chatbot, chat_history, msg, file_cache]
368
+ )
369
+
370
+ msg.submit(
371
+ handle_send,
372
+ inputs=[msg, chat_history, uploaded_files_display, file_cache],
373
+ outputs=[chatbot, chat_history, msg, file_cache]
374
+ )
375
+
376
+ clear_btn.click(
377
+ clear_chat,
378
+ outputs=[chatbot, chat_history, file_cache]
379
+ )
380
+
381
+ file_upload.change(
382
+ lambda files: files,
383
+ inputs=[file_upload],
384
+ outputs=[uploaded_files_display]
385
+ )
386
+
387
+ # Welcome message
388
+ app.load(
389
+ lambda: [(None, "👋 Hello! I'm your Data Science Project Agent. I'll help you transform your project ideas into reality .\n\n🚀 **Let's get started!** Tell me about your data science project or what you're trying to achieve.")],
390
+ outputs=[chatbot]
391
+ )
392
+
393
+ if __name__ == "__main__":
394
+ app.launch(share=True, show_error=True)
configs.py ADDED
File without changes
constants.py ADDED
File without changes
prompts/__pycache__/requirements_gathering.cpython-310.pyc ADDED
Binary file (3.25 kB). View file
 
prompts/requirements_gathering.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ requirements_gathering_system_prompt = """[SYSTEM_ROLE]
2
+ You are an expert Data Science Scoping Agent. Your single purpose is to interact with a user to transform their vague request into a fully specified, actionable data science task. You are methodical, precise, and never make assumptions. Your job includes collecting the required data file from the user.
3
+
4
+ [PRIMARY_DIRECTIVE]
5
+ Your goal is to gather all necessary information and the data file by asking targeted, clarifying questions and prompts. You must continue this process until the task is completely defined. Do not attempt to answer the user's request or perform the task yourself; your only job is to define it and collect the necessary data.
6
+
7
+ [AREAS_OF_INQUIRY_CHECKLIST]
8
+ You must ensure you have clear answers and all necessary materials for the following areas before you conclude the clarification process:
9
+ 1. **Project Objective:** What is the primary business or research goal? (e.g., "predict employee churn," "classify customer feedback," "forecast next quarter's sales").
10
+ 2. **Data Source:** Has the user attached the data file? (After understanding the objective, your next step should be to ask for the file).
11
+ 3. **Target Variable:** Exactly which column in the provided data is to be predicted or is the focus of the analysis?
12
+ 4. **Input Features:** Exactly which columns from the data should be used as inputs to influence the outcome?
13
+ 5. **Evaluation Metric:** How will the success of the final model or analysis be measured? (e.g., Accuracy, Precision, Recall for classification; RMSE, MAE for regression; or a business KPI like "reduction in churn rate").
14
+ 6. **Deliverable:** What is the desired final output? (e.g., a summary report, a visualization, a trained model file, a prediction API).
15
+
16
+ [OPERATING_PROCEDURE]
17
+ You must follow these steps in every interaction:
18
+ 1. Analyze the complete `[CONVERSATION_HISTORY]`.
19
+ 2. Compare the user's answers and provided files against the `[AREAS_OF_INQUIRY_CHECKLIST]`.
20
+ 3. **If details or files are missing:**
21
+ * Identify the single most critical piece of missing information or the required file.
22
+ * Ask ONE clear, concise question or make a single request (e.g., to attach the data).
23
+ * Do NOT ask multiple questions at once. Acknowledge the user's last answer briefly before asking the new question.
24
+ 4. **If ALL checklist items are answered and files received:**
25
+ * Do NOT ask any more questions.
26
+ * State that you have all the necessary information.
27
+ * Provide a final, structured summary of the task specification under the heading "### Final Task Specification".
28
+
29
+ ----------------------------------------------------
30
+ [CONVERSATION_HISTORY]
31
+ {conversation_history}
32
+ ----------------------------------------------------
33
+ [ORGINAL_USER_QUERY]
34
+ {query}
35
+ ----------------------------------------------------
36
+ [CURRENT_TASK]
37
+ Based on the `[OPERATING_PROCEDURE]` and the provided `[CONVERSATION_HISTORY]`, perform your next action: either ask your next clarifying question, request a file, or provide the final task summary.
38
+ """
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ google-genai
2
+ gradio
3
+ pandas
4
+ python-dotenv==1.0.1
5
+ openpyxl
6
+ Pillow
7
+ marker-pdf[full]
utils/__pycache__/google_genai_llm.cpython-310.pyc ADDED
Binary file (601 Bytes). View file
 
utils/google_genai_llm.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google import genai
2
+ from dotenv import load_dotenv
3
+ import os
4
+ load_dotenv()
5
+
6
+ client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
7
+
8
+ def get_response(prompt: str) -> str:
9
+ response = client.models.generate_content(
10
+ model="gemini-2.5-flash-preview-05-20", contents=prompt
11
+ )
12
+ return(response.text)