ai-puppy commited on
Commit
3c3b761
·
1 Parent(s): 3774bab
Files changed (2) hide show
  1. app.py +69 -209
  2. graph.py +73 -8
app.py CHANGED
@@ -4,9 +4,7 @@ import asyncio
4
  import tempfile
5
  from dotenv import find_dotenv, load_dotenv
6
  from langchain.chat_models import init_chat_model
7
- from langchain.schema import HumanMessage, SystemMessage
8
- from langgraph.prebuilt import create_react_agent
9
- from langsmith import traceable
10
 
11
  # Import the CodeAct agent functionality
12
  from agent import FileInjectedPyodideSandbox, create_pyodide_eval_fn, create_codeact
@@ -17,67 +15,13 @@ from graph import analyze_file_with_guidance_sync, guided_analysis_graph
17
  # Load environment variables
18
  load_dotenv(find_dotenv())
19
 
20
- # Initialize OpenAI model
21
- openai_model = init_chat_model(
22
- model="gpt-4.1-nano-2025-04-14",
23
- api_key=os.getenv("OPENAI_API_KEY"),
24
- )
25
-
26
- # Create the basic chat agent
27
- chat_agent = create_react_agent(openai_model, tools=[])
28
-
29
- # Initialize CodeAct model for file analysis
30
  codeact_model = init_chat_model("gpt-4.1-2025-04-14", model_provider="openai")
31
 
32
  # Store uploaded file path globally
33
  uploaded_file_path = None
34
 
35
- @traceable
36
- def respond(
37
- message,
38
- history: list[tuple[str, str]],
39
- system_message,
40
- max_tokens,
41
- temperature,
42
- top_p,
43
- ):
44
- """
45
- Main chat function that processes user input and returns AI response
46
- """
47
- try:
48
- # Convert history to LangChain message format
49
- messages = [SystemMessage(content=system_message)]
50
-
51
- # Add conversation history
52
- for user_msg, assistant_msg in history:
53
- if user_msg:
54
- messages.append(HumanMessage(content=user_msg))
55
- if assistant_msg:
56
- messages.append(SystemMessage(content=assistant_msg))
57
-
58
- # Add current user message
59
- messages.append(HumanMessage(content=message))
60
-
61
- # Prepare input for the agent
62
- input_data = {"messages": messages}
63
-
64
- # Stream the response
65
- response_text = ""
66
- for chunk in chat_agent.stream(input_data, stream_mode="values"):
67
- if "messages" in chunk and chunk["messages"]:
68
- latest_message = chunk["messages"][-1]
69
- if hasattr(latest_message, 'content'):
70
- current_content = latest_message.content
71
- if current_content and len(current_content) > len(response_text):
72
- response_text = current_content
73
- yield response_text
74
-
75
- # Ensure we return something even if streaming doesn't work
76
- if not response_text:
77
- yield "I'm sorry, I couldn't process your message. Please check your OpenAI API key."
78
-
79
- except Exception as e:
80
- yield f"Error: {str(e)}. Please make sure your OpenAI API key is set correctly."
81
 
82
  def handle_file_upload(file):
83
  """Handle file upload and store the path globally"""
@@ -109,45 +53,6 @@ def analyze_file_with_question(user_question):
109
  except Exception as e:
110
  return f"❌ Error in guided analysis: {str(e)}"
111
 
112
- def get_question_suggestions(file_path):
113
- """
114
- Generate suggested questions based on file type and structure
115
- """
116
- if not file_path or not os.path.exists(file_path):
117
- return []
118
-
119
- file_ext = os.path.splitext(file_path)[1].lower()
120
- base_suggestions = [
121
- "What are the main patterns in this file?",
122
- "Are there any security issues or anomalies?",
123
- "Provide a statistical summary of the data",
124
- "What insights can you extract from this file?"
125
- ]
126
-
127
- if file_ext in ['.log', '.txt']:
128
- return [
129
- "Find any security threats or failed login attempts",
130
- "Identify performance bottlenecks and slow operations",
131
- "What errors or warnings are present?",
132
- "Show me time-based trends in the data",
133
- "Are there any suspicious IP addresses or user activities?"
134
- ] + base_suggestions
135
- elif file_ext == '.csv':
136
- return [
137
- "Analyze the data distribution and statistics",
138
- "Find correlations between columns",
139
- "Identify outliers or anomalies in the data",
140
- "What are the key insights from this dataset?"
141
- ] + base_suggestions
142
- elif file_ext == '.json':
143
- return [
144
- "Parse and analyze the JSON structure",
145
- "What are the key data fields and their values?",
146
- "Find any nested patterns or relationships"
147
- ] + base_suggestions
148
- else:
149
- return base_suggestions
150
-
151
  async def analyze_uploaded_file():
152
  """Legacy function - kept for backward compatibility"""
153
  return analyze_file_with_question("Provide a comprehensive analysis of this file.")
@@ -156,122 +61,77 @@ def run_file_analysis():
156
  """Wrapper to run async file analysis in sync context"""
157
  return asyncio.run(analyze_uploaded_file())
158
 
159
- def update_question_suggestions():
160
- """Update question suggestions based on uploaded file"""
161
- global uploaded_file_path
162
- suggestions = get_question_suggestions(uploaded_file_path)
163
- return gr.Dropdown.update(choices=suggestions, value=suggestions[0] if suggestions else "")
164
-
165
  # Create the Gradio interface
166
- with gr.Blocks(title="DataForge - AI Assistant with Advanced File Analysis") as demo:
167
- gr.Markdown("# 🔍 DataForge - AI Assistant with Advanced File Analysis")
168
- gr.Markdown("Upload files and ask specific questions for AI-powered guided analysis using LangGraph.")
 
169
 
170
- with gr.Tab("💬 Chat Assistant"):
171
- chat_interface = gr.ChatInterface(
172
- respond,
173
- additional_inputs=[
174
- gr.Textbox(
175
- value="You are a helpful AI assistant. Be friendly, informative, and concise in your responses.",
176
- label="System message"
177
- ),
178
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
179
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
180
- gr.Slider(
181
- minimum=0.1,
182
- maximum=1.0,
183
- value=0.95,
184
- step=0.05,
185
- label="Top-p (nucleus sampling)",
186
- ),
187
- ],
188
- title="Chat with AI Assistant",
189
- description="Ask questions or get help with any topic."
190
- )
191
 
192
- with gr.Tab("📁 Advanced File Analysis"):
193
- gr.Markdown("## 🚀 Guided File Analysis with LangGraph")
194
- gr.Markdown("""
195
- Upload files and ask specific questions for targeted AI analysis. Our guided approach:
196
-
197
- 1. 📋 **Examines** your file structure and patterns
198
- 2. 🎯 **Generates** specific code guidance based on your question
199
- 3. 🚀 **Executes** enhanced analysis with improved accuracy
200
- """)
201
-
202
- with gr.Row():
203
- with gr.Column(scale=1):
204
- # File Upload Section
205
- gr.Markdown("### 📤 File Upload")
206
- file_upload = gr.File(
207
- label="Upload File for Analysis",
208
- file_types=[".txt", ".log", ".csv", ".json", ".xml", ".py", ".js", ".html", ".md"],
209
- type="filepath"
210
- )
211
- upload_status = gr.Textbox(
212
- label="Upload Status",
213
- value="No file uploaded",
214
- interactive=False
215
- )
216
-
217
- # Question Section
218
- gr.Markdown("### ❓ Ask Your Question")
219
- question_suggestions = gr.Dropdown(
220
- label="Question Suggestions (select or type your own)",
221
- choices=[],
222
- allow_custom_value=True,
223
- value=""
224
- )
225
-
226
- user_question = gr.Textbox(
227
- label="Your Question about the File",
228
- placeholder="What would you like to know about this file?",
229
- lines=3
230
- )
231
-
232
- analyze_btn = gr.Button("🔍 Run Guided Analysis", variant="primary", size="lg")
233
-
234
- # Analysis Info
235
- gr.Markdown("### ℹ️ Analysis Method")
236
- gr.Markdown("""
237
- **Guided Analysis Features:**
238
- - 🎯 Question-aware code generation
239
- - 📋 File structure examination
240
- - 🚀 Dynamic prompt optimization
241
- - ✅ Higher accuracy than generic analysis
242
- """)
243
 
244
- with gr.Column(scale=2):
245
- analysis_output = gr.Textbox(
246
- label="📊 Guided Analysis Results",
247
- lines=25,
248
- max_lines=35,
249
- placeholder="Upload a file, ask a question, and click 'Run Guided Analysis' to see detailed results here...",
250
- interactive=False
251
- )
252
-
253
- # Event handlers
254
- file_upload.change(
255
- fn=handle_file_upload,
256
- inputs=[file_upload],
257
- outputs=[upload_status]
258
- ).then(
259
- fn=update_question_suggestions,
260
- inputs=[],
261
- outputs=[question_suggestions]
262
- )
263
-
264
- question_suggestions.change(
265
- fn=lambda x: x,
266
- inputs=[question_suggestions],
267
- outputs=[user_question]
268
- )
269
 
270
- analyze_btn.click(
271
- fn=analyze_file_with_question,
272
- inputs=[user_question],
273
- outputs=[analysis_output]
274
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
  with gr.Tab("📊 Analysis Examples"):
277
  gr.Markdown("## 💡 Example Questions by File Type")
 
4
  import tempfile
5
  from dotenv import find_dotenv, load_dotenv
6
  from langchain.chat_models import init_chat_model
7
+ # Simplified imports - focusing on file analysis
 
 
8
 
9
  # Import the CodeAct agent functionality
10
  from agent import FileInjectedPyodideSandbox, create_pyodide_eval_fn, create_codeact
 
15
  # Load environment variables
16
  load_dotenv(find_dotenv())
17
 
18
+ # Initialize model for file analysis
 
 
 
 
 
 
 
 
 
19
  codeact_model = init_chat_model("gpt-4.1-2025-04-14", model_provider="openai")
20
 
21
  # Store uploaded file path globally
22
  uploaded_file_path = None
23
 
24
+ # Chat functionality removed - focusing on file analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def handle_file_upload(file):
27
  """Handle file upload and store the path globally"""
 
53
  except Exception as e:
54
  return f"❌ Error in guided analysis: {str(e)}"
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  async def analyze_uploaded_file():
57
  """Legacy function - kept for backward compatibility"""
58
  return analyze_file_with_question("Provide a comprehensive analysis of this file.")
 
61
  """Wrapper to run async file analysis in sync context"""
62
  return asyncio.run(analyze_uploaded_file())
63
 
 
 
 
 
 
 
64
  # Create the Gradio interface
65
+ with gr.Blocks(title="DataForge - AI-Powered File Analysis") as demo:
66
+ gr.Markdown("# 🔍 DataForge - AI-Powered File Analysis")
67
+ gr.Markdown("""
68
+ Upload any file and ask specific questions for targeted AI analysis. Our guided approach:
69
 
70
+ 1. 📋 **Examines** your file structure and patterns automatically
71
+ 2. 🎯 **Generates** specific code guidance based on your question
72
+ 3. 🚀 **Executes** enhanced analysis with improved accuracy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ **Simply upload a file and ask any question you want!**
75
+ """)
76
+
77
+ with gr.Row():
78
+ with gr.Column(scale=1):
79
+ # File Upload Section
80
+ gr.Markdown("### 📤 File Upload")
81
+ file_upload = gr.File(
82
+ label="Upload File for Analysis",
83
+ file_types=[".txt", ".log", ".csv", ".json", ".xml", ".py", ".js", ".html", ".md"],
84
+ type="filepath"
85
+ )
86
+ upload_status = gr.Textbox(
87
+ label="Upload Status",
88
+ value="No file uploaded",
89
+ interactive=False
90
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ # Question Section
93
+ gr.Markdown("### ❓ Ask Your Question")
94
+ user_question = gr.Textbox(
95
+ label="Your Question about the File",
96
+ placeholder="What would you like to know about this file? (e.g., 'Find security threats', 'Show performance issues', 'What errors are present?')",
97
+ lines=4,
98
+ value=""
99
+ )
100
+
101
+ analyze_btn = gr.Button("🔍 Run Guided Analysis", variant="primary", size="lg")
102
+
103
+ # Analysis Info
104
+ gr.Markdown("### ℹ️ How It Works")
105
+ gr.Markdown("""
106
+ **Guided Analysis Process:**
107
+ - 🎯 **Question-aware**: Code generation tailored to your specific question
108
+ - 📋 **Smart examination**: Automatically detects file structure and patterns
109
+ - 🚀 **Dynamic optimization**: Creates targeted analysis approach
110
+ - ✅ **Higher accuracy**: Prevents common code generation errors
111
+ - 🔧 **Quality control**: Built-in validation to avoid syntax issues
112
+ """)
 
 
 
 
113
 
114
+ with gr.Column(scale=2):
115
+ analysis_output = gr.Textbox(
116
+ label="📊 Guided Analysis Results",
117
+ lines=25,
118
+ max_lines=35,
119
+ placeholder="Upload a file, type your question, and click 'Run Guided Analysis' to see detailed results here...",
120
+ interactive=False
121
+ )
122
+
123
+ # Event handlers
124
+ file_upload.change(
125
+ fn=handle_file_upload,
126
+ inputs=[file_upload],
127
+ outputs=[upload_status]
128
+ )
129
+
130
+ analyze_btn.click(
131
+ fn=analyze_file_with_question,
132
+ inputs=[user_question],
133
+ outputs=[analysis_output]
134
+ )
135
 
136
  with gr.Tab("📊 Analysis Examples"):
137
  gr.Markdown("## 💡 Example Questions by File Type")
graph.py CHANGED
@@ -1,4 +1,5 @@
1
  import asyncio
 
2
  import os
3
  import re
4
  from typing import Annotated, Dict, List, Optional
@@ -58,6 +59,37 @@ class CodeAnalysisState(TypedDict):
58
  final_analysis: Optional[str]
59
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def examine_file_structure(state: CodeAnalysisState) -> CodeAnalysisState:
62
  """
63
  Node 1: Examine the file structure by reading the first several lines
@@ -244,8 +276,7 @@ def generate_code_guidance(state: CodeAnalysisState) -> CodeAnalysisState:
244
 
245
  def execute_guided_analysis(state: CodeAnalysisState) -> CodeAnalysisState:
246
  """
247
- Node 3: Execute the file analysis using the generated guidance.
248
- This replaces the original agent with guided code generation.
249
  """
250
  file_path = state["file_path"]
251
  file_examination = state["file_examination"]
@@ -259,9 +290,8 @@ def execute_guided_analysis(state: CodeAnalysisState) -> CodeAnalysisState:
259
  }
260
 
261
  try:
262
- # Create the guided analysis query
263
- guided_query = f"""
264
- Based on the file examination and guidance, analyze this file with the following SPECIFIC instructions:
265
 
266
  FILE CONTEXT:
267
  - File Type: {file_examination.file_type}
@@ -284,6 +314,41 @@ SAMPLE FILE STRUCTURE (first few lines):
284
 
285
  USER REQUEST: {analysis_query or "Comprehensive analysis following the guidance above"}
286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
  INSTRUCTIONS:
288
  1. Follow the specified analysis approach exactly
289
  2. Import only the recommended libraries: {', '.join(code_guidance.required_imports)}
@@ -291,11 +356,12 @@ INSTRUCTIONS:
291
  4. Structure your code following: {code_guidance.code_structure}
292
  5. Generate the expected outputs: {', '.join(code_guidance.expected_outputs)}
293
  6. Implement proper error handling: {code_guidance.error_handling}
 
294
 
295
  Since you have detailed guidance about this specific file structure, your code should be highly accurate and efficient.
296
  The file examination shows this is a {file_examination.file_type} with {file_examination.data_format} data format.
297
 
298
- Write Python code that leverages this specific knowledge for optimal analysis.
299
  """
300
 
301
  print(f"🚀 Executing guided analysis...")
@@ -326,8 +392,7 @@ Write Python code that leverages this specific knowledge for optimal analysis.
326
  execution_result = asyncio.run(run_guided_analysis())
327
 
328
  # Create final analysis summary
329
- final_analysis = f"""
330
- === GUIDED FILE ANALYSIS RESULTS ===
331
 
332
  File: {file_path}
333
  Type: {file_examination.file_type} ({file_examination.data_format})
 
1
  import asyncio
2
+ import ast
3
  import os
4
  import re
5
  from typing import Annotated, Dict, List, Optional
 
59
  final_analysis: Optional[str]
60
 
61
 
62
+ def validate_python_code(code: str) -> tuple[bool, str]:
63
+ """
64
+ Validate Python code for syntax errors and potential issues.
65
+ Returns (is_valid, error_message)
66
+ """
67
+ try:
68
+ # Try to parse the code as AST
69
+ ast.parse(code)
70
+
71
+ # Check for common problematic patterns
72
+ lines = code.split('\n')
73
+ for i, line in enumerate(lines, 1):
74
+ line_stripped = line.strip()
75
+
76
+ # Check for unterminated strings
77
+ if line_stripped.startswith('print(') and not line_stripped.endswith(')'):
78
+ if line_stripped.count('"') % 2 != 0 or line_stripped.count("'") % 2 != 0:
79
+ return False, f"Line {i}: Potentially unterminated string in print statement"
80
+
81
+ # Check for very long lines that might get truncated
82
+ if len(line) > 100:
83
+ return False, f"Line {i}: Line too long ({len(line)} chars) - may cause truncation"
84
+
85
+ return True, "Code validation passed"
86
+
87
+ except SyntaxError as e:
88
+ return False, f"Syntax error: {e.msg} at line {e.lineno}"
89
+ except Exception as e:
90
+ return False, f"Validation error: {str(e)}"
91
+
92
+
93
  def examine_file_structure(state: CodeAnalysisState) -> CodeAnalysisState:
94
  """
95
  Node 1: Examine the file structure by reading the first several lines
 
276
 
277
  def execute_guided_analysis(state: CodeAnalysisState) -> CodeAnalysisState:
278
  """
279
+ Node 3: Execute the file analysis using the generated guidance with code quality validation.
 
280
  """
281
  file_path = state["file_path"]
282
  file_examination = state["file_examination"]
 
290
  }
291
 
292
  try:
293
+ # Create the guided analysis query with strict code quality requirements
294
+ guided_query = f"""Based on the file examination and guidance, analyze this file with the following SPECIFIC instructions:
 
295
 
296
  FILE CONTEXT:
297
  - File Type: {file_examination.file_type}
 
314
 
315
  USER REQUEST: {analysis_query or "Comprehensive analysis following the guidance above"}
316
 
317
+ CRITICAL CODE QUALITY REQUIREMENTS:
318
+ 1. ALL print statements MUST be on single lines with properly closed quotes
319
+ 2. NO multi-line strings or f-strings that span multiple lines
320
+ 3. NO print statements longer than 80 characters - break into multiple prints instead
321
+ 4. ALL strings must be properly terminated with matching quotes
322
+ 5. Use short variable names and concise output formatting
323
+ 6. If you need to print long text, use multiple short print() calls
324
+ 7. Always close parentheses, brackets, and quotes on the same line they open
325
+ 8. Use simple string concatenation instead of complex f-strings for long output
326
+ 9. NEVER use triple quotes for multi-line strings in limited execution environments
327
+ 10. Test each print statement individually to ensure it executes without truncation
328
+
329
+ EXAMPLE OF SAFE CODING PRACTICES:
330
+ ```python
331
+ # GOOD - Short, single-line prints
332
+ print("=== Results ===")
333
+ print(f"Count: {{count}}")
334
+ print(f"User: {{user}}")
335
+
336
+ # BAD - Long print that could be truncated
337
+ print(f"This is a very long print statement that could get truncated...")
338
+
339
+ # GOOD - Break long output into multiple prints
340
+ print("Analysis complete:")
341
+ print(f"Found {{count}} items")
342
+ print(f"Top user: {{user}}")
343
+ ```
344
+
345
+ MANDATORY CODE GENERATION PROCESS:
346
+ 1. Generate your analysis code following the above requirements
347
+ 2. Before presenting the code, internally validate each line for potential issues
348
+ 3. Ensure ALL print statements are under 80 characters
349
+ 4. Verify all quotes and parentheses are properly closed
350
+ 5. If any line might cause issues, rewrite it using multiple shorter statements
351
+
352
  INSTRUCTIONS:
353
  1. Follow the specified analysis approach exactly
354
  2. Import only the recommended libraries: {', '.join(code_guidance.required_imports)}
 
356
  4. Structure your code following: {code_guidance.code_structure}
357
  5. Generate the expected outputs: {', '.join(code_guidance.expected_outputs)}
358
  6. Implement proper error handling: {code_guidance.error_handling}
359
+ 7. ENSURE ALL CODE FOLLOWS THE QUALITY REQUIREMENTS ABOVE
360
 
361
  Since you have detailed guidance about this specific file structure, your code should be highly accurate and efficient.
362
  The file examination shows this is a {file_examination.file_type} with {file_examination.data_format} data format.
363
 
364
+ Write Python code that leverages this specific knowledge for optimal analysis and follows strict code quality standards.
365
  """
366
 
367
  print(f"🚀 Executing guided analysis...")
 
392
  execution_result = asyncio.run(run_guided_analysis())
393
 
394
  # Create final analysis summary
395
+ final_analysis = f"""=== GUIDED FILE ANALYSIS RESULTS ===
 
396
 
397
  File: {file_path}
398
  Type: {file_examination.file_type} ({file_examination.data_format})