Spaces:
Running
Running
Merge branch 'main' of https://huggingface.co/spaces/Agents-MCP-Hackathon/DataForge
Browse files
app.py
CHANGED
@@ -4,9 +4,7 @@ import asyncio
|
|
4 |
import tempfile
|
5 |
from dotenv import find_dotenv, load_dotenv
|
6 |
from langchain.chat_models import init_chat_model
|
7 |
-
|
8 |
-
from langgraph.prebuilt import create_react_agent
|
9 |
-
from langsmith import traceable
|
10 |
|
11 |
# Import the CodeAct agent functionality
|
12 |
from agent import FileInjectedPyodideSandbox, create_pyodide_eval_fn, create_codeact
|
@@ -17,77 +15,29 @@ from graph import analyze_file_with_guidance_sync, guided_analysis_graph
|
|
17 |
# Load environment variables
|
18 |
load_dotenv(find_dotenv())
|
19 |
|
20 |
-
# Initialize
|
21 |
-
openai_model = init_chat_model(
|
22 |
-
model="gpt-4.1-nano-2025-04-14",
|
23 |
-
api_key=os.getenv("OPENAI_API_KEY"),
|
24 |
-
)
|
25 |
-
|
26 |
-
# Create the basic chat agent
|
27 |
-
chat_agent = create_react_agent(openai_model, tools=[])
|
28 |
-
|
29 |
-
# Initialize CodeAct model for file analysis
|
30 |
codeact_model = init_chat_model("gpt-4.1-2025-04-14", model_provider="openai")
|
31 |
|
32 |
# Store uploaded file path globally
|
33 |
uploaded_file_path = None
|
34 |
|
35 |
-
|
36 |
-
def respond(
|
37 |
-
message,
|
38 |
-
history: list[tuple[str, str]],
|
39 |
-
system_message,
|
40 |
-
max_tokens,
|
41 |
-
temperature,
|
42 |
-
top_p,
|
43 |
-
):
|
44 |
-
"""
|
45 |
-
Main chat function that processes user input and returns AI response
|
46 |
-
"""
|
47 |
-
try:
|
48 |
-
# Convert history to LangChain message format
|
49 |
-
messages = [SystemMessage(content=system_message)]
|
50 |
-
|
51 |
-
# Add conversation history
|
52 |
-
for user_msg, assistant_msg in history:
|
53 |
-
if user_msg:
|
54 |
-
messages.append(HumanMessage(content=user_msg))
|
55 |
-
if assistant_msg:
|
56 |
-
messages.append(SystemMessage(content=assistant_msg))
|
57 |
-
|
58 |
-
# Add current user message
|
59 |
-
messages.append(HumanMessage(content=message))
|
60 |
-
|
61 |
-
# Prepare input for the agent
|
62 |
-
input_data = {"messages": messages}
|
63 |
-
|
64 |
-
# Stream the response
|
65 |
-
response_text = ""
|
66 |
-
for chunk in chat_agent.stream(input_data, stream_mode="values"):
|
67 |
-
if "messages" in chunk and chunk["messages"]:
|
68 |
-
latest_message = chunk["messages"][-1]
|
69 |
-
if hasattr(latest_message, 'content'):
|
70 |
-
current_content = latest_message.content
|
71 |
-
if current_content and len(current_content) > len(response_text):
|
72 |
-
response_text = current_content
|
73 |
-
yield response_text
|
74 |
-
|
75 |
-
# Ensure we return something even if streaming doesn't work
|
76 |
-
if not response_text:
|
77 |
-
yield "I'm sorry, I couldn't process your message. Please check your OpenAI API key."
|
78 |
-
|
79 |
-
except Exception as e:
|
80 |
-
yield f"Error: {str(e)}. Please make sure your OpenAI API key is set correctly."
|
81 |
|
82 |
def handle_file_upload(file):
|
83 |
"""Handle file upload and store the path globally"""
|
84 |
global uploaded_file_path
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
uploaded_file_path = None
|
90 |
-
return "❌
|
91 |
|
92 |
def analyze_file_with_question(user_question):
|
93 |
"""
|
@@ -95,13 +45,13 @@ def analyze_file_with_question(user_question):
|
|
95 |
"""
|
96 |
global uploaded_file_path
|
97 |
|
98 |
-
if not uploaded_file_path or not os.path.exists(uploaded_file_path):
|
99 |
-
return "❌ No file uploaded or file not found. Please upload a file first."
|
100 |
-
|
101 |
-
if not user_question or user_question.strip() == "":
|
102 |
-
user_question = "Provide a comprehensive analysis of this file including security, performance, and data insights."
|
103 |
-
|
104 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
# Use the new guided analysis approach
|
106 |
result = analyze_file_with_guidance_sync(uploaded_file_path, user_question)
|
107 |
return result
|
@@ -109,45 +59,6 @@ def analyze_file_with_question(user_question):
|
|
109 |
except Exception as e:
|
110 |
return f"❌ Error in guided analysis: {str(e)}"
|
111 |
|
112 |
-
def get_question_suggestions(file_path):
|
113 |
-
"""
|
114 |
-
Generate suggested questions based on file type and structure
|
115 |
-
"""
|
116 |
-
if not file_path or not os.path.exists(file_path):
|
117 |
-
return []
|
118 |
-
|
119 |
-
file_ext = os.path.splitext(file_path)[1].lower()
|
120 |
-
base_suggestions = [
|
121 |
-
"What are the main patterns in this file?",
|
122 |
-
"Are there any security issues or anomalies?",
|
123 |
-
"Provide a statistical summary of the data",
|
124 |
-
"What insights can you extract from this file?"
|
125 |
-
]
|
126 |
-
|
127 |
-
if file_ext in ['.log', '.txt']:
|
128 |
-
return [
|
129 |
-
"Find any security threats or failed login attempts",
|
130 |
-
"Identify performance bottlenecks and slow operations",
|
131 |
-
"What errors or warnings are present?",
|
132 |
-
"Show me time-based trends in the data",
|
133 |
-
"Are there any suspicious IP addresses or user activities?"
|
134 |
-
] + base_suggestions
|
135 |
-
elif file_ext == '.csv':
|
136 |
-
return [
|
137 |
-
"Analyze the data distribution and statistics",
|
138 |
-
"Find correlations between columns",
|
139 |
-
"Identify outliers or anomalies in the data",
|
140 |
-
"What are the key insights from this dataset?"
|
141 |
-
] + base_suggestions
|
142 |
-
elif file_ext == '.json':
|
143 |
-
return [
|
144 |
-
"Parse and analyze the JSON structure",
|
145 |
-
"What are the key data fields and their values?",
|
146 |
-
"Find any nested patterns or relationships"
|
147 |
-
] + base_suggestions
|
148 |
-
else:
|
149 |
-
return base_suggestions
|
150 |
-
|
151 |
async def analyze_uploaded_file():
|
152 |
"""Legacy function - kept for backward compatibility"""
|
153 |
return analyze_file_with_question("Provide a comprehensive analysis of this file.")
|
@@ -156,167 +67,122 @@ def run_file_analysis():
|
|
156 |
"""Wrapper to run async file analysis in sync context"""
|
157 |
return asyncio.run(analyze_uploaded_file())
|
158 |
|
159 |
-
def update_question_suggestions():
|
160 |
-
"""Update question suggestions based on uploaded file"""
|
161 |
-
global uploaded_file_path
|
162 |
-
suggestions = get_question_suggestions(uploaded_file_path)
|
163 |
-
return gr.Dropdown.update(choices=suggestions, value=suggestions[0] if suggestions else "")
|
164 |
-
|
165 |
# Create the Gradio interface
|
166 |
-
with gr.Blocks(title="DataForge - AI
|
167 |
-
gr.Markdown("# 🔍 DataForge - AI
|
168 |
-
gr.Markdown("
|
|
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
additional_inputs=[
|
174 |
-
gr.Textbox(
|
175 |
-
value="You are a helpful AI assistant. Be friendly, informative, and concise in your responses.",
|
176 |
-
label="System message"
|
177 |
-
),
|
178 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
179 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
180 |
-
gr.Slider(
|
181 |
-
minimum=0.1,
|
182 |
-
maximum=1.0,
|
183 |
-
value=0.95,
|
184 |
-
step=0.05,
|
185 |
-
label="Top-p (nucleus sampling)",
|
186 |
-
),
|
187 |
-
],
|
188 |
-
title="Chat with AI Assistant",
|
189 |
-
description="Ask questions or get help with any topic."
|
190 |
-
)
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
file_types=[".txt", ".log", ".csv", ".json", ".xml", ".py", ".js", ".html", ".md"],
|
209 |
-
type="filepath"
|
210 |
-
)
|
211 |
-
upload_status = gr.Textbox(
|
212 |
-
label="Upload Status",
|
213 |
-
value="No file uploaded",
|
214 |
-
interactive=False
|
215 |
-
)
|
216 |
-
|
217 |
-
# Question Section
|
218 |
-
gr.Markdown("### ❓ Ask Your Question")
|
219 |
-
question_suggestions = gr.Dropdown(
|
220 |
-
label="Question Suggestions (select or type your own)",
|
221 |
-
choices=[],
|
222 |
-
allow_custom_value=True,
|
223 |
-
value=""
|
224 |
-
)
|
225 |
-
|
226 |
-
user_question = gr.Textbox(
|
227 |
-
label="Your Question about the File",
|
228 |
-
placeholder="What would you like to know about this file?",
|
229 |
-
lines=3
|
230 |
-
)
|
231 |
-
|
232 |
-
analyze_btn = gr.Button("🔍 Run Guided Analysis", variant="primary", size="lg")
|
233 |
-
|
234 |
-
# Analysis Info
|
235 |
-
gr.Markdown("### ℹ️ Analysis Method")
|
236 |
-
gr.Markdown("""
|
237 |
-
**Guided Analysis Features:**
|
238 |
-
- 🎯 Question-aware code generation
|
239 |
-
- 📋 File structure examination
|
240 |
-
- 🚀 Dynamic prompt optimization
|
241 |
-
- ✅ Higher accuracy than generic analysis
|
242 |
-
""")
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
# Event handlers
|
254 |
-
file_upload.change(
|
255 |
-
fn=handle_file_upload,
|
256 |
-
inputs=[file_upload],
|
257 |
-
outputs=[upload_status]
|
258 |
-
).then(
|
259 |
-
fn=update_question_suggestions,
|
260 |
-
inputs=[],
|
261 |
-
outputs=[question_suggestions]
|
262 |
-
)
|
263 |
-
|
264 |
-
question_suggestions.change(
|
265 |
-
fn=lambda x: x,
|
266 |
-
inputs=[question_suggestions],
|
267 |
-
outputs=[user_question]
|
268 |
-
)
|
269 |
-
|
270 |
-
analyze_btn.click(
|
271 |
-
fn=analyze_file_with_question,
|
272 |
-
inputs=[user_question],
|
273 |
-
outputs=[analysis_output]
|
274 |
-
)
|
275 |
-
|
276 |
-
with gr.Tab("📊 Analysis Examples"):
|
277 |
-
gr.Markdown("## 💡 Example Questions by File Type")
|
278 |
-
|
279 |
-
with gr.Accordion("🔐 Security Analysis Questions", open=False):
|
280 |
-
gr.Markdown("""
|
281 |
-
**For Log Files:**
|
282 |
-
- "Find any failed login attempts and suspicious IP addresses"
|
283 |
-
- "Identify potential security threats or anomalies"
|
284 |
-
- "Show me authentication errors and user access patterns"
|
285 |
-
- "Are there any brute force attacks or repeated failures?"
|
286 |
|
287 |
-
|
288 |
-
- "Detect unusual access patterns or potential intrusions"
|
289 |
-
- "Find requests with suspicious user agents or payloads"
|
290 |
-
- "Identify high-frequency requests from single IPs"
|
291 |
-
""")
|
292 |
-
|
293 |
-
with gr.Accordion("⚡ Performance Analysis Questions", open=False):
|
294 |
-
gr.Markdown("""
|
295 |
-
**For Application Logs:**
|
296 |
-
- "Which API endpoints are slowest and why?"
|
297 |
-
- "Find performance bottlenecks and response time issues"
|
298 |
-
- "Show me timeout errors and failed requests"
|
299 |
-
- "What are the peak usage times and load patterns?"
|
300 |
|
301 |
-
|
302 |
-
|
303 |
-
- "Find database query performance problems"
|
304 |
-
- "Show me error rates and system health indicators"
|
305 |
-
""")
|
306 |
-
|
307 |
-
with gr.Accordion("📈 Data Analysis Questions", open=False):
|
308 |
gr.Markdown("""
|
309 |
-
**
|
310 |
-
-
|
311 |
-
-
|
312 |
-
-
|
313 |
-
-
|
314 |
-
|
315 |
-
**For JSON Files:**
|
316 |
-
- "Parse the structure and extract key information"
|
317 |
-
- "Find patterns in nested data and relationships"
|
318 |
-
- "Summarize the main data points and values"
|
319 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
|
321 |
if __name__ == "__main__":
|
|
|
322 |
demo.launch()
|
|
|
4 |
import tempfile
|
5 |
from dotenv import find_dotenv, load_dotenv
|
6 |
from langchain.chat_models import init_chat_model
|
7 |
+
# Simplified imports - focusing on file analysis
|
|
|
|
|
8 |
|
9 |
# Import the CodeAct agent functionality
|
10 |
from agent import FileInjectedPyodideSandbox, create_pyodide_eval_fn, create_codeact
|
|
|
15 |
# Load environment variables
|
16 |
load_dotenv(find_dotenv())
|
17 |
|
18 |
+
# Initialize model for file analysis
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
codeact_model = init_chat_model("gpt-4.1-2025-04-14", model_provider="openai")
|
20 |
|
21 |
# Store uploaded file path globally
|
22 |
uploaded_file_path = None
|
23 |
|
24 |
+
# Chat functionality removed - focusing on file analysis
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def handle_file_upload(file):
|
27 |
"""Handle file upload and store the path globally"""
|
28 |
global uploaded_file_path
|
29 |
+
try:
|
30 |
+
if file is not None:
|
31 |
+
# With type="filepath", Gradio returns the file path as a string
|
32 |
+
uploaded_file_path = file
|
33 |
+
filename = os.path.basename(file)
|
34 |
+
return f"✅ File uploaded successfully: {filename}"
|
35 |
+
else:
|
36 |
+
uploaded_file_path = None
|
37 |
+
return "❌ No file uploaded"
|
38 |
+
except Exception as e:
|
39 |
uploaded_file_path = None
|
40 |
+
return f"❌ Upload error: {str(e)}"
|
41 |
|
42 |
def analyze_file_with_question(user_question):
|
43 |
"""
|
|
|
45 |
"""
|
46 |
global uploaded_file_path
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
try:
|
49 |
+
if not uploaded_file_path or not os.path.exists(uploaded_file_path):
|
50 |
+
return "❌ No file uploaded or file not found. Please upload a file first."
|
51 |
+
|
52 |
+
if not user_question or user_question.strip() == "":
|
53 |
+
user_question = "Provide a comprehensive analysis of this file including security, performance, and data insights."
|
54 |
+
|
55 |
# Use the new guided analysis approach
|
56 |
result = analyze_file_with_guidance_sync(uploaded_file_path, user_question)
|
57 |
return result
|
|
|
59 |
except Exception as e:
|
60 |
return f"❌ Error in guided analysis: {str(e)}"
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
async def analyze_uploaded_file():
|
63 |
"""Legacy function - kept for backward compatibility"""
|
64 |
return analyze_file_with_question("Provide a comprehensive analysis of this file.")
|
|
|
67 |
"""Wrapper to run async file analysis in sync context"""
|
68 |
return asyncio.run(analyze_uploaded_file())
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
# Create the Gradio interface
|
71 |
+
with gr.Blocks(title="DataForge - AI-Powered File Analysis") as demo:
|
72 |
+
gr.Markdown("# 🔍 DataForge - AI-Powered File Analysis")
|
73 |
+
gr.Markdown("""
|
74 |
+
Upload any file and ask specific questions for targeted AI analysis. Our guided approach:
|
75 |
|
76 |
+
1. 📋 **Examines** your file structure and patterns automatically
|
77 |
+
2. 🎯 **Generates** specific code guidance based on your question
|
78 |
+
3. 🚀 **Executes** enhanced analysis with improved accuracy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
+
**Simply upload a file and ask any question you want!**
|
81 |
+
""")
|
82 |
+
|
83 |
+
with gr.Row():
|
84 |
+
with gr.Column(scale=1):
|
85 |
+
# File Upload Section
|
86 |
+
gr.Markdown("### 📤 File Upload")
|
87 |
+
file_upload = gr.File(
|
88 |
+
label="Upload File for Analysis",
|
89 |
+
type="filepath"
|
90 |
+
)
|
91 |
+
upload_status = gr.Textbox(
|
92 |
+
label="Upload Status",
|
93 |
+
value="No file uploaded",
|
94 |
+
interactive=False
|
95 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
+
# Question Section
|
98 |
+
gr.Markdown("### ❓ Ask Your Question")
|
99 |
+
user_question = gr.Textbox(
|
100 |
+
label="Your Question about the File",
|
101 |
+
placeholder="What would you like to know about this file? (e.g., 'Find security threats', 'Show performance issues', 'What errors are present?')",
|
102 |
+
lines=4,
|
103 |
+
value=""
|
104 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
+
analyze_btn = gr.Button("🔍 Run Guided Analysis", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
+
# Analysis Info
|
109 |
+
gr.Markdown("### ℹ️ How It Works")
|
|
|
|
|
|
|
|
|
|
|
110 |
gr.Markdown("""
|
111 |
+
**Guided Analysis Process:**
|
112 |
+
- 🎯 **Question-aware**: Code generation tailored to your specific question
|
113 |
+
- 📋 **Smart examination**: Automatically detects file structure and patterns
|
114 |
+
- 🚀 **Dynamic optimization**: Creates targeted analysis approach
|
115 |
+
- ✅ **Higher accuracy**: Prevents common code generation errors
|
116 |
+
- 🔧 **Quality control**: Built-in validation to avoid syntax issues
|
|
|
|
|
|
|
|
|
117 |
""")
|
118 |
+
|
119 |
+
with gr.Column(scale=2):
|
120 |
+
analysis_output = gr.Textbox(
|
121 |
+
label="📊 Guided Analysis Results",
|
122 |
+
lines=25,
|
123 |
+
max_lines=35,
|
124 |
+
placeholder="Upload a file, type your question, and click 'Run Guided Analysis' to see detailed results here...",
|
125 |
+
interactive=False
|
126 |
+
)
|
127 |
+
|
128 |
+
# Event handlers
|
129 |
+
file_upload.change(
|
130 |
+
fn=handle_file_upload,
|
131 |
+
inputs=[file_upload],
|
132 |
+
outputs=[upload_status]
|
133 |
+
)
|
134 |
+
|
135 |
+
analyze_btn.click(
|
136 |
+
fn=analyze_file_with_question,
|
137 |
+
inputs=[user_question],
|
138 |
+
outputs=[analysis_output]
|
139 |
+
)
|
140 |
+
|
141 |
+
gr.Markdown("---")
|
142 |
+
gr.Markdown("## 💡 Example Questions by File Type")
|
143 |
+
|
144 |
+
with gr.Accordion("🔐 Security Analysis Questions", open=False):
|
145 |
+
gr.Markdown("""
|
146 |
+
**For Log Files:**
|
147 |
+
- "Find any failed login attempts and suspicious IP addresses"
|
148 |
+
- "Identify potential security threats or anomalies"
|
149 |
+
- "Show me authentication errors and user access patterns"
|
150 |
+
- "Are there any brute force attacks or repeated failures?"
|
151 |
+
|
152 |
+
**For Access Logs:**
|
153 |
+
- "Detect unusual access patterns or potential intrusions"
|
154 |
+
- "Find requests with suspicious user agents or payloads"
|
155 |
+
- "Identify high-frequency requests from single IPs"
|
156 |
+
""")
|
157 |
+
|
158 |
+
with gr.Accordion("⚡ Performance Analysis Questions", open=False):
|
159 |
+
gr.Markdown("""
|
160 |
+
**For Application Logs:**
|
161 |
+
- "Which API endpoints are slowest and why?"
|
162 |
+
- "Find performance bottlenecks and response time issues"
|
163 |
+
- "Show me timeout errors and failed requests"
|
164 |
+
- "What are the peak usage times and load patterns?"
|
165 |
+
|
166 |
+
**For System Logs:**
|
167 |
+
- "Identify resource usage spikes and memory issues"
|
168 |
+
- "Find database query performance problems"
|
169 |
+
- "Show me error rates and system health indicators"
|
170 |
+
""")
|
171 |
+
|
172 |
+
with gr.Accordion("📈 Data Analysis Questions", open=False):
|
173 |
+
gr.Markdown("""
|
174 |
+
**For CSV/Data Files:**
|
175 |
+
- "Analyze data distribution and find statistical insights"
|
176 |
+
- "Identify outliers and anomalies in the dataset"
|
177 |
+
- "What correlations exist between different columns?"
|
178 |
+
- "Generate a comprehensive data quality report"
|
179 |
+
|
180 |
+
**For JSON Files:**
|
181 |
+
- "Parse the structure and extract key information"
|
182 |
+
- "Find patterns in nested data and relationships"
|
183 |
+
- "Summarize the main data points and values"
|
184 |
+
""")
|
185 |
|
186 |
if __name__ == "__main__":
|
187 |
+
print("Starting DataForge application...")
|
188 |
demo.launch()
|
graph.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import asyncio
|
|
|
2 |
import os
|
3 |
import re
|
4 |
from typing import Annotated, Dict, List, Optional
|
@@ -58,6 +59,37 @@ class CodeAnalysisState(TypedDict):
|
|
58 |
final_analysis: Optional[str]
|
59 |
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def examine_file_structure(state: CodeAnalysisState) -> CodeAnalysisState:
|
62 |
"""
|
63 |
Node 1: Examine the file structure by reading the first several lines
|
@@ -244,8 +276,7 @@ def generate_code_guidance(state: CodeAnalysisState) -> CodeAnalysisState:
|
|
244 |
|
245 |
def execute_guided_analysis(state: CodeAnalysisState) -> CodeAnalysisState:
|
246 |
"""
|
247 |
-
Node 3: Execute the file analysis using the generated guidance.
|
248 |
-
This replaces the original agent with guided code generation.
|
249 |
"""
|
250 |
file_path = state["file_path"]
|
251 |
file_examination = state["file_examination"]
|
@@ -259,9 +290,8 @@ def execute_guided_analysis(state: CodeAnalysisState) -> CodeAnalysisState:
|
|
259 |
}
|
260 |
|
261 |
try:
|
262 |
-
# Create the guided analysis query
|
263 |
-
guided_query = f"""
|
264 |
-
Based on the file examination and guidance, analyze this file with the following SPECIFIC instructions:
|
265 |
|
266 |
FILE CONTEXT:
|
267 |
- File Type: {file_examination.file_type}
|
@@ -284,6 +314,41 @@ SAMPLE FILE STRUCTURE (first few lines):
|
|
284 |
|
285 |
USER REQUEST: {analysis_query or "Comprehensive analysis following the guidance above"}
|
286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
INSTRUCTIONS:
|
288 |
1. Follow the specified analysis approach exactly
|
289 |
2. Import only the recommended libraries: {', '.join(code_guidance.required_imports)}
|
@@ -291,11 +356,12 @@ INSTRUCTIONS:
|
|
291 |
4. Structure your code following: {code_guidance.code_structure}
|
292 |
5. Generate the expected outputs: {', '.join(code_guidance.expected_outputs)}
|
293 |
6. Implement proper error handling: {code_guidance.error_handling}
|
|
|
294 |
|
295 |
Since you have detailed guidance about this specific file structure, your code should be highly accurate and efficient.
|
296 |
The file examination shows this is a {file_examination.file_type} with {file_examination.data_format} data format.
|
297 |
|
298 |
-
Write Python code that leverages this specific knowledge for optimal analysis.
|
299 |
"""
|
300 |
|
301 |
print(f"🚀 Executing guided analysis...")
|
@@ -326,8 +392,7 @@ Write Python code that leverages this specific knowledge for optimal analysis.
|
|
326 |
execution_result = asyncio.run(run_guided_analysis())
|
327 |
|
328 |
# Create final analysis summary
|
329 |
-
final_analysis = f"""
|
330 |
-
=== GUIDED FILE ANALYSIS RESULTS ===
|
331 |
|
332 |
File: {file_path}
|
333 |
Type: {file_examination.file_type} ({file_examination.data_format})
|
|
|
1 |
import asyncio
|
2 |
+
import ast
|
3 |
import os
|
4 |
import re
|
5 |
from typing import Annotated, Dict, List, Optional
|
|
|
59 |
final_analysis: Optional[str]
|
60 |
|
61 |
|
62 |
+
def validate_python_code(code: str) -> tuple[bool, str]:
|
63 |
+
"""
|
64 |
+
Validate Python code for syntax errors and potential issues.
|
65 |
+
Returns (is_valid, error_message)
|
66 |
+
"""
|
67 |
+
try:
|
68 |
+
# Try to parse the code as AST
|
69 |
+
ast.parse(code)
|
70 |
+
|
71 |
+
# Check for common problematic patterns
|
72 |
+
lines = code.split('\n')
|
73 |
+
for i, line in enumerate(lines, 1):
|
74 |
+
line_stripped = line.strip()
|
75 |
+
|
76 |
+
# Check for unterminated strings
|
77 |
+
if line_stripped.startswith('print(') and not line_stripped.endswith(')'):
|
78 |
+
if line_stripped.count('"') % 2 != 0 or line_stripped.count("'") % 2 != 0:
|
79 |
+
return False, f"Line {i}: Potentially unterminated string in print statement"
|
80 |
+
|
81 |
+
# Check for very long lines that might get truncated
|
82 |
+
if len(line) > 100:
|
83 |
+
return False, f"Line {i}: Line too long ({len(line)} chars) - may cause truncation"
|
84 |
+
|
85 |
+
return True, "Code validation passed"
|
86 |
+
|
87 |
+
except SyntaxError as e:
|
88 |
+
return False, f"Syntax error: {e.msg} at line {e.lineno}"
|
89 |
+
except Exception as e:
|
90 |
+
return False, f"Validation error: {str(e)}"
|
91 |
+
|
92 |
+
|
93 |
def examine_file_structure(state: CodeAnalysisState) -> CodeAnalysisState:
|
94 |
"""
|
95 |
Node 1: Examine the file structure by reading the first several lines
|
|
|
276 |
|
277 |
def execute_guided_analysis(state: CodeAnalysisState) -> CodeAnalysisState:
|
278 |
"""
|
279 |
+
Node 3: Execute the file analysis using the generated guidance with code quality validation.
|
|
|
280 |
"""
|
281 |
file_path = state["file_path"]
|
282 |
file_examination = state["file_examination"]
|
|
|
290 |
}
|
291 |
|
292 |
try:
|
293 |
+
# Create the guided analysis query with strict code quality requirements
|
294 |
+
guided_query = f"""Based on the file examination and guidance, analyze this file with the following SPECIFIC instructions:
|
|
|
295 |
|
296 |
FILE CONTEXT:
|
297 |
- File Type: {file_examination.file_type}
|
|
|
314 |
|
315 |
USER REQUEST: {analysis_query or "Comprehensive analysis following the guidance above"}
|
316 |
|
317 |
+
CRITICAL CODE QUALITY REQUIREMENTS:
|
318 |
+
1. ALL print statements MUST be on single lines with properly closed quotes
|
319 |
+
2. NO multi-line strings or f-strings that span multiple lines
|
320 |
+
3. NO print statements longer than 80 characters - break into multiple prints instead
|
321 |
+
4. ALL strings must be properly terminated with matching quotes
|
322 |
+
5. Use short variable names and concise output formatting
|
323 |
+
6. If you need to print long text, use multiple short print() calls
|
324 |
+
7. Always close parentheses, brackets, and quotes on the same line they open
|
325 |
+
8. Use simple string concatenation instead of complex f-strings for long output
|
326 |
+
9. NEVER use triple quotes for multi-line strings in limited execution environments
|
327 |
+
10. Test each print statement individually to ensure it executes without truncation
|
328 |
+
|
329 |
+
EXAMPLE OF SAFE CODING PRACTICES:
|
330 |
+
```python
|
331 |
+
# GOOD - Short, single-line prints
|
332 |
+
print("=== Results ===")
|
333 |
+
print(f"Count: {{count}}")
|
334 |
+
print(f"User: {{user}}")
|
335 |
+
|
336 |
+
# BAD - Long print that could be truncated
|
337 |
+
print(f"This is a very long print statement that could get truncated...")
|
338 |
+
|
339 |
+
# GOOD - Break long output into multiple prints
|
340 |
+
print("Analysis complete:")
|
341 |
+
print(f"Found {{count}} items")
|
342 |
+
print(f"Top user: {{user}}")
|
343 |
+
```
|
344 |
+
|
345 |
+
MANDATORY CODE GENERATION PROCESS:
|
346 |
+
1. Generate your analysis code following the above requirements
|
347 |
+
2. Before presenting the code, internally validate each line for potential issues
|
348 |
+
3. Ensure ALL print statements are under 80 characters
|
349 |
+
4. Verify all quotes and parentheses are properly closed
|
350 |
+
5. If any line might cause issues, rewrite it using multiple shorter statements
|
351 |
+
|
352 |
INSTRUCTIONS:
|
353 |
1. Follow the specified analysis approach exactly
|
354 |
2. Import only the recommended libraries: {', '.join(code_guidance.required_imports)}
|
|
|
356 |
4. Structure your code following: {code_guidance.code_structure}
|
357 |
5. Generate the expected outputs: {', '.join(code_guidance.expected_outputs)}
|
358 |
6. Implement proper error handling: {code_guidance.error_handling}
|
359 |
+
7. ENSURE ALL CODE FOLLOWS THE QUALITY REQUIREMENTS ABOVE
|
360 |
|
361 |
Since you have detailed guidance about this specific file structure, your code should be highly accurate and efficient.
|
362 |
The file examination shows this is a {file_examination.file_type} with {file_examination.data_format} data format.
|
363 |
|
364 |
+
Write Python code that leverages this specific knowledge for optimal analysis and follows strict code quality standards.
|
365 |
"""
|
366 |
|
367 |
print(f"🚀 Executing guided analysis...")
|
|
|
392 |
execution_result = asyncio.run(run_guided_analysis())
|
393 |
|
394 |
# Create final analysis summary
|
395 |
+
final_analysis = f"""=== GUIDED FILE ANALYSIS RESULTS ===
|
|
|
396 |
|
397 |
File: {file_path}
|
398 |
Type: {file_examination.file_type} ({file_examination.data_format})
|