siyah1 commited on
Commit
569dd78
Β·
verified Β·
1 Parent(s): 52d8119

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -270
app.py CHANGED
@@ -1,33 +1,75 @@
1
  import streamlit as st
2
- import numpy as np
3
- import pandas as pd
4
- from smolagents import CodeAgent, tool
5
- from typing import Union, List, Dict, Optional
6
- import matplotlib.pyplot as plt
7
- import seaborn as sns
8
- import os
9
  from groq import Groq
10
- from dataclasses import dataclass
11
- import tempfile
12
- import base64
13
- import io
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  class GroqLLM:
16
- """Compatible LLM interface for smolagents CodeAgent"""
 
 
 
17
  def __init__(self, model_name="llama-3.1-8B-Instant"):
18
  self.client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
19
  self.model_name = model_name
20
 
21
  def __call__(self, prompt: Union[str, dict, List[Dict]]) -> str:
22
- """Make the class callable as required by smolagents"""
23
  try:
24
- # Handle different prompt formats
25
- if isinstance(prompt, (dict, list)):
26
- prompt_str = str(prompt)
27
- else:
28
- prompt_str = str(prompt)
29
 
30
- # Create a properly formatted message
31
  completion = self.client.chat.completions.create(
32
  model=self.model_name,
33
  messages=[{
@@ -40,274 +82,165 @@ class GroqLLM:
40
  )
41
 
42
  return completion.choices[0].message.content if completion.choices else "Error: No response generated"
43
-
44
  except Exception as e:
45
  error_msg = f"Error generating response: {str(e)}"
46
- print(error_msg)
47
  return error_msg
48
-
49
- def generate(self, prompt: Union[str, dict, List[Dict]], **kwargs) -> object:
50
- """Add generate method to make compatible with smolagents CodeAgent
51
-
52
- Args:
53
- prompt: The prompt to send to the model
54
- **kwargs: Additional keyword arguments to support CodeAgent API
55
- (stop_sequences, etc.) - these are ignored in the Groq implementation
56
-
57
- Returns:
58
- An object with a 'content' attribute containing the response text
59
- """
60
- response_text = self.__call__(prompt)
61
-
62
- # Create a simple object with a content attribute
63
- class Response:
64
- def __init__(self, content):
65
- self.content = content
66
-
67
- return Response(response_text)
68
-
69
- class DataAnalysisAgent(CodeAgent):
70
- """Extended CodeAgent with dataset awareness"""
71
- def __init__(self, dataset: pd.DataFrame, *args, **kwargs):
72
- super().__init__(*args, **kwargs)
73
- self._dataset = dataset
74
-
75
- @property
76
- def dataset(self) -> pd.DataFrame:
77
- """Access the stored dataset"""
78
- return self._dataset
79
 
80
- def run(self, prompt: str) -> str:
81
- """Override run method to include dataset context"""
82
- dataset_info = f"""
83
- Dataset Shape: {self.dataset.shape}
84
- Columns: {', '.join(self.dataset.columns)}
85
- Data Types: {self.dataset.dtypes.to_dict()}
86
- """
87
- enhanced_prompt = f"""
88
- Analyze the following dataset:
89
- {dataset_info}
90
-
91
- Task: {prompt}
92
-
93
- Use the provided tools to analyze this specific dataset and return detailed results.
94
- """
95
- return super().run(enhanced_prompt)
96
-
97
- @tool
98
- def analyze_basic_stats(data: pd.DataFrame) -> str:
99
- """Calculate basic statistical measures for numerical columns in the dataset.
100
-
101
- This function computes fundamental statistical metrics including mean, median,
102
- standard deviation, skewness, and counts of missing values for all numerical
103
- columns in the provided DataFrame.
104
-
105
- Args:
106
- data: A pandas DataFrame containing the dataset to analyze. The DataFrame
107
- should contain at least one numerical column for meaningful analysis.
108
-
109
- Returns:
110
- str: A string containing formatted basic statistics for each numerical column,
111
- including mean, median, standard deviation, skewness, and missing value counts.
112
  """
113
- # Access dataset from agent if no data provided
114
- if data is None:
115
- data = tool.agent.dataset
116
-
117
- stats = {}
118
- numeric_cols = data.select_dtypes(include=[np.number]).columns
119
-
120
- for col in numeric_cols:
121
- stats[col] = {
122
- 'mean': float(data[col].mean()),
123
- 'median': float(data[col].median()),
124
- 'std': float(data[col].std()),
125
- 'skew': float(data[col].skew()),
126
- 'missing': int(data[col].isnull().sum())
127
- }
128
-
129
- return str(stats)
130
-
131
- @tool
132
- def generate_correlation_matrix(data: pd.DataFrame) -> str:
133
- """Generate a visual correlation matrix for numerical columns in the dataset.
134
-
135
- This function creates a heatmap visualization showing the correlations between
136
- all numerical columns in the dataset. The correlation values are displayed
137
- using a color-coded matrix for easy interpretation.
138
-
139
- Args:
140
- data: A pandas DataFrame containing the dataset to analyze. The DataFrame
141
- should contain at least two numerical columns for correlation analysis.
142
-
143
- Returns:
144
- str: A base64 encoded string representing the correlation matrix plot image,
145
- which can be displayed in a web interface or saved as an image file.
146
  """
147
- # Access dataset from agent if no data provided
148
- if data is None:
149
- data = tool.agent.dataset
150
-
151
- numeric_data = data.select_dtypes(include=[np.number])
152
 
153
- plt.figure(figsize=(10, 8))
154
- sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')
155
- plt.title('Correlation Matrix')
 
156
 
157
- buf = io.BytesIO()
158
- plt.savefig(buf, format='png')
159
- plt.close()
160
- return base64.b64encode(buf.getvalue()).decode()
161
-
162
- @tool
163
- def analyze_categorical_columns(data: pd.DataFrame) -> str:
164
- """Analyze categorical columns in the dataset for distribution and frequencies.
165
 
166
- This function examines categorical columns to identify unique values, top categories,
167
- and missing value counts, providing insights into the categorical data distribution.
 
 
168
 
169
- Args:
170
- data: A pandas DataFrame containing the dataset to analyze. The DataFrame
171
- should contain at least one categorical column for meaningful analysis.
172
 
173
- Returns:
174
- str: A string containing formatted analysis results for each categorical column,
175
- including unique value counts, top categories, and missing value counts.
176
- """
177
- # Access dataset from agent if no data provided
178
- if data is None:
179
- data = tool.agent.dataset
180
-
181
- categorical_cols = data.select_dtypes(include=['object', 'category']).columns
182
- analysis = {}
183
 
184
- for col in categorical_cols:
185
- analysis[col] = {
186
- 'unique_values': int(data[col].nunique()),
187
- 'top_categories': data[col].value_counts().head(5).to_dict(),
188
- 'missing': int(data[col].isnull().sum())
189
- }
190
-
191
- return str(analysis)
192
 
193
- @tool
194
- def suggest_features(data: pd.DataFrame) -> str:
195
- """Suggest potential feature engineering steps based on data characteristics.
196
-
197
- This function analyzes the dataset's structure and statistical properties to
198
- recommend possible feature engineering steps that could improve model performance.
199
-
200
- Args:
201
- data: A pandas DataFrame containing the dataset to analyze. The DataFrame
202
- can contain both numerical and categorical columns.
203
-
204
- Returns:
205
- str: A string containing suggestions for feature engineering based on
206
- the characteristics of the input data.
207
  """
208
- # Access dataset from agent if no data provided
209
- if data is None:
210
- data = tool.agent.dataset
211
-
212
- suggestions = []
213
- numeric_cols = data.select_dtypes(include=[np.number]).columns
214
- categorical_cols = data.select_dtypes(include=['object', 'category']).columns
215
-
216
- if len(numeric_cols) >= 2:
217
- suggestions.append("Consider creating interaction terms between numerical features")
218
-
219
- if len(categorical_cols) > 0:
220
- suggestions.append("Consider one-hot encoding for categorical variables")
221
-
222
- for col in numeric_cols:
223
- if data[col].skew() > 1 or data[col].skew() < -1:
224
- suggestions.append(f"Consider log transformation for {col} due to skewness")
225
-
226
- return '\n'.join(suggestions)
227
 
228
- def main():
229
- st.title("Data Analysis Assistant")
230
- st.write("Upload your dataset and get automated analysis with natural language interaction.")
231
-
232
- # Initialize session state
233
- if 'data' not in st.session_state:
234
- st.session_state['data'] = None
235
- if 'agent' not in st.session_state:
236
- st.session_state['agent'] = None
237
-
238
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
239
-
240
- try:
241
- if uploaded_file is not None:
242
- with st.spinner('Loading and processing your data...'):
243
- # Load the dataset
244
- data = pd.read_csv(uploaded_file)
245
- st.session_state['data'] = data
246
-
247
- # Initialize the agent with the dataset
248
- st.session_state['agent'] = DataAnalysisAgent(
249
- dataset=data,
250
- tools=[analyze_basic_stats, generate_correlation_matrix,
251
- analyze_categorical_columns, suggest_features],
252
- model=GroqLLM(),
253
- additional_authorized_imports=["pandas", "numpy", "matplotlib", "seaborn"]
254
- )
255
-
256
- st.success(f'Successfully loaded dataset with {data.shape[0]} rows and {data.shape[1]} columns')
257
- st.subheader("Data Preview")
258
- st.dataframe(data.head())
259
-
260
- if st.session_state['data'] is not None:
261
- analysis_type = st.selectbox(
262
- "Choose analysis type",
263
- ["Basic Statistics", "Correlation Analysis", "Categorical Analysis",
264
- "Feature Engineering", "Custom Question"]
265
- )
266
-
267
- if analysis_type == "Basic Statistics":
268
- with st.spinner('Analyzing basic statistics...'):
269
- result = st.session_state['agent'].run(
270
- "Use the analyze_basic_stats tool to analyze this dataset and "
271
- "provide insights about the numerical distributions."
272
- )
273
- st.write(result)
274
-
275
- elif analysis_type == "Correlation Analysis":
276
- with st.spinner('Generating correlation matrix...'):
277
- result = st.session_state['agent'].run(
278
- "Use the generate_correlation_matrix tool to analyze correlations "
279
- "and explain any strong relationships found."
280
- )
281
- if isinstance(result, str) and result.startswith('data:image') or ',' in result:
282
- st.image(f"data:image/png;base64,{result.split(',')[-1]}")
283
- else:
284
- st.write(result)
285
-
286
- elif analysis_type == "Categorical Analysis":
287
- with st.spinner('Analyzing categorical columns...'):
288
- result = st.session_state['agent'].run(
289
- "Use the analyze_categorical_columns tool to examine the "
290
- "categorical variables and explain the distributions."
291
- )
292
- st.write(result)
293
 
294
- elif analysis_type == "Feature Engineering":
295
- with st.spinner('Generating feature suggestions...'):
296
- result = st.session_state['agent'].run(
297
- "Use the suggest_features tool to recommend potential "
298
- "feature engineering steps for this dataset."
299
  )
300
- st.write(result)
301
 
302
- elif analysis_type == "Custom Question":
303
- question = st.text_input("What would you like to know about your data?")
304
- if question:
305
- with st.spinner('Analyzing...'):
306
- result = st.session_state['agent'].run(question)
307
- st.write(result)
 
 
 
 
 
 
 
 
 
 
308
 
309
- except Exception as e:
310
- st.error(f"An error occurred: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
- if __name__ == "__main__":
313
- main()
 
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ from typing import Union, List, Dict
 
 
 
 
 
4
  from groq import Groq
5
+ import os
6
+ from duckduckgo_search import DDGS
7
+
8
+ class DuckDuckGoSearch:
9
+ """
10
+ Custom DuckDuckGo search implementation with robust error handling and result processing.
11
+ Uses the duckduckgo_search library to fetch and format news results.
12
+ """
13
+ def __init__(self):
14
+ # Initialize the DuckDuckGo search session
15
+ self.ddgs = DDGS()
16
+
17
+ def __call__(self, query: str, max_results: int = 5) -> str:
18
+ try:
19
+ # Perform the search and get results
20
+ # The news method is more appropriate for recent news analysis
21
+ search_results = list(self.ddgs.news(
22
+ query,
23
+ max_results=max_results,
24
+ region='wt-wt', # Worldwide results
25
+ safesearch='on'
26
+ ))
27
+
28
+ if not search_results:
29
+ return "No results found. Try modifying your search query."
30
+
31
+ # Format the results into a readable string
32
+ formatted_results = []
33
+ for idx, result in enumerate(search_results, 1):
34
+ # Extract available fields with fallbacks for missing data
35
+ title = result.get('title', 'No title available')
36
+ snippet = result.get('body', result.get('snippet', 'No description available'))
37
+ source = result.get('source', 'Unknown source')
38
+ url = result.get('url', result.get('link', 'No link available'))
39
+ date = result.get('date', 'Date not available')
40
+
41
+ # Format each result with available information
42
+ formatted_results.append(
43
+ f"{idx}. Title: {title}\n"
44
+ f" Date: {date}\n"
45
+ f" Source: {source}\n"
46
+ f" Summary: {snippet}\n"
47
+ f" URL: {url}\n"
48
+ )
49
+
50
+ return "\n".join(formatted_results)
51
+
52
+ except Exception as e:
53
+ # Provide detailed error information for debugging
54
+ error_msg = f"Search error: {str(e)}\nTry again with a different search term or check your internet connection."
55
+ print(f"DuckDuckGo search error: {str(e)}") # For logging
56
+ return error_msg
57
 
58
  class GroqLLM:
59
+ """
60
+ LLM interface using Groq's LLama model.
61
+ Handles API communication and response processing.
62
+ """
63
  def __init__(self, model_name="llama-3.1-8B-Instant"):
64
  self.client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
65
  self.model_name = model_name
66
 
67
  def __call__(self, prompt: Union[str, dict, List[Dict]]) -> str:
 
68
  try:
69
+ # Convert prompt to string if it's a complex structure
70
+ prompt_str = str(prompt) if isinstance(prompt, (dict, list)) else prompt
 
 
 
71
 
72
+ # Make API call to Groq
73
  completion = self.client.chat.completions.create(
74
  model=self.model_name,
75
  messages=[{
 
82
  )
83
 
84
  return completion.choices[0].message.content if completion.choices else "Error: No response generated"
 
85
  except Exception as e:
86
  error_msg = f"Error generating response: {str(e)}"
87
+ print(error_msg) # For logging
88
  return error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ def create_analysis_prompt(topic: str, search_results: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  """
92
+ Creates a detailed prompt for news analysis, structuring the request
93
+ to get comprehensive and well-organized results from the LLM.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  """
95
+ return f"""Analyze the following news information about {topic}.
96
+ Search Results: {search_results}
 
 
 
97
 
98
+ Please provide a comprehensive analysis including:
99
+ 1. Key Points Summary:
100
+ - Main events and developments
101
+ - Critical updates and changes
102
 
103
+ 2. Stakeholder Analysis:
104
+ - Primary parties involved
105
+ - Their roles and positions
 
 
 
 
 
106
 
107
+ 3. Impact Assessment:
108
+ - Immediate implications
109
+ - Potential long-term effects
110
+ - Broader context and significance
111
 
112
+ 4. Multiple Perspectives:
113
+ - Different viewpoints on the issue
114
+ - Areas of agreement and contention
115
 
116
+ 5. Fact Check & Reliability:
117
+ - Verification of major claims
118
+ - Consistency across sources
119
+ - Source credibility assessment
 
 
 
 
 
 
120
 
121
+ Please format the analysis in a clear, journalistic style with section headers."""
 
 
 
 
 
 
 
122
 
123
+ def log_agent_activity(prompt: str, result: str, agent_name: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  """
125
+ Creates an expandable log of agent activities in the Streamlit interface
126
+ for transparency and debugging purposes.
127
+ """
128
+ with st.expander("View Agent Activity Log"):
129
+ st.write(f"### Agent Activity ({agent_name}):")
130
+ st.write("**Input Prompt:**")
131
+ st.code(prompt, language="text")
132
+ st.write("**Analysis Output:**")
133
+ st.code(result, language="text")
 
 
 
 
 
 
 
 
 
 
134
 
135
+ # Initialize Streamlit app
136
+ st.set_page_config(page_title="News Analysis Tool", layout="wide")
137
+
138
+ # Title and description
139
+ st.title("πŸ” AI News Analysis Tool")
140
+ st.write("""
141
+ This tool combines the power of Groq's LLama 3.1 8B Instant model with DuckDuckGo
142
+ search to provide in-depth news analysis. Get comprehensive insights and multiple
143
+ perspectives on any news topic.
144
+ """)
145
+
146
+ # Initialize the components
147
+ try:
148
+ # Initialize LLM and search tool
149
+ llm = GroqLLM()
150
+ search_tool = DuckDuckGoSearch()
151
+
152
+ # Input section
153
+ news_topic = st.text_input(
154
+ "Enter News Topic or Query:",
155
+ placeholder="E.g., Recent developments in renewable energy"
156
+ )
157
+
158
+ # Analysis options
159
+ col1, col2 = st.columns(2)
160
+ with col1:
161
+ search_depth = st.slider(
162
+ "Search Depth (number of results)",
163
+ min_value=3,
164
+ max_value=10,
165
+ value=5
166
+ )
167
+ with col2:
168
+ analysis_type = st.selectbox(
169
+ "Analysis Type",
170
+ ["Comprehensive", "Quick Summary", "Technical", "Simplified"]
171
+ )
172
+
173
+ # Generate analysis button
174
+ if st.button("Analyze News"):
175
+ if news_topic:
176
+ with st.spinner("Gathering information and analyzing..."):
177
+ try:
178
+ # Show search progress
179
+ search_placeholder = st.empty()
180
+ search_placeholder.info("Searching for recent news...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ # Perform search
183
+ search_results = search_tool(
184
+ f"Latest news about {news_topic} last 7 days",
185
+ max_results=search_depth
 
186
  )
 
187
 
188
+ if not search_results.startswith(("Search error", "No results")):
189
+ # Update progress
190
+ search_placeholder.info("Analyzing search results...")
191
+
192
+ # Create analysis prompt
193
+ analysis_prompt = create_analysis_prompt(news_topic, search_results)
194
+
195
+ # Get analysis from LLM
196
+ analysis_result = llm(analysis_prompt)
197
+
198
+ # Clear progress messages
199
+ search_placeholder.empty()
200
+
201
+ # Display results
202
+ st.subheader("πŸ“Š Analysis Results")
203
+ st.markdown(analysis_result)
204
 
205
+ # Log the activity
206
+ log_agent_activity(
207
+ analysis_prompt,
208
+ analysis_result,
209
+ "News Analysis Agent"
210
+ )
211
+ else:
212
+ search_placeholder.empty()
213
+ st.error(search_results)
214
+
215
+ except Exception as e:
216
+ st.error(f"An error occurred during analysis: {str(e)}")
217
+ else:
218
+ st.warning("Please enter a news topic to analyze.")
219
+
220
+ # Add helpful tips
221
+ with st.expander("πŸ’‘ Tips for Better Results"):
222
+ st.write("""
223
+ - Be specific with your topic for more focused analysis
224
+ - Use keywords related to recent events for timely information
225
+ - Consider including timeframes in your query
226
+ - Try different analysis types for various perspectives
227
+ - For complex topics, start with a broader search and then narrow down
228
+ """)
229
+
230
+ except Exception as e:
231
+ st.error(f"""
232
+ Failed to initialize the application: {str(e)}
233
+
234
+ Please ensure:
235
+ 1. Your GROQ_API_KEY is properly set in environment variables
236
+ 2. All required packages are installed:
237
+ - pip install streamlit groq duckduckgo-search
238
+ 3. You have internet connectivity for DuckDuckGo searches
239
+ """)
240
 
241
+ # Footer
242
+ st.markdown("---")
243
+ st.caption(
244
+ "Powered by Groq LLama 3.1 8B Instant, DuckDuckGo, and Streamlit | "
245
+ "Created for news analysis and research purposes"
246
+ )