Spaces:
Running
Running
Update insight_and_tasks/agents/follower_agent.py
Browse files
insight_and_tasks/agents/follower_agent.py
CHANGED
@@ -0,0 +1,502 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# agents/follower_agent.py
|
2 |
+
import pandas as pd
|
3 |
+
from typing import Dict, List, Any, Optional
|
4 |
+
import logging
|
5 |
+
import pandasai as pai # Assuming pandasai is imported as pai globally or configured
|
6 |
+
|
7 |
+
from google.adk.agents import LlmAgent # Assuming this is the correct import path
|
8 |
+
|
9 |
+
# Project-specific imports
|
10 |
+
from utils.retry_mechanism import RetryMechanism
|
11 |
+
from data_models.metrics import AgentMetrics, TimeSeriesMetric
|
12 |
+
|
13 |
+
# Configure logger for this module
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
# Define the model globally or pass it as a parameter. For now, using a constant.
|
17 |
+
# Consider moving this to a shared config or environment variable.
|
18 |
+
DEFAULT_AGENT_MODEL = "gemini-1.5-flash-latest" # Or your specific model like "gemini-1.5-flash-preview-05-20"
|
19 |
+
|
20 |
+
class EnhancedFollowerAnalysisAgent:
|
21 |
+
"""
|
22 |
+
Enhanced follower analysis agent with proper handling of different follower count types
|
23 |
+
and structured metric extraction.
|
24 |
+
"""
|
25 |
+
|
26 |
+
AGENT_NAME = "follower_analyst"
|
27 |
+
AGENT_DESCRIPTION = "Expert analyst specializing in follower growth patterns and demographic analysis."
|
28 |
+
AGENT_INSTRUCTION = """
|
29 |
+
You are a specialized LinkedIn follower analytics expert focused on temporal patterns and demographic trends.
|
30 |
+
|
31 |
+
Your role includes:
|
32 |
+
|
33 |
+
1. FOLLOWER TREND ANALYSIS:
|
34 |
+
- Analyze follower growth trends over time (monthly data from 'follower_gains_monthly' type).
|
35 |
+
- Identify growth acceleration/deceleration periods.
|
36 |
+
- Calculate growth rates and velocity changes.
|
37 |
+
- Detect seasonal patterns and anomalies.
|
38 |
+
- Analyze organic vs paid follower counts over time.
|
39 |
+
|
40 |
+
2. DEMOGRAPHIC ANALYSIS (based on 'follower_industry', 'follower_seniority', etc.):
|
41 |
+
- Analyze follower distribution by industry, seniority, function, and geography.
|
42 |
+
- Compare organic vs paid followers across these demographic segments.
|
43 |
+
- Identify high-value audience segments based on counts and potential engagement.
|
44 |
+
|
45 |
+
3. TIME-BASED INSIGHTS:
|
46 |
+
- Provide month-over-month comparisons for growth data.
|
47 |
+
- Identify critical inflection points in follower growth.
|
48 |
+
- Calculate trend momentum and acceleration.
|
49 |
+
|
50 |
+
4. METRIC EXTRACTION (for the AgentMetrics structure):
|
51 |
+
- Extract time-series data for total, organic, and paid follower counts, and growth rates.
|
52 |
+
- Provide aggregate metrics like average monthly growth, total organic/paid followers.
|
53 |
+
- Provide demographic breakdowns as categorical metrics (e.g., top N industries by follower count).
|
54 |
+
|
55 |
+
Focus on separating temporal analysis (monthly) from demographic analysis.
|
56 |
+
When analyzing demographics, consider the top N segments (e.g., top 10 industries) for conciseness.
|
57 |
+
Ensure your analysis summary is comprehensive and insightful.
|
58 |
+
"""
|
59 |
+
|
60 |
+
def __init__(self, api_key: str, model_name: Optional[str] = None):
|
61 |
+
"""
|
62 |
+
Initializes the Follower Analysis Agent.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
api_key: API key for LLM and potentially PandasAI.
|
66 |
+
model_name: Name of the language model to use. Defaults to DEFAULT_AGENT_MODEL.
|
67 |
+
"""
|
68 |
+
self.api_key = api_key # May be used if PandasAI is configured per agent or for other API calls
|
69 |
+
self.model_name = model_name or DEFAULT_AGENT_MODEL
|
70 |
+
|
71 |
+
self.agent = LlmAgent(
|
72 |
+
name=self.AGENT_NAME,
|
73 |
+
model=self.model_name,
|
74 |
+
description=self.AGENT_DESCRIPTION,
|
75 |
+
instruction=self.AGENT_INSTRUCTION
|
76 |
+
)
|
77 |
+
self.retry_mechanism = RetryMechanism()
|
78 |
+
logger.info(f"{self.AGENT_NAME} initialized with model {self.model_name}.")
|
79 |
+
|
80 |
+
def _separate_follower_data_by_type(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
|
81 |
+
"""Separate follower data by follower_count_type and process appropriately."""
|
82 |
+
separated_data = {}
|
83 |
+
|
84 |
+
if df is None or df.empty or 'follower_count_type' not in df.columns:
|
85 |
+
logger.warning("Input DataFrame is empty or 'follower_count_type' column is missing.")
|
86 |
+
return separated_data
|
87 |
+
|
88 |
+
# Define the expected follower count types
|
89 |
+
# These should match the 'follower_count_type' values in your Bubble data
|
90 |
+
follower_types = [
|
91 |
+
'follower_gains_monthly', # For time-series analysis
|
92 |
+
'follower_industry', # For demographic analysis
|
93 |
+
'follower_seniority',
|
94 |
+
'follower_function',
|
95 |
+
'follower_geo'
|
96 |
+
]
|
97 |
+
|
98 |
+
for ftype in follower_types:
|
99 |
+
type_data = df[df['follower_count_type'] == ftype].copy()
|
100 |
+
if not type_data.empty:
|
101 |
+
if ftype == 'follower_gains_monthly':
|
102 |
+
type_data = self._process_monthly_data(type_data)
|
103 |
+
else: # Demographic data
|
104 |
+
type_data = self._get_top_demographic_segments(type_data, top_n=10)
|
105 |
+
separated_data[ftype] = type_data
|
106 |
+
else:
|
107 |
+
logger.info(f"No data found for follower_count_type: {ftype}")
|
108 |
+
|
109 |
+
return separated_data
|
110 |
+
|
111 |
+
def _get_top_demographic_segments(self, demo_df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:
|
112 |
+
"""Get top N demographic segments by total follower count (organic + paid)."""
|
113 |
+
if demo_df.empty:
|
114 |
+
return demo_df
|
115 |
+
|
116 |
+
# Ensure required columns exist and are numeric, fill NaNs with 0 for sum
|
117 |
+
demo_df = demo_df.copy() # Work on a copy
|
118 |
+
demo_df['follower_count_organic'] = pd.to_numeric(demo_df.get('follower_count_organic'), errors='coerce').fillna(0)
|
119 |
+
demo_df['follower_count_paid'] = pd.to_numeric(demo_df.get('follower_count_paid'), errors='coerce').fillna(0)
|
120 |
+
|
121 |
+
demo_df['total_followers'] = demo_df['follower_count_organic'] + demo_df['follower_count_paid']
|
122 |
+
|
123 |
+
# Sort by total followers and take top N
|
124 |
+
# 'category_name' usually holds the demographic label (e.g., industry name)
|
125 |
+
if 'category_name' not in demo_df.columns:
|
126 |
+
logger.warning("'_get_top_demographic_segments' expects 'category_name' column for grouping.")
|
127 |
+
return demo_df.drop(columns=['total_followers'], errors='ignore')
|
128 |
+
|
129 |
+
# Group by category_name if there are multiple entries for the same category, sum followers
|
130 |
+
# This step might be redundant if data is already aggregated per category_name
|
131 |
+
# demo_df_grouped = demo_df.groupby('category_name').agg(
|
132 |
+
# follower_count_organic=('follower_count_organic', 'sum'),
|
133 |
+
# follower_count_paid=('follower_count_paid', 'sum'),
|
134 |
+
# total_followers=('total_followers', 'sum')
|
135 |
+
# ).reset_index()
|
136 |
+
|
137 |
+
top_segments = demo_df.nlargest(top_n, 'total_followers')
|
138 |
+
|
139 |
+
return top_segments.drop(columns=['total_followers'], errors='ignore')
|
140 |
+
|
141 |
+
|
142 |
+
def _process_monthly_data(self, monthly_df: pd.DataFrame) -> pd.DataFrame:
|
143 |
+
"""Process monthly follower data: parse dates, sort."""
|
144 |
+
if monthly_df.empty or 'category_name' not in monthly_df.columns:
|
145 |
+
logger.warning("Monthly data DataFrame is empty or 'category_name' column is missing.")
|
146 |
+
return monthly_df
|
147 |
+
|
148 |
+
df_processed = monthly_df.copy()
|
149 |
+
|
150 |
+
# 'category_name' for monthly data is expected to be a date string like 'YYYY-MM-DD'
|
151 |
+
# Attempt to convert 'category_name' to datetime
|
152 |
+
df_processed['date_for_analysis'] = pd.to_datetime(df_processed['category_name'], errors='coerce')
|
153 |
+
|
154 |
+
# Drop rows where date conversion failed
|
155 |
+
df_processed.dropna(subset=['date_for_analysis'], inplace=True)
|
156 |
+
|
157 |
+
if df_processed.empty:
|
158 |
+
logger.warning("No valid dates found in 'category_name' for monthly data after processing.")
|
159 |
+
return df_processed
|
160 |
+
|
161 |
+
df_processed['year_month'] = df_processed['date_for_analysis'].dt.strftime('%Y-%m')
|
162 |
+
df_processed['month_name'] = df_processed['date_for_analysis'].dt.strftime('%B %Y')
|
163 |
+
|
164 |
+
# Ensure numeric types for follower counts
|
165 |
+
for col in ['follower_count_organic', 'follower_count_paid']:
|
166 |
+
if col in df_processed.columns:
|
167 |
+
df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce').fillna(0)
|
168 |
+
else: # Add column with zeros if missing, to prevent errors in later calculations
|
169 |
+
df_processed[col] = 0
|
170 |
+
|
171 |
+
|
172 |
+
return df_processed.sort_values('date_for_analysis')
|
173 |
+
|
174 |
+
def _extract_time_series_metrics(self, monthly_df: pd.DataFrame) -> List[TimeSeriesMetric]:
|
175 |
+
"""Extract time-series metrics from processed monthly follower data."""
|
176 |
+
ts_metrics = []
|
177 |
+
if monthly_df.empty or 'date_for_analysis' not in monthly_df.columns:
|
178 |
+
logger.info("Cannot extract time-series metrics: monthly DataFrame is empty or lacks 'date_for_analysis'.")
|
179 |
+
return ts_metrics
|
180 |
+
|
181 |
+
# Ensure data is sorted by date for correct growth rate calculation
|
182 |
+
monthly_df_sorted = monthly_df.sort_values('date_for_analysis').copy()
|
183 |
+
|
184 |
+
timestamps = monthly_df_sorted['year_month'].tolist()
|
185 |
+
|
186 |
+
# Calculate total followers
|
187 |
+
monthly_df_sorted['total_followers'] = monthly_df_sorted.get('follower_count_organic', 0) + \
|
188 |
+
monthly_df_sorted.get('follower_count_paid', 0)
|
189 |
+
|
190 |
+
metric_definitions = {
|
191 |
+
"total_follower_count": monthly_df_sorted['total_followers'],
|
192 |
+
"organic_follower_count": monthly_df_sorted.get('follower_count_organic', pd.Series(0, index=monthly_df_sorted.index)),
|
193 |
+
"paid_follower_count": monthly_df_sorted.get('follower_count_paid', pd.Series(0, index=monthly_df_sorted.index))
|
194 |
+
}
|
195 |
+
|
196 |
+
for name, values_series in metric_definitions.items():
|
197 |
+
ts_metrics.append(TimeSeriesMetric(
|
198 |
+
metric_name=name,
|
199 |
+
values=values_series.tolist(),
|
200 |
+
timestamps=timestamps,
|
201 |
+
metric_type="time_series",
|
202 |
+
time_granularity="monthly"
|
203 |
+
))
|
204 |
+
|
205 |
+
# Calculate growth rate for total followers
|
206 |
+
if len(monthly_df_sorted) > 1:
|
207 |
+
# pct_change gives NaN for the first element, fill with 0
|
208 |
+
growth_rates = monthly_df_sorted['total_followers'].pct_change().fillna(0).tolist()
|
209 |
+
ts_metrics.append(TimeSeriesMetric(
|
210 |
+
metric_name="total_follower_growth_rate",
|
211 |
+
values=growth_rates,
|
212 |
+
timestamps=timestamps, # Timestamps align, first growth rate is vs non-existent previous point (so 0)
|
213 |
+
metric_type="time_series",
|
214 |
+
time_granularity="monthly",
|
215 |
+
unit="%"
|
216 |
+
))
|
217 |
+
else:
|
218 |
+
logger.info("Not enough data points (<=1) to calculate growth rate.")
|
219 |
+
|
220 |
+
return ts_metrics
|
221 |
+
|
222 |
+
def _calculate_aggregate_metrics(self, separated_data: Dict[str, pd.DataFrame]) -> Dict[str, float]:
|
223 |
+
"""Calculate aggregate metrics from all follower data."""
|
224 |
+
agg_metrics = {}
|
225 |
+
|
226 |
+
monthly_df = separated_data.get('follower_gains_monthly')
|
227 |
+
if monthly_df is not None and not monthly_df.empty:
|
228 |
+
total_organic = monthly_df['follower_count_organic'].sum()
|
229 |
+
total_paid = monthly_df['follower_count_paid'].sum()
|
230 |
+
total_all_followers = total_organic + total_paid
|
231 |
+
|
232 |
+
agg_metrics['total_organic_followers_gained_period'] = float(total_organic)
|
233 |
+
agg_metrics['total_paid_followers_gained_period'] = float(total_paid)
|
234 |
+
agg_metrics['overall_total_followers_gained_period'] = float(total_all_followers)
|
235 |
+
|
236 |
+
if total_all_followers > 0:
|
237 |
+
agg_metrics['overall_organic_follower_ratio_gained'] = float(total_organic / total_all_followers)
|
238 |
+
agg_metrics['overall_paid_follower_ratio_gained'] = float(total_paid / total_all_followers)
|
239 |
+
|
240 |
+
# Average monthly gain (if 'total_followers' represents gain, not cumulative)
|
241 |
+
# Assuming 'follower_count_organic/paid' in 'follower_gains_monthly' are indeed GAINS for that month
|
242 |
+
monthly_df['monthly_total_gain'] = monthly_df['follower_count_organic'] + monthly_df['follower_count_paid']
|
243 |
+
if not monthly_df['monthly_total_gain'].empty:
|
244 |
+
agg_metrics['avg_monthly_follower_gain'] = float(monthly_df['monthly_total_gain'].mean())
|
245 |
+
agg_metrics['max_monthly_follower_gain'] = float(monthly_df['monthly_total_gain'].max())
|
246 |
+
agg_metrics['min_monthly_follower_gain'] = float(monthly_df['monthly_total_gain'].min())
|
247 |
+
|
248 |
+
|
249 |
+
# Count of distinct demographic segments identified (top N for each)
|
250 |
+
for demo_type in ['follower_industry', 'follower_seniority', 'follower_function', 'follower_geo']:
|
251 |
+
if demo_type in separated_data and not separated_data[demo_type].empty:
|
252 |
+
agg_metrics[f'distinct_{demo_type}_segments_analyzed'] = float(len(separated_data[demo_type]))
|
253 |
+
|
254 |
+
return agg_metrics
|
255 |
+
|
256 |
+
def _extract_demographic_metrics(self, separated_data: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
|
257 |
+
"""Extract demographic distributions (categorical metrics)."""
|
258 |
+
cat_metrics = {}
|
259 |
+
demographic_types_map = {
|
260 |
+
'follower_industry': 'industry_distribution',
|
261 |
+
'follower_seniority': 'seniority_distribution',
|
262 |
+
'follower_function': 'function_distribution',
|
263 |
+
'follower_geo': 'geographic_distribution'
|
264 |
+
}
|
265 |
+
|
266 |
+
for demo_type_key, metric_name_prefix in demographic_types_map.items():
|
267 |
+
demo_df = separated_data.get(demo_type_key)
|
268 |
+
if demo_df is not None and not demo_df.empty and 'category_name' in demo_df.columns:
|
269 |
+
distribution = {}
|
270 |
+
for _, row in demo_df.iterrows():
|
271 |
+
category = row['category_name']
|
272 |
+
organic = float(row.get('follower_count_organic', 0))
|
273 |
+
paid = float(row.get('follower_count_paid', 0))
|
274 |
+
total = organic + paid
|
275 |
+
distribution[category] = {
|
276 |
+
'total_followers': total,
|
277 |
+
'organic_followers': organic,
|
278 |
+
'paid_followers': paid,
|
279 |
+
'organic_ratio': organic / total if total > 0 else 0.0
|
280 |
+
}
|
281 |
+
|
282 |
+
# Sort by total followers descending for the distribution
|
283 |
+
sorted_distribution = dict(sorted(distribution.items(), key=lambda item: item[1]['total_followers'], reverse=True))
|
284 |
+
cat_metrics[metric_name_prefix] = sorted_distribution
|
285 |
+
|
286 |
+
# Summary for this demographic type
|
287 |
+
total_followers_in_type = sum(item['total_followers'] for item in distribution.values())
|
288 |
+
cat_metrics[f'{metric_name_prefix}_summary'] = {
|
289 |
+
'total_followers_in_top_segments': total_followers_in_type,
|
290 |
+
'number_of_segments_reported': len(distribution),
|
291 |
+
'top_segment': list(sorted_distribution.keys())[0] if sorted_distribution else "N/A"
|
292 |
+
}
|
293 |
+
return cat_metrics
|
294 |
+
|
295 |
+
def _extract_time_periods(self, monthly_df: Optional[pd.DataFrame]) -> List[str]:
|
296 |
+
"""Extract unique year-month time periods covered by the monthly data."""
|
297 |
+
if monthly_df is None or monthly_df.empty or 'year_month' not in monthly_df.columns:
|
298 |
+
return ["Data period not available or N/A"]
|
299 |
+
|
300 |
+
periods = sorted(monthly_df['year_month'].dropna().unique().tolist(), reverse=True)
|
301 |
+
return periods[:12] # Return up to the last 12 months if available
|
302 |
+
|
303 |
+
|
304 |
+
def analyze_follower_data(self, follower_stats_df: pd.DataFrame) -> AgentMetrics:
|
305 |
+
"""
|
306 |
+
Generate comprehensive follower analysis using PandasAI and structured metric extraction.
|
307 |
+
"""
|
308 |
+
if follower_stats_df is None or follower_stats_df.empty:
|
309 |
+
logger.warning("Follower statistics DataFrame is empty. Returning empty metrics.")
|
310 |
+
return AgentMetrics(
|
311 |
+
agent_name=self.AGENT_NAME,
|
312 |
+
analysis_summary="No follower data provided for analysis.",
|
313 |
+
time_periods_covered=["N/A"]
|
314 |
+
)
|
315 |
+
|
316 |
+
# 1. Pre-process and separate data
|
317 |
+
separated_data = self._separate_follower_data_by_type(follower_stats_df)
|
318 |
+
|
319 |
+
# Prepare a combined DataFrame for PandasAI if needed, or use the original one.
|
320 |
+
# For PandasAI, it's often better to provide a clean, understandable DataFrame.
|
321 |
+
# Let's use the original df for the textual analysis by PandasAI,
|
322 |
+
# as it contains all types and the LLM can be instructed to differentiate.
|
323 |
+
|
324 |
+
# Ensure PandasAI is configured (this should ideally be done once at orchestrator level)
|
325 |
+
# from utils.pandasai_setup import configure_pandasai
|
326 |
+
# configure_pandasai(self.api_key, self.model_name) # Or pass LLM object if configured outside
|
327 |
+
|
328 |
+
df_description = "LinkedIn follower statistics. Contains 'follower_count_type' indicating data category (e.g., 'follower_gains_monthly', 'follower_industry'), 'category_name' (e.g., date for monthly, industry name for industry type), 'follower_count_organic', 'follower_count_paid'."
|
329 |
+
|
330 |
+
# Create PandasAI DataFrame
|
331 |
+
# Check if pai.DataFrame is the correct way to initialize based on your pandasai version
|
332 |
+
try:
|
333 |
+
pandas_ai_df = pai.DataFrame(follower_stats_df, description=df_description)
|
334 |
+
except Exception as e:
|
335 |
+
logger.error(f"Failed to create PandasAI DataFrame: {e}", exc_info=True)
|
336 |
+
return AgentMetrics(
|
337 |
+
agent_name=self.AGENT_NAME,
|
338 |
+
analysis_summary=f"Error initializing PandasAI: {e}",
|
339 |
+
time_periods_covered=self._extract_time_periods(separated_data.get('follower_gains_monthly'))
|
340 |
+
)
|
341 |
+
|
342 |
+
# 2. Generate textual analysis using PandasAI via LlmAgent
|
343 |
+
# The LlmAgent itself doesn't directly use PandasAI's .chat() method.
|
344 |
+
# The instruction for LlmAgent should guide it to perform analysis.
|
345 |
+
# If direct PandasAI chat is needed, it's a separate call.
|
346 |
+
# The original code uses pandas_df.chat(analysis_query). This implies PandasAI is used directly.
|
347 |
+
# Let's stick to the direct PandasAI chat call as in the original structure.
|
348 |
+
|
349 |
+
analysis_query = f"""
|
350 |
+
Analyze the provided LinkedIn follower statistics. The DataFrame contains various 'follower_count_type' values.
|
351 |
+
Focus on:
|
352 |
+
1. For 'follower_gains_monthly': Analyze monthly follower growth trends (total, organic, paid). Identify key periods of growth or decline.
|
353 |
+
2. For demographic types (industry, seniority, function, geo): Describe the distribution of followers. Which are the top segments? How do organic vs paid compare?
|
354 |
+
3. Synthesize these findings into an overall summary of follower dynamics.
|
355 |
+
|
356 |
+
Consider the data structure: 'category_name' holds the date for monthly data or the demographic label.
|
357 |
+
'follower_count_organic' and 'follower_count_paid' are the key metrics.
|
358 |
+
"""
|
359 |
+
|
360 |
+
analysis_result_text = "PandasAI analysis could not be performed." # Default
|
361 |
+
try:
|
362 |
+
def chat_operation():
|
363 |
+
# Ensure the LLM for PandasAI is correctly configured before this call
|
364 |
+
# This might involve re-calling configure_pandasai if it's not persistent
|
365 |
+
# or if the LLM object needs to be explicitly passed to PandasAI DataFrame.
|
366 |
+
if not pai.config.llm: # Check if LLM is set for PandasAI
|
367 |
+
logger.warning("PandasAI LLM not configured. Attempting to configure now.")
|
368 |
+
# This assumes configure_pandasai is available and sets pai.config.llm
|
369 |
+
from utils.pandasai_setup import configure_pandasai
|
370 |
+
configure_pandasai(self.api_key, self.model_name)
|
371 |
+
if not pai.config.llm:
|
372 |
+
raise RuntimeError("PandasAI LLM could not be configured for chat operation.")
|
373 |
+
|
374 |
+
logger.info(f"Executing PandasAI chat for follower analysis with LLM: {pai.config.llm}")
|
375 |
+
return pandas_ai_df.chat(analysis_query)
|
376 |
+
|
377 |
+
analysis_result_raw = self.retry_mechanism.retry_with_backoff(
|
378 |
+
func=chat_operation,
|
379 |
+
max_retries=2, # Adjusted retries
|
380 |
+
base_delay=2.0,
|
381 |
+
exceptions=(Exception,) # Catch broader exceptions for PandasAI calls
|
382 |
+
)
|
383 |
+
analysis_result_text = str(analysis_result_raw) if analysis_result_raw else "No textual analysis generated by PandasAI."
|
384 |
+
logger.info("Follower analysis via PandasAI completed.")
|
385 |
+
|
386 |
+
except Exception as e:
|
387 |
+
logger.error(f"Follower analysis with PandasAI failed after retries: {e}", exc_info=True)
|
388 |
+
analysis_result_text = f"Follower analysis using PandasAI failed. Error: {str(e)[:200]}"
|
389 |
+
|
390 |
+
# 3. Extract structured metrics using the separated and processed data
|
391 |
+
monthly_data_for_metrics = separated_data.get('follower_gains_monthly', pd.DataFrame())
|
392 |
+
|
393 |
+
time_series_metrics = self._extract_time_series_metrics(monthly_data_for_metrics)
|
394 |
+
aggregate_metrics = self._calculate_aggregate_metrics(separated_data) # Uses all separated types
|
395 |
+
categorical_metrics = self._extract_demographic_metrics(separated_data) # Uses demographic types
|
396 |
+
time_periods = self._extract_time_periods(monthly_data_for_metrics)
|
397 |
+
|
398 |
+
return AgentMetrics(
|
399 |
+
agent_name=self.AGENT_NAME,
|
400 |
+
analysis_summary=analysis_result_text[:2000], # Truncate if too long
|
401 |
+
time_series_metrics=time_series_metrics,
|
402 |
+
aggregate_metrics=aggregate_metrics,
|
403 |
+
categorical_metrics=categorical_metrics,
|
404 |
+
time_periods_covered=time_periods,
|
405 |
+
data_sources_used=[f"follower_stats_df (shape: {follower_stats_df.shape})"]
|
406 |
+
)
|
407 |
+
|
408 |
+
if __name__ == '__main__':
|
409 |
+
# This is for example and testing purposes.
|
410 |
+
# Ensure logging and other necessary setups are done.
|
411 |
+
try:
|
412 |
+
from utils.logging_config import setup_logging
|
413 |
+
setup_logging()
|
414 |
+
logger.info("Logging setup for EnhancedFollowerAnalysisAgent test.")
|
415 |
+
except ImportError:
|
416 |
+
logging.basicConfig(level=logging.INFO)
|
417 |
+
logger.warning("Could not import setup_logging. Using basicConfig.")
|
418 |
+
|
419 |
+
# Mock API Key and Model for testing
|
420 |
+
# IMPORTANT: For PandasAI to run, a valid API key and model setup are needed.
|
421 |
+
# This example might not fully execute PandasAI chat without proper environment setup.
|
422 |
+
MOCK_API_KEY = os.environ.get("GOOGLE_API_KEY", "test_api_key_followers")
|
423 |
+
MODEL_NAME = DEFAULT_AGENT_MODEL
|
424 |
+
|
425 |
+
# Configure PandasAI (essential for the .chat() part)
|
426 |
+
try:
|
427 |
+
from utils.pandasai_setup import configure_pandasai
|
428 |
+
if MOCK_API_KEY != "test_api_key_followers": # Only configure if a real key might be present
|
429 |
+
configure_pandasai(MOCK_API_KEY, MODEL_NAME)
|
430 |
+
logger.info("PandasAI configured for testing EnhancedFollowerAnalysisAgent.")
|
431 |
+
else:
|
432 |
+
logger.warning("Using mock API key. PandasAI chat will likely fail or use a default/mock LLM if available.")
|
433 |
+
# Mock pai.DataFrame if pandasai is not fully set up to avoid errors
|
434 |
+
class MockPandasAIDataFrame:
|
435 |
+
def __init__(self, df, description): self.df = df; self.description = description
|
436 |
+
def chat(self, query): return f"Mock PandasAI response to: {query}"
|
437 |
+
pai.DataFrame = MockPandasAIDataFrame
|
438 |
+
|
439 |
+
except ImportError:
|
440 |
+
logger.error("utils.pandasai_setup not found. PandasAI will not be configured.")
|
441 |
+
class MockPandasAIDataFrame:
|
442 |
+
def __init__(self, df, description): self.df = df; self.description = description
|
443 |
+
def chat(self, query): return f"Mock PandasAI response to: {query}"
|
444 |
+
pai.DataFrame = MockPandasAIDataFrame
|
445 |
+
|
446 |
+
# Sample Data
|
447 |
+
sample_follower_data = {
|
448 |
+
'follower_count_type': [
|
449 |
+
'follower_gains_monthly', 'follower_gains_monthly', 'follower_gains_monthly',
|
450 |
+
'follower_industry', 'follower_industry', 'follower_industry', 'follower_industry',
|
451 |
+
'follower_seniority', 'follower_seniority'
|
452 |
+
],
|
453 |
+
'category_name': [ # Dates for monthly, names for demographics
|
454 |
+
'2023-01-01', '2023-02-01', '2023-03-01',
|
455 |
+
'Technology', 'Finance', 'Healthcare', 'Retail',
|
456 |
+
'Senior', 'Entry-Level'
|
457 |
+
],
|
458 |
+
'follower_count_organic': [
|
459 |
+
100, 120, 110, # Monthly gains
|
460 |
+
500, 300, 200, 150, # Industry organic
|
461 |
+
600, 400 # Seniority organic
|
462 |
+
],
|
463 |
+
'follower_count_paid': [
|
464 |
+
10, 15, 12, # Monthly gains
|
465 |
+
50, 30, 20, 10, # Industry paid
|
466 |
+
60, 40 # Seniority paid
|
467 |
+
]
|
468 |
+
}
|
469 |
+
sample_df = pd.DataFrame(sample_follower_data)
|
470 |
+
|
471 |
+
# Initialize agent
|
472 |
+
follower_agent = EnhancedFollowerAnalysisAgent(api_key=MOCK_API_KEY, model_name=MODEL_NAME)
|
473 |
+
|
474 |
+
logger.info("Analyzing sample follower data...")
|
475 |
+
metrics_result = follower_agent.analyze_follower_data(sample_df)
|
476 |
+
|
477 |
+
print("\n--- EnhancedFollowerAnalysisAgent Results ---")
|
478 |
+
print(f"Agent Name: {metrics_result.agent_name}")
|
479 |
+
print(f"Analysis Summary: {metrics_result.analysis_summary}")
|
480 |
+
print("\nTime Series Metrics:")
|
481 |
+
for ts_metric in metrics_result.time_series_metrics:
|
482 |
+
print(f" - {ts_metric.metric_name}: {len(ts_metric.values)} data points, e.g., {ts_metric.values[:3]} for ts {ts_metric.timestamps[:3]}")
|
483 |
+
print("\nAggregate Metrics:")
|
484 |
+
for key, value in metrics_result.aggregate_metrics.items():
|
485 |
+
print(f" - {key}: {value}")
|
486 |
+
print("\nCategorical Metrics:")
|
487 |
+
for key, value in metrics_result.categorical_metrics.items():
|
488 |
+
print(f" - {key}: (details below)")
|
489 |
+
if isinstance(value, dict):
|
490 |
+
for sub_key, sub_value in list(value.items())[:2]: # Print first 2 items for brevity
|
491 |
+
print(f" - {sub_key}: {sub_value}")
|
492 |
+
else:
|
493 |
+
print(f" {value}")
|
494 |
+
|
495 |
+
print(f"\nTime Periods Covered: {metrics_result.time_periods_covered}")
|
496 |
+
print(f"Data Sources Used: {metrics_result.data_sources_used}")
|
497 |
+
print(f"Generated Timestamp: {metrics_result.generation_timestamp}")
|
498 |
+
|
499 |
+
# Test with empty DataFrame
|
500 |
+
logger.info("\n--- Testing with empty DataFrame ---")
|
501 |
+
empty_metrics_result = follower_agent.analyze_follower_data(pd.DataFrame())
|
502 |
+
print(f"Empty DF Analysis Summary: {empty_metrics_result.analysis_summary}")
|