Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

File size: 44,869 Bytes

import json
import requests
import html
import time # Added for potential rate limiting if needed
from datetime import datetime
from collections import defaultdict
from urllib.parse import quote # Added for URL encoding
from transformers import pipeline

from sessions import create_session
from error_handling import display_error
from posts_categorization import batch_summarize_and_classify
import logging


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

API_V2_BASE = 'https://api.linkedin.com/v2'
API_REST_BASE = "https://api.linkedin.com/rest"

# Initialize sentiment pipeline (loaded once globally)
sentiment_pipeline = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis")

# --- Utility Function ---
def extract_text_from_mention_commentary(commentary):
    """
    Extracts clean text from a commentary string, removing potential placeholders like {mention}.
    """
    import re 
    if not commentary:
        return ""
    return re.sub(r"{.*?}", "", commentary).strip()

# --- Core Sentiment Analysis Helper ---
def _get_sentiment_from_text(text_to_analyze):
    """
    Analyzes a single piece of text and returns its sentiment label and raw counts.
    Returns a dict: {"label": "Sentiment Label", "counts": defaultdict(int)}
    """
    sentiment_counts = defaultdict(int)
    dominant_sentiment_label = "Neutral 😐" # Default
    
    if not text_to_analyze or not text_to_analyze.strip():
        return {"label": dominant_sentiment_label, "counts": sentiment_counts}

    try:
        # Truncate to avoid issues with very long texts for the model
        analysis_result = sentiment_pipeline(str(text_to_analyze)[:512]) 
        label = analysis_result[0]['label'].upper()
        
        if label in ['POSITIVE', 'VERY POSITIVE']:
            dominant_sentiment_label = 'Positive 👍'
            sentiment_counts['Positive 👍'] += 1
        elif label in ['NEGATIVE', 'VERY NEGATIVE']:
            dominant_sentiment_label = 'Negative 👎'
            sentiment_counts['Negative 👎'] += 1
        elif label == 'NEUTRAL':
            dominant_sentiment_label = 'Neutral 😐' # Already default, but for clarity
            sentiment_counts['Neutral 😐'] += 1
        else:
            dominant_sentiment_label = 'Unknown' # Catch any other labels from the model
            sentiment_counts['Unknown'] += 1
        
    except Exception as e:
        # Log the error with more context if possible
        logging.error(f"Sentiment analysis failed for text snippet '{str(text_to_analyze)[:50]}...'. Error: {e}")
        sentiment_counts['Error'] += 1
        dominant_sentiment_label = "Error" # Indicate error in sentiment

    return {"label": dominant_sentiment_label, "counts": sentiment_counts}

def get_post_media_category(post_content):
    """
    Determines the media category from the post's content object.
    Args:
        post_content (dict or None): The content dictionary of the post.
    Returns:
        str: The determined media category (e.g., "Video", "Article", "Document", "Image", "Multi-Image", "NONE").
    """
    if not post_content:
        return "NONE"

    # 1. Check for specific LinkedIn Video Component (from your original logic)
    # You might want to refine this if 'mediaCategory' within the video component is more specific
    if "com.linkedin.voyager.feed.render.LinkedInVideoComponent" in post_content:
        # video_component_data = post_content.get("com.linkedin.voyager.feed.render.LinkedInVideoComponent", {})
        # return video_component_data.get("mediaCategory", "Video") # Example if you want to use its specific category
        return "Video"

    # 2. Check for Article (based on your "old code" and examples)
    if 'article' in post_content:
        return "Article"

    # 3. Check for Multi-Image (based on your "old code")
    if 'multiImage' in post_content:
        return "Multi-Image"

    # 4. Check for Media (Document or Image - based on your "old code" and examples)
    if 'media' in post_content:
        media_item = post_content['media']
        # Heuristic from your "old code": if 'title' is present, it's likely a Document.
        if 'title' in media_item:
            # Example: "content": {"media": {"title": "...", "id": "urn:li:document:..."}}
            return "Document"
        # Else, if 'id' is present (and no title was found for Document), assume Image.
        elif 'id' in media_item:
            # Example: "content": {"media": {"altText": "", "id": "urn:li:image:..."}}
            return "Image"

    return "NONE"
# --- Post Retrieval Functions ---
def fetch_linkedin_posts_core(comm_client_id, community_token, org_urn, count):
    """
    Fetches raw posts, their basic statistics, and performs summarization/categorization.
    Does NOT fetch comments or analyze sentiment of comments here.
    """
    token_dict = community_token if isinstance(community_token, dict) else {'access_token': community_token, 'token_type': 'Bearer'}
    session = create_session(comm_client_id, token=token_dict)
    session.headers.update({
        "LinkedIn-Version": "202502" 
    })

    posts_url = f"{API_REST_BASE}/posts?author={org_urn}&q=author&count={count}&sortBy=LAST_MODIFIED"
    logging.info(f"Fetching posts from URL: {posts_url}")
    try:
        resp = session.get(posts_url)
        resp.raise_for_status()
        raw_posts_api = resp.json().get("elements", [])
        logging.info(f"Fetched {len(raw_posts_api)} raw posts from API.")
    except requests.exceptions.RequestException as e:
        status = getattr(e.response, 'status_code', 'N/A')
        text = getattr(e.response, 'text', 'No response text')
        logging.error(f"Failed to fetch posts (Status: {status}): {e}. Response: {text}")
        raise ValueError(f"Failed to fetch posts (Status: {status})") from e
    except json.JSONDecodeError as e:
        logging.error(f"Failed to decode JSON from posts response: {e}. Response text: {resp.text if resp else 'No response object'}")
        raise ValueError("Failed to decode JSON from posts response") from e

    if not raw_posts_api:
        logging.info("No raw posts found.")
        return [], {}, "DefaultOrgName" 

    post_urns_for_stats = [p["id"] for p in raw_posts_api if p.get("id")]

    post_texts_for_nlp = []
    for p in raw_posts_api:
        text_content = p.get("commentary") or \
                       p.get("specificContent", {}).get("com.linkedin.ugc.ShareContent", {}).get("shareCommentaryV2", {}).get("text", "") or \
                       "[No text content]"
        post_texts_for_nlp.append({"text": text_content, "id": p.get("id")})

    logging.info(f"Prepared {len(post_texts_for_nlp)} posts for NLP (summarization/classification).")
    if 'batch_summarize_and_classify' in globals():
        structured_results_list = batch_summarize_and_classify(post_texts_for_nlp)
    else: 
        logging.warning("batch_summarize_and_classify not found, using fallback.")
        structured_results_list = [{"id": p["id"], "summary": "N/A", "category": "N/A"} for p in post_texts_for_nlp]

    structured_results_map = {res["id"]: res for res in structured_results_list if "id" in res}

    stats_map = {}
    if post_urns_for_stats:
        batch_size_stats = 20
        for i in range(0, len(post_urns_for_stats), batch_size_stats):
            batch_urns = post_urns_for_stats[i:i+batch_size_stats]
            params = {'q': 'organizationalEntity', 'organizationalEntity': org_urn}
            
            share_idx = 0 # Index for share URNs in the current batch's params
            ugc_idx = 0   # Index for ugcPost URNs in the current batch's params
            
            # Keep track of URNs actually added to this batch's parameters for logging
            urns_in_current_api_call = []
    
            for urn_str in batch_urns:
                if ":share:" in urn_str:
                    params[f"shares[{share_idx}]"] = urn_str
                    share_idx += 1
                    urns_in_current_api_call.append(urn_str)
                elif ":ugcPost:" in urn_str:
                    params[f"ugcPosts[{ugc_idx}]"] = urn_str
                    ugc_idx += 1
                    urns_in_current_api_call.append(urn_str)
                else:
                    logging.warning(f"URN {urn_str} is not a recognized share or ugcPost type for stats. Skipping.")
                    continue
            
            # If no valid URNs were prepared for this batch, skip the API call
            if not share_idx and not ugc_idx: # or check 'if not urns_in_current_api_call:'
                logging.info(f"Skipping API call for an empty or invalid batch of URNs (original batch segment size: {len(batch_urns)}).")
                continue
    
            try:
                # Log the URNs being sent in this specific API call
                logging.info(f"Fetching stats for batch of {len(urns_in_current_api_call)} URNs. First URN in call: {urns_in_current_api_call[0] if urns_in_current_api_call else 'N/A'}")
                
                # Actual API call
                stat_resp = session.get(f"{API_REST_BASE}/organizationalEntityShareStatistics", params=params)
                stat_resp.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX)
                stats_data = stat_resp.json()
    
                # --- Corrected Parsing Logic ---
                # LinkedIn API for batch stats often returns an "elements" list.
                elements_from_api = stats_data.get("elements")
    
                if isinstance(elements_from_api, list):
                    if not elements_from_api:
                        logging.info(f"API returned 'elements' but it's an empty list for the URNs in this call.")
                    
                    processed_urns_in_batch = 0
                    for item in elements_from_api:
                        urn_in_item = None
                        # Determine the URN key (e.g., 'share' or 'ugcPost')
                        if "share" in item:
                            urn_in_item = item.get("share")
                        elif "ugcPost" in item:
                            urn_in_item = item.get("ugcPost")
                        # Add other URN types if necessary, e.g., elif "article" in item: ...
                        
                        if urn_in_item:
                            stats_values = item.get("totalShareStatistics", {})
                            if stats_values: # Only add if there are actual stats
                                stats_map[urn_in_item] = stats_values
                                processed_urns_in_batch +=1
                            else:
                                # It's possible an URN is returned without stats, or with empty stats
                                logging.debug(f"No 'totalShareStatistics' data found for URN: {urn_in_item} in API item: {item}")
                                stats_map[urn_in_item] = {} # Store empty stats if URN was processed but had no data
                        else:
                            logging.warning(f"Could not extract a recognized URN key from API element: {item}")
                    logging.info(f"Successfully processed {processed_urns_in_batch} URNs with stats from the API response for this batch. Current total stats_map size: {len(stats_map)}")
    
                elif elements_from_api is None and "results" in stats_data:
                    # Fallback or alternative check if your API version *does* use "results"
                    # This was your original attempt. If "elements" is consistently missing,
                    # you might need to debug the exact structure of "results".
                    logging.warning(f"API response does not contain 'elements' key, but 'results' key is present. Attempting to parse 'results'. Response keys: {stats_data.keys()}")
                    results_dict = stats_data.get("results", {})
                    if isinstance(results_dict, dict):
                        for urn_key, stat_element_values in results_dict.items():
                            stats_map[urn_key] = stat_element_values.get("totalShareStatistics", {})
                        logging.info(f"Processed stats from 'results' dictionary. Current stats_map size: {len(stats_map)}")
                    else:
                        logging.error(f"'results' key found but is not a dictionary. Type: {type(results_dict)}")
    
                else:
                    # Neither "elements" (as list) nor "results" (as dict) found as expected
                    logging.error(f"API response structure not recognized. Expected 'elements' (list) or 'results' (dict). Got keys: {stats_data.keys()}. Full response sample: {str(stats_data)[:500]}")
    
                # --- End Corrected Parsing Logic ---
    
                # Check for specific errors reported by the API within the JSON response
                if stats_data.get("errors"):
                    for urn_errored, error_detail in stats_data.get("errors", {}).items():
                        logging.warning(f"API reported error for URN {urn_errored}: {error_detail.get('message', 'Unknown API error message')}")
                
                # This log might be slightly misleading if parsing failed but no exception occurred.
                # The more specific log after parsing 'elements' is better.
                # logging.info(f"Successfully processed stats response for {len(urns_in_current_api_call)} URNs. Current stats_map size: {len(stats_map)}")
    
    
            except requests.exceptions.HTTPError as e:
                # Specific handling for HTTP errors (4xx, 5xx)
                status_code = e.response.status_code
                response_text = e.response.text
                logging.warning(f"HTTP error fetching stats for a batch (Status: {status_code}): {e}. Params: {params}. Response: {response_text[:500]}") # Log first 500 chars of response
            except requests.exceptions.RequestException as e:
                # Catch other requests-related errors (e.g., connection issues)
                status_code = getattr(e.response, 'status_code', 'N/A')
                response_text = getattr(e.response, 'text', 'No response text')
                logging.warning(f"Request failed for stats batch (Status: {status_code}): {e}. Params: {params}. Response: {response_text[:500]}")
            except json.JSONDecodeError as e:
                # Handle cases where the response is not valid JSON
                response_text_for_json_error = stat_resp.text if 'stat_resp' in locals() and hasattr(stat_resp, 'text') else 'Response object not available or no text attribute'
                logging.warning(f"Failed to decode JSON from stats response: {e}. Response text: {response_text_for_json_error[:500]}") # Log first 500 chars
            except Exception as e:
                # Catch any other unexpected errors during the batch processing
                logging.error(f"An unexpected error occurred processing stats batch: {e}", exc_info=True)
    
    
        logging.info(f"Finished processing all URN batches. Final stats_map size: {len(stats_map)}")

    processed_raw_posts = []
    for p in raw_posts_api:
        post_id = p.get("id")
        if not post_id:
            logging.warning("Skipping raw post due to missing ID.")
            continue

        text_content = p.get("commentary") or \
                       p.get("specificContent", {}).get("com.linkedin.ugc.ShareContent", {}).get("shareCommentaryV2", {}).get("text", "") or \
                       "[No text content]"
        
        timestamp = p.get("publishedAt") or p.get("createdAt") or p.get("firstPublishedAt") 
        published_at_iso = datetime.fromtimestamp(timestamp / 1000).isoformat() if timestamp else None
        
        structured_res = structured_results_map.get(post_id, {"summary": "N/A", "category": "N/A"})

        processed_raw_posts.append({
            "id": post_id,
            "raw_text": text_content,
            "summary": structured_res["summary"],
            "category": structured_res["category"],
            "published_at_timestamp": timestamp,
            "published_at_iso": published_at_iso,
            "organization_urn": p.get("author", f"urn:li:organization:{org_urn.split(':')[-1]}"), 
            "is_ad": 'adContext' in p,
            "media_category": get_post_media_category(p.get("content")),
        })
    logging.info(f"Processed {len(processed_raw_posts)} posts with core data.")
    return processed_raw_posts, stats_map, "DefaultOrgName"


def fetch_comments(comm_client_id, community_token, post_urns, stats_map):
    """
    Fetches comments for a list of post URNs using the socialActions endpoint.
    Uses stats_map to potentially skip posts with 0 comments.
    """
    # Ensure community_token is in the expected dictionary format for create_session
    if isinstance(community_token, str):
        token_dict = {'access_token': community_token, 'token_type': 'Bearer'}
    elif isinstance(community_token, dict) and 'access_token' in community_token:
        token_dict = community_token
    else:
        logging.error("Invalid community_token format. Expected a string or a dict with 'access_token'.")
        return {urn: [] for urn in post_urns} # Return empty for all if token is bad

    linkedin_session = create_session(comm_client_id, token=token_dict)
    
    # Set the LinkedIn API version header
    # This is crucial for API compatibility.
    linkedin_session.headers.update({
        'LinkedIn-Version': "202502" # Or your target version
    })

    all_comments_by_post = {}
    logging.info(f"Fetching comments for {len(post_urns)} posts.")

    for post_urn in post_urns:
        post_stats = stats_map.get(post_urn, {})
        # Try to get comment count from "commentSummary" first, then fallback to "commentCount"
        comment_summary = post_stats.get("commentSummary", {})
        comment_count_from_stats = comment_summary.get("totalComments", post_stats.get('commentCount', 0))

        if comment_count_from_stats == 0:
            logging.info(f"Skipping comment fetch for {post_urn} as commentCount is 0 in stats_map.")
            all_comments_by_post[post_urn] = []
            continue
        
        try:
            # IMPORTANT: Use the correct endpoint structure from your working code.
            # The post_urn goes directly into the path and should NOT be URL-encoded here.
            url = f"{API_REST_BASE}/socialActions/{post_urn}/comments"
            # If you want to add other parameters like 'count' or 'start', append them, e.g.,
            # url = f"{API_REST_BASE}/socialActions/{post_urn}/comments?sortOrder=CHRONOLOGICAL&count=10"
            
            logging.debug(f"Fetching comments from URL: {url} for post URN: {post_urn}")
            response = linkedin_session.get(url)
            
            if response.status_code == 200:
                elements = response.json().get('elements', [])
                comments_texts = []
                for c in elements:
                    # Extracting comment text. Adjust if the structure is different.
                    # The original working code stored `data.get('elements', [])`
                    # If you need the full comment object, store 'c' instead of 'comment_text'.
                    message_obj = c.get('message', {})
                    if isinstance(message_obj, dict): # Ensure message is a dict before .get('text')
                        comment_text = message_obj.get('text')
                        if comment_text: 
                            comments_texts.append(comment_text)
                    elif isinstance(message_obj, str): # Sometimes message might be just a string
                        comments_texts.append(message_obj)

                all_comments_by_post[post_urn] = comments_texts
                logging.info(f"Fetched {len(comments_texts)} comments for {post_urn}.")
            elif response.status_code == 403:
                logging.warning(f"Forbidden (403) to fetch comments for {post_urn}. URL: {url}. Response: {response.text}. Check permissions or API version.")
                all_comments_by_post[post_urn] = [] # Or some error indicator
            elif response.status_code == 404:
                logging.warning(f"Comments not found (404) for {post_urn}. URL: {url}. Response: {response.text}")
                all_comments_by_post[post_urn] = []
            else:
                logging.error(f"Error fetching comments for {post_urn}. Status: {response.status_code}. URL: {url}. Response: {response.text}")
                all_comments_by_post[post_urn] = [] # Or some error indicator
        except requests.exceptions.RequestException as e:
            logging.error(f"RequestException fetching comments for {post_urn}: {e}")
            all_comments_by_post[post_urn] = []
        except json.JSONDecodeError as e:
            # Log the response text if it's available and JSON decoding fails
            response_text_for_log = 'N/A'
            if 'response' in locals() and hasattr(response, 'text'):
                response_text_for_log = response.text
            logging.error(f"JSONDecodeError fetching comments for {post_urn}. Response: {response_text_for_log}. Error: {e}")
            all_comments_by_post[post_urn] = []
        except Exception as e:
            # Catch any other unexpected errors
            logging.error(f"Unexpected error fetching comments for {post_urn}: {e}", exc_info=True) # exc_info=True for traceback
            all_comments_by_post[post_urn] = []
            
    return all_comments_by_post

def analyze_sentiment(all_comments_data):
    """
    Analyzes sentiment for comments grouped by post_urn using the helper function.
    all_comments_data is a dict: {post_urn: [comment_text_1, comment_text_2,...]}
    Returns a dict: {post_urn: {"sentiment": "DominantOverallSentiment", "percentage": X.X, "details": {aggregated_counts}}}
    """
    results_by_post = {}
    logging.info(f"Analyzing aggregated sentiment for comments from {len(all_comments_data)} posts.")
    for post_urn, comments_list in all_comments_data.items():
        aggregated_sentiment_counts = defaultdict(int)
        total_valid_comments_for_post = 0
        
        if not comments_list:
            results_by_post[post_urn] = {"sentiment": "Neutral 😐", "percentage": 0.0, "details": dict(aggregated_sentiment_counts)}
            continue

        for comment_text in comments_list:
            if not comment_text or not comment_text.strip():
                continue
            
            # Use the helper for individual comment sentiment
            single_comment_sentiment = _get_sentiment_from_text(comment_text)
            
            # Aggregate counts
            for label, count in single_comment_sentiment["counts"].items():
                aggregated_sentiment_counts[label] += count
            
            if single_comment_sentiment["label"] != "Error": # Count valid analyses
                total_valid_comments_for_post +=1
        
        dominant_overall_sentiment = "Neutral 😐" # Default
        percentage = 0.0

        if total_valid_comments_for_post > 0:
            # Determine dominant sentiment from aggregated_sentiment_counts
            # Exclude 'Error' from being a dominant sentiment unless it's the only category with counts
            valid_sentiments = {k: v for k, v in aggregated_sentiment_counts.items() if k != 'Error' and v > 0}
            if not valid_sentiments:
                dominant_overall_sentiment = 'Error' if aggregated_sentiment_counts['Error'] > 0 else 'Neutral 😐'
            else:
                # Simple max count logic for dominance
                dominant_overall_sentiment = max(valid_sentiments, key=valid_sentiments.get)
            
            if dominant_overall_sentiment != 'Error':
                percentage = round((aggregated_sentiment_counts[dominant_overall_sentiment] / total_valid_comments_for_post) * 100, 1)
            else: # if dominant is 'Error' or only errors were found
                percentage = 0.0
        elif aggregated_sentiment_counts['Error'] > 0 : # No valid comments, but errors occurred
             dominant_overall_sentiment = 'Error'


        results_by_post[post_urn] = {
            "sentiment": dominant_overall_sentiment, 
            "percentage": percentage,
            "details": dict(aggregated_sentiment_counts) # Store aggregated counts
        }
        logging.debug(f"Aggregated sentiment for post {post_urn}: {results_by_post[post_urn]}")
            
    return results_by_post


def compile_detailed_posts(processed_raw_posts, stats_map, sentiments_per_post):
    """
    Combines processed raw post data with their statistics and overall comment sentiment.
    """
    detailed_post_list = []
    logging.info(f"Compiling detailed data for {len(processed_raw_posts)} posts.")
    for proc_post in processed_raw_posts:
        post_id = proc_post["id"]
        stats = stats_map.get(post_id, {})

        likes = stats.get("likeCount", 0)
        comments_stat_count = stats.get("commentSummary", {}).get("totalComments", stats.get("commentCount", 0))
        
        clicks = stats.get("clickCount", 0)
        shares = stats.get("shareCount", 0)
        impressions = stats.get("impressionCount", 0)
        unique_impressions = stats.get("uniqueImpressionsCount", stats.get("impressionCount", 0)) 

        engagement_numerator = likes + comments_stat_count + clicks + shares
        engagement_rate = (engagement_numerator / impressions * 100) if impressions and impressions > 0 else 0.0
        
        sentiment_info = sentiments_per_post.get(post_id, {"sentiment": "Neutral 😐", "percentage": 0.0, "details": {}})
        
        display_text = html.escape(proc_post["raw_text"][:250]).replace("\n", "<br>") + \
                       ("..." if len(proc_post["raw_text"]) > 250 else "")
        
        when_formatted = datetime.fromtimestamp(proc_post["published_at_timestamp"] / 1000).strftime("%Y-%m-%d %H:%M") \
            if proc_post["published_at_timestamp"] else "Unknown"

        detailed_post_list.append({
            "id": post_id,
            "when": when_formatted,
            "text_for_display": display_text,
            "raw_text": proc_post["raw_text"],
            "likes": likes,
            "comments_stat_count": comments_stat_count,
            "clicks": clicks,
            "shares": shares,
            "impressions": impressions,
            "uniqueImpressionsCount": unique_impressions,
            "engagement": f"{engagement_rate:.2f}%",
            "engagement_raw": engagement_rate,
            "sentiment": sentiment_info["sentiment"],
            "sentiment_percent": sentiment_info["percentage"],
            "sentiment_details": sentiment_info.get("details", {}),
            "summary": proc_post["summary"],
            "category": proc_post["category"],
            "organization_urn": proc_post["organization_urn"],
            "is_ad": proc_post["is_ad"],
            "media_category": proc_post.get("media_category", "NONE"),
            "published_at": proc_post["published_at_iso"]
        })
    logging.info(f"Compiled {len(detailed_post_list)} detailed posts.")
    return detailed_post_list


def prepare_data_for_bubble(detailed_posts, all_actual_comments_data):
    """
    Prepares data lists for uploading to Bubble.
    - detailed_posts: List of comprehensively compiled post objects.
    - all_actual_comments_data: Dict of {post_urn: [comment_texts]} from fetch_comments.
    """
    li_posts = []
    li_post_stats = []
    li_post_comments = []
    logging.info("Preparing posts data for Bubble.")

    if not detailed_posts:
        logging.warning("No detailed posts to prepare for Bubble.")
        return [], [], []

    org_urn_default = detailed_posts[0]["organization_urn"] if detailed_posts else "urn:li:organization:UNKNOWN"

    for post_data in detailed_posts:
        li_posts.append({
            "organization_urn": post_data["organization_urn"],
            "id": post_data["id"],
            "is_ad": post_data["is_ad"],
            "media_type": post_data.get("media_category", "NONE"),
            "published_at": post_data["published_at"],
            "sentiment": post_data["sentiment"], 
            "text": post_data["raw_text"],
            #"summary_text": post_data["summary"], 
            "li_eb_label": post_data["category"] 
        })

        li_post_stats.append({
            "clickCount": post_data["clicks"],
            "commentCount": post_data["comments_stat_count"],
            "engagement": post_data["engagement_raw"], 
            "impressionCount": post_data["impressions"],
            "likeCount": post_data["likes"],
            "shareCount": post_data["shares"],
            "uniqueImpressionsCount": post_data["uniqueImpressionsCount"],
            "post_id": post_data["id"],
            "organization_urn": post_data["organization_urn"] 
        })

    for post_urn, comments_text_list in all_actual_comments_data.items():
        current_post_org_urn = org_urn_default 
        for p in detailed_posts: 
            if p["id"] == post_urn:
                current_post_org_urn = p["organization_urn"]
                break

        for single_comment_text in comments_text_list:
            if single_comment_text and single_comment_text.strip():
                li_post_comments.append({
                    "comment_text": single_comment_text,
                    "post_id": post_urn,
                    "organization_urn": current_post_org_urn
                })
    
    logging.info(f"Prepared {len(li_posts)} posts, {len(li_post_stats)} stats entries, and {len(li_post_comments)} comments for Bubble.")
    return li_posts, li_post_stats, li_post_comments

# --- Mentions Retrieval Functions ---

def fetch_linkedin_mentions_core(comm_client_id, community_token, org_urn, count=20):
    """
    Fetches raw mention notifications and the details of the posts where the organization was mentioned.
    Returns a list of processed mention data (internal structure).
    """
    token_dict = community_token if isinstance(community_token, dict) else {'access_token': community_token, 'token_type': 'Bearer'}
    session = create_session(comm_client_id, token=token_dict)
    session.headers.update({
        "X-Restli-Protocol-Version": "2.0.0",
        "LinkedIn-Version": "202502" 
    })

    encoded_org_urn = quote(org_urn, safe='')
    
    notifications_url_base = (
        f"{API_REST_BASE}/organizationalEntityNotifications"
        f"?q=criteria"
        f"&actions=List(SHARE_MENTION)" 
        f"&organizationalEntity={encoded_org_urn}"
        f"&count={count}" 
    )

    all_notifications = []
    start_index = 0
    processed_mentions_internal = [] 
    page_count = 0 
    max_pages = 10 

    while page_count < max_pages:
        current_url = f"{notifications_url_base}&start={start_index}"
        logging.info(f"Fetching notifications page {page_count + 1} from URL: {current_url}")
        try:
            resp = session.get(current_url)
            resp.raise_for_status()
            data = resp.json()
            elements = data.get("elements", [])
            
            if not elements: 
                logging.info(f"No more notifications found on page {page_count + 1}. Total notifications fetched: {len(all_notifications)}.")
                break 
            
            all_notifications.extend(elements)
            
            paging = data.get("paging", {})
            if 'start' not in paging or 'count' not in paging or len(elements) < paging.get('count', count):
                logging.info(f"Last page of notifications fetched. Total notifications: {len(all_notifications)}.")
                break

            start_index = paging['start'] + paging['count'] 
            page_count += 1

        except requests.exceptions.RequestException as e:
            status = getattr(e.response, 'status_code', 'N/A')
            text = getattr(e.response, 'text', 'No response text')
            logging.error(f"Failed to fetch notifications (Status: {status}): {e}. Response: {text}")
            break 
        except json.JSONDecodeError as e:
            logging.error(f"Failed to decode JSON from notifications response: {e}. Response: {resp.text if resp else 'No resp obj'}")
            break
        if page_count >= max_pages:
            logging.info(f"Reached max_pages ({max_pages}) for fetching notifications.")
            break

    if not all_notifications:
        logging.info("No mention notifications found after fetching.")
        return []

    mention_share_urns = list(set([ 
        n.get("generatedActivity") for n in all_notifications 
        if n.get("action") == "SHARE_MENTION" and n.get("generatedActivity")
    ]))
    
    logging.info(f"Found {len(mention_share_urns)} unique share URNs from SHARE_MENTION notifications.")

    # for share_urn in mention_share_urns: 
    #     encoded_share_urn = quote(share_urn, safe='')
    #     post_detail_url = f"{API_REST_BASE}/posts/{encoded_share_urn}" 
    #     logging.info(f"Fetching details for mentioned post: {post_detail_url}")
    #     try:
    #         post_resp = session.get(post_detail_url)
    #         post_resp.raise_for_status()
    #         post_data = post_resp.json()

    #         commentary_raw = post_data.get("commentary") 
    #         if not commentary_raw and "specificContent" in post_data: 
    #              share_content = post_data.get("specificContent", {}).get("com.linkedin.ugc.ShareContent", {})
    #              commentary_raw = share_content.get("shareCommentaryV2", {}).get("text", "")

    #         if not commentary_raw:
    #             logging.warning(f"No commentary found for share URN {share_urn}. Skipping.")
    #             continue

    #         mention_text_cleaned = extract_text_from_mention_commentary(commentary_raw)
    #         timestamp = post_data.get("publishedAt") or post_data.get("createdAt") or post_data.get("firstPublishedAt")
    #         published_at_iso = datetime.fromtimestamp(timestamp / 1000).isoformat() if timestamp else None
    #         author_urn = post_data.get("author", "urn:li:unknown") 

    #         processed_mentions_internal.append({
    #             "mention_id": f"mention_{share_urn}", 
    #             "share_urn": share_urn, 
    #             "mention_text_raw": commentary_raw, 
    #             "mention_text_cleaned": mention_text_cleaned, 
    #             "published_at_timestamp": timestamp,
    #             "published_at_iso": published_at_iso, 
    #             "mentioned_by_author_urn": author_urn,
    #             "organization_urn_mentioned": org_urn 
    #         })
    #     except requests.exceptions.RequestException as e:
    #         status = getattr(e.response, 'status_code', 'N/A')
    #         text = getattr(e.response, 'text', 'No response text')
    #         logging.warning(f"Failed to fetch post details for share URN {share_urn} (Status: {status}): {e}. Response: {text}")
    #     except json.JSONDecodeError as e:
    #         logging.warning(f"Failed to decode JSON for post details {share_urn}: {e}. Response: {post_resp.text if post_resp else 'No resp obj'}")

    if mention_share_urns:
        # Encode URNs for the batch request URL
        encoded_urns = [quote(urn, safe='') for urn in mention_share_urns]
        formatted_urns = ",".join(encoded_urns)
        
        # Construct the URL for batch fetching post details
        # API_REST_BASE should be the base URL like "https://api.linkedin.com/rest"
        batch_posts_url = f"{API_REST_BASE}/posts?ids=List({formatted_urns})"
        logging.info(f"Fetching details for {len(mention_share_urns)} posts in a batch: {batch_posts_url}")

        try:
            batch_resp = session.get(batch_posts_url)
            batch_resp.raise_for_status() # Raise an exception for HTTP errors
            batch_data = batch_resp.json()
            
            results = batch_data.get("results", {}) # Contains post details keyed by URN
            errors = batch_data.get("errors", {})   # Contains errors for specific URNs
            statuses = batch_data.get("statuses", {}) # Contains HTTP statuses for specific URNs

            # Process each share URN using the data from the batch response
            for share_urn in mention_share_urns:
                if share_urn not in results:
                    # Log if a URN was requested but not found in the results
                    logging.warning(
                        f"Post details for share URN {share_urn} not found in batch response. "
                        f"Status: {statuses.get(share_urn)}, Error: {errors.get(share_urn)}"
                    )
                    continue

                post_data = results[share_urn]
                
                # Extract commentary - try direct 'commentary' field first, then fallback
                commentary_raw = post_data.get("commentary")
                if not commentary_raw and "specificContent" in post_data:
                    # Fallback for older structures or specific share types if 'commentary' is not top-level
                    share_content = post_data.get("specificContent", {}).get("com.linkedin.ugc.ShareContent", {})
                    commentary_raw = share_content.get("shareCommentaryV2", {}).get("text", "")

                if not commentary_raw:
                    logging.warning(f"No commentary found for share URN {share_urn} in batch data. Skipping.")
                    continue
                
                # Clean the commentary text (assuming this function is defined)
                mention_text_cleaned = extract_text_from_mention_commentary(commentary_raw)
                
                # Extract timestamp and convert to ISO format
                timestamp = post_data.get("publishedAt") or post_data.get("createdAt") or post_data.get("firstPublishedAt")
                published_at_iso = datetime.fromtimestamp(timestamp / 1000).isoformat() if timestamp else None
                
                # Extract author URN
                author_urn = post_data.get("author", "urn:li:unknown") # Default if author is not found
                
                # Append processed mention data
                processed_mentions_internal.append({
                    "mention_id": f"mention_{share_urn}", # Create a unique ID for the mention
                    "share_urn": share_urn,
                    "mention_text_raw": commentary_raw,
                    "mention_text_cleaned": mention_text_cleaned,
                    "published_at_timestamp": timestamp,
                    "published_at_iso": published_at_iso,
                    "mentioned_by_author_urn": author_urn,
                    "organization_urn_mentioned": org_urn # The URN of the organization that was mentioned
                })

        except requests.exceptions.RequestException as e:
            status = getattr(e.response, 'status_code', 'N/A')
            text = getattr(e.response, 'text', 'No response text')
            logging.error(f"Failed to fetch batch post details (Status: {status}): {e}. Response: {text}")
        except json.JSONDecodeError as e:
            # Log error if JSON decoding fails for the batch response
            logging.error(f"Failed to decode JSON from batch posts response: {e}. Response: {batch_resp.text if batch_resp else 'No resp obj'}")    

    
    logging.info(f"Processed {len(processed_mentions_internal)} mentions with their post details.")
    return processed_mentions_internal


def analyze_mentions_sentiment(processed_mentions_list):
    """
    Analyzes sentiment for the text of each processed mention using the helper function.
    Input: list of processed_mention dicts (internal structure from fetch_linkedin_mentions_core).
    Returns: a dict {mention_id: {"sentiment_label": "DominantSentiment", "percentage": 100.0, "details": {counts}}}
    """
    mention_sentiments_map = {}
    logging.info(f"Analyzing individual sentiment for {len(processed_mentions_list)} mentions.")

    for mention_data in processed_mentions_list:
        mention_internal_id = mention_data["mention_id"] # Internal ID from fetch_linkedin_mentions_core
        text_to_analyze = mention_data.get("mention_text_cleaned", "")
        
        sentiment_result = _get_sentiment_from_text(text_to_analyze)
        
        # For single text, percentage is 100% for the dominant label if not error
        percentage = 0.0
        if sentiment_result["label"] != "Error" and any(sentiment_result["counts"].values()):
            percentage = 100.0

        mention_sentiments_map[mention_internal_id] = {
            "sentiment_label": sentiment_result["label"], # The dominant sentiment label
            "percentage": percentage, 
            "details": dict(sentiment_result["counts"]) # Raw counts for this specific mention
        }
        logging.debug(f"Individual sentiment for mention {mention_internal_id}: {mention_sentiments_map[mention_internal_id]}")
            
    return mention_sentiments_map


def compile_detailed_mentions(processed_mentions_list, mention_sentiments_map):
    """
    Combines processed mention data (internal structure) with their sentiment analysis 
    into the user-specified output format.
    processed_mentions_list: list of dicts from fetch_linkedin_mentions_core
    mention_sentiments_map: dict from analyze_mentions_sentiment, keyed by "mention_id" (internal)
                           and contains "sentiment_label".
    """
    detailed_mentions_output = []
    logging.info(f"Compiling detailed data for {len(processed_mentions_list)} mentions into specified format.")

    for mention_core_data in processed_mentions_list:
        mention_internal_id = mention_core_data["mention_id"] 
        sentiment_info = mention_sentiments_map.get(mention_internal_id, {"sentiment_label": "Neutral 😐"}) 
        
        date_formatted = "Unknown"
        if mention_core_data["published_at_timestamp"]:
            try:
                date_formatted = datetime.fromtimestamp(mention_core_data["published_at_timestamp"] / 1000).strftime("%Y-%m-%d %H:%M")
            except TypeError: 
                logging.warning(f"Could not format timestamp for mention_id {mention_internal_id}")

        detailed_mentions_output.append({
            "date": date_formatted, # User-specified field name
            "id": mention_core_data["share_urn"], # User-specified field name (URN of the post with mention)
            "mention_text": mention_core_data["mention_text_cleaned"], # User-specified field name
            "organization_urn": mention_core_data["organization_urn_mentioned"], # User-specified field name
            "sentiment_label": sentiment_info["sentiment_label"] # User-specified field name
        })
    logging.info(f"Compiled {len(detailed_mentions_output)} detailed mentions with specified fields.")
    return detailed_mentions_output


def prepare_mentions_for_bubble(compiled_detailed_mentions_list):
    """
    Prepares mention data for uploading to a Bubble table.
    The input `compiled_detailed_mentions_list` is already in the user-specified format:
    [{"date": ..., "id": ..., "mention_text": ..., "organization_urn": ..., "sentiment_label": ...}, ...]
    This function directly uses these fields as per user's selection for Bubble upload.
    """
    li_mentions_bubble = []
    logging.info(f"Preparing {len(compiled_detailed_mentions_list)} compiled mentions for Bubble upload.")

    if not compiled_detailed_mentions_list:
        return []

    for mention_data in compiled_detailed_mentions_list:
        # The mention_data dictionary already has the keys:
        # "date", "id", "mention_text", "organization_urn", "sentiment_label"
        # These are used directly for the Bubble upload list.
        li_mentions_bubble.append({
            "date": mention_data["date"],
            "id": mention_data["id"], 
            "mention_text": mention_data["mention_text"],
            "organization_urn": mention_data["organization_urn"],
            "sentiment_label": mention_data["sentiment_label"]
            # If Bubble table has different field names, mapping would be done here.
            # Example: "bubble_mention_date": mention_data["date"],
            # For now, using direct mapping as per user's selected code for the append.
        })
    
    logging.info(f"Prepared {len(li_mentions_bubble)} mention entries for Bubble, using direct field names from compiled data.")
    return li_mentions_bubble