Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

App Files Files Community

GuglielmoTor commited on May 9

Commit

75a5661

verified ·

1 Parent(s): fff82df

Update posts_categorization.py

Browse files

Files changed (1) hide show

posts_categorization.py +120 -45

posts_categorization.py CHANGED Viewed

@@ -4,34 +4,59 @@ import instructor
 from pydantic import BaseModel
 import os
 api_key = os.getenv('GROQ_API_KEY')
 # Create single patched Groq client with instructor for structured output
 client = instructor.from_groq(Groq(api_key=api_key), mode=instructor.Mode.JSON)
 class SummaryOutput(BaseModel):
     summary: str
-# Define pydantic schema for classification output
 class ClassificationOutput(BaseModel):
     category: str
 PRIMARY_SUMMARIZER_MODEL = "deepseek-r1-distill-llama-70b"
 FALLBACK_SUMMARIZER_MODEL = "llama-3.3-70b-versatile"
-# Summarize post text
-def summarize_post(text):
-    if pd.isna(text) or text is None:
         return None
-    text = str(text)[:500]  # truncate to avoid token overflow
     prompt = f"""
     Summarize the following LinkedIn post in 5 to 10 words.
     Only return the summary inside a JSON field called 'summary'.
     Post Text:
-    \"\"\"{text}\"\"\"
     """
     try:
@@ -43,7 +68,7 @@ def summarize_post(text):
             messages=[
                 {"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
                 {"role": "user", "content": prompt}
-                ],
             temperature=0.3
         )
         return response.summary
@@ -57,8 +82,8 @@ def summarize_post(text):
                 messages=[
                     {"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
                     {"role": "user", "content": prompt}
-                    ],
-                temperature=0.3 # Keep temperature consistent or adjust as needed for fallback
             )
             print(f"Summarization successful with fallback model: {FALLBACK_SUMMARIZER_MODEL}")
             return response.summary
@@ -70,63 +95,113 @@ def summarize_post(text):
             return None
     except Exception as e_primary:
         print(f"Error during summarization with primary model ({PRIMARY_SUMMARIZER_MODEL}): {e_primary}")
-        # You could also try fallback here for non-rate-limit errors if desired
         return None
-# Classify post summary into structured categories
-def classify_post(summary, labels):
-    if pd.isna(summary) or summary is None:
-        return None
     prompt = f"""
     Post Summary: "{summary}"
     Available Categories:
-    {', '.join(labels)}
-    Task: Choose the single most relevant category from the list above that applies to this summary. Return only one category in a structured JSON format under the field 'category'.
-    If no category applies, return 'None'.
     """
     try:
         result = client.chat.completions.create(
-            model="meta-llama/llama-4-maverick-17b-128e-instruct",
             response_model=ClassificationOutput,
             messages=[
-                {"role": "system", "content": "You are a strict classifier. Return only one matching category name under the field 'category'."},
                 {"role": "user", "content": prompt}
             ],
-            temperature=0
         )
-        return result.category
     except Exception as e:
-        print(f"Classification error: {e}")
-        return None
-def summarize_and_classify_post(text, labels):
-    summary = summarize_post(text)
-    category = classify_post(summary, labels) if summary else None
     return {
-        "summary": summary,
-        "category": category
     }
-def batch_summarize_and_classify(posts):
-    labels = [
-    "Company Culture and Values",
-    "Employee Stories and Spotlights",
-    "Work-Life Balance, Flexibility, and Well-being",
-    "Diversity, Equity, and Inclusion (DEI)",
-    "Professional Development and Growth Opportunities",
-    "Mission, Vision, and Social Responsibility",
-    "None"
-    ]
     results = []
-    for post in posts:
-        text = post.get("text")
-        result = summarize_and_classify_post(text, labels)
-        results.append(result)
-    return results

 from pydantic import BaseModel
 import os
+# Ensure GROQ_API_KEY is set in your environment variables
 api_key = os.getenv('GROQ_API_KEY')
+if not api_key:
+    raise ValueError("GROQ_API_KEY environment variable not set.")
 # Create single patched Groq client with instructor for structured output
+# Using Mode.JSON for structured output based on Pydantic models
 client = instructor.from_groq(Groq(api_key=api_key), mode=instructor.Mode.JSON)
+# Pydantic model for summarization output
 class SummaryOutput(BaseModel):
     summary: str
+# Pydantic model for classification output
 class ClassificationOutput(BaseModel):
     category: str
+# Define model names (as per your original code)
 PRIMARY_SUMMARIZER_MODEL = "deepseek-r1-distill-llama-70b"
 FALLBACK_SUMMARIZER_MODEL = "llama-3.3-70b-versatile"
+CLASSIFICATION_MODEL = "meta-llama/llama-4-maverick-17b-128e-instruct" # Or your preferred classification model
+# Define the standard list of categories, including "None"
+CLASSIFICATION_LABELS = [
+    "Company Culture and Values",
+    "Employee Stories and Spotlights",
+    "Work-Life Balance, Flexibility, and Well-being",
+    "Diversity, Equity, and Inclusion (DEI)",
+    "Professional Development and Growth Opportunities",
+    "Mission, Vision, and Social Responsibility",
+    "None"  # Represents no applicable category or cases where classification isn't possible
+]
+def summarize_post(text: str) -> str | None:
+    """
+    Summarizes the given post text using a primary model with a fallback.
+    Returns the summary string or None if summarization fails or input is invalid.
+    """
+    # Check for NaN, None, or empty/whitespace-only string
+    if pd.isna(text) or text is None or not str(text).strip():
+        print("Summarizer: Input text is empty or None. Returning None.")
         return None
+    # Truncate text to a reasonable length to avoid token overflow and reduce costs
+    processed_text = str(text)[:500]
     prompt = f"""
     Summarize the following LinkedIn post in 5 to 10 words.
     Only return the summary inside a JSON field called 'summary'.
     Post Text:
+    \"\"\"{processed_text}\"\"\"
     """
     try:
             messages=[
                 {"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
                 {"role": "user", "content": prompt}
+            ],
             temperature=0.3
         )
         return response.summary
                 messages=[
                     {"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
                     {"role": "user", "content": prompt}
+                ],
+                temperature=0.3
             )
             print(f"Summarization successful with fallback model: {FALLBACK_SUMMARIZER_MODEL}")
             return response.summary
             return None
     except Exception as e_primary:
         print(f"Error during summarization with primary model ({PRIMARY_SUMMARIZER_MODEL}): {e_primary}")
+        # Consider if fallback should be attempted for other errors too, or just return None
         return None
+def classify_post(summary: str | None, labels: list[str]) -> str:
+    """
+    Classifies the post summary into one of the provided labels.
+    Ensures the returned category is one of the labels, defaulting to "None".
+    """
+    # If the summary is None (e.g., from a failed summarization or empty input),
+    # or if the summary is an empty string after stripping, classify as "None".
+    if pd.isna(summary) or summary is None or not str(summary).strip():
+        print("Classifier: Input summary is empty or None. Returning 'None' category.")
+        return "None" # Return the string "None" to match the label
+    # Join labels for the prompt to ensure the LLM knows the exact expected strings
+    labels_string = "', '".join(labels)
     prompt = f"""
     Post Summary: "{summary}"
     Available Categories:
+    '{labels_string}'
+    Task: Choose the single most relevant category from the list above that applies to this summary.
+    Return ONLY ONE category string in a structured JSON format under the field 'category'.
+    The category MUST be one of the following: '{labels_string}'.
+    If no specific category applies, or if you are unsure, return "None".
     """
     try:
+        system_message = (
+            f"You are a very strict classifier. Your ONLY job is to return a JSON object "
+            f"with a 'category' field. The value of 'category' MUST be one of these "
+            f"exact strings: '{labels_string}'."
+        )
         result = client.chat.completions.create(
+            model=CLASSIFICATION_MODEL,
             response_model=ClassificationOutput,
             messages=[
+                {"role": "system", "content": system_message},
                 {"role": "user", "content": prompt}
             ],
+            temperature=0 # Temperature 0 for deterministic classification
         )
+        returned_category = result.category
+        # Validate the output against the provided labels
+        if returned_category not in labels:
+            print(f"Warning: Classifier returned '{returned_category}', which is not in the predefined labels. Forcing to 'None'. Summary: '{summary}'")
+            return "None" # Force to "None" if the LLM returns an unexpected category
+        return returned_category
     except Exception as e:
+        print(f"Classification error: {e}. Summary: '{summary}'. Defaulting to 'None' category.")
+        return "None" # Default to "None" on any exception during classification
+def summarize_and_classify_post(text: str | None, labels: list[str]) -> dict:
+    """
+    Summarizes and then classifies a single post text.
+    Handles cases where text is None or summarization fails.
+    """
+    summary = summarize_post(text) # This can return None
+    # If summarization didn't produce a result (e.g. empty input, error),
+    # or if the summary itself is effectively empty, the category is "None".
+    if summary is None or not summary.strip():
+        category = "None"
+    else:
+        # If we have a valid summary, try to classify it.
+        # classify_post is designed to return one of the labels or "None".
+        category = classify_post(summary, labels)
     return {
+        "summary": summary,  # This can be None
+        "category": category # This will be one of the labels or "None"
     }
+def batch_summarize_and_classify(posts_data: list[dict]) -> list[dict]:
+    """
+    Processes a batch of posts, performing summarization and classification for each.
+    Expects posts_data to be a list of dictionaries, each with at least 'id' and 'text' keys.
+    Returns a list of dictionaries, each with 'id', 'summary', and 'category'.
+    """
     results = []
+    if not posts_data:
+        print("Input 'posts_data' is empty. Returning empty results.")
+        return results
+    for i, post_item in enumerate(posts_data):
+        if not isinstance(post_item, dict):
+            print(f"Warning: Item at index {i} is not a dictionary. Skipping.")
+            continue
+        post_id = post_item.get("id")
+        text_to_process = post_item.get("text") # This text is passed to summarize_and_classify_post
+        print(f"\nProcessing Post ID: {post_id if post_id else 'N/A (ID missing)'}, Text: '{str(text_to_process)[:50]}...'")
+        # summarize_and_classify_post will handle None/empty text internally
+        # and ensure category is "None" in such cases.
+        summary_and_category_result = summarize_and_classify_post(text_to_process, CLASSIFICATION_LABELS)
+        results.append({
+            "id": post_id, # Include the ID for mapping back to original data
+            "summary": summary_and_category_result["summary"],
+            "category": summary_and_category_result["category"] # This is now validated
+        })
+        print(f"Result for Post ID {post_id}: Summary='{summary_and_category_result['summary']}', Category='{summary_and_category_result['category']}'")
+    return results