File size: 4,813 Bytes
16353a0
87232cf
16353a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ef06de
 
 
16353a0
 
 
 
 
4004355
16353a0
 
 
 
 
 
 
 
 
 
1ef06de
 
16353a0
1ef06de
16353a0
8fc987e
482c776
 
 
16353a0
 
 
1ef06de
 
 
 
 
 
 
8fc987e
482c776
 
 
1ef06de
 
 
 
 
 
 
 
 
 
 
 
 
16353a0
 
 
1ef06de
16353a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d55124
16353a0
 
 
 
 
 
 
 
 
 
 
 
 
 
ae41008
16353a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import pandas as pd
from groq import Groq, RateLimitError
import instructor
from pydantic import BaseModel
import os

api_key = os.getenv('GROQ_API_KEY')

# Create single patched Groq client with instructor for structured output
client = instructor.from_groq(Groq(api_key=api_key), mode=instructor.Mode.JSON)

class SummaryOutput(BaseModel):
    summary: str

# Define pydantic schema for classification output
class ClassificationOutput(BaseModel):
    category: str

PRIMARY_SUMMARIZER_MODEL = "deepseek-r1-distill-llama-70b" 
FALLBACK_SUMMARIZER_MODEL = "llama-3.3-70b-versatile"

# Summarize post text
def summarize_post(text):
    if pd.isna(text) or text is None:
        return None

    text = str(text)[:500]  # truncate to avoid token overflow

    prompt = f"""
    Summarize the following LinkedIn post in 5 to 10 words.
    Only return the summary inside a JSON field called 'summary'.

    Post Text:
    \"\"\"{text}\"\"\"
    """

    try:
        # Attempt with primary model
        print(f"Attempting summarization with primary model: {PRIMARY_SUMMARIZER_MODEL}")
        response = client.chat.completions.create(
            model=PRIMARY_SUMMARIZER_MODEL,
            response_model=SummaryOutput,
            messages=[
                {"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
                {"role": "user", "content": prompt}
                ],
            temperature=0.3
        )
        return response.summary
    except RateLimitError:
        print(f"Rate limit hit for primary summarizer model: {PRIMARY_SUMMARIZER_MODEL}. Trying fallback: {FALLBACK_SUMMARIZER_MODEL}")
        try:
            # Attempt with fallback model
            response = client.chat.completions.create(
                model=FALLBACK_SUMMARIZER_MODEL,
                response_model=SummaryOutput,
                messages=[
                    {"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
                    {"role": "user", "content": prompt}
                    ],
                temperature=0.3 # Keep temperature consistent or adjust as needed for fallback
            )
            print(f"Summarization successful with fallback model: {FALLBACK_SUMMARIZER_MODEL}")
            return response.summary
        except RateLimitError as rle_fallback:
            print(f"Rate limit hit for fallback summarizer model ({FALLBACK_SUMMARIZER_MODEL}): {rle_fallback}. Summarization failed.")
            return None
        except Exception as e_fallback:
            print(f"Error during summarization with fallback model ({FALLBACK_SUMMARIZER_MODEL}): {e_fallback}")
            return None
    except Exception as e_primary:
        print(f"Error during summarization with primary model ({PRIMARY_SUMMARIZER_MODEL}): {e_primary}")
        # You could also try fallback here for non-rate-limit errors if desired
        return None



# Classify post summary into structured categories
def classify_post(summary, labels):
    if pd.isna(summary) or summary is None:
        return None

    prompt = f"""
    Post Summary: "{summary}"

    Available Categories:
    {', '.join(labels)}

    Task: Choose the single most relevant category from the list above that applies to this summary. Return only one category in a structured JSON format under the field 'category'.
    If no category applies, return 'None'.
    """
    try:
        result = client.chat.completions.create(
            model="meta-llama/llama-4-maverick-17b-128e-instruct",
            response_model=ClassificationOutput,
            messages=[
                {"role": "system", "content": "You are a strict classifier. Return only one matching category name under the field 'category'."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )
        return result.category
    except Exception as e:
        print(f"Classification error: {e}")
        return None

def summarize_and_classify_post(text, labels):
    summary = summarize_post(text)
    category = classify_post(summary, labels) if summary else None
    return {
        "summary": summary,
        "category": category
    }

def batch_summarize_and_classify(posts):

    labels = [
    "Company Culture and Values",
    "Employee Stories and Spotlights",
    "Work-Life Balance, Flexibility, and Well-being",
    "Diversity, Equity, and Inclusion (DEI)",
    "Professional Development and Growth Opportunities",
    "Mission, Vision, and Social Responsibility",
    "None"
    ]
    
    results = []
    for post in posts:
        text = post.get("text")
        result = summarize_and_classify_post(text, labels)
        results.append(result)
    return results