LinkedinMonitor / posts_categorization.py
GuglielmoTor's picture
Update posts_categorization.py
8fc987e verified
raw
history blame
4.81 kB
import pandas as pd
from groq import Groq, RateLimitError
import instructor
from pydantic import BaseModel
import os
api_key = os.getenv('GROQ_API_KEY')
# Create single patched Groq client with instructor for structured output
client = instructor.from_groq(Groq(api_key=api_key), mode=instructor.Mode.JSON)
class SummaryOutput(BaseModel):
summary: str
# Define pydantic schema for classification output
class ClassificationOutput(BaseModel):
category: str
PRIMARY_SUMMARIZER_MODEL = "deepseek-r1-distill-llama-70b"
FALLBACK_SUMMARIZER_MODEL = "llama-3.3-70b-versatile"
# Summarize post text
def summarize_post(text):
if pd.isna(text) or text is None:
return None
text = str(text)[:500] # truncate to avoid token overflow
prompt = f"""
Summarize the following LinkedIn post in 5 to 10 words.
Only return the summary inside a JSON field called 'summary'.
Post Text:
\"\"\"{text}\"\"\"
"""
try:
# Attempt with primary model
print(f"Attempting summarization with primary model: {PRIMARY_SUMMARIZER_MODEL}")
response = client.chat.completions.create(
model=PRIMARY_SUMMARIZER_MODEL,
response_model=SummaryOutput,
messages=[
{"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
{"role": "user", "content": prompt}
],
temperature=0.3
)
return response.summary
except RateLimitError:
print(f"Rate limit hit for primary summarizer model: {PRIMARY_SUMMARIZER_MODEL}. Trying fallback: {FALLBACK_SUMMARIZER_MODEL}")
try:
# Attempt with fallback model
response = client.chat.completions.create(
model=FALLBACK_SUMMARIZER_MODEL,
response_model=SummaryOutput,
messages=[
{"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
{"role": "user", "content": prompt}
],
temperature=0.3 # Keep temperature consistent or adjust as needed for fallback
)
print(f"Summarization successful with fallback model: {FALLBACK_SUMMARIZER_MODEL}")
return response.summary
except RateLimitError as rle_fallback:
print(f"Rate limit hit for fallback summarizer model ({FALLBACK_SUMMARIZER_MODEL}): {rle_fallback}. Summarization failed.")
return None
except Exception as e_fallback:
print(f"Error during summarization with fallback model ({FALLBACK_SUMMARIZER_MODEL}): {e_fallback}")
return None
except Exception as e_primary:
print(f"Error during summarization with primary model ({PRIMARY_SUMMARIZER_MODEL}): {e_primary}")
# You could also try fallback here for non-rate-limit errors if desired
return None
# Classify post summary into structured categories
def classify_post(summary, labels):
if pd.isna(summary) or summary is None:
return None
prompt = f"""
Post Summary: "{summary}"
Available Categories:
{', '.join(labels)}
Task: Choose the single most relevant category from the list above that applies to this summary. Return only one category in a structured JSON format under the field 'category'.
If no category applies, return 'None'.
"""
try:
result = client.chat.completions.create(
model="meta-llama/llama-4-maverick-17b-128e-instruct",
response_model=ClassificationOutput,
messages=[
{"role": "system", "content": "You are a strict classifier. Return only one matching category name under the field 'category'."},
{"role": "user", "content": prompt}
],
temperature=0
)
return result.category
except Exception as e:
print(f"Classification error: {e}")
return None
def summarize_and_classify_post(text, labels):
summary = summarize_post(text)
category = classify_post(summary, labels) if summary else None
return {
"summary": summary,
"category": category
}
def batch_summarize_and_classify(posts):
labels = [
"Company Culture and Values",
"Employee Stories and Spotlights",
"Work-Life Balance, Flexibility, and Well-being",
"Diversity, Equity, and Inclusion (DEI)",
"Professional Development and Growth Opportunities",
"Mission, Vision, and Social Responsibility",
"None"
]
results = []
for post in posts:
text = post.get("text")
result = summarize_and_classify_post(text, labels)
results.append(result)
return results