Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

App Files Files Community

LinkedinMonitor / posts_categorization.py

GuglielmoTor

Update posts_categorization.py

8fc987e verified 7 months ago

raw

history blame

4.81 kB

	import pandas as pd
	from groq import Groq, RateLimitError
	import instructor
	from pydantic import BaseModel
	import os

	api_key = os.getenv('GROQ_API_KEY')

	# Create single patched Groq client with instructor for structured output
	client = instructor.from_groq(Groq(api_key=api_key), mode=instructor.Mode.JSON)

	class SummaryOutput(BaseModel):
	summary: str

	# Define pydantic schema for classification output
	class ClassificationOutput(BaseModel):
	category: str

	PRIMARY_SUMMARIZER_MODEL = "deepseek-r1-distill-llama-70b"
	FALLBACK_SUMMARIZER_MODEL = "llama-3.3-70b-versatile"

	# Summarize post text
	def summarize_post(text):
	if pd.isna(text) or text is None:
	return None

	text = str(text)[:500] # truncate to avoid token overflow

	prompt = f"""
	Summarize the following LinkedIn post in 5 to 10 words.
	Only return the summary inside a JSON field called 'summary'.

	Post Text:
	\"\"\"{text}\"\"\"
	"""

	try:
	# Attempt with primary model
	print(f"Attempting summarization with primary model: {PRIMARY_SUMMARIZER_MODEL}")
	response = client.chat.completions.create(
	model=PRIMARY_SUMMARIZER_MODEL,
	response_model=SummaryOutput,
	messages=[
	{"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
	{"role": "user", "content": prompt}
	],
	temperature=0.3
	)
	return response.summary
	except RateLimitError:
	print(f"Rate limit hit for primary summarizer model: {PRIMARY_SUMMARIZER_MODEL}. Trying fallback: {FALLBACK_SUMMARIZER_MODEL}")
	try:
	# Attempt with fallback model
	response = client.chat.completions.create(
	model=FALLBACK_SUMMARIZER_MODEL,
	response_model=SummaryOutput,
	messages=[
	{"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
	{"role": "user", "content": prompt}
	],
	temperature=0.3 # Keep temperature consistent or adjust as needed for fallback
	)
	print(f"Summarization successful with fallback model: {FALLBACK_SUMMARIZER_MODEL}")
	return response.summary
	except RateLimitError as rle_fallback:
	print(f"Rate limit hit for fallback summarizer model ({FALLBACK_SUMMARIZER_MODEL}): {rle_fallback}. Summarization failed.")
	return None
	except Exception as e_fallback:
	print(f"Error during summarization with fallback model ({FALLBACK_SUMMARIZER_MODEL}): {e_fallback}")
	return None
	except Exception as e_primary:
	print(f"Error during summarization with primary model ({PRIMARY_SUMMARIZER_MODEL}): {e_primary}")
	# You could also try fallback here for non-rate-limit errors if desired
	return None



	# Classify post summary into structured categories
	def classify_post(summary, labels):
	if pd.isna(summary) or summary is None:
	return None

	prompt = f"""
	Post Summary: "{summary}"

	Available Categories:
	{', '.join(labels)}

	Task: Choose the single most relevant category from the list above that applies to this summary. Return only one category in a structured JSON format under the field 'category'.
	If no category applies, return 'None'.
	"""
	try:
	result = client.chat.completions.create(
	model="meta-llama/llama-4-maverick-17b-128e-instruct",
	response_model=ClassificationOutput,
	messages=[
	{"role": "system", "content": "You are a strict classifier. Return only one matching category name under the field 'category'."},
	{"role": "user", "content": prompt}
	],
	temperature=0
	)
	return result.category
	except Exception as e:
	print(f"Classification error: {e}")
	return None

	def summarize_and_classify_post(text, labels):
	summary = summarize_post(text)
	category = classify_post(summary, labels) if summary else None
	return {
	"summary": summary,
	"category": category
	}

	def batch_summarize_and_classify(posts):

	labels = [
	"Company Culture and Values",
	"Employee Stories and Spotlights",
	"Work-Life Balance, Flexibility, and Well-being",
	"Diversity, Equity, and Inclusion (DEI)",
	"Professional Development and Growth Opportunities",
	"Mission, Vision, and Social Responsibility",
	"None"
	]

	results = []
	for post in posts:
	text = post.get("text")
	result = summarize_and_classify_post(text, labels)
	results.append(result)
	return results