|
from types import ModuleType
|
|
import math
|
|
from langchain.llms import OpenAI
|
|
import requests
|
|
import requests.models
|
|
|
|
|
|
import requests
|
|
from hamilton.function_modifiers import config
|
|
|
|
|
|
|
|
|
|
def enforcement_prompt(user_input: str, violation_context: dict) -> str:
|
|
"""
|
|
Generates the prompt to be sent to the LLM for determining the appropriate enforcement action.
|
|
"""
|
|
dio_name = violation_context.get("entity_name", "None")
|
|
dio_details = violation_context.get("entity_info", {}).get("Summary", "Unknown")
|
|
dio_category = violation_context.get("entity_info", {}).get("Policy Category", "Unknown")
|
|
radicalization_flag = violation_context.get("extremism_detected", False)
|
|
dio_sentiment = violation_context.get("aspect_sentiment", "None")
|
|
|
|
prompt = (
|
|
|
|
f"Based on the following content and context, determine the appropriate enforcement action:\\n\\n"
|
|
f"Content: '{user_input}'\\n"
|
|
f"Radicalization detected: {radicalization_flag}\\n"
|
|
f"DIO Mentioned: {dio_name} (Category: {dio_category})\\n"
|
|
f"System's Classification of Content Sentiment Towards DIO: {dio_sentiment}"
|
|
|
|
"Policy Summary: Any entity mentioned in the 'DIO Mentioned' field is from the Dangerous Individuals & Organizations (DIO) blacklist. "
|
|
"If this entity is mentioned in the user content in a supportive, glorifying, or representative manner, it constitutes a violation of our platform's community standards. "
|
|
"Our platform does not allow content that glorifies, supports, or represents individuals or organizations involved in violence, hate, or terrorism. "
|
|
"These entities include terrorist groups, hate groups, violent non-state actors, and others who advocate or engage in violent activities. "
|
|
"Any content that endorses or supports these entities or their actions is a policy violation. "
|
|
"Users may discuss these topics neutrally or in a condemnatory manner, but any positive portrayal is prohibited. "
|
|
"Based on this policy, choose one of the following enforcement actions: Warning, Limited Visibility, Temporary Suspension, Permanent Ban. "
|
|
"Provide a brief explanation for your choice."
|
|
)
|
|
return prompt
|
|
|
|
def get_enforcement_decision(enforcement_prompt: str, mistral_public_url: str) -> dict:
|
|
"""
|
|
Sends the enforcement prompt to the Mistral model server and retrieves the enforcement decision.
|
|
"""
|
|
input_text = {
|
|
"context": enforcement_prompt,
|
|
"question": "What is the appropriate enforcement action?"
|
|
}
|
|
|
|
response = requests.post(f'{mistral_public_url}/mistral-inference', json=input_text, stream=False)
|
|
|
|
return {
|
|
"enforcement_action": response.text.strip(),
|
|
"prompt": enforcement_prompt
|
|
}
|
|
|
|
|
|
|