pratikshahp commited on
Commit
fb28324
·
verified ·
1 Parent(s): 6b3dc9b

Create guardrail.py

Browse files
Files changed (1) hide show
  1. guardrail.py +67 -0
guardrail.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from together import Together
2
+ from helper import get_together_api_key
3
+
4
+ # Initialize Together client
5
+ client = Together(api_key=get_together_api_key())
6
+
7
+ # Set the content policy for the under-16 chatbot guardrails
8
+ under_16_content_policy = {
9
+ "policy": """
10
+ U1: Age-Appropriate Content.
11
+ Should not:
12
+ - Respond to any topics that are inappropriate for individuals under 16, including violence, sexual content, and substance use.
13
+ - Provide information or advice that is unsuitable for children.
14
+ Can:
15
+ - Provide responses that are age-appropriate, educational, and entertaining.
16
+
17
+ U2: Personal Information.
18
+ Should not:
19
+ - Request or handle personal information, including full names, addresses, contact numbers, or sensitive personal details.
20
+ Can:
21
+ - Interact generically without requiring any personal details.
22
+
23
+ U3: Language.
24
+ Should not:
25
+ - Use profane, offensive, or inappropriate language.
26
+ - Use language that could be perceived as harmful or inappropriate for minors.
27
+ Can:
28
+ - Use clear, respectful, simple, and encouraging language suitable for children under 16.
29
+ """
30
+ }
31
+
32
+ def is_safe(message: str) -> bool:
33
+ """
34
+ Validates if the given message complies with the under-16 safety policy using the LlamaGuard model.
35
+
36
+ Args:
37
+ message (str): User message or chatbot response.
38
+
39
+ Returns:
40
+ bool: True if the message is safe, False otherwise.
41
+ """
42
+ # Construct the safety validation prompt
43
+ prompt = f"""[INST] Task: Validate if the given 'user' or 'agent' message adheres to the safety policy below:
44
+ <BEGIN SAFETY POLICY>
45
+ {under_16_content_policy['policy']}
46
+ <END SAFETY POLICY>
47
+ <BEGIN MESSAGE>
48
+ {message}
49
+ <END MESSAGE>
50
+ Provide your safety assessment for the above message:
51
+ - First line must read 'safe' or 'unsafe'.
52
+ - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]"""
53
+
54
+ try:
55
+ # Use TogetherAI's LlamaGuard model to validate the content
56
+ response = client.completions.create(
57
+ model="meta-llama/Meta-Llama-Guard-2-8B",
58
+ prompt=prompt
59
+ )
60
+
61
+ # Extract the response result
62
+ result = response.choices[0].text.strip().lower()
63
+ return result == 'safe'
64
+
65
+ except Exception as e:
66
+ print(f"Error during safety validation: {e}")
67
+ return False