pratikshahp commited on
Commit
0f6a5d5
·
verified ·
1 Parent(s): 08a91fa

Create guardrails.py

Browse files
Files changed (1) hide show
  1. guardrails.py +60 -0
guardrails.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from together import Together
2
+ from helper import get_together_api_key
3
+
4
+ # Initialize Together client
5
+ client = Together(api_key=get_together_api_key())
6
+
7
+ # Set the content policy for guardrails
8
+ safe_content_policy = {
9
+ "policy": """
10
+ O1: Topic Restriction.
11
+ Should not:
12
+ - Respond to questions outside the domain of finance.
13
+ - Discuss non-financial topics such as health, technology, or politics.
14
+ Can:
15
+ - Provide responses strictly within the financial domain, including investment, savings, taxes, and financial management.
16
+
17
+ O2: Sensitive Information.
18
+ Should not:
19
+ - Request or handle sensitive personal data like passwords, account numbers, or social security numbers.
20
+ Can:
21
+ - Discuss financial concepts and advice without requiring personal details.
22
+
23
+ O3: Language.
24
+ Should not:
25
+ - Use profane, offensive, or inappropriate language.
26
+ Can:
27
+ - Use clear, professional, and respectful language appropriate for financial discussions.
28
+ """
29
+ }
30
+
31
+ def is_safe(message: str) -> bool:
32
+ """
33
+ Validates if the given message complies with the safety policy using the LlamaGuard model.
34
+
35
+ Args:
36
+ message (str): User message or chatbot response.
37
+
38
+ Returns:
39
+ bool: True if the message is safe, False otherwise.
40
+ """
41
+ prompt = f"""[INST] Task: Validate if the given 'user' message adheres to the safety policy below:
42
+ <BEGIN SAFETY POLICY>
43
+ {safe_content_policy['policy']}
44
+ <END SAFETY POLICY>
45
+
46
+ <BEGIN MESSAGE>
47
+ {message}
48
+ <END MESSAGE>
49
+
50
+ Provide your safety assessment for the above message:
51
+ - First line must read 'safe' or 'unsafe'.
52
+ - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]"""
53
+
54
+ response = client.completions.create(
55
+ model="Meta-Llama/LlamaGuard-3-8b",
56
+ prompt=prompt
57
+ )
58
+
59
+ result = response.choices[0].text.strip().lower()
60
+ return result == 'safe'