Spaces:
Sleeping
Sleeping
import streamlit as st | |
from io import BytesIO | |
import time | |
import re | |
from report_gen import generate_pdf | |
# Set page config | |
st.set_page_config( | |
page_title="AI Trust and Opacity Evaluation", | |
page_icon="🤖", | |
initial_sidebar_state="collapsed" | |
) | |
# Password protection | |
def check_password(): | |
def password_entered(): | |
if st.session_state["password"] == st.secrets["app_password"]: | |
st.session_state["password_correct"] = True | |
del st.session_state["password"] | |
else: | |
st.session_state["password_correct"] = False | |
if "password_correct" not in st.session_state: | |
st.markdown("\n\n") | |
st.text_input("Enter the password", type="password", on_change=password_entered, key="password") | |
st.divider() | |
st.info("Developed by Milan Mrdenovic © IBM Norway 2024") | |
return False | |
elif not st.session_state["password_correct"]: | |
st.markdown("\n\n") | |
st.text_input("Enter the password", type="password", on_change=password_entered, key="password") | |
st.divider() | |
st.info("Developed by Milan Mrdenovic © IBM Norway 2024") | |
st.error("😕 Password incorrect") | |
return False | |
else: | |
return True | |
if not check_password(): | |
st.stop() | |
# Initialize session state | |
if 'current_page' not in st.session_state: | |
st.session_state.current_page = 0 | |
if 'answers' not in st.session_state: | |
st.session_state.answers = { | |
'what_evaluating': '', | |
'stakeholder_groups': '', | |
'intentional_concealment': '', | |
'technological_literacy': '', | |
'cognitive_mismatch': '', | |
'trust_focus': '', | |
'trust_source': '', | |
'trust_warranted': '', | |
'trust_conclusion': '' | |
} | |
def format_text(text): | |
# Make text before colon bold | |
text = re.sub(r'(^|[\n\r])([^:\n\r]+):', r'\1##\2**:', text) | |
# Make text in parentheses italic | |
text = re.sub(r'\(([^)]+)\)', r'*(\1)*', text) | |
return text | |
# Define the content for each page | |
pages = [ | |
{ | |
'title': "What Are We Evaluating?", | |
'content': """ | |
Are we evaluating the overall solution or the role that the LLM plays itself? | |
(If you want to do both, it is best to start with the solution and then do another evaluation for the model as part of it.) | |
""", | |
'input_key': 'what_evaluating', | |
'input_type': 'radio', | |
'options': ['Overall Solution', 'Foundation Model'], | |
'example': """ | |
Solution: Evaluate a gen.ai chatbot for extracting precedent from legal documents for a law firm. | |
Foundation Model: Evaluate what part Llama2 as a model in the chatbot may play in the risks. | |
""" | |
}, | |
{ | |
'title': "Identifying Main Stakeholder Groups", | |
'content': """ | |
We need to discover the most pressing governance barriers for our client. | |
Specify up to three main stakeholder groups whose trust in the solution must be established for long-term success. | |
""", | |
'input_key': 'stakeholder_groups', | |
'input_type': 'text_area', | |
'example': """ | |
Gen.ai chatbot solution for law firm example: | |
Group 1 - End-users (attorneys/legal counsellors) | |
Group 2 - IT Department (providing the tooling) | |
Group 3 - End-clients (the plaintiffs or defendants) | |
""" | |
}, | |
{ | |
'title': "What is Opacity?", | |
'content': """ | |
Through this process we will look out for factors known as "Opacity", they represent limiters that prevent us from making informed decisions, knowing why things happen or making sure we can take accountability for issues that come up. | |
They are not just technological problems, but societal ones as well. | |
There are three main types we will focus on (most common and among the most dangerous): | |
Intentional Concealment / Technological Literacy / Cognitive Mismatch | |
""" | |
}, | |
{ | |
'title': "Intentional Concealment", | |
'content': """ | |
Intentional Concealment is a form of opacity that occurs when organizations prevent or reduce access to information | |
so that they can maintain their trade secrets or competitive advantages. It's always present in some shape or form with | |
proprietary technologies, maybe they won't declare where they get their data, maybe they won't declare the techniques | |
they use to train their models, etc. | |
""", | |
'input_key': 'intentional_concealment', | |
'input_type': 'text_area', | |
'example': """ | |
Gen.ai chatbot solution for law firm example: | |
Concealment 1: They don't let people audit the training data. | |
Concealment 2: We can't deploy the model ourselves, only consume it from them via an endpoint. | |
Transparency 1: They published an academic paper on how the model was developed. | |
""" | |
}, | |
{ | |
'title': "Technological Literacy", | |
'content': """ | |
Technological Literacy (often termed as technological illiteracy in literature) is a form | |
of opacity that most impacts the general population because understanding these technologies is not a common or easily accessible | |
skill. Often the thing that a user experiences is very different than what the technology actually does. It's similar | |
to how a person may know how to use a smartphone, but not to code an app for one, or to engineer a phone themselves. | |
""", | |
'input_key': 'technological_literacy', | |
'input_type': 'text_area', | |
'example': """ | |
Gen.ai chatbot solution for law firm example: | |
Group 1 (Attorneys) - Yes, Yes : They should be able to evaluate the output if it seems abnormal due to their domain expertise, but may not understand how the solution works in the background. | |
Group 2 (IT Department) - Yes, Yes : They have used foundation models for other tasks before to create other solutions. | |
Group 3 (Plaintiffs/Defendants) - Yes, No : They aren't familiar with legal precedents, that's why they hired our client to help them. They need to trust their expertise but less so the solution. | |
""" | |
}, | |
{ | |
'title': "Cognitive Mismatch", | |
'content': """ | |
Cognitive Mismatch is the most difficult form of opacity, it's something we can only moderately protect ourselves from | |
but never get rid of. It happens when we are dealing with systems, technologies or just scales that are way beyond our | |
human point of reference. In AI systems it most often happens when we need to know exactly how a system came up | |
with a decision. This is a problem inherent to both Predictive and Generative AI, it is the cause of the AI Black Box that you may have heard of before. | |
""", | |
'input_key': 'cognitive_mismatch', | |
'input_type': 'text_area', | |
'example': """ | |
Gen.ai chatbot solution for law firm example: | |
Group 1 - Yes: The chatbot invents a new type of precedent by hallucinating, but it turns out that it is a real thing that we never expected or noticed!? | |
Group 2 - Yes: The government wants us to provide an explanation of why a model said what it said to a client and ruined their life, how do I do that when it has 70 billion parameters!? | |
Group 3 - Yes: In the meeting the chatbot starts to tell the client that they will suffer 40 years of jail time for causing the death of the Dalai Lama because they didn't pay the taxes. | |
""" | |
}, | |
{ | |
'title': "Intrinsic and Extrinsic Trust", | |
'content': """ | |
Intrinsic trust comes from us, we believe we understand something enough or have enough | |
experience from it. So we trust ourselves and the way we engage with the technology. | |
Extrinsic trust comes from outside, when we rely on someone/something's reputation, the | |
recommendations of experts, reports about their credibility and so forth. But it is also much more | |
frail than intrinsic trust. | |
Which form of trust do we want to focus on, and what is the source of that trust? Can it be further strengthened? | |
""", | |
'input_key': 'trust_focus', | |
'input_type': 'combined', | |
'options': ['Intrinsic trust', 'Extrinsic trust'], | |
'example': """ | |
Gen.ai chatbot solution for law firm example: | |
We choose to rely on Intrinsic Trust in this case. Because our client's customers come to them because they trust them, while the IT department trusts that they can handle this solution and we trust the expertise of our lawyers to handle any anomalies. But we can also provide the IT department with tools to help them maintain their intrinsic trust in their work by monitoring it. | |
Source of Trust: The intrinsic trust comes from the lawyers' expertise in their field and the IT department's familiarity with similar systems. We can further strengthen this by providing regular training sessions on the AI system and its limitations, as well as implementing a robust feedback mechanism for continuous improvement. | |
""" | |
}, | |
{ | |
'title': "Warranted or Unwarranted Trust", | |
'content': """ | |
Warranted trust meaning that we have enough arguments to say that we firmly believe that | |
we can trust ourselves to handle the usecase alongside our client, their customers and the | |
solution itself. | |
Unwarranted trust meaning that we still choose to trust our approach and case even though we | |
don't have enough arguments and may be doing it for irrational reasons such as "it feels like it can do | |
it good" or "the results are the only thing that matters, even if we can't make sure the tech is | |
sound." | |
Based on your answers and thought process above, determine if our trust in this case is warranted or unwarranted. | |
Then, create a short conclusion on why the trust is or isn't warranted based on the answers from the previous sections. | |
""", | |
'input_key': 'trust_warranted', | |
'input_type': 'combined', | |
'options': ['Trust is warranted', 'Trust is unwarranted'], | |
'example': """ | |
Gen.ai chatbot solution for law firm example: | |
Conclusion: We believe that trust in our solution is Warranted when we consider the subject matter expertise of our clients internal legal and IT experts, as well as our recommendation of using some watsonx.governance tooling to help bolster the IT department's intrinsic trust in the solution and reduce cognitive mismatch opacity. | |
This trust is warranted because: | |
1. The legal experts (end-users) have the domain knowledge to critically evaluate the chatbot's outputs. | |
2. The IT department has experience with similar AI systems and will be equipped with additional monitoring tools. | |
3. We've identified potential opacity issues and have plans to address them, such as using explainable AI techniques. | |
4. The solution will be continuously monitored and improved based on feedback from all stakeholder groups. | |
""" | |
}, | |
{ | |
'title': "Improving Trust", | |
'content': """ | |
Based on your evaluation, how can we improve upon this trust? | |
Consider the different types of opacity discussed earlier and the specific needs of each stakeholder group. | |
Provide concrete suggestions for strengthening trust in the AI solution. | |
""", | |
'input_key': 'trust_improvement', | |
'input_type': 'text_area', | |
'example': """ | |
To improve trust in our example gen.ai chatbot for the law firm: | |
1. Transparency: Develop a clear, non-technical explanation of how the AI works for all stakeholders. | |
2. Training: Provide regular training sessions for attorneys on how to effectively use and critically evaluate the chatbot's outputs. | |
3. Feedback Loop: Implement a robust feedback mechanism where users can flag incorrect or concerning outputs, feeding into continuous improvement. | |
4. Explainability: Integrate explainable AI techniques to provide rationale for the chatbot's recommendations, addressing cognitive mismatch. | |
5. Auditing: Establish regular audits of the system's performance and decision-making processes, sharing results with stakeholders. | |
6. Ethical Guidelines: Develop and prominently display clear ethical guidelines for the AI's use in legal contexts. | |
7. Collaboration: Foster ongoing collaboration between legal experts and IT teams to bridge the technological literacy gap. | |
8. Gradual Rollout: Implement the solution in phases, allowing time for trust to build and for refinements based on real-world use. | |
""" | |
}, | |
{ | |
'title': "Generate Evaluation Report", | |
'content': "You have completed the AI Trust and Opacity Evaluation. Click the button below to generate and download your PDF report.", | |
'input_key': None | |
} | |
] | |
# Streamlit app | |
st.title("AI Trust and Opacity Evaluation") | |
# Navigation buttons | |
col1, col2, col3 = st.columns([1, 2, 1]) | |
with col1: | |
if st.session_state.current_page > 0: | |
if st.button("Back"): | |
st.session_state.current_page -= 1 | |
st.rerun() | |
with col3: | |
if st.session_state.current_page < len(pages) - 1: # Changed condition to allow "Next" on the second-to-last page | |
if st.button("Next", use_container_width=True): | |
st.session_state.current_page += 1 | |
st.rerun() | |
# Display current page | |
current_page = pages[st.session_state.current_page] | |
st.header(current_page['title']) | |
st.markdown(current_page['content']) | |
# Input fields (only for pages that need input) | |
if 'input_key' in current_page and current_page['input_key'] is not None: | |
if current_page['input_type'] == 'radio': | |
st.session_state.answers[current_page['input_key']] = st.radio( | |
"Select an option:", | |
current_page['options'], | |
key=current_page['input_key'] | |
) | |
elif current_page['input_type'] == 'text_area': | |
st.session_state.answers[current_page['input_key']] = st.text_area( | |
"Your answer:", | |
value=st.session_state.answers.get(current_page['input_key'], ""), | |
key=current_page['input_key'], | |
height=300 | |
) | |
elif current_page['input_type'] == 'combined': | |
st.session_state.answers[current_page['input_key']] = st.radio( | |
"Select an option:", | |
current_page['options'], | |
key=f"{current_page['input_key']}_radio" | |
) | |
st.session_state.answers[f"{current_page['input_key']}_conclusion"] = st.text_area( | |
"Provide your conclusion:", | |
value=st.session_state.answers.get(f"{current_page['input_key']}_conclusion", ""), | |
key=f"{current_page['input_key']}_text_area", | |
height=200 | |
) | |
# Add example in an expander | |
if 'example' in current_page: | |
with st.expander("Reveal Example"): | |
st.markdown(current_page['example']) | |
# Generate PDF button (only on the last page) | |
if st.session_state.current_page == len(pages) - 1: | |
if st.button("Generate and Download PDF", use_container_width=True): | |
pdf = generate_pdf(pages, st.session_state.answers) | |
st.download_button( | |
label="Download PDF", | |
data=pdf, | |
file_name="AI_Trust_and_Opacity_Evaluation.pdf", | |
mime="application/pdf", | |
use_container_width=True | |
) | |
# Display progress | |
st.progress((st.session_state.current_page + 1) / len(pages)) | |
# st.divider() | |
# st.info("Developed by Milan Mrdenovic © IBM Norway 2024") |