Spaces:
Sleeping
Sleeping
Ryan
commited on
Commit
·
e5c73cc
1
Parent(s):
27f83f8
update
Browse files
README.md
CHANGED
@@ -21,14 +21,39 @@ Here is a link to the video demo also:
|
|
21 |
|
22 |
# Introduction
|
23 |
|
24 |
-
This is a Gradio app that allows you to compare the responses of two different LLMs (Language Models) to the same input prompt. The app provides a simple interface where you can enter a prompt and
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# Usage
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
|
|
32 |
|
33 |
# Documentation
|
34 |
|
|
|
21 |
|
22 |
# Introduction
|
23 |
|
24 |
+
This is a Gradio app that allows you to compare the responses of two different LLMs (Language Models) to the same input prompt. The app provides a simple interface where you can enter a prompt and responses yourself, or select from the built-in dataset. The app is built using the Gradio library, which provides a user-friendly interface for creating web applications with Python. The app is designed to be easy to use and provides a simple way to compare the responses of different LLMs to the same input prompt. It can be useful for researchers and developers who want to evaluate the performance of different LLMs on the same task.
|
25 |
+
|
26 |
+
There are three main tabs:
|
27 |
+
|
28 |
+
- Dataset Input
|
29 |
+
- Analysis
|
30 |
+
- RoBERTa Sentiment
|
31 |
+
- Summary
|
32 |
+
|
33 |
+
|
34 |
|
35 |
# Usage
|
36 |
|
37 |
+
## Dataset Input
|
38 |
+
|
39 |
+
The dataset input tab allows you to select a dataset from the built-in dataset or enter your own prompt and responses. You can select a dataset from the dropdown menu, or enter your own prompt and responses in the text boxes.
|
40 |
+
|
41 |
+
## Analysis
|
42 |
+
|
43 |
+
Once you have loaded a dataset, you now have four options:
|
44 |
+
|
45 |
+
- Bag of Words
|
46 |
+
- N-grams
|
47 |
+
- Bias Detection
|
48 |
+
- Classifier
|
49 |
+
|
50 |
+
## RoBERTa Sentiment
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
## Summary
|
55 |
|
56 |
+
The summary tab provides a summary of two of the prompts: the Trump and Harris prompts. This is hard-coded in the app. It is text files selectable from a dropdown menu. The first part of the prompt is the summary from an LLM. One is from Claude 3.7 and the other ChatGPT o4. The second part of the text file, below the first, is the actual data given to the LLM, which is of the results from the various analysis options. A user could copy and paste these in an LLM to get a similar response from their user entered datasets.
|
57 |
|
58 |
# Documentation
|
59 |
|
app.py
CHANGED
@@ -7,6 +7,9 @@ from visualization.roberta_visualizer import process_and_visualize_sentiment_ana
|
|
7 |
import nltk
|
8 |
import os
|
9 |
import json
|
|
|
|
|
|
|
10 |
|
11 |
# Download necessary NLTK resources function remains unchanged
|
12 |
def download_nltk_resources():
|
@@ -760,6 +763,185 @@ def create_app():
|
|
760 |
inputs=[summary_dropdown],
|
761 |
outputs=[summary_content, summary_status]
|
762 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
763 |
|
764 |
# Run analysis with proper parameters
|
765 |
run_analysis_btn.click(
|
|
|
7 |
import nltk
|
8 |
import os
|
9 |
import json
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
import io
|
12 |
+
import base64
|
13 |
|
14 |
# Download necessary NLTK resources function remains unchanged
|
15 |
def download_nltk_resources():
|
|
|
763 |
inputs=[summary_dropdown],
|
764 |
outputs=[summary_content, summary_status]
|
765 |
)
|
766 |
+
# Add a Visuals tab for plotting graphs
|
767 |
+
with gr.Tab("Visuals"):
|
768 |
+
gr.Markdown("## Visualization Graphs")
|
769 |
+
|
770 |
+
with gr.Row():
|
771 |
+
with gr.Column(scale=1):
|
772 |
+
# Dropdown for selecting visualization type
|
773 |
+
viz_type = gr.Dropdown(
|
774 |
+
choices=["N-gram Comparison", "Word Frequency", "Sentiment Analysis"],
|
775 |
+
label="Visualization Type",
|
776 |
+
info="Select the type of visualization to display",
|
777 |
+
value="N-gram Comparison"
|
778 |
+
)
|
779 |
+
|
780 |
+
# Button to generate visualization
|
781 |
+
generate_viz_btn = gr.Button("Generate Visualization", variant="primary")
|
782 |
+
|
783 |
+
with gr.Column(scale=3):
|
784 |
+
# Image component to display the plot
|
785 |
+
viz_output = gr.Image(
|
786 |
+
label="Visualization",
|
787 |
+
type="pil",
|
788 |
+
height=500
|
789 |
+
)
|
790 |
+
|
791 |
+
viz_status = gr.Markdown("*No visualization generated*")
|
792 |
+
|
793 |
+
# Function to generate and display visualizations
|
794 |
+
def generate_visualization(viz_type, dataset, analysis_results):
|
795 |
+
try:
|
796 |
+
if not dataset or "entries" not in dataset or not dataset["entries"]:
|
797 |
+
return None, "❌ **Error:** No dataset loaded. Please create or load a dataset first."
|
798 |
+
|
799 |
+
# Example data (fallback when no real data is available)
|
800 |
+
ex_data = {
|
801 |
+
'attorney general': 3,
|
802 |
+
'social justice': 3,
|
803 |
+
'centrist approach': 2,
|
804 |
+
'climate change': 2,
|
805 |
+
'criminal justice': 2,
|
806 |
+
'gun control': 2,
|
807 |
+
'human rights': 2,
|
808 |
+
'justice issues': 2,
|
809 |
+
'measures like': 2,
|
810 |
+
'middle class': 2
|
811 |
+
}
|
812 |
+
|
813 |
+
gran_data = {
|
814 |
+
'political views': 3,
|
815 |
+
'vice president': 3,
|
816 |
+
'criminal justice': 2,
|
817 |
+
'democratic party': 2,
|
818 |
+
'foreign policy': 2,
|
819 |
+
'harris advocated': 2,
|
820 |
+
'lgbtq rights': 2,
|
821 |
+
'president harris': 2,
|
822 |
+
'social issues': 2,
|
823 |
+
'2019 proposed': 1
|
824 |
+
}
|
825 |
+
|
826 |
+
# Use real data if available in analysis_results
|
827 |
+
model1_data = {}
|
828 |
+
model2_data = {}
|
829 |
+
model1_name = "Model 1"
|
830 |
+
model2_name = "Model 2"
|
831 |
+
|
832 |
+
# Extract actual model names from dataset
|
833 |
+
if dataset and "entries" in dataset and len(dataset["entries"]) >= 2:
|
834 |
+
model1_name = dataset["entries"][0].get("model", "Model 1")
|
835 |
+
model2_name = dataset["entries"][1].get("model", "Model 2")
|
836 |
+
|
837 |
+
# Try to get real data from analysis_results
|
838 |
+
if analysis_results and "analyses" in analysis_results:
|
839 |
+
for prompt, analyses in analysis_results["analyses"].items():
|
840 |
+
if viz_type == "N-gram Comparison" and "ngram_analysis" in analyses:
|
841 |
+
ngram_results = analyses["ngram_analysis"]
|
842 |
+
important_ngrams = ngram_results.get("important_ngrams", {})
|
843 |
+
|
844 |
+
if model1_name in important_ngrams:
|
845 |
+
model1_data = {item["ngram"]: item["count"] for item in important_ngrams[model1_name]}
|
846 |
+
|
847 |
+
if model2_name in important_ngrams:
|
848 |
+
model2_data = {item["ngram"]: item["count"] for item in important_ngrams[model2_name]}
|
849 |
+
|
850 |
+
elif viz_type == "Word Frequency" and "bag_of_words" in analyses:
|
851 |
+
bow_results = analyses["bag_of_words"]
|
852 |
+
important_words = bow_results.get("important_words", {})
|
853 |
+
|
854 |
+
if model1_name in important_words:
|
855 |
+
model1_data = {item["word"]: item["count"] for item in important_words[model1_name]}
|
856 |
+
|
857 |
+
if model2_name in important_words:
|
858 |
+
model2_data = {item["word"]: item["count"] for item in important_words[model2_name]}
|
859 |
+
|
860 |
+
# If we couldn't get real data, use example data
|
861 |
+
if not model1_data:
|
862 |
+
model1_data = ex_data
|
863 |
+
if not model2_data:
|
864 |
+
model2_data = gran_data
|
865 |
+
|
866 |
+
# Create the visualization
|
867 |
+
plt.figure(figsize=(10, 6))
|
868 |
+
|
869 |
+
if viz_type == "N-gram Comparison" or viz_type == "Word Frequency":
|
870 |
+
# Plot for the first model
|
871 |
+
plt.subplot(1, 2, 1)
|
872 |
+
sorted_data1 = sorted(model1_data.items(), key=lambda x: x[1], reverse=True)[:10] # Top 10
|
873 |
+
terms1, counts1 = zip(*sorted_data1) if sorted_data1 else ([], [])
|
874 |
+
|
875 |
+
# Create horizontal bar chart
|
876 |
+
plt.barh([t[:20] + '...' if len(t) > 20 else t for t in terms1[::-1]], counts1[::-1])
|
877 |
+
plt.xlabel('Frequency')
|
878 |
+
plt.title(f'Top {viz_type.split()[0]}s Used by {model1_name}')
|
879 |
+
plt.tight_layout()
|
880 |
+
|
881 |
+
# Plot for the second model
|
882 |
+
plt.subplot(1, 2, 2)
|
883 |
+
sorted_data2 = sorted(model2_data.items(), key=lambda x: x[1], reverse=True)[:10] # Top 10
|
884 |
+
terms2, counts2 = zip(*sorted_data2) if sorted_data2 else ([], [])
|
885 |
+
|
886 |
+
# Create horizontal bar chart
|
887 |
+
plt.barh([t[:20] + '...' if len(t) > 20 else t for t in terms2[::-1]], counts2[::-1])
|
888 |
+
plt.xlabel('Frequency')
|
889 |
+
plt.title(f'Top {viz_type.split()[0]}s Used by {model2_name}')
|
890 |
+
plt.tight_layout()
|
891 |
+
|
892 |
+
elif viz_type == "Sentiment Analysis":
|
893 |
+
# Generate sentiment comparison visualization
|
894 |
+
# This would be populated with real data when available
|
895 |
+
sentiment_scores = {
|
896 |
+
model1_name: 0.75, # Example score
|
897 |
+
model2_name: 0.25 # Example score
|
898 |
+
}
|
899 |
+
|
900 |
+
# Extract real sentiment scores if available
|
901 |
+
if "roberta_results_state" in analysis_results:
|
902 |
+
roberta_results = analysis_results["roberta_results_state"]
|
903 |
+
if "analyses" in roberta_results:
|
904 |
+
for prompt, analyses in roberta_results["analyses"].items():
|
905 |
+
if "roberta_sentiment" in analyses:
|
906 |
+
sentiment_result = analyses["roberta_sentiment"]
|
907 |
+
sentiment_analysis = sentiment_result.get("sentiment_analysis", {})
|
908 |
+
|
909 |
+
if model1_name in sentiment_analysis:
|
910 |
+
sentiment_scores[model1_name] = sentiment_analysis[model1_name].get("sentiment_score", 0)
|
911 |
+
|
912 |
+
if model2_name in sentiment_analysis:
|
913 |
+
sentiment_scores[model2_name] = sentiment_analysis[model2_name].get("sentiment_score", 0)
|
914 |
+
|
915 |
+
# Create sentiment bar chart
|
916 |
+
plt.bar(list(sentiment_scores.keys()), list(sentiment_scores.values()))
|
917 |
+
plt.ylim(-1, 1)
|
918 |
+
plt.ylabel('Sentiment Score (-1 to 1)')
|
919 |
+
plt.title('Sentiment Analysis Comparison')
|
920 |
+
plt.axhline(y=0, color='r', linestyle='-', alpha=0.3) # Add a zero line
|
921 |
+
|
922 |
+
# Save the plot to a bytes buffer
|
923 |
+
buf = io.BytesIO()
|
924 |
+
plt.savefig(buf, format='png')
|
925 |
+
buf.seek(0)
|
926 |
+
|
927 |
+
# Convert plot to PIL Image
|
928 |
+
from PIL import Image
|
929 |
+
image = Image.open(buf)
|
930 |
+
|
931 |
+
return image, f"✅ **Generated {viz_type} visualization**"
|
932 |
+
|
933 |
+
except Exception as e:
|
934 |
+
import traceback
|
935 |
+
error_msg = f"Error generating visualization: {str(e)}\n{traceback.format_exc()}"
|
936 |
+
print(error_msg)
|
937 |
+
return None, f"❌ **Error:** {str(e)}"
|
938 |
+
|
939 |
+
# Connect the generate button to the function
|
940 |
+
generate_viz_btn.click(
|
941 |
+
fn=generate_visualization,
|
942 |
+
inputs=[viz_type, dataset_state, analysis_results_state],
|
943 |
+
outputs=[viz_output, viz_status]
|
944 |
+
)
|
945 |
|
946 |
# Run analysis with proper parameters
|
947 |
run_analysis_btn.click(
|