Spaces:
Sleeping
Sleeping
Ryan
commited on
Commit
·
5ba1ab4
1
Parent(s):
5110d3f
update
Browse files- .idea/525GradioApp.iml +12 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +7 -0
- .idea/workspace.xml +61 -0
- _archive/app.py +63 -0
- app.py +5 -13
- processors/bias_detection.py +0 -241
- processors/diff_highlighter.py +0 -298
- processors/metrics.py +0 -258
- processors/ngram_analysis.py +0 -208
- processors/topic_modeling.py +0 -183
- ui/analysis_screen.py +5 -5
.idea/525GradioApp.iml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$" />
|
5 |
+
<orderEntry type="inheritedJdk" />
|
6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
+
</component>
|
8 |
+
<component name="PyDocumentationSettings">
|
9 |
+
<option name="format" value="GOOGLE" />
|
10 |
+
<option name="myDocStringFormat" value="Google" />
|
11 |
+
</component>
|
12 |
+
</module>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/525GradioApp.iml" filepath="$PROJECT_DIR$/.idea/525GradioApp.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="" vcs="Git" />
|
5 |
+
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
6 |
+
</component>
|
7 |
+
</project>
|
.idea/workspace.xml
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ChangeListManager">
|
4 |
+
<list default="true" id="8e67814c-7f04-433c-ab7a-2b65a1106d4c" name="Changes" comment="">
|
5 |
+
<change beforePath="$PROJECT_DIR$/processors/bias_detection.py" beforeDir="false" />
|
6 |
+
<change beforePath="$PROJECT_DIR$/processors/diff_highlighter.py" beforeDir="false" />
|
7 |
+
<change beforePath="$PROJECT_DIR$/processors/metrics.py" beforeDir="false" />
|
8 |
+
<change beforePath="$PROJECT_DIR$/processors/ngram_analysis.py" beforeDir="false" />
|
9 |
+
<change beforePath="$PROJECT_DIR$/processors/topic_modeling.py" beforeDir="false" />
|
10 |
+
</list>
|
11 |
+
<option name="SHOW_DIALOG" value="false" />
|
12 |
+
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
13 |
+
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
14 |
+
<option name="LAST_RESOLUTION" value="IGNORE" />
|
15 |
+
</component>
|
16 |
+
<component name="Git.Settings">
|
17 |
+
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
18 |
+
</component>
|
19 |
+
<component name="ProjectColorInfo"><![CDATA[{
|
20 |
+
"associatedIndex": 6
|
21 |
+
}]]></component>
|
22 |
+
<component name="ProjectId" id="2w0Fnz09BnFZ6wle8bfjI0kAU9r" />
|
23 |
+
<component name="ProjectViewState">
|
24 |
+
<option name="hideEmptyMiddlePackages" value="true" />
|
25 |
+
<option name="showLibraryContents" value="true" />
|
26 |
+
</component>
|
27 |
+
<component name="PropertiesComponent"><![CDATA[{
|
28 |
+
"keyToString": {
|
29 |
+
"ModuleVcsDetector.initialDetectionPerformed": "true",
|
30 |
+
"RunOnceActivity.ShowReadmeOnStart": "true",
|
31 |
+
"RunOnceActivity.git.unshallow": "true",
|
32 |
+
"git-widget-placeholder": "main",
|
33 |
+
"last_opened_file_path": "/Users/ryan/GitHub/525GradioApp/525GradioApp",
|
34 |
+
"nodejs_package_manager_path": "npm",
|
35 |
+
"settings.editor.selected.configurable": "preferences.pluginManager",
|
36 |
+
"vue.rearranger.settings.migration": "true"
|
37 |
+
}
|
38 |
+
}]]></component>
|
39 |
+
<component name="SharedIndexes">
|
40 |
+
<attachedChunks>
|
41 |
+
<set>
|
42 |
+
<option value="bundled-js-predefined-d6986cc7102b-f27c65a3e318-JavaScript-PY-251.23774.444" />
|
43 |
+
<option value="bundled-python-sdk-890ed5b35930-d9c5bdb153f4-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-251.23774.444" />
|
44 |
+
</set>
|
45 |
+
</attachedChunks>
|
46 |
+
</component>
|
47 |
+
<component name="TaskManager">
|
48 |
+
<task active="true" id="Default" summary="Default task">
|
49 |
+
<changelist id="8e67814c-7f04-433c-ab7a-2b65a1106d4c" name="Changes" comment="" />
|
50 |
+
<created>1745170754325</created>
|
51 |
+
<option name="number" value="Default" />
|
52 |
+
<option name="presentableId" value="Default" />
|
53 |
+
<updated>1745170754325</updated>
|
54 |
+
<workItem from="1745170755404" duration="245000" />
|
55 |
+
</task>
|
56 |
+
<servers />
|
57 |
+
</component>
|
58 |
+
<component name="TypeScriptGeneratedFilesManager">
|
59 |
+
<option name="version" value="3" />
|
60 |
+
</component>
|
61 |
+
</project>
|
_archive/app.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
from ui.dataset_input import create_dataset_input, load_example_dataset
|
4 |
+
from ui.analysis_screen import process_analysis_request
|
5 |
+
|
6 |
+
def create_app():
|
7 |
+
"""
|
8 |
+
Create a streamlined Gradio app for dataset input and Bag of Words analysis.
|
9 |
+
|
10 |
+
Returns:
|
11 |
+
gr.Blocks: The Gradio application
|
12 |
+
"""
|
13 |
+
with gr.Blocks(title="LLM Response Comparator") as app:
|
14 |
+
# Application state to share data between tabs
|
15 |
+
dataset_state = gr.State({})
|
16 |
+
analysis_results_state = gr.State({})
|
17 |
+
|
18 |
+
# Dataset Input Tab
|
19 |
+
with gr.Tab("Dataset Input"):
|
20 |
+
dataset_inputs, example_dropdown, load_example_btn, create_btn, prompt, response1, model1, response2, model2 = create_dataset_input()
|
21 |
+
|
22 |
+
# Load example dataset
|
23 |
+
load_example_btn.click(
|
24 |
+
fn=load_example_dataset,
|
25 |
+
inputs=[example_dropdown],
|
26 |
+
outputs=[dataset_inputs] # Ensure `load_example_dataset` returns compatible data
|
27 |
+
)
|
28 |
+
|
29 |
+
# Save dataset to state
|
30 |
+
create_btn.click(
|
31 |
+
fn=lambda p, r1, m1, r2, m2: {
|
32 |
+
"entries": [
|
33 |
+
{"prompt": p, "response": r1, "model": m1},
|
34 |
+
{"prompt": p, "response": r2, "model": m2}
|
35 |
+
]
|
36 |
+
},
|
37 |
+
inputs=[prompt, response1, model1, response2, model2], # Ensure these are valid Gradio components
|
38 |
+
outputs=[dataset_state] # Ensure `dataset_state` is correctly defined
|
39 |
+
)
|
40 |
+
|
41 |
+
# Analysis Tab
|
42 |
+
with gr.Tab("Analysis"):
|
43 |
+
analysis_options = gr.CheckboxGroup(
|
44 |
+
choices=["Bag of Words"],
|
45 |
+
value=["Bag of Words"],
|
46 |
+
label="Select Analyses to Run"
|
47 |
+
)
|
48 |
+
run_analysis_btn = gr.Button("Run Analysis", variant="primary")
|
49 |
+
analysis_output = gr.JSON(label="Analysis Results", visible=False)
|
50 |
+
|
51 |
+
# Run analysis
|
52 |
+
run_analysis_btn.click(
|
53 |
+
fn=process_analysis_request,
|
54 |
+
inputs=[dataset_state, analysis_options], # Removed None
|
55 |
+
outputs=[analysis_results_state, analysis_output]
|
56 |
+
)
|
57 |
+
|
58 |
+
return app
|
59 |
+
|
60 |
+
if __name__ == "__main__":
|
61 |
+
# Create and launch the app
|
62 |
+
app = create_app()
|
63 |
+
app.launch()
|
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import gradio as gr
|
2 |
-
import os
|
3 |
from ui.dataset_input import create_dataset_input, load_example_dataset
|
4 |
from ui.analysis_screen import process_analysis_request
|
5 |
|
@@ -23,7 +22,7 @@ def create_app():
|
|
23 |
load_example_btn.click(
|
24 |
fn=load_example_dataset,
|
25 |
inputs=[example_dropdown],
|
26 |
-
outputs=[dataset_inputs]
|
27 |
)
|
28 |
|
29 |
# Save dataset to state
|
@@ -34,30 +33,23 @@ def create_app():
|
|
34 |
{"prompt": p, "response": r2, "model": m2}
|
35 |
]
|
36 |
},
|
37 |
-
inputs=[prompt, response1, model1, response2, model2],
|
38 |
-
outputs=[dataset_state]
|
39 |
)
|
40 |
|
41 |
# Analysis Tab
|
42 |
with gr.Tab("Analysis"):
|
43 |
-
analysis_options =
|
44 |
-
choices=["Bag of Words"],
|
45 |
-
value=["Bag of Words"],
|
46 |
-
label="Select Analyses to Run"
|
47 |
-
)
|
48 |
-
run_analysis_btn = gr.Button("Run Analysis", variant="primary")
|
49 |
-
analysis_output = gr.JSON(label="Analysis Results", visible=False)
|
50 |
|
51 |
# Run analysis
|
52 |
run_analysis_btn.click(
|
53 |
fn=process_analysis_request,
|
54 |
-
inputs=[dataset_state, analysis_options],
|
55 |
outputs=[analysis_results_state, analysis_output]
|
56 |
)
|
57 |
|
58 |
return app
|
59 |
|
60 |
if __name__ == "__main__":
|
61 |
-
# Create and launch the app
|
62 |
app = create_app()
|
63 |
app.launch()
|
|
|
1 |
import gradio as gr
|
|
|
2 |
from ui.dataset_input import create_dataset_input, load_example_dataset
|
3 |
from ui.analysis_screen import process_analysis_request
|
4 |
|
|
|
22 |
load_example_btn.click(
|
23 |
fn=load_example_dataset,
|
24 |
inputs=[example_dropdown],
|
25 |
+
outputs=[dataset_inputs]
|
26 |
)
|
27 |
|
28 |
# Save dataset to state
|
|
|
33 |
{"prompt": p, "response": r2, "model": m2}
|
34 |
]
|
35 |
},
|
36 |
+
inputs=[prompt, response1, model1, response2, model2],
|
37 |
+
outputs=[dataset_state]
|
38 |
)
|
39 |
|
40 |
# Analysis Tab
|
41 |
with gr.Tab("Analysis"):
|
42 |
+
analysis_options, analysis_params, run_analysis_btn, analysis_output = process_analysis_request()
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
# Run analysis
|
45 |
run_analysis_btn.click(
|
46 |
fn=process_analysis_request,
|
47 |
+
inputs=[dataset_state, analysis_options],
|
48 |
outputs=[analysis_results_state, analysis_output]
|
49 |
)
|
50 |
|
51 |
return app
|
52 |
|
53 |
if __name__ == "__main__":
|
|
|
54 |
app = create_app()
|
55 |
app.launch()
|
processors/bias_detection.py
DELETED
@@ -1,241 +0,0 @@
|
|
1 |
-
import nltk
|
2 |
-
from nltk.sentiment import SentimentIntensityAnalyzer
|
3 |
-
import re
|
4 |
-
import numpy as np
|
5 |
-
from collections import Counter
|
6 |
-
|
7 |
-
# Download necessary NLTK data
|
8 |
-
try:
|
9 |
-
nltk.data.find('sentiment/vader_lexicon.zip')
|
10 |
-
except LookupError:
|
11 |
-
nltk.download('vader_lexicon')
|
12 |
-
|
13 |
-
# Political leaning lexicons (simplified for demonstration)
|
14 |
-
# In a real implementation, these would be much more comprehensive and nuanced
|
15 |
-
LIBERAL_TERMS = {
|
16 |
-
'progressive', 'equity', 'climate change', 'social justice', 'regulation', 'equality',
|
17 |
-
'diversity', 'inclusion', 'workers rights', 'universal healthcare', 'welfare', 'public',
|
18 |
-
'government program', 'marginalized', 'underrepresented', 'systemic', 'racism',
|
19 |
-
'discrimination', 'gun control', 'green new deal', 'carbon tax', 'reproductive rights',
|
20 |
-
'pro-choice', 'labor union', 'living wage', 'wealth tax', 'police reform'
|
21 |
-
}
|
22 |
-
|
23 |
-
CONSERVATIVE_TERMS = {
|
24 |
-
'traditional', 'free market', 'deregulation', 'individual responsibility', 'liberty',
|
25 |
-
'freedom', 'private sector', 'family values', 'law and order', 'tax cuts', 'limited government',
|
26 |
-
'fiscal responsibility', 'national security', 'defense spending', 'second amendment',
|
27 |
-
'religious freedom', 'pro-life', 'states rights', 'border security', 'merit-based',
|
28 |
-
'job creators', 'free enterprise', 'strong military', 'patriotism', 'constitutional'
|
29 |
-
}
|
30 |
-
|
31 |
-
# Framing lexicons
|
32 |
-
ECONOMIC_FRAMING = {
|
33 |
-
'economy', 'economic', 'cost', 'money', 'financial', 'revenue', 'tax', 'budget',
|
34 |
-
'fiscal', 'deficit', 'inflation', 'growth', 'investment', 'market', 'trade', 'profit',
|
35 |
-
'wage', 'income', 'gdp', 'business', 'corporation', 'industry', 'job', 'unemployment'
|
36 |
-
}
|
37 |
-
|
38 |
-
MORAL_FRAMING = {
|
39 |
-
'moral', 'ethical', 'right', 'wrong', 'good', 'bad', 'value', 'principle', 'fair',
|
40 |
-
'unfair', 'justice', 'dignity', 'integrity', 'honest', 'corrupt', 'compassion',
|
41 |
-
'respect', 'responsibility', 'duty', 'virtue', 'vice', 'sin', 'sacred', 'character'
|
42 |
-
}
|
43 |
-
|
44 |
-
SECURITY_FRAMING = {
|
45 |
-
'security', 'safety', 'threat', 'danger', 'risk', 'fear', 'protect', 'defend',
|
46 |
-
'attack', 'crisis', 'emergency', 'invasion', 'violence', 'crime', 'terrorism',
|
47 |
-
'defense', 'military', 'police', 'law', 'order', 'stability', 'chaos', 'conflict'
|
48 |
-
}
|
49 |
-
|
50 |
-
def detect_sentiment(text):
|
51 |
-
"""
|
52 |
-
Detect overall sentiment of text
|
53 |
-
|
54 |
-
Args:
|
55 |
-
text (str): Input text
|
56 |
-
|
57 |
-
Returns:
|
58 |
-
dict: Sentiment analysis results
|
59 |
-
"""
|
60 |
-
# Use VADER for sentiment analysis
|
61 |
-
sid = SentimentIntensityAnalyzer()
|
62 |
-
sentiment_scores = sid.polarity_scores(text)
|
63 |
-
|
64 |
-
# Classify based on compound score
|
65 |
-
if sentiment_scores['compound'] >= 0.05:
|
66 |
-
classification = "Positive"
|
67 |
-
elif sentiment_scores['compound'] <= -0.05:
|
68 |
-
classification = "Negative"
|
69 |
-
else:
|
70 |
-
classification = "Neutral"
|
71 |
-
|
72 |
-
return {
|
73 |
-
"compound": sentiment_scores['compound'],
|
74 |
-
"positive": sentiment_scores['pos'],
|
75 |
-
"neutral": sentiment_scores['neu'],
|
76 |
-
"negative": sentiment_scores['neg'],
|
77 |
-
"classification": classification
|
78 |
-
}
|
79 |
-
|
80 |
-
def detect_partisan_lean(text):
|
81 |
-
"""
|
82 |
-
Detect political leaning of text
|
83 |
-
|
84 |
-
Args:
|
85 |
-
text (str): Input text
|
86 |
-
|
87 |
-
Returns:
|
88 |
-
dict: Political leaning analysis
|
89 |
-
"""
|
90 |
-
# Normalize text
|
91 |
-
text_lower = text.lower()
|
92 |
-
|
93 |
-
# Count occurrences of politically-charged terms
|
94 |
-
liberal_count = 0
|
95 |
-
conservative_count = 0
|
96 |
-
|
97 |
-
for term in LIBERAL_TERMS:
|
98 |
-
liberal_count += len(re.findall(r'\b' + term + r'\b', text_lower))
|
99 |
-
|
100 |
-
for term in CONSERVATIVE_TERMS:
|
101 |
-
conservative_count += len(re.findall(r'\b' + term + r'\b', text_lower))
|
102 |
-
|
103 |
-
# Calculate total and political lean
|
104 |
-
total_partisan_terms = liberal_count + conservative_count
|
105 |
-
|
106 |
-
if total_partisan_terms > 0:
|
107 |
-
# Scale from -1 (liberal) to 1 (conservative)
|
108 |
-
lean_score = (conservative_count - liberal_count) / total_partisan_terms
|
109 |
-
else:
|
110 |
-
lean_score = 0 # Neutral if no partisan terms found
|
111 |
-
|
112 |
-
# Classify based on score
|
113 |
-
if lean_score < -0.2:
|
114 |
-
classification = "Liberal Leaning"
|
115 |
-
elif lean_score > 0.2:
|
116 |
-
classification = "Conservative Leaning"
|
117 |
-
else:
|
118 |
-
classification = "Politically Balanced"
|
119 |
-
|
120 |
-
return {
|
121 |
-
"lean_score": lean_score,
|
122 |
-
"liberal_terms": liberal_count,
|
123 |
-
"conservative_terms": conservative_count,
|
124 |
-
"total_partisan_terms": total_partisan_terms,
|
125 |
-
"classification": classification
|
126 |
-
}
|
127 |
-
|
128 |
-
def detect_framing_bias(text):
|
129 |
-
"""
|
130 |
-
Detect framing bias in political context
|
131 |
-
|
132 |
-
Args:
|
133 |
-
text (str): Input text
|
134 |
-
|
135 |
-
Returns:
|
136 |
-
dict: Framing analysis
|
137 |
-
"""
|
138 |
-
# Normalize text
|
139 |
-
text_lower = text.lower()
|
140 |
-
|
141 |
-
# Count framing terms
|
142 |
-
economic_count = 0
|
143 |
-
moral_count = 0
|
144 |
-
security_count = 0
|
145 |
-
|
146 |
-
for term in ECONOMIC_FRAMING:
|
147 |
-
economic_count += len(re.findall(r'\b' + term + r'\b', text_lower))
|
148 |
-
|
149 |
-
for term in MORAL_FRAMING:
|
150 |
-
moral_count += len(re.findall(r'\b' + term + r'\b', text_lower))
|
151 |
-
|
152 |
-
for term in SECURITY_FRAMING:
|
153 |
-
security_count += len(re.findall(r'\b' + term + r'\b', text_lower))
|
154 |
-
|
155 |
-
# Calculate total framing terms
|
156 |
-
total_framing_terms = economic_count + moral_count + security_count
|
157 |
-
|
158 |
-
# Calculate percentages
|
159 |
-
if total_framing_terms > 0:
|
160 |
-
economic_pct = economic_count / total_framing_terms
|
161 |
-
moral_pct = moral_count / total_framing_terms
|
162 |
-
security_pct = security_count / total_framing_terms
|
163 |
-
else:
|
164 |
-
economic_pct = moral_pct = security_pct = 0
|
165 |
-
|
166 |
-
# Determine primary framing
|
167 |
-
if total_framing_terms > 0:
|
168 |
-
max_count = max(economic_count, moral_count, security_count)
|
169 |
-
if max_count == economic_count:
|
170 |
-
primary_frame = "Economic"
|
171 |
-
elif max_count == moral_count:
|
172 |
-
primary_frame = "Moral/Ethical"
|
173 |
-
else:
|
174 |
-
primary_frame = "Security/Safety"
|
175 |
-
else:
|
176 |
-
primary_frame = "No clear framing"
|
177 |
-
|
178 |
-
return {
|
179 |
-
"economic_framing": economic_pct,
|
180 |
-
"moral_framing": moral_pct,
|
181 |
-
"security_framing": security_pct,
|
182 |
-
"total_framing_terms": total_framing_terms,
|
183 |
-
"primary_frame": primary_frame
|
184 |
-
}
|
185 |
-
|
186 |
-
def compare_bias(texts, model_names, bias_methods=None):
|
187 |
-
"""
|
188 |
-
Compare bias metrics across texts
|
189 |
-
|
190 |
-
Args:
|
191 |
-
texts (list): List of text responses
|
192 |
-
model_names (list): Names of models corresponding to responses
|
193 |
-
bias_methods (list): List of bias detection methods to apply
|
194 |
-
|
195 |
-
Returns:
|
196 |
-
dict: Comparative bias analysis
|
197 |
-
"""
|
198 |
-
if bias_methods is None:
|
199 |
-
bias_methods = ["Sentiment Analysis", "Partisan Leaning", "Framing Analysis"]
|
200 |
-
|
201 |
-
results = {
|
202 |
-
"models": model_names
|
203 |
-
}
|
204 |
-
|
205 |
-
# Run selected bias analyses
|
206 |
-
if "Sentiment Analysis" in bias_methods:
|
207 |
-
sentiment_results = {}
|
208 |
-
for i, (text, model) in enumerate(zip(texts, model_names)):
|
209 |
-
sentiment_results[model] = detect_sentiment(text)
|
210 |
-
results["sentiment"] = sentiment_results
|
211 |
-
|
212 |
-
if "Partisan Leaning" in bias_methods:
|
213 |
-
partisan_results = {}
|
214 |
-
for i, (text, model) in enumerate(zip(texts, model_names)):
|
215 |
-
partisan_results[model] = detect_partisan_lean(text)
|
216 |
-
results["partisan_leaning"] = partisan_results
|
217 |
-
|
218 |
-
if "Framing Analysis" in bias_methods:
|
219 |
-
framing_results = {}
|
220 |
-
for i, (text, model) in enumerate(zip(texts, model_names)):
|
221 |
-
framing_results[model] = detect_framing_bias(text)
|
222 |
-
results["framing"] = framing_results
|
223 |
-
|
224 |
-
# Add summary statistics
|
225 |
-
if "Sentiment Analysis" in bias_methods:
|
226 |
-
avg_sentiment = np.mean([results["sentiment"][model]["compound"] for model in model_names])
|
227 |
-
sentiment_variance = np.var([results["sentiment"][model]["compound"] for model in model_names])
|
228 |
-
results["sentiment_summary"] = {
|
229 |
-
"average_compound": avg_sentiment,
|
230 |
-
"variance": sentiment_variance
|
231 |
-
}
|
232 |
-
|
233 |
-
if "Partisan Leaning" in bias_methods:
|
234 |
-
avg_lean = np.mean([results["partisan_leaning"][model]["lean_score"] for model in model_names])
|
235 |
-
lean_variance = np.var([results["partisan_leaning"][model]["lean_score"] for model in model_names])
|
236 |
-
results["partisan_summary"] = {
|
237 |
-
"average_lean": avg_lean,
|
238 |
-
"variance": lean_variance
|
239 |
-
}
|
240 |
-
|
241 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
processors/diff_highlighter.py
DELETED
@@ -1,298 +0,0 @@
|
|
1 |
-
import difflib
|
2 |
-
import nltk
|
3 |
-
from nltk.tokenize import sent_tokenize, word_tokenize
|
4 |
-
import re
|
5 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
-
from scipy.spatial.distance import cosine
|
7 |
-
import numpy as np
|
8 |
-
import html
|
9 |
-
|
10 |
-
# Download necessary NLTK data
|
11 |
-
try:
|
12 |
-
nltk.data.find('tokenizers/punkt')
|
13 |
-
except LookupError:
|
14 |
-
nltk.download('punkt')
|
15 |
-
|
16 |
-
def highlight_differences(text1, text2):
|
17 |
-
"""
|
18 |
-
Find and highlight textual differences
|
19 |
-
|
20 |
-
Args:
|
21 |
-
text1 (str): First text
|
22 |
-
text2 (str): Second text
|
23 |
-
|
24 |
-
Returns:
|
25 |
-
dict: Differences analysis
|
26 |
-
"""
|
27 |
-
# Tokenize into sentences
|
28 |
-
sentences1 = sent_tokenize(text1)
|
29 |
-
sentences2 = sent_tokenize(text2)
|
30 |
-
|
31 |
-
# Compare sentence by sentence
|
32 |
-
matcher = difflib.SequenceMatcher(None, sentences1, sentences2)
|
33 |
-
|
34 |
-
# Track different and similar content
|
35 |
-
unique_to_1 = []
|
36 |
-
unique_to_2 = []
|
37 |
-
similar_content = []
|
38 |
-
|
39 |
-
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
40 |
-
if tag == 'equal':
|
41 |
-
for i in range(i1, i2):
|
42 |
-
similar_content.append(sentences1[i])
|
43 |
-
elif tag == 'delete':
|
44 |
-
for i in range(i1, i2):
|
45 |
-
unique_to_1.append(sentences1[i])
|
46 |
-
elif tag == 'insert':
|
47 |
-
for j in range(j1, j2):
|
48 |
-
unique_to_2.append(sentences2[j])
|
49 |
-
elif tag == 'replace':
|
50 |
-
for i in range(i1, i2):
|
51 |
-
unique_to_1.append(sentences1[i])
|
52 |
-
for j in range(j1, j2):
|
53 |
-
unique_to_2.append(sentences2[j])
|
54 |
-
|
55 |
-
# Calculate percentage of unique content
|
56 |
-
total_sentences1 = len(sentences1)
|
57 |
-
total_sentences2 = len(sentences2)
|
58 |
-
|
59 |
-
pct_unique_1 = len(unique_to_1) / total_sentences1 if total_sentences1 > 0 else 0
|
60 |
-
pct_unique_2 = len(unique_to_2) / total_sentences2 if total_sentences2 > 0 else 0
|
61 |
-
|
62 |
-
# Calculate content overlap
|
63 |
-
if total_sentences1 + total_sentences2 > 0:
|
64 |
-
overlap = 2 * len(similar_content) / (total_sentences1 + total_sentences2)
|
65 |
-
else:
|
66 |
-
overlap = 0
|
67 |
-
|
68 |
-
# Analyze word-level differences
|
69 |
-
# First, get common sentences
|
70 |
-
vectorizer = TfidfVectorizer(min_df=1)
|
71 |
-
|
72 |
-
# Extract significant words unique to each text
|
73 |
-
significant_words_1 = []
|
74 |
-
significant_words_2 = []
|
75 |
-
|
76 |
-
if unique_to_1 and unique_to_2:
|
77 |
-
# Combine unique sentences for each text
|
78 |
-
combined_1 = ' '.join(unique_to_1)
|
79 |
-
combined_2 = ' '.join(unique_to_2)
|
80 |
-
|
81 |
-
# Create TF-IDF vectors
|
82 |
-
try:
|
83 |
-
tfidf_matrix = vectorizer.fit_transform([combined_1, combined_2])
|
84 |
-
feature_names = vectorizer.get_feature_names_out()
|
85 |
-
|
86 |
-
# Extract weights for each document
|
87 |
-
weights_1 = tfidf_matrix[0].toarray()[0]
|
88 |
-
weights_2 = tfidf_matrix[1].toarray()[0]
|
89 |
-
|
90 |
-
# Get top 10 words unique to each text
|
91 |
-
for i in range(len(feature_names)):
|
92 |
-
if weights_1[i] > weights_2[i] * 2: # Significantly higher in text 1
|
93 |
-
significant_words_1.append((feature_names[i], weights_1[i]))
|
94 |
-
elif weights_2[i] > weights_1[i] * 2: # Significantly higher in text 2
|
95 |
-
significant_words_2.append((feature_names[i], weights_2[i]))
|
96 |
-
|
97 |
-
# Sort by weight and take top 10
|
98 |
-
significant_words_1 = sorted(significant_words_1, key=lambda x: x[1], reverse=True)[:10]
|
99 |
-
significant_words_2 = sorted(significant_words_2, key=lambda x: x[1], reverse=True)[:10]
|
100 |
-
|
101 |
-
# Convert to list of words only
|
102 |
-
significant_words_1 = [word for word, _ in significant_words_1]
|
103 |
-
significant_words_2 = [word for word, _ in significant_words_2]
|
104 |
-
except:
|
105 |
-
# Fallback if TF-IDF fails
|
106 |
-
significant_words_1 = []
|
107 |
-
significant_words_2 = []
|
108 |
-
|
109 |
-
return {
|
110 |
-
"unique_to_first": unique_to_1,
|
111 |
-
"unique_to_second": unique_to_2,
|
112 |
-
"similar_content": similar_content,
|
113 |
-
"pct_unique_first": pct_unique_1,
|
114 |
-
"pct_unique_second": pct_unique_2,
|
115 |
-
"content_overlap": overlap,
|
116 |
-
"significant_words_first": significant_words_1,
|
117 |
-
"significant_words_second": significant_words_2
|
118 |
-
}
|
119 |
-
|
120 |
-
def extract_unique_content(texts):
|
121 |
-
"""
|
122 |
-
Extract content unique to each text
|
123 |
-
|
124 |
-
Args:
|
125 |
-
texts (list): List of texts
|
126 |
-
|
127 |
-
Returns:
|
128 |
-
dict: Unique content for each text
|
129 |
-
"""
|
130 |
-
n = len(texts)
|
131 |
-
unique_content = [[] for _ in range(n)]
|
132 |
-
|
133 |
-
# Compare each text with all others
|
134 |
-
for i in range(n):
|
135 |
-
sentences_i = sent_tokenize(texts[i])
|
136 |
-
|
137 |
-
# For each sentence in this text
|
138 |
-
for sentence in sentences_i:
|
139 |
-
# Check if it appears in any other text
|
140 |
-
is_unique = True
|
141 |
-
for j in range(n):
|
142 |
-
if i != j and sentence in texts[j]:
|
143 |
-
is_unique = False
|
144 |
-
break
|
145 |
-
|
146 |
-
if is_unique:
|
147 |
-
unique_content[i].append(sentence)
|
148 |
-
|
149 |
-
return {f"text_{i+1}_unique": content for i, content in enumerate(unique_content)}
|
150 |
-
|
151 |
-
def generate_html_diff(text1, text2):
|
152 |
-
"""
|
153 |
-
Generate HTML with highlighted differences
|
154 |
-
|
155 |
-
Args:
|
156 |
-
text1 (str): First text
|
157 |
-
text2 (str): Second text
|
158 |
-
|
159 |
-
Returns:
|
160 |
-
str: HTML with highlighted differences
|
161 |
-
"""
|
162 |
-
# Split into sentences
|
163 |
-
sentences1 = sent_tokenize(text1)
|
164 |
-
sentences2 = sent_tokenize(text2)
|
165 |
-
|
166 |
-
# Compare sentence by sentence
|
167 |
-
matcher = difflib.SequenceMatcher(None, sentences1, sentences2)
|
168 |
-
|
169 |
-
# Create HTML with highlighted differences
|
170 |
-
html_output = []
|
171 |
-
|
172 |
-
html_output.append('<div style="display: flex; width: 100%;">')
|
173 |
-
|
174 |
-
# First text column
|
175 |
-
html_output.append('<div style="flex: 1; padding: 10px; border-right: 1px solid #ccc;">')
|
176 |
-
html_output.append(f'<h3>Text 1</h3>')
|
177 |
-
|
178 |
-
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
179 |
-
if tag in ('delete', 'replace'):
|
180 |
-
# Unique to text 1 - highlight in red
|
181 |
-
for i in range(i1, i2):
|
182 |
-
html_output.append(f'<p style="background-color: #ffdddd;">{html.escape(sentences1[i])}</p>')
|
183 |
-
else:
|
184 |
-
# Common or not in text 1
|
185 |
-
for i in range(i1, i2):
|
186 |
-
html_output.append(f'<p>{html.escape(sentences1[i])}</p>')
|
187 |
-
|
188 |
-
html_output.append('</div>')
|
189 |
-
|
190 |
-
# Second text column
|
191 |
-
html_output.append('<div style="flex: 1; padding: 10px;">')
|
192 |
-
html_output.append(f'<h3>Text 2</h3>')
|
193 |
-
|
194 |
-
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
195 |
-
if tag in ('insert', 'replace'):
|
196 |
-
# Unique to text 2 - highlight in green
|
197 |
-
for j in range(j1, j2):
|
198 |
-
html_output.append(f'<p style="background-color: #ddffdd;">{html.escape(sentences2[j])}</p>')
|
199 |
-
else:
|
200 |
-
# Common or not in text 2
|
201 |
-
for j in range(j1, j2):
|
202 |
-
html_output.append(f'<p>{html.escape(sentences2[j])}</p>')
|
203 |
-
|
204 |
-
html_output.append('</div>')
|
205 |
-
html_output.append('</div>')
|
206 |
-
|
207 |
-
return ''.join(html_output)
|
208 |
-
|
209 |
-
def highlight_text_differences(diff_results, model_pair=None):
|
210 |
-
"""
|
211 |
-
Generate HTML with highlighted differences based on analysis results
|
212 |
-
|
213 |
-
Args:
|
214 |
-
diff_results (dict): Results from highlight_differences
|
215 |
-
model_pair (str): Model pair to compare (e.g., "GPT-4 vs Claude-3")
|
216 |
-
|
217 |
-
Returns:
|
218 |
-
str: HTML with highlighted differences
|
219 |
-
"""
|
220 |
-
if model_pair and model_pair in diff_results:
|
221 |
-
analysis = diff_results[model_pair]
|
222 |
-
elif model_pair:
|
223 |
-
# If specific pair not found, return error
|
224 |
-
return f"<p>Model pair '{model_pair}' not found in difference analysis.</p>"
|
225 |
-
else:
|
226 |
-
# If no pair specified, use first available
|
227 |
-
if not diff_results:
|
228 |
-
return "<p>No difference analysis available.</p>"
|
229 |
-
analysis = diff_results[list(diff_results.keys())[0]]
|
230 |
-
|
231 |
-
# Extract model names from pair
|
232 |
-
if model_pair:
|
233 |
-
model1, model2 = model_pair.split(" vs ")
|
234 |
-
else:
|
235 |
-
model1 = "Text 1"
|
236 |
-
model2 = "Text 2"
|
237 |
-
|
238 |
-
html_output = []
|
239 |
-
|
240 |
-
# Overall statistics
|
241 |
-
html_output.append('<div style="margin-bottom: 20px;">')
|
242 |
-
html_output.append('<h3>Difference Analysis</h3>')
|
243 |
-
html_output.append(f'<p><b>Content Overlap:</b> {analysis["content_overlap"]*100:.1f}%</p>')
|
244 |
-
html_output.append(f'<p><b>Unique to {model1}:</b> {analysis["pct_unique_first"]*100:.1f}%</p>')
|
245 |
-
html_output.append(f'<p><b>Unique to {model2}:</b> {analysis["pct_unique_second"]*100:.1f}%</p>')
|
246 |
-
html_output.append('</div>')
|
247 |
-
|
248 |
-
# Significant words
|
249 |
-
html_output.append('<div style="display: flex; margin-bottom: 20px;">')
|
250 |
-
|
251 |
-
html_output.append('<div style="flex: 1; padding: 10px;">')
|
252 |
-
html_output.append(f'<h4>Key terms unique to {model1}:</h4>')
|
253 |
-
if analysis["significant_words_first"]:
|
254 |
-
html_output.append('<ul>')
|
255 |
-
for word in analysis["significant_words_first"]:
|
256 |
-
html_output.append(f'<li>{html.escape(word)}</li>')
|
257 |
-
html_output.append('</ul>')
|
258 |
-
else:
|
259 |
-
html_output.append('<p>No significant unique terms found.</p>')
|
260 |
-
html_output.append('</div>')
|
261 |
-
|
262 |
-
html_output.append('<div style="flex: 1; padding: 10px;">')
|
263 |
-
html_output.append(f'<h4>Key terms unique to {model2}:</h4>')
|
264 |
-
if analysis["significant_words_second"]:
|
265 |
-
html_output.append('<ul>')
|
266 |
-
for word in analysis["significant_words_second"]:
|
267 |
-
html_output.append(f'<li>{html.escape(word)}</li>')
|
268 |
-
html_output.append('</ul>')
|
269 |
-
else:
|
270 |
-
html_output.append('<p>No significant unique terms found.</p>')
|
271 |
-
html_output.append('</div>')
|
272 |
-
|
273 |
-
html_output.append('</div>')
|
274 |
-
|
275 |
-
# Unique content sections
|
276 |
-
html_output.append('<div style="display: flex;">')
|
277 |
-
|
278 |
-
html_output.append('<div style="flex: 1; padding: 10px; border-right: 1px solid #ccc;">')
|
279 |
-
html_output.append(f'<h4>Content unique to {model1}:</h4>')
|
280 |
-
if analysis["unique_to_first"]:
|
281 |
-
for sentence in analysis["unique_to_first"]:
|
282 |
-
html_output.append(f'<p style="background-color: #ffdddd;">{html.escape(sentence)}</p>')
|
283 |
-
else:
|
284 |
-
html_output.append('<p>No unique content found.</p>')
|
285 |
-
html_output.append('</div>')
|
286 |
-
|
287 |
-
html_output.append('<div style="flex: 1; padding: 10px;">')
|
288 |
-
html_output.append(f'<h4>Content unique to {model2}:</h4>')
|
289 |
-
if analysis["unique_to_second"]:
|
290 |
-
for sentence in analysis["unique_to_second"]:
|
291 |
-
html_output.append(f'<p style="background-color: #ddffdd;">{html.escape(sentence)}</p>')
|
292 |
-
else:
|
293 |
-
html_output.append('<p>No unique content found.</p>')
|
294 |
-
html_output.append('</div>')
|
295 |
-
|
296 |
-
html_output.append('</div>')
|
297 |
-
|
298 |
-
return ''.join(html_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
processors/metrics.py
DELETED
@@ -1,258 +0,0 @@
|
|
1 |
-
import nltk
|
2 |
-
from nltk.tokenize import word_tokenize
|
3 |
-
from nltk.util import ngrams
|
4 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
-
import numpy as np
|
7 |
-
import re
|
8 |
-
from collections import Counter
|
9 |
-
|
10 |
-
# Download necessary NLTK data
|
11 |
-
try:
|
12 |
-
nltk.data.find('tokenizers/punkt')
|
13 |
-
except LookupError:
|
14 |
-
nltk.download('punkt')
|
15 |
-
|
16 |
-
def preprocess_text(text):
|
17 |
-
"""
|
18 |
-
Preprocess text for similarity calculations
|
19 |
-
|
20 |
-
Args:
|
21 |
-
text (str): Input text
|
22 |
-
|
23 |
-
Returns:
|
24 |
-
str: Preprocessed text
|
25 |
-
"""
|
26 |
-
# Convert to lowercase
|
27 |
-
text = text.lower()
|
28 |
-
|
29 |
-
# Remove special characters and digits
|
30 |
-
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
31 |
-
|
32 |
-
# Replace multiple spaces with single space
|
33 |
-
text = re.sub(r'\s+', ' ', text).strip()
|
34 |
-
|
35 |
-
return text
|
36 |
-
|
37 |
-
def calculate_cosine_similarity(text1, text2):
|
38 |
-
"""
|
39 |
-
Calculate cosine similarity between two texts
|
40 |
-
|
41 |
-
Args:
|
42 |
-
text1 (str): First text
|
43 |
-
text2 (str): Second text
|
44 |
-
|
45 |
-
Returns:
|
46 |
-
float: Cosine similarity score
|
47 |
-
"""
|
48 |
-
# Preprocess texts
|
49 |
-
preprocessed_text1 = preprocess_text(text1)
|
50 |
-
preprocessed_text2 = preprocess_text(text2)
|
51 |
-
|
52 |
-
# Create TF-IDF vectors
|
53 |
-
vectorizer = TfidfVectorizer()
|
54 |
-
tfidf_matrix = vectorizer.fit_transform([preprocessed_text1, preprocessed_text2])
|
55 |
-
|
56 |
-
# Calculate cosine similarity
|
57 |
-
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
|
58 |
-
|
59 |
-
return float(cosine_sim)
|
60 |
-
|
61 |
-
def calculate_jaccard_similarity(text1, text2):
|
62 |
-
"""
|
63 |
-
Calculate Jaccard similarity between two texts
|
64 |
-
|
65 |
-
Args:
|
66 |
-
text1 (str): First text
|
67 |
-
text2 (str): Second text
|
68 |
-
|
69 |
-
Returns:
|
70 |
-
float: Jaccard similarity score
|
71 |
-
"""
|
72 |
-
# Preprocess texts
|
73 |
-
preprocessed_text1 = preprocess_text(text1)
|
74 |
-
preprocessed_text2 = preprocess_text(text2)
|
75 |
-
|
76 |
-
# Tokenize
|
77 |
-
tokens1 = set(word_tokenize(preprocessed_text1))
|
78 |
-
tokens2 = set(word_tokenize(preprocessed_text2))
|
79 |
-
|
80 |
-
# Calculate Jaccard similarity
|
81 |
-
intersection = tokens1.intersection(tokens2)
|
82 |
-
union = tokens1.union(tokens2)
|
83 |
-
|
84 |
-
if len(union) == 0:
|
85 |
-
return 0.0
|
86 |
-
|
87 |
-
return len(intersection) / len(union)
|
88 |
-
|
89 |
-
def calculate_ngram_overlap(text1, text2, n=2):
|
90 |
-
"""
|
91 |
-
Calculate n-gram overlap between two texts
|
92 |
-
|
93 |
-
Args:
|
94 |
-
text1 (str): First text
|
95 |
-
text2 (str): Second text
|
96 |
-
n (int): Size of n-grams
|
97 |
-
|
98 |
-
Returns:
|
99 |
-
float: N-gram overlap score
|
100 |
-
"""
|
101 |
-
# Preprocess texts
|
102 |
-
preprocessed_text1 = preprocess_text(text1)
|
103 |
-
preprocessed_text2 = preprocess_text(text2)
|
104 |
-
|
105 |
-
# Tokenize
|
106 |
-
tokens1 = word_tokenize(preprocessed_text1)
|
107 |
-
tokens2 = word_tokenize(preprocessed_text2)
|
108 |
-
|
109 |
-
# Generate n-grams
|
110 |
-
ngrams1 = set(' '.join(gram) for gram in ngrams(tokens1, n))
|
111 |
-
ngrams2 = set(' '.join(gram) for gram in ngrams(tokens2, n))
|
112 |
-
|
113 |
-
# Calculate overlap
|
114 |
-
intersection = ngrams1.intersection(ngrams2)
|
115 |
-
union = ngrams1.union(ngrams2)
|
116 |
-
|
117 |
-
if len(union) == 0:
|
118 |
-
return 0.0
|
119 |
-
|
120 |
-
return len(intersection) / len(union)
|
121 |
-
|
122 |
-
def calculate_semantic_similarity(text1, text2):
|
123 |
-
"""
|
124 |
-
Calculate semantic similarity between two texts
|
125 |
-
|
126 |
-
Note: In a real implementation, this would use a pretrained language model.
|
127 |
-
This is a simplified version for demonstration purposes.
|
128 |
-
|
129 |
-
Args:
|
130 |
-
text1 (str): First text
|
131 |
-
text2 (str): Second text
|
132 |
-
|
133 |
-
Returns:
|
134 |
-
float: Semantic similarity score
|
135 |
-
"""
|
136 |
-
# For demonstration, use a weighted combination of other similarities
|
137 |
-
cosine_sim = calculate_cosine_similarity(text1, text2)
|
138 |
-
jaccard_sim = calculate_jaccard_similarity(text1, text2)
|
139 |
-
ngram_sim = calculate_ngram_overlap(text1, text2, 2)
|
140 |
-
|
141 |
-
# Weighted average (would be replaced with actual embedding-based similarity)
|
142 |
-
semantic_sim = (0.5 * cosine_sim) + (0.3 * jaccard_sim) + (0.2 * ngram_sim)
|
143 |
-
|
144 |
-
return float(semantic_sim)
|
145 |
-
|
146 |
-
def calculate_lexical_diversity(text):
|
147 |
-
"""
|
148 |
-
Calculate lexical diversity (type-token ratio)
|
149 |
-
|
150 |
-
Args:
|
151 |
-
text (str): Input text
|
152 |
-
|
153 |
-
Returns:
|
154 |
-
float: Lexical diversity score
|
155 |
-
"""
|
156 |
-
# Preprocess text
|
157 |
-
preprocessed_text = preprocess_text(text)
|
158 |
-
|
159 |
-
# Tokenize
|
160 |
-
tokens = word_tokenize(preprocessed_text)
|
161 |
-
|
162 |
-
# Calculate type-token ratio
|
163 |
-
if len(tokens) == 0:
|
164 |
-
return 0.0
|
165 |
-
|
166 |
-
return len(set(tokens)) / len(tokens)
|
167 |
-
|
168 |
-
def calculate_complexity(text):
|
169 |
-
"""
|
170 |
-
Calculate linguistic complexity metrics
|
171 |
-
|
172 |
-
Args:
|
173 |
-
text (str): Input text
|
174 |
-
|
175 |
-
Returns:
|
176 |
-
dict: Complexity metrics
|
177 |
-
"""
|
178 |
-
# Preprocess minimally to keep sentence structure
|
179 |
-
text_lower = text.lower()
|
180 |
-
|
181 |
-
# Tokenize into sentences and words
|
182 |
-
sentences = nltk.sent_tokenize(text_lower)
|
183 |
-
words = word_tokenize(text_lower)
|
184 |
-
|
185 |
-
# Calculate average sentence length
|
186 |
-
avg_sentence_length = len(words) / len(sentences) if len(sentences) > 0 else 0
|
187 |
-
|
188 |
-
# Calculate average word length
|
189 |
-
avg_word_length = sum(len(word) for word in words) / len(words) if len(words) > 0 else 0
|
190 |
-
|
191 |
-
# Calculate lexical diversity
|
192 |
-
lexical_diversity = calculate_lexical_diversity(text)
|
193 |
-
|
194 |
-
return {
|
195 |
-
"avg_sentence_length": float(avg_sentence_length),
|
196 |
-
"avg_word_length": float(avg_word_length),
|
197 |
-
"lexical_diversity": float(lexical_diversity)
|
198 |
-
}
|
199 |
-
|
200 |
-
def calculate_similarity(text1, text2, methods=None):
|
201 |
-
"""
|
202 |
-
Calculate similarity between texts using various methods
|
203 |
-
|
204 |
-
Args:
|
205 |
-
text1 (str): First text
|
206 |
-
text2 (str): Second text
|
207 |
-
methods (list): List of similarity methods to apply
|
208 |
-
|
209 |
-
Returns:
|
210 |
-
dict: Similarity metrics
|
211 |
-
"""
|
212 |
-
if methods is None:
|
213 |
-
methods = ["Cosine Similarity"]
|
214 |
-
|
215 |
-
results = {}
|
216 |
-
|
217 |
-
if "Cosine Similarity" in methods:
|
218 |
-
results["cosine_similarity"] = calculate_cosine_similarity(text1, text2)
|
219 |
-
|
220 |
-
if "Jaccard Similarity" in methods:
|
221 |
-
results["jaccard_similarity"] = calculate_jaccard_similarity(text1, text2)
|
222 |
-
|
223 |
-
if "N-gram Overlap" in methods:
|
224 |
-
for n in range(1, 4):
|
225 |
-
results[f"{n}-gram_overlap"] = calculate_ngram_overlap(text1, text2, n)
|
226 |
-
|
227 |
-
if "Semantic Similarity" in methods:
|
228 |
-
results["semantic_similarity"] = calculate_semantic_similarity(text1, text2)
|
229 |
-
|
230 |
-
# Add complexity comparison
|
231 |
-
if "Complexity Comparison" in methods:
|
232 |
-
complexity1 = calculate_complexity(text1)
|
233 |
-
complexity2 = calculate_complexity(text2)
|
234 |
-
|
235 |
-
results["complexity_comparison"] = {
|
236 |
-
"text1_complexity": complexity1,
|
237 |
-
"text2_complexity": complexity2,
|
238 |
-
"complexity_difference": {
|
239 |
-
"avg_sentence_length": complexity1["avg_sentence_length"] - complexity2["avg_sentence_length"],
|
240 |
-
"avg_word_length": complexity1["avg_word_length"] - complexity2["avg_word_length"],
|
241 |
-
"lexical_diversity": complexity1["lexical_diversity"] - complexity2["lexical_diversity"]
|
242 |
-
}
|
243 |
-
}
|
244 |
-
|
245 |
-
return results
|
246 |
-
|
247 |
-
def calculate_diversity(text):
|
248 |
-
"""
|
249 |
-
Calculate lexical diversity and other metrics
|
250 |
-
|
251 |
-
Args:
|
252 |
-
text (str): Input text
|
253 |
-
|
254 |
-
Returns:
|
255 |
-
dict: Diversity metrics
|
256 |
-
"""
|
257 |
-
return calculate_complexity(text)
|
258 |
-
vector
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
processors/ngram_analysis.py
DELETED
@@ -1,208 +0,0 @@
|
|
1 |
-
from sklearn.feature_extraction.text import CountVectorizer
|
2 |
-
import numpy as np
|
3 |
-
from collections import Counter
|
4 |
-
import re
|
5 |
-
import nltk
|
6 |
-
from nltk.corpus import stopwords
|
7 |
-
from nltk.util import ngrams
|
8 |
-
from nltk.tokenize import word_tokenize
|
9 |
-
|
10 |
-
# Download necessary NLTK data
|
11 |
-
try:
|
12 |
-
nltk.data.find('tokenizers/punkt')
|
13 |
-
except LookupError:
|
14 |
-
nltk.download('punkt')
|
15 |
-
|
16 |
-
try:
|
17 |
-
nltk.data.find('corpora/stopwords')
|
18 |
-
except LookupError:
|
19 |
-
nltk.download('stopwords')
|
20 |
-
|
21 |
-
def preprocess_text(text):
|
22 |
-
"""
|
23 |
-
Preprocess text for n-gram analysis
|
24 |
-
|
25 |
-
Args:
|
26 |
-
text (str): Input text
|
27 |
-
|
28 |
-
Returns:
|
29 |
-
list: List of preprocessed tokens
|
30 |
-
"""
|
31 |
-
# Convert to lowercase
|
32 |
-
text = text.lower()
|
33 |
-
|
34 |
-
# Remove special characters and digits (but keep spaces and punctuation for n-grams)
|
35 |
-
text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)
|
36 |
-
|
37 |
-
# Tokenize
|
38 |
-
tokens = word_tokenize(text)
|
39 |
-
|
40 |
-
# Remove stopwords for unigrams, but keep for n-grams (important for context)
|
41 |
-
# stop_words = set(stopwords.words('english'))
|
42 |
-
# tokens = [token for token in tokens if token not in stop_words]
|
43 |
-
|
44 |
-
return tokens
|
45 |
-
|
46 |
-
def extract_ngrams(text, n=2):
|
47 |
-
"""
|
48 |
-
Extract n-grams from text
|
49 |
-
|
50 |
-
Args:
|
51 |
-
text (str): Input text
|
52 |
-
n (int): Size of n-grams to extract
|
53 |
-
|
54 |
-
Returns:
|
55 |
-
dict: N-grams with counts
|
56 |
-
"""
|
57 |
-
# Preprocess text
|
58 |
-
tokens = preprocess_text(text)
|
59 |
-
|
60 |
-
# Generate n-grams
|
61 |
-
n_grams = list(ngrams(tokens, n))
|
62 |
-
|
63 |
-
# Convert n-grams to strings for easier handling
|
64 |
-
n_gram_strings = [' '.join(gram) for gram in n_grams]
|
65 |
-
|
66 |
-
# Count occurrences
|
67 |
-
gram_counts = Counter(n_gram_strings)
|
68 |
-
|
69 |
-
return dict(gram_counts)
|
70 |
-
|
71 |
-
def compare_ngrams(texts, model_names, n=2, top_n=10):
|
72 |
-
"""
|
73 |
-
Compare n-grams across different texts
|
74 |
-
|
75 |
-
Args:
|
76 |
-
texts (list): List of text responses
|
77 |
-
model_names (list): Names of models corresponding to responses
|
78 |
-
n (int): Size of n-grams to extract
|
79 |
-
top_n (int): Number of top n-grams to consider
|
80 |
-
|
81 |
-
Returns:
|
82 |
-
dict: Comparative analysis
|
83 |
-
"""
|
84 |
-
# Extract n-grams for each text
|
85 |
-
model_ngrams = {}
|
86 |
-
for i, (text, model) in enumerate(zip(texts, model_names)):
|
87 |
-
model_ngrams[model] = extract_ngrams(text, n)
|
88 |
-
|
89 |
-
# Get top n-grams for each model
|
90 |
-
top_ngrams = {}
|
91 |
-
for model, ngrams_dict in model_ngrams.items():
|
92 |
-
sorted_ngrams = sorted(ngrams_dict.items(), key=lambda x: x[1], reverse=True)
|
93 |
-
top_ngrams[model] = [{"ngram": ngram, "count": count} for ngram, count in sorted_ngrams[:top_n]]
|
94 |
-
|
95 |
-
# Find unique n-grams for each model
|
96 |
-
unique_ngrams = {}
|
97 |
-
for i, model1 in enumerate(model_names):
|
98 |
-
# Get all n-grams from other models
|
99 |
-
other_ngrams = set()
|
100 |
-
for j, model2 in enumerate(model_names):
|
101 |
-
if i != j:
|
102 |
-
other_ngrams.update(model_ngrams[model2].keys())
|
103 |
-
|
104 |
-
# Find n-grams unique to this model
|
105 |
-
unique_to_model = set(model_ngrams[model1].keys()) - other_ngrams
|
106 |
-
|
107 |
-
# Sort by count
|
108 |
-
sorted_unique = sorted(
|
109 |
-
[(ngram, model_ngrams[model1][ngram]) for ngram in unique_to_model],
|
110 |
-
key=lambda x: x[1],
|
111 |
-
reverse=True
|
112 |
-
)
|
113 |
-
|
114 |
-
unique_ngrams[model1] = [{"ngram": ngram, "count": count} for ngram, count in sorted_unique[:top_n]]
|
115 |
-
|
116 |
-
# Calculate pairwise similarity between models
|
117 |
-
similarities = {}
|
118 |
-
for i, model1 in enumerate(model_names):
|
119 |
-
for j, model2 in enumerate(model_names):
|
120 |
-
if j <= i: # Avoid duplicate comparisons
|
121 |
-
continue
|
122 |
-
|
123 |
-
# Get sets of n-grams
|
124 |
-
ngrams1 = set(model_ngrams[model1].keys())
|
125 |
-
ngrams2 = set(model_ngrams[model2].keys())
|
126 |
-
|
127 |
-
# Calculate Jaccard similarity
|
128 |
-
intersection = ngrams1.intersection(ngrams2)
|
129 |
-
union = ngrams1.union(ngrams2)
|
130 |
-
|
131 |
-
jaccard = len(intersection) / len(union) if len(union) > 0 else 0
|
132 |
-
|
133 |
-
similarities[f"{model1} vs {model2}"] = {
|
134 |
-
"jaccard_similarity": jaccard,
|
135 |
-
"common_ngrams": list(intersection)[:top_n]
|
136 |
-
}
|
137 |
-
|
138 |
-
# Create n-gram frequency matrix for comparison
|
139 |
-
all_ngrams = set()
|
140 |
-
for model_dict in model_ngrams.values():
|
141 |
-
all_ngrams.update(model_dict.keys())
|
142 |
-
|
143 |
-
# Calculate ngram variances to find most differential ngrams
|
144 |
-
ngram_variances = {}
|
145 |
-
for ngram in all_ngrams:
|
146 |
-
counts = [model_dict.get(ngram, 0) for model_dict in model_ngrams.values()]
|
147 |
-
if len(counts) > 1:
|
148 |
-
ngram_variances[ngram] = np.var(counts)
|
149 |
-
|
150 |
-
# Get top differential ngrams
|
151 |
-
top_diff_ngrams = sorted(ngram_variances.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
152 |
-
differential_ngrams = [ngram for ngram, _ in top_diff_ngrams]
|
153 |
-
|
154 |
-
# Create matrix of counts for top differential ngrams
|
155 |
-
ngram_matrix = {}
|
156 |
-
for ngram in differential_ngrams:
|
157 |
-
ngram_matrix[ngram] = {model: model_dict.get(ngram, 0) for model, model_dict in model_ngrams.items()}
|
158 |
-
|
159 |
-
# Format results
|
160 |
-
result = {
|
161 |
-
"n": n,
|
162 |
-
"top_ngrams": top_ngrams,
|
163 |
-
"unique_ngrams": unique_ngrams,
|
164 |
-
"similarities": similarities,
|
165 |
-
"differential_ngrams": differential_ngrams,
|
166 |
-
"ngram_matrix": ngram_matrix,
|
167 |
-
"models": model_names
|
168 |
-
}
|
169 |
-
|
170 |
-
return result
|
171 |
-
|
172 |
-
def unique_ngrams(text1, text2, n=2):
|
173 |
-
"""
|
174 |
-
Find unique n-grams in one text vs another
|
175 |
-
|
176 |
-
Args:
|
177 |
-
text1 (str): First text
|
178 |
-
text2 (str): Second text
|
179 |
-
n (int): Size of n-grams
|
180 |
-
|
181 |
-
Returns:
|
182 |
-
dict: N-grams unique to each text
|
183 |
-
"""
|
184 |
-
# Extract n-grams
|
185 |
-
ngrams1 = extract_ngrams(text1, n)
|
186 |
-
ngrams2 = extract_ngrams(text2, n)
|
187 |
-
|
188 |
-
# Find unique n-grams
|
189 |
-
unique_to_1 = set(ngrams1.keys()) - set(ngrams2.keys())
|
190 |
-
unique_to_2 = set(ngrams2.keys()) - set(ngrams1.keys())
|
191 |
-
|
192 |
-
# Sort by frequency
|
193 |
-
sorted_unique_1 = sorted(
|
194 |
-
[(ngram, ngrams1[ngram]) for ngram in unique_to_1],
|
195 |
-
key=lambda x: x[1],
|
196 |
-
reverse=True
|
197 |
-
)
|
198 |
-
|
199 |
-
sorted_unique_2 = sorted(
|
200 |
-
[(ngram, ngrams2[ngram]) for ngram in unique_to_2],
|
201 |
-
key=lambda x: x[1],
|
202 |
-
reverse=True
|
203 |
-
)
|
204 |
-
|
205 |
-
return {
|
206 |
-
"unique_to_first": [{"ngram": ngram, "count": count} for ngram, count in sorted_unique_1[:10]],
|
207 |
-
"unique_to_second": [{"ngram": ngram, "count": count} for ngram, count in sorted_unique_2[:10]]
|
208 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
processors/topic_modeling.py
DELETED
@@ -1,183 +0,0 @@
|
|
1 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
2 |
-
from sklearn.decomposition import LatentDirichletAllocation, NMF
|
3 |
-
import numpy as np
|
4 |
-
import re
|
5 |
-
import nltk
|
6 |
-
from nltk.corpus import stopwords
|
7 |
-
from nltk.stem import WordNetLemmatizer
|
8 |
-
from nltk.tokenize import word_tokenize
|
9 |
-
|
10 |
-
# Download necessary NLTK data
|
11 |
-
try:
|
12 |
-
nltk.data.find('tokenizers/punkt')
|
13 |
-
except LookupError:
|
14 |
-
nltk.download('punkt')
|
15 |
-
|
16 |
-
try:
|
17 |
-
nltk.data.find('corpora/stopwords')
|
18 |
-
except LookupError:
|
19 |
-
nltk.download('stopwords')
|
20 |
-
|
21 |
-
try:
|
22 |
-
nltk.data.find('corpora/wordnet')
|
23 |
-
except LookupError:
|
24 |
-
nltk.download('wordnet')
|
25 |
-
|
26 |
-
def preprocess_text(text):
|
27 |
-
"""
|
28 |
-
Preprocess text for topic modeling
|
29 |
-
|
30 |
-
Args:
|
31 |
-
text (str): Input text
|
32 |
-
|
33 |
-
Returns:
|
34 |
-
str: Preprocessed text
|
35 |
-
"""
|
36 |
-
# Convert to lowercase
|
37 |
-
text = text.lower()
|
38 |
-
|
39 |
-
# Remove special characters and digits
|
40 |
-
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
41 |
-
|
42 |
-
# Tokenize
|
43 |
-
tokens = word_tokenize(text)
|
44 |
-
|
45 |
-
# Remove stopwords
|
46 |
-
stop_words = set(stopwords.words('english'))
|
47 |
-
tokens = [token for token in tokens if token not in stop_words]
|
48 |
-
|
49 |
-
# Lemmatize
|
50 |
-
lemmatizer = WordNetLemmatizer()
|
51 |
-
tokens = [lemmatizer.lemmatize(token) for token in tokens]
|
52 |
-
|
53 |
-
# Join back to string
|
54 |
-
return ' '.join(tokens)
|
55 |
-
|
56 |
-
def extract_topics(texts, num_topics=3, method='lda'):
|
57 |
-
"""
|
58 |
-
Extract main topics using topic modeling
|
59 |
-
|
60 |
-
Args:
|
61 |
-
texts (list): List of text documents
|
62 |
-
num_topics (int): Number of topics to extract
|
63 |
-
method (str): Method to use ('lda' or 'nmf')
|
64 |
-
|
65 |
-
Returns:
|
66 |
-
dict: Extracted topics and their keywords
|
67 |
-
"""
|
68 |
-
# Preprocess texts
|
69 |
-
preprocessed_texts = [preprocess_text(text) for text in texts]
|
70 |
-
|
71 |
-
# Create TF-IDF vectorizer
|
72 |
-
vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.8)
|
73 |
-
tfidf_matrix = vectorizer.fit_transform(preprocessed_texts)
|
74 |
-
feature_names = vectorizer.get_feature_names_out()
|
75 |
-
|
76 |
-
# Run topic modeling
|
77 |
-
if method == 'nmf':
|
78 |
-
# Non-negative Matrix Factorization (often works better for short texts)
|
79 |
-
model = NMF(n_components=num_topics, random_state=42)
|
80 |
-
else:
|
81 |
-
# Latent Dirichlet Allocation
|
82 |
-
model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
|
83 |
-
|
84 |
-
model.fit(tfidf_matrix)
|
85 |
-
|
86 |
-
# Extract topics and keywords
|
87 |
-
topics = {}
|
88 |
-
for topic_idx, topic in enumerate(model.components_):
|
89 |
-
# Get top 10 keywords for this topic
|
90 |
-
top_keyword_indices = topic.argsort()[:-11:-1]
|
91 |
-
top_keywords = [feature_names[i] for i in top_keyword_indices]
|
92 |
-
|
93 |
-
topics[f"Topic_{topic_idx+1}"] = {
|
94 |
-
"keywords": top_keywords,
|
95 |
-
"weight": float(topic.sum()) # Convert to float for JSON serialization
|
96 |
-
}
|
97 |
-
|
98 |
-
# Get topic distribution for each document
|
99 |
-
if method == 'nmf':
|
100 |
-
doc_topic_dist = model.transform(tfidf_matrix)
|
101 |
-
else:
|
102 |
-
doc_topic_dist = model.transform(tfidf_matrix)
|
103 |
-
|
104 |
-
# Convert to list of dictionaries for JSON serialization
|
105 |
-
doc_topics = []
|
106 |
-
for i, doc_dist in enumerate(doc_topic_dist):
|
107 |
-
# Normalize to sum to 1
|
108 |
-
doc_dist = doc_dist / doc_dist.sum() if doc_dist.sum() > 0 else doc_dist
|
109 |
-
|
110 |
-
# Convert to dictionary of topic distributions
|
111 |
-
dist = {}
|
112 |
-
for topic_idx, weight in enumerate(doc_dist):
|
113 |
-
dist[f"Topic_{topic_idx+1}"] = float(weight) # Convert to float for JSON serialization
|
114 |
-
|
115 |
-
doc_topics.append(dist)
|
116 |
-
|
117 |
-
return {
|
118 |
-
"topics": topics,
|
119 |
-
"document_topics": doc_topics
|
120 |
-
}
|
121 |
-
|
122 |
-
def compare_topics(texts, model_names, num_topics=3):
|
123 |
-
"""
|
124 |
-
Compare topics across different model responses
|
125 |
-
|
126 |
-
Args:
|
127 |
-
texts (list): List of text responses
|
128 |
-
model_names (list): List of model names corresponding to responses
|
129 |
-
num_topics (int): Number of topics to extract
|
130 |
-
|
131 |
-
Returns:
|
132 |
-
dict: Comparative topic analysis
|
133 |
-
"""
|
134 |
-
# Extract topics
|
135 |
-
topic_results = extract_topics(texts, num_topics)
|
136 |
-
|
137 |
-
# Map document topics to models
|
138 |
-
model_topics = {}
|
139 |
-
for i, model in enumerate(model_names):
|
140 |
-
model_topics[model] = topic_results["document_topics"][i]
|
141 |
-
|
142 |
-
# Find primary topic for each model
|
143 |
-
model_primary_topics = {}
|
144 |
-
for model, topics in model_topics.items():
|
145 |
-
primary_topic = max(topics.items(), key=lambda x: x[1])
|
146 |
-
model_primary_topics[model] = {
|
147 |
-
"topic": primary_topic[0],
|
148 |
-
"weight": primary_topic[1]
|
149 |
-
}
|
150 |
-
|
151 |
-
# Format for output
|
152 |
-
result = {
|
153 |
-
"topics": topic_results["topics"],
|
154 |
-
"model_topics": model_topics,
|
155 |
-
"primary_topics": model_primary_topics,
|
156 |
-
"models": model_names
|
157 |
-
}
|
158 |
-
|
159 |
-
return result
|
160 |
-
|
161 |
-
def topic_similarity(topic1, topic2):
|
162 |
-
"""
|
163 |
-
Calculate similarity between topics
|
164 |
-
|
165 |
-
Args:
|
166 |
-
topic1 (dict): First topic with keywords
|
167 |
-
topic2 (dict): Second topic with keywords
|
168 |
-
|
169 |
-
Returns:
|
170 |
-
float: Similarity score
|
171 |
-
"""
|
172 |
-
# Extract keywords
|
173 |
-
keywords1 = set(topic1["keywords"])
|
174 |
-
keywords2 = set(topic2["keywords"])
|
175 |
-
|
176 |
-
# Calculate Jaccard similarity
|
177 |
-
intersection = keywords1.intersection(keywords2)
|
178 |
-
union = keywords1.union(keywords2)
|
179 |
-
|
180 |
-
if len(union) == 0:
|
181 |
-
return 0.0
|
182 |
-
|
183 |
-
return len(intersection) / len(union)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/analysis_screen.py
CHANGED
@@ -2,12 +2,12 @@ import gradio as gr
|
|
2 |
import json
|
3 |
|
4 |
# Import analysis modules
|
5 |
-
from processors.topic_modeling import extract_topics, compare_topics
|
6 |
-
from processors.ngram_analysis import compare_ngrams
|
7 |
-
from processors.bias_detection import compare_bias
|
8 |
from processors.bow_analysis import compare_bow
|
9 |
-
from processors.metrics import calculate_similarity
|
10 |
-
from processors.diff_highlighter import highlight_differences
|
11 |
|
12 |
def create_analysis_screen():
|
13 |
"""
|
|
|
2 |
import json
|
3 |
|
4 |
# Import analysis modules
|
5 |
+
#from processors.topic_modeling import extract_topics, compare_topics
|
6 |
+
#from processors.ngram_analysis import compare_ngrams
|
7 |
+
#from processors.bias_detection import compare_bias
|
8 |
from processors.bow_analysis import compare_bow
|
9 |
+
#from processors.metrics import calculate_similarity
|
10 |
+
#from processors.diff_highlighter import highlight_differences
|
11 |
|
12 |
def create_analysis_screen():
|
13 |
"""
|