Spaces:
Sleeping
Sleeping
Ryan
commited on
Commit
·
1a44569
1
Parent(s):
e633a26
update
Browse files- .idea/workspace.xml +5 -2
- processors/bow_analysis.py +110 -29
.idea/workspace.xml
CHANGED
@@ -4,7 +4,10 @@
|
|
4 |
<option name="autoReloadType" value="SELECTIVE" />
|
5 |
</component>
|
6 |
<component name="ChangeListManager">
|
7 |
-
<list default="true" id="8e67814c-7f04-433c-ab7a-2b65a1106d4c" name="Changes" comment=""
|
|
|
|
|
|
|
8 |
<option name="SHOW_DIALOG" value="false" />
|
9 |
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
10 |
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
@@ -53,7 +56,7 @@
|
|
53 |
<option name="presentableId" value="Default" />
|
54 |
<updated>1745170754325</updated>
|
55 |
<workItem from="1745170755404" duration="245000" />
|
56 |
-
<workItem from="1745172030020" duration="
|
57 |
</task>
|
58 |
<servers />
|
59 |
</component>
|
|
|
4 |
<option name="autoReloadType" value="SELECTIVE" />
|
5 |
</component>
|
6 |
<component name="ChangeListManager">
|
7 |
+
<list default="true" id="8e67814c-7f04-433c-ab7a-2b65a1106d4c" name="Changes" comment="">
|
8 |
+
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
9 |
+
<change beforePath="$PROJECT_DIR$/processors/bow_analysis.py" beforeDir="false" afterPath="$PROJECT_DIR$/processors/bow_analysis.py" afterDir="false" />
|
10 |
+
</list>
|
11 |
<option name="SHOW_DIALOG" value="false" />
|
12 |
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
13 |
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
|
|
56 |
<option name="presentableId" value="Default" />
|
57 |
<updated>1745170754325</updated>
|
58 |
<workItem from="1745170755404" duration="245000" />
|
59 |
+
<workItem from="1745172030020" duration="4469000" />
|
60 |
</task>
|
61 |
<servers />
|
62 |
</component>
|
processors/bow_analysis.py
CHANGED
@@ -11,10 +11,114 @@ from nltk.stem import WordNetLemmatizer
|
|
11 |
from nltk.tokenize import word_tokenize
|
12 |
from processors.metrics import calculate_similarity
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
"""
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def add_similarity_metrics(bow_results, response_texts, model_names):
|
20 |
"""
|
@@ -69,8 +173,6 @@ def add_similarity_metrics(bow_results, response_texts, model_names):
|
|
69 |
|
70 |
return bow_results
|
71 |
|
72 |
-
# All existing imports and preprocessing functions remain unchanged
|
73 |
-
|
74 |
def compare_bow(texts, model_names, top_n=25):
|
75 |
"""
|
76 |
Compare bag of words between different texts
|
@@ -87,27 +189,6 @@ def compare_bow(texts, model_names, top_n=25):
|
|
87 |
|
88 |
# Add similarity metrics to the results
|
89 |
if len(texts) >= 2 and len(model_names) >= 2:
|
90 |
-
|
91 |
-
model1, model2 = model_names[0], model_names[1]
|
92 |
-
comparison_key = f"{model1} vs {model2}"
|
93 |
-
|
94 |
-
# Initialize comparisons dict if needed
|
95 |
-
if "comparisons" not in bow_results:
|
96 |
-
bow_results["comparisons"] = {}
|
97 |
-
|
98 |
-
# Initialize comparison entry if needed
|
99 |
-
if comparison_key not in bow_results["comparisons"]:
|
100 |
-
bow_results["comparisons"][comparison_key] = {}
|
101 |
-
|
102 |
-
# Calculate similarity metrics
|
103 |
-
text1, text2 = texts[0], texts[1]
|
104 |
-
metrics = calculate_similarity(text1, text2)
|
105 |
-
|
106 |
-
# Add metrics to the comparison
|
107 |
-
bow_results["comparisons"][comparison_key].update({
|
108 |
-
"cosine_similarity": metrics.get("cosine_similarity", 0),
|
109 |
-
"jaccard_similarity": metrics.get("jaccard_similarity", 0),
|
110 |
-
"semantic_similarity": metrics.get("semantic_similarity", 0)
|
111 |
-
})
|
112 |
|
113 |
-
return bow_results
|
|
|
11 |
from nltk.tokenize import word_tokenize
|
12 |
from processors.metrics import calculate_similarity
|
13 |
|
14 |
+
# Define the compare_bow_across_texts function directly in this file
|
15 |
+
def compare_bow_across_texts(texts, model_names, top_n=25):
|
16 |
+
"""
|
17 |
+
Compare bag of words representations across multiple texts.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
texts (list): List of text responses to compare
|
21 |
+
model_names (list): Names of models corresponding to responses
|
22 |
+
top_n (int): Number of top words to consider
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
dict: Bag of words analysis results
|
26 |
+
"""
|
27 |
+
# Initialize the results dictionary
|
28 |
+
result = {
|
29 |
+
"models": model_names,
|
30 |
+
"important_words": {},
|
31 |
+
"word_count_matrix": {},
|
32 |
+
"differential_words": []
|
33 |
+
}
|
34 |
+
|
35 |
+
# Make sure we have texts to analyze
|
36 |
+
if not texts or len(texts) < 1:
|
37 |
+
return result
|
38 |
+
|
39 |
+
# Preprocess texts (tokenize, remove stopwords, etc.)
|
40 |
+
preprocessed_texts = []
|
41 |
+
stop_words = set(stopwords.words('english'))
|
42 |
+
lemmatizer = WordNetLemmatizer()
|
43 |
+
|
44 |
+
for text in texts:
|
45 |
+
# Convert to lowercase and tokenize
|
46 |
+
tokens = word_tokenize(text.lower())
|
47 |
+
|
48 |
+
# Remove stopwords, punctuation, and lemmatize
|
49 |
+
filtered_tokens = []
|
50 |
+
for token in tokens:
|
51 |
+
if token.isalpha() and token not in stop_words and len(token) > 2:
|
52 |
+
filtered_tokens.append(lemmatizer.lemmatize(token))
|
53 |
+
|
54 |
+
preprocessed_texts.append(" ".join(filtered_tokens))
|
55 |
+
|
56 |
+
# Create bag of words representations using CountVectorizer
|
57 |
+
vectorizer = CountVectorizer(max_features=1000)
|
58 |
+
X = vectorizer.fit_transform(preprocessed_texts)
|
59 |
+
|
60 |
+
# Get feature names (words)
|
61 |
+
feature_names = vectorizer.get_feature_names_out()
|
62 |
+
|
63 |
+
# Create word count matrix
|
64 |
+
word_counts = {}
|
65 |
+
for i, model in enumerate(model_names):
|
66 |
+
counts = X[i].toarray()[0]
|
67 |
+
word_counts[model] = {}
|
68 |
+
|
69 |
+
# Store word frequencies for this model
|
70 |
+
for j, word in enumerate(feature_names):
|
71 |
+
if counts[j] > 0: # Only store words that appear
|
72 |
+
word_counts[model][word] = int(counts[j])
|
73 |
+
|
74 |
+
# Add to word count matrix
|
75 |
+
if word not in result["word_count_matrix"]:
|
76 |
+
result["word_count_matrix"][word] = {}
|
77 |
+
result["word_count_matrix"][word][model] = int(counts[j])
|
78 |
+
|
79 |
+
# Find important words for each model
|
80 |
+
for model, word_freq in word_counts.items():
|
81 |
+
# Sort by frequency
|
82 |
+
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
83 |
+
|
84 |
+
# Store top N words
|
85 |
+
result["important_words"][model] = [
|
86 |
+
{"word": word, "count": count}
|
87 |
+
for word, count in sorted_words[:top_n]
|
88 |
+
]
|
89 |
+
|
90 |
+
# Calculate differential words (words with biggest frequency difference between models)
|
91 |
+
if len(model_names) >= 2:
|
92 |
+
model1, model2 = model_names[0], model_names[1]
|
93 |
+
|
94 |
+
# Calculate differences
|
95 |
+
diff_scores = {}
|
96 |
+
for word in result["word_count_matrix"]:
|
97 |
+
count1 = result["word_count_matrix"][word].get(model1, 0)
|
98 |
+
count2 = result["word_count_matrix"][word].get(model2, 0)
|
99 |
+
|
100 |
+
# Absolute difference
|
101 |
+
diff_scores[word] = abs(count1 - count2)
|
102 |
+
|
103 |
+
# Sort by difference
|
104 |
+
sorted_diffs = sorted(diff_scores.items(), key=lambda x: x[1], reverse=True)
|
105 |
+
result["differential_words"] = [word for word, _ in sorted_diffs[:top_n]]
|
106 |
+
|
107 |
+
# Calculate overlap statistics
|
108 |
+
model1_words = set(word_counts.get(model1, {}).keys())
|
109 |
+
model2_words = set(word_counts.get(model2, {}).keys())
|
110 |
+
common_words = model1_words.intersection(model2_words)
|
111 |
+
|
112 |
+
# Initialize comparisons if needed
|
113 |
+
if "comparisons" not in result:
|
114 |
+
result["comparisons"] = {}
|
115 |
+
|
116 |
+
comparison_key = f"{model1} vs {model2}"
|
117 |
+
result["comparisons"][comparison_key] = {
|
118 |
+
"common_word_count": len(common_words)
|
119 |
+
}
|
120 |
+
|
121 |
+
return result
|
122 |
|
123 |
def add_similarity_metrics(bow_results, response_texts, model_names):
|
124 |
"""
|
|
|
173 |
|
174 |
return bow_results
|
175 |
|
|
|
|
|
176 |
def compare_bow(texts, model_names, top_n=25):
|
177 |
"""
|
178 |
Compare bag of words between different texts
|
|
|
189 |
|
190 |
# Add similarity metrics to the results
|
191 |
if len(texts) >= 2 and len(model_names) >= 2:
|
192 |
+
bow_results = add_similarity_metrics(bow_results, texts, model_names)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
+
return bow_results
|