Spaces:
Sleeping
Sleeping
Ryan
commited on
Commit
·
2d9e425
1
Parent(s):
2c58f4e
update
Browse files- .DS_Store +0 -0
- README.md +6 -0
- processors/bow_analysis.py +9 -5
- processors/ngram_analysis.py +5 -2
- processors/roberta_analysis.py +1 -1
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
README.md
CHANGED
@@ -125,6 +125,8 @@ The RoBERTa sentiment analysis classifier will output which model is more positi
|
|
125 |
|
126 |

|
127 |
|
|
|
|
|
128 |

|
129 |
|
130 |
## Summary
|
@@ -133,6 +135,8 @@ The summary tab provides a summary of two of the prompts: the Trump and Harris p
|
|
133 |
|
134 |

|
135 |
|
|
|
|
|
136 |

|
137 |
|
138 |
|
@@ -142,6 +146,8 @@ This is a hard-coded tab that displays some basic graphs. The first one is a ba
|
|
142 |
|
143 |

|
144 |
|
|
|
|
|
145 |

|
146 |
|
147 |
|
|
|
125 |
|
126 |

|
127 |
|
128 |
+
The results are shown below.
|
129 |
+
|
130 |

|
131 |
|
132 |
## Summary
|
|
|
135 |
|
136 |

|
137 |
|
138 |
+
Below is the summary area filled in after clicking the button with the YOUR DATASET RESULTS selected.
|
139 |
+
|
140 |

|
141 |
|
142 |
|
|
|
146 |
|
147 |

|
148 |
|
149 |
+
Below is the chart.
|
150 |
+
|
151 |

|
152 |
|
153 |
|
processors/bow_analysis.py
CHANGED
@@ -1,16 +1,20 @@
|
|
1 |
"""
|
2 |
-
Updated bow_analysis.py to include similarity metrics
|
|
|
|
|
3 |
"""
|
4 |
from sklearn.feature_extraction.text import CountVectorizer
|
5 |
-
import numpy as np
|
6 |
-
from collections import Counter
|
7 |
-
import re
|
8 |
-
import nltk
|
9 |
from nltk.corpus import stopwords
|
10 |
from nltk.stem import WordNetLemmatizer
|
11 |
from nltk.tokenize import word_tokenize
|
12 |
from processors.metrics import calculate_similarity
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
# Define the compare_bow_across_texts function directly in this file
|
15 |
def compare_bow_across_texts(texts, model_names, top_n=25):
|
16 |
"""
|
|
|
1 |
"""
|
2 |
+
Updated bow_analysis.py to include similarity metrics.
|
3 |
+
Preprocessing here is more advanced than n-gram version.
|
4 |
+
Lowercase, tokenize, remove stopwords, non-alphabetic characters removal, short words removal, lemmatization.
|
5 |
"""
|
6 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
|
|
|
|
|
7 |
from nltk.corpus import stopwords
|
8 |
from nltk.stem import WordNetLemmatizer
|
9 |
from nltk.tokenize import word_tokenize
|
10 |
from processors.metrics import calculate_similarity
|
11 |
|
12 |
+
# not used currently imports, but left in case I start using them again
|
13 |
+
import numpy as np
|
14 |
+
from collections import Counter
|
15 |
+
import re
|
16 |
+
import nltk
|
17 |
+
|
18 |
# Define the compare_bow_across_texts function directly in this file
|
19 |
def compare_bow_across_texts(texts, model_names, top_n=25):
|
20 |
"""
|
processors/ngram_analysis.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1 |
"""
|
2 |
-
N-gram analysis for comparing text responses
|
|
|
3 |
"""
|
4 |
from sklearn.feature_extraction.text import CountVectorizer
|
5 |
-
|
|
|
6 |
from collections import Counter
|
|
|
7 |
import nltk
|
8 |
from nltk.util import ngrams
|
9 |
from nltk.tokenize import word_tokenize
|
|
|
1 |
"""
|
2 |
+
N-gram analysis for comparing text responses.
|
3 |
+
Minimal preprocessing is done here, basically just removing stop words and tokenization. From my research this is a good combination for n-gram analysis.
|
4 |
"""
|
5 |
from sklearn.feature_extraction.text import CountVectorizer
|
6 |
+
|
7 |
+
# these aren't used currently, as they were imports for testing versions with them. the code is removed also, but I decided to just leave these imports incase I start using them again.
|
8 |
from collections import Counter
|
9 |
+
import numpy as np
|
10 |
import nltk
|
11 |
from nltk.util import ngrams
|
12 |
from nltk.tokenize import word_tokenize
|
processors/roberta_analysis.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
RoBERTa-based sentiment analysis for comparing LLM responses
|
3 |
"""
|
4 |
import torch
|
5 |
-
import numpy as np
|
6 |
from transformers import RobertaTokenizer, RobertaForSequenceClassification
|
7 |
import nltk
|
8 |
from nltk.tokenize import sent_tokenize
|
|
|
2 |
RoBERTa-based sentiment analysis for comparing LLM responses
|
3 |
"""
|
4 |
import torch
|
5 |
+
import numpy as np # ended up not using, but left in case I need it later.
|
6 |
from transformers import RobertaTokenizer, RobertaForSequenceClassification
|
7 |
import nltk
|
8 |
from nltk.tokenize import sent_tokenize
|