Spaces:
Sleeping
Sleeping
Ryan
commited on
Commit
·
41e3754
1
Parent(s):
6d70959
update
Browse files- processors/ngram_analysis.py +24 -15
processors/ngram_analysis.py
CHANGED
@@ -9,6 +9,22 @@ from nltk.util import ngrams
|
|
9 |
from nltk.tokenize import word_tokenize
|
10 |
from nltk.corpus import stopwords
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def compare_ngrams(texts, model_names, n=2, top_n=25):
|
14 |
"""
|
@@ -52,23 +68,16 @@ def compare_ngrams(texts, model_names, n=2, top_n=25):
|
|
52 |
stop_words='english'
|
53 |
)
|
54 |
|
55 |
-
#
|
56 |
processed_texts = []
|
57 |
for text in texts:
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
flat_text.extend([str(subitem) for subitem in item])
|
66 |
-
else:
|
67 |
-
flat_text.append(str(item))
|
68 |
-
text = ' '.join(flat_text)
|
69 |
-
else:
|
70 |
-
text = str(text) # Convert to string if it's another type
|
71 |
-
processed_texts.append(text)
|
72 |
|
73 |
X = vectorizer.fit_transform(processed_texts)
|
74 |
|
|
|
9 |
from nltk.tokenize import word_tokenize
|
10 |
from nltk.corpus import stopwords
|
11 |
|
12 |
+
# Helper function to flatten nested lists
|
13 |
+
def flatten_list(nested_list):
|
14 |
+
"""
|
15 |
+
Recursively flattens a nested list.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
nested_list (list): A potentially nested list.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
list: A flattened list.
|
22 |
+
"""
|
23 |
+
for item in nested_list:
|
24 |
+
if isinstance(item, list):
|
25 |
+
yield from flatten_list(item)
|
26 |
+
else:
|
27 |
+
yield item
|
28 |
|
29 |
def compare_ngrams(texts, model_names, n=2, top_n=25):
|
30 |
"""
|
|
|
68 |
stop_words='english'
|
69 |
)
|
70 |
|
71 |
+
# Ensure all texts are strings and handle nested lists
|
72 |
processed_texts = []
|
73 |
for text in texts:
|
74 |
+
if isinstance(text, list):
|
75 |
+
# Flatten nested lists and join into a single string
|
76 |
+
flat_text = ' '.join(map(str, flatten_list(text)))
|
77 |
+
processed_texts.append(flat_text)
|
78 |
+
else:
|
79 |
+
# Convert non-string objects to strings
|
80 |
+
processed_texts.append(str(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
X = vectorizer.fit_transform(processed_texts)
|
83 |
|