Ryan commited on
Commit
41e3754
·
1 Parent(s): 6d70959
Files changed (1) hide show
  1. processors/ngram_analysis.py +24 -15
processors/ngram_analysis.py CHANGED
@@ -9,6 +9,22 @@ from nltk.util import ngrams
9
  from nltk.tokenize import word_tokenize
10
  from nltk.corpus import stopwords
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def compare_ngrams(texts, model_names, n=2, top_n=25):
14
  """
@@ -52,23 +68,16 @@ def compare_ngrams(texts, model_names, n=2, top_n=25):
52
  stop_words='english'
53
  )
54
 
55
- # Make sure texts are strings before processing
56
  processed_texts = []
57
  for text in texts:
58
- # If text is not a string (e.g., it's a list), convert it to a string
59
- if not isinstance(text, str):
60
- if isinstance(text, list):
61
- # Handle potentially nested lists by flattening
62
- flat_text = []
63
- for item in text:
64
- if isinstance(item, list):
65
- flat_text.extend([str(subitem) for subitem in item])
66
- else:
67
- flat_text.append(str(item))
68
- text = ' '.join(flat_text)
69
- else:
70
- text = str(text) # Convert to string if it's another type
71
- processed_texts.append(text)
72
 
73
  X = vectorizer.fit_transform(processed_texts)
74
 
 
9
  from nltk.tokenize import word_tokenize
10
  from nltk.corpus import stopwords
11
 
12
+ # Helper function to flatten nested lists
13
+ def flatten_list(nested_list):
14
+ """
15
+ Recursively flattens a nested list.
16
+
17
+ Args:
18
+ nested_list (list): A potentially nested list.
19
+
20
+ Returns:
21
+ list: A flattened list.
22
+ """
23
+ for item in nested_list:
24
+ if isinstance(item, list):
25
+ yield from flatten_list(item)
26
+ else:
27
+ yield item
28
 
29
  def compare_ngrams(texts, model_names, n=2, top_n=25):
30
  """
 
68
  stop_words='english'
69
  )
70
 
71
+ # Ensure all texts are strings and handle nested lists
72
  processed_texts = []
73
  for text in texts:
74
+ if isinstance(text, list):
75
+ # Flatten nested lists and join into a single string
76
+ flat_text = ' '.join(map(str, flatten_list(text)))
77
+ processed_texts.append(flat_text)
78
+ else:
79
+ # Convert non-string objects to strings
80
+ processed_texts.append(str(text))
 
 
 
 
 
 
 
81
 
82
  X = vectorizer.fit_transform(processed_texts)
83