Ryan commited on
Commit
c0e1d59
·
1 Parent(s): 1f1253e
Files changed (1) hide show
  1. processors/ngram_analysis.py +4 -23
processors/ngram_analysis.py CHANGED
@@ -67,29 +67,10 @@ def compare_ngrams(texts, model_names, n=2, top_n=25):
67
  max_features=1000,
68
  stop_words='english'
69
  )
70
-
71
- # Ensure all texts are strings and handle nested lists
72
- processed_texts = []
73
- for text in texts:
74
- try:
75
- if isinstance(text, list):
76
- # More thoroughly flatten and ensure we have a string
77
- flat_items = list(flatten_list(text))
78
- # Convert each item to string and join
79
- flat_text = ' '.join([str(item) for item in flat_items])
80
- processed_texts.append(flat_text)
81
- else:
82
- # Convert non-string objects to strings
83
- processed_texts.append(str(text))
84
-
85
- # Verify we have a valid string
86
- if not isinstance(processed_texts[-1], str):
87
- processed_texts[-1] = str(processed_texts[-1])
88
- except Exception as e:
89
- # Handle problematic text by adding empty string
90
- print(f"Warning: Error processing text: {e}")
91
- processed_texts.append("")
92
-
93
  X = vectorizer.fit_transform(processed_texts)
94
 
95
  # Get feature names (n-grams)
 
67
  max_features=1000,
68
  stop_words='english'
69
  )
70
+
71
+ # Ensure each text is a string, without attempting complex preprocessing
72
+ processed_texts = [str(text) if not isinstance(text, str) else text for text in texts]
73
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  X = vectorizer.fit_transform(processed_texts)
75
 
76
  # Get feature names (n-grams)