rodrigomasini commited on
Commit
21f2eeb
·
verified ·
1 Parent(s): 4044da4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -2
app.py CHANGED
@@ -224,15 +224,31 @@ from textstat import textstat
224
  # return 0.4 * ((words / sentences_count) + 100 * (complex_words / words))
225
 
226
  def pre_process_text(text):
 
 
 
 
 
 
 
 
 
 
 
 
227
  sentences_list = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
 
228
  # Split the elements of the list by newline characters
229
  split_sentences = []
230
  for sentence in sentences_list:
231
  split_sentences.extend(re.split(r'\n+', sentence))
 
232
  # Remove empty elements
233
  cleaned_sentences = [sentence for sentence in split_sentences if sentence.strip()]
234
- string_sentences = (" ".join(cleaned_sentences))
235
- return string_sentences
 
 
236
 
237
  def flesch_kincaid_grade_level(text):
238
  sentences = pre_process_text(text)
 
224
  # return 0.4 * ((words / sentences_count) + 100 * (complex_words / words))
225
 
226
  def pre_process_text(text):
227
+ # Normalize line breaks and whitespace
228
+ text = re.sub(r'\n\s*\n', '\n\n', text.strip())
229
+
230
+ # Split the text into sections
231
+ sections = re.split(r'\n{2,}', text)
232
+
233
+ # Remove empty strings from the split result
234
+ sections = [section.strip() for section in sections if section.strip()]
235
+
236
+ # Combine sections into a single string
237
+ combined_text = '\n\n'.join(sections)
238
+
239
  sentences_list = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
240
+ print(sentences_list)
241
  # Split the elements of the list by newline characters
242
  split_sentences = []
243
  for sentence in sentences_list:
244
  split_sentences.extend(re.split(r'\n+', sentence))
245
+
246
  # Remove empty elements
247
  cleaned_sentences = [sentence for sentence in split_sentences if sentence.strip()]
248
+
249
+ combined_text = (" ".join(cleaned_sentences))
250
+
251
+ return combined_text
252
 
253
  def flesch_kincaid_grade_level(text):
254
  sentences = pre_process_text(text)