hugpv commited on
Commit
601fcdb
·
1 Parent(s): 025d629

synced with dev version. Mostly OCR related

Browse files
app.py CHANGED
@@ -22,6 +22,11 @@ import zipfile
22
  from matplotlib import font_manager
23
  import os
24
 
 
 
 
 
 
25
  from multi_proc_funcs import (
26
  ALL_FIX_MEASURES,
27
  COLORS,
@@ -236,7 +241,7 @@ COLNAME_CANDIDATES_CUSTOM_CSV_FIX = {
236
  "trial_id_col_name_fix": ["trial_id", "trialid", "trial", "trial_num", "id"],
237
  "subject_col_name_fix": ["subject", "sub", "subid", "sub_id"],
238
  "time_start_col_name_fix": ["start", "start_time", "ts", "t_start", "starttime"],
239
- "time_stop_col_name_fix": ["stop", "stop_time", "te", "t_end", "t_stop", "stoptime"],
240
  }
241
  COLNAME_CANDIDATES_CUSTOM_CSV_FIX_DEFAULT = {k: v[0] for k, v in COLNAME_CANDIDATES_CUSTOM_CSV_FIX.items()}
242
 
@@ -923,6 +928,8 @@ def make_trial_from_stimulus_df(
923
  stim_plot_df,
924
  filename,
925
  trial_id,
 
 
926
  ):
927
  chars_list = []
928
  words_list = []
@@ -931,6 +938,12 @@ def make_trial_from_stimulus_df(
931
  chars_list.append(char_dict)
932
 
933
  words_list, chars_list = ut.add_words(chars_list)
 
 
 
 
 
 
934
  letter_width_avg = np.mean([x["char_xmax"] - x["char_xmin"] for x in chars_list if x["char_xmax"] > x["char_xmin"]])
935
  line_heights = [x["char_ymax"] - x["char_ymin"] for x in chars_list]
936
  line_xcoords_all = [x["char_x_center"] for x in chars_list]
@@ -1034,12 +1047,15 @@ def get_fixations_file_trials_list(dffix, stimulus):
1034
  for trial_id, subdf in stqdm(enum, desc="Creating trials"):
1035
  if isinstance(stimulus, pd.DataFrame):
1036
  stim_df = stimulus[stimulus.trial_id == subdf["trial_id"].iloc[0]]
 
 
1037
 
1038
  stim_df = stim_df.dropna(axis=0, how="all")
1039
  subdf = subdf.dropna(axis=0, how="all")
1040
  stim_df = stim_df.dropna(axis=1, how="all")
1041
  subdf = subdf.dropna(axis=1, how="all")
1042
  if subdf.empty:
 
1043
  continue
1044
  subdf = subdf.reset_index(drop=True).copy()
1045
  stim_df = stim_df.reset_index(drop=True).copy()
@@ -1048,6 +1064,8 @@ def get_fixations_file_trials_list(dffix, stimulus):
1048
  stim_df,
1049
  st.session_state["single_csv_file_stim"].name,
1050
  trial_id,
 
 
1051
  )
1052
  else:
1053
  if "trial_id" in stimulus.keys() and (
@@ -1072,20 +1090,25 @@ def get_fixations_file_trials_list(dffix, stimulus):
1072
 
1073
  return trials_by_ids, trial_keys
1074
 
 
 
 
 
 
1075
 
1076
  def load_csv_delim_agnostic(file_path):
1077
  try:
1078
  df = pd.read_csv(file_path)
1079
  if df.shape[1] > 1:
1080
- return df
1081
  else:
1082
  dec_file = get_decoded_input_from_file(file_path)
1083
  df = pd.read_csv(StringIO(dec_file.replace(";", ",").replace("\t", ",")))
1084
- return df
1085
  except Exception as e:
1086
  dec_file = get_decoded_input_from_file(file_path)
1087
  df = pd.read_csv(StringIO(dec_file.replace(";", ",").replace("\t", ",")))
1088
- return df
1089
 
1090
 
1091
  def find_col_name_suggestions(cols, candidates_dict):
@@ -1374,6 +1397,7 @@ def main():
1374
  trial_choices_single_asc, trials_by_ids, lines, asc_file, trials_dict = ut.get_trials_list(
1375
  st.session_state["single_asc_file_asc"],
1376
  close_gap_between_words=st.session_state["close_gap_between_words_single_asc"],
 
1377
  paragraph_trials_only=st.session_state["paragraph_trials_only_single_asc"],
1378
  ias_files=st.session_state["single_asc_file_ias_files"],
1379
  trial_start_keyword=trial_start_keyword,
@@ -1981,10 +2005,22 @@ def main():
1981
  "Select .csv or .json file containing the stimulus data",
1982
  accept_multiple_files=False,
1983
  key="single_csv_file_stim_uploaded",
1984
- type={"json", "csv", "txt", "dat"},
1985
- help="Drag and drop or select a single .json, .csv, .txt or .dat file that you wish to process as the stimulus file for the uploaded fixation data. This can be left blank if you chose to use the examples.",
1986
  )
1987
 
 
 
 
 
 
 
 
 
 
 
 
 
1988
  use_example_or_uploaded_file_choice = st.radio(
1989
  "Should the uploaded files be used or some example files?",
1990
  index=1,
@@ -2024,6 +2060,11 @@ def main():
2024
  trial = json.loads(decoded_input)
2025
  st.session_state["stimdf_single_csv"] = trial
2026
  colnames_stim = list(st.session_state["stimdf_single_csv"].keys())
 
 
 
 
 
2027
  else:
2028
  st.session_state["stimdf_single_csv"] = load_csv_delim_agnostic(single_csv_stim_file)
2029
  colnames_stim = st.session_state["stimdf_single_csv"].columns
@@ -2407,6 +2448,7 @@ def main():
2407
  use_corrected_fixations=True,
2408
  correction_algo=st.session_state["algo_choice_custom_eyekit"],
2409
  save_to_csv=True,
 
2410
  )
2411
  st.dataframe(own_word_measures, use_container_width=True, hide_index=True, height=200)
2412
  own_word_measures_csv = convert_df(own_word_measures)
@@ -3056,6 +3098,7 @@ def main():
3056
  use_corrected_fixations=True,
3057
  correction_algo=st.session_state["algo_choice_multi_asc_eyekit"],
3058
  save_to_csv=True,
 
3059
  )
3060
  if "sentence_measures_multi_asc" in st.session_state:
3061
  sent_measures_multi = st.session_state["sentence_measures_multi_asc"]
@@ -3219,6 +3262,12 @@ def show_file_parsing_settings(suffix: str):
3219
  key=f"close_gap_between_words{suffix}",
3220
  help="If this is selected, each word bounding box will include half the spaces between adjacent words. If not, the word bounding boxes will simply be the combined bounding boxes of the letters making up the word.", # TODO check if this affects analysis
3221
  )
 
 
 
 
 
 
3222
  st.markdown("### Trial filtering settings")
3223
 
3224
  st.checkbox(
 
22
  from matplotlib import font_manager
23
  import os
24
 
25
+ try:
26
+ from create_interest_areas_from_image import recognize_text
27
+ except Exception as e:
28
+ print(e)
29
+
30
  from multi_proc_funcs import (
31
  ALL_FIX_MEASURES,
32
  COLORS,
 
241
  "trial_id_col_name_fix": ["trial_id", "trialid", "trial", "trial_num", "id"],
242
  "subject_col_name_fix": ["subject", "sub", "subid", "sub_id"],
243
  "time_start_col_name_fix": ["start", "start_time", "ts", "t_start", "starttime"],
244
+ "time_stop_col_name_fix": ["end","stop", "stop_time", "te", "t_end", "t_stop", "stoptime"],
245
  }
246
  COLNAME_CANDIDATES_CUSTOM_CSV_FIX_DEFAULT = {k: v[0] for k, v in COLNAME_CANDIDATES_CUSTOM_CSV_FIX.items()}
247
 
 
928
  stim_plot_df,
929
  filename,
930
  trial_id,
931
+ close_gaps_between_words:bool,
932
+ close_gaps_between_lines:bool,
933
  ):
934
  chars_list = []
935
  words_list = []
 
938
  chars_list.append(char_dict)
939
 
940
  words_list, chars_list = ut.add_words(chars_list)
941
+ if close_gaps_between_words:
942
+ words_list = ut.close_gaps_in_words_list(words_list)
943
+ if close_gaps_between_lines:
944
+ chars_list = ut.close_gaps_between_lines(chars_list,prefix='char')
945
+ words_list = ut.close_gaps_between_lines(words_list,prefix='word')
946
+
947
  letter_width_avg = np.mean([x["char_xmax"] - x["char_xmin"] for x in chars_list if x["char_xmax"] > x["char_xmin"]])
948
  line_heights = [x["char_ymax"] - x["char_ymin"] for x in chars_list]
949
  line_xcoords_all = [x["char_x_center"] for x in chars_list]
 
1047
  for trial_id, subdf in stqdm(enum, desc="Creating trials"):
1048
  if isinstance(stimulus, pd.DataFrame):
1049
  stim_df = stimulus[stimulus.trial_id == subdf["trial_id"].iloc[0]]
1050
+ if stim_df.empty:
1051
+ st.session_state["logger"].warning(f"stim_df dataframe is empty because trial_id {trial_id} not in stimulus trial ids:\n{stimulus.trial_id.unique()}")
1052
 
1053
  stim_df = stim_df.dropna(axis=0, how="all")
1054
  subdf = subdf.dropna(axis=0, how="all")
1055
  stim_df = stim_df.dropna(axis=1, how="all")
1056
  subdf = subdf.dropna(axis=1, how="all")
1057
  if subdf.empty:
1058
+ st.session_state["logger"].warning(f"Sub dataframe is empty for trial_id {trial_id}")
1059
  continue
1060
  subdf = subdf.reset_index(drop=True).copy()
1061
  stim_df = stim_df.reset_index(drop=True).copy()
 
1064
  stim_df,
1065
  st.session_state["single_csv_file_stim"].name,
1066
  trial_id,
1067
+ close_gaps_between_words=st.session_state["close_gap_between_words_single_csv"],
1068
+ close_gaps_between_lines=st.session_state["close_gap_between_lines_single_csv"],
1069
  )
1070
  else:
1071
  if "trial_id" in stimulus.keys() and (
 
1090
 
1091
  return trials_by_ids, trial_keys
1092
 
1093
+ def make_ints_float(df):
1094
+ for col in df.columns:
1095
+ if 'int' in str(df[col].dtype).lower():
1096
+ df[col] = pd.to_numeric(df[col], downcast='float')
1097
+ return df
1098
 
1099
  def load_csv_delim_agnostic(file_path):
1100
  try:
1101
  df = pd.read_csv(file_path)
1102
  if df.shape[1] > 1:
1103
+ return make_ints_float(df)
1104
  else:
1105
  dec_file = get_decoded_input_from_file(file_path)
1106
  df = pd.read_csv(StringIO(dec_file.replace(";", ",").replace("\t", ",")))
1107
+ return make_ints_float(df)
1108
  except Exception as e:
1109
  dec_file = get_decoded_input_from_file(file_path)
1110
  df = pd.read_csv(StringIO(dec_file.replace(";", ",").replace("\t", ",")))
1111
+ return make_ints_float(df)
1112
 
1113
 
1114
  def find_col_name_suggestions(cols, candidates_dict):
 
1397
  trial_choices_single_asc, trials_by_ids, lines, asc_file, trials_dict = ut.get_trials_list(
1398
  st.session_state["single_asc_file_asc"],
1399
  close_gap_between_words=st.session_state["close_gap_between_words_single_asc"],
1400
+ close_gap_between_lines=st.session_state["close_gap_between_lines_single_asc"],
1401
  paragraph_trials_only=st.session_state["paragraph_trials_only_single_asc"],
1402
  ias_files=st.session_state["single_asc_file_ias_files"],
1403
  trial_start_keyword=trial_start_keyword,
 
2005
  "Select .csv or .json file containing the stimulus data",
2006
  accept_multiple_files=False,
2007
  key="single_csv_file_stim_uploaded",
2008
+ type={"json", "csv", "txt", "dat","jpeg","png"},
2009
+ help="Drag and drop or select a single .json, .csv, .txt, .dat, jpeg or png file that you wish to process as the stimulus file for the uploaded fixation data. If an image is uploaded OCR will be attempted to extract the character bounding boxes. This can be left blank if you chose to use the examples.",
2010
  )
2011
 
2012
+ st.checkbox(
2013
+ label="Should spaces between words be included in word bounding box?",
2014
+ value=get_default_val("close_gap_between_words_csv", True),
2015
+ key="close_gap_between_words_single_csv",
2016
+ help="If this is selected, each word bounding box will include half the spaces between adjacent words. If not, the word bounding boxes will simply be the combined bounding boxes of the letters making up the word.", # TODO check if this affects analysis
2017
+ )
2018
+ st.checkbox(
2019
+ label="Should spaces between lines be included in word and character bounding boxes?",
2020
+ value=get_default_val("close_gap_between_lines_single_csv", True),
2021
+ key="close_gap_between_lines_single_csv",
2022
+ help="If this is selected, each word and char bounding box will include half the spaces between adjacent lines.", # TODO check if this affects analysis
2023
+ )
2024
  use_example_or_uploaded_file_choice = st.radio(
2025
  "Should the uploaded files be used or some example files?",
2026
  index=1,
 
2060
  trial = json.loads(decoded_input)
2061
  st.session_state["stimdf_single_csv"] = trial
2062
  colnames_stim = list(st.session_state["stimdf_single_csv"].keys())
2063
+ elif any([".png" in single_csv_stim_file.name, ".jpeg" in single_csv_stim_file.name]):
2064
+ stimdf_single_csv = recognize_text(single_csv_stim_file)
2065
+ stimdf_single_csv.to_csv(RESULTS_FOLDER / f"{single_csv_stim_file.name}_stimdf_single_from_OCR.csv")
2066
+ st.session_state["stimdf_single_csv"] = stimdf_single_csv
2067
+ colnames_stim = st.session_state["stimdf_single_csv"].columns
2068
  else:
2069
  st.session_state["stimdf_single_csv"] = load_csv_delim_agnostic(single_csv_stim_file)
2070
  colnames_stim = st.session_state["stimdf_single_csv"].columns
 
2448
  use_corrected_fixations=True,
2449
  correction_algo=st.session_state["algo_choice_custom_eyekit"],
2450
  save_to_csv=True,
2451
+ measures_to_calculate = ALL_MEASURES_OWN
2452
  )
2453
  st.dataframe(own_word_measures, use_container_width=True, hide_index=True, height=200)
2454
  own_word_measures_csv = convert_df(own_word_measures)
 
3098
  use_corrected_fixations=True,
3099
  correction_algo=st.session_state["algo_choice_multi_asc_eyekit"],
3100
  save_to_csv=True,
3101
+ measures_to_calculate = ALL_MEASURES_OWN
3102
  )
3103
  if "sentence_measures_multi_asc" in st.session_state:
3104
  sent_measures_multi = st.session_state["sentence_measures_multi_asc"]
 
3262
  key=f"close_gap_between_words{suffix}",
3263
  help="If this is selected, each word bounding box will include half the spaces between adjacent words. If not, the word bounding boxes will simply be the combined bounding boxes of the letters making up the word.", # TODO check if this affects analysis
3264
  )
3265
+ st.checkbox(
3266
+ label="Should spaces between lines be included in word and character bounding boxes?",
3267
+ value=get_default_val(f"close_gap_between_lines{suffix}", True),
3268
+ key=f"close_gap_between_lines{suffix}",
3269
+ help="If this is selected, each word and char bounding box will include half the spaces between adjacent lines.", # TODO check if this affects analysis
3270
+ )
3271
  st.markdown("### Trial filtering settings")
3272
 
3273
  st.checkbox(
create_interest_areas_from_image.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image, ImageDraw
2
+ import pandas as pd
3
+ import io
4
+ import csv
5
+ import os
6
+
7
+ if os.environ.get('TESSDATA_PREFIX') is None and os.name == 'nt':
8
+ os.environ['TESSDATA_PREFIX'] = 'C:/Program Files/Tesseract-OCR/tessdata/'
9
+ tessdata_prefix = 'C:/Program Files/Tesseract-OCR/tessdata/'
10
+ if os.environ.get('TESSDATA_PREFIX') is None and os.name != 'nt':
11
+ os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'
12
+ tessdata_prefix = '/usr/share/tesseract-ocr/4.00/tessdata'
13
+
14
+ import pytesseract
15
+ if os.name == 'nt':
16
+ pytesseract.pytesseract.tesseract_cmd = r'c:/Program Files/Tesseract-OCR/tesseract.exe'
17
+ else:
18
+ pytesseract.pytesseract.tesseract_cmd =r'/usr/bin/tesseract'
19
+
20
+ def recognize_text(image_path, tesseract_config='--psm 6 -l spa'):
21
+ """
22
+ Performs OCR on an image and returns a DataFrame with character bounding boxes
23
+ and associated information.
24
+
25
+ Args:
26
+ image_path: Path to the image file.
27
+ tesseract_config: Configuration string for pytesseract (e.g., '--psm 6 -l spa').
28
+
29
+ Returns:
30
+ pandas.DataFrame: DataFrame containing character-level data (df_word_chars).
31
+ """
32
+ # if os.environ['TESSDATA_PREFIX'] is not None:
33
+ # tesseract_config = f'--tessdata-dir "{tessdata_prefix}"' + tesseract_config
34
+ image = Image.open(image_path).convert('RGB')
35
+ if hasattr(image_path,'name'):
36
+ im_name = image_path.name
37
+ else:
38
+ im_name = image_path
39
+ image_height = image.height
40
+
41
+ # Extract filename for trial_id
42
+ trial_id = os.path.splitext(os.path.basename(im_name))[0]
43
+
44
+ # Use pytesseract to extract data for words and characters
45
+ data_words = pytesseract.image_to_data(image, config=tesseract_config)
46
+ data_chars = pytesseract.image_to_boxes(image, config=tesseract_config)
47
+
48
+ df_words = pd.read_csv(io.StringIO(data_words), sep='\t', quoting=csv.QUOTE_NONE)
49
+ df_chars = pd.read_csv(io.StringIO(data_chars), sep=' ', header=None, names=['char', 'left', 'top', 'right', 'bottom', 'unknown'])
50
+
51
+ # Fix character coordinates
52
+ for index, row in df_chars.iterrows():
53
+ original_top = int(row['top'])
54
+ original_bottom = int(row['bottom'])
55
+ df_chars.at[index, 'top'] = image_height - original_bottom
56
+ df_chars.at[index, 'bottom'] = image_height - original_top
57
+
58
+ # Create DataFrame to store spaces
59
+ df_spaces = pd.DataFrame(columns=['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text'])
60
+
61
+ # Group words by line, block, and paragraph
62
+ grouped_lines = df_words.groupby(['block_num', 'par_num', 'line_num'])
63
+
64
+ for (block_num, par_num, line_num), line_words_df in grouped_lines:
65
+ sorted_words = line_words_df.sort_values(by='left')
66
+ previous_word = None
67
+ for index, current_word in sorted_words.iterrows():
68
+ if previous_word is not None:
69
+ space_left = int(previous_word['left']) + int(previous_word['width'])
70
+ space_width = int(current_word['left']) - space_left
71
+ if space_width > 0:
72
+ space_top = int(previous_word['top'])
73
+ space_height = int(previous_word['height'])
74
+ space_data = {
75
+ 'level': 5,
76
+ 'page_num': int(current_word['page_num']),
77
+ 'block_num': int(current_word['block_num']),
78
+ 'par_num': int(current_word['par_num']),
79
+ 'line_num': int(current_word['line_num']),
80
+ 'word_num': int(previous_word['word_num']),
81
+ 'left': space_left,
82
+ 'top': space_top,
83
+ 'width': space_width,
84
+ 'height': space_height,
85
+ 'conf': 0,
86
+ 'text': ' '
87
+ }
88
+ df_spaces = pd.concat([df_spaces, pd.DataFrame(space_data, index=[0])], ignore_index=True)
89
+ previous_word = current_word
90
+
91
+ # Create DataFrame for characters within words (and spaces)
92
+ df_word_chars = pd.DataFrame(columns=['char', 'char_xmin', 'char_ymin', 'char_xmax', 'char_ymax',
93
+ 'block', 'paragraph', 'line_number',
94
+ 'word_nr', 'letter_nr', 'word',
95
+ 'char_x_center', 'char_y_center', 'assigned_line', 'trial_id'])
96
+
97
+
98
+ for index_word, row_word in df_words.iterrows():
99
+ if isinstance(row_word['text'], str) and row_word['text'].strip() and row_word['level'] == 5:
100
+ word_left = int(row_word['left'])
101
+ word_top = int(row_word['top'])
102
+ word_width = int(row_word['width'])
103
+ word_height = int(row_word['height'])
104
+ word_right = word_left + word_width
105
+ word_bottom = word_top + word_height
106
+ word_text = row_word['text']
107
+
108
+ char_index_in_word = 0
109
+ relevant_chars = df_chars[
110
+ (df_chars['left'] >= word_left) & (df_chars['right'] <= word_right) &
111
+ (df_chars['top'] >= word_top) & (df_chars['bottom'] <= word_bottom)
112
+ ]
113
+ relevant_chars = relevant_chars.sort_values(by='left')
114
+ previous_char_right = word_left
115
+
116
+ for index_char, row_char in relevant_chars.iterrows():
117
+ char_text = row_char['char']
118
+ char_left = previous_char_right
119
+ char_right = int(row_char['right'])
120
+ char_right = min(char_right, word_right)
121
+ if char_left > char_right:
122
+ char_right = int(row_char['right'])
123
+ char_top = word_top
124
+ char_bottom = word_bottom
125
+
126
+ char_data = {
127
+ 'char': char_text,
128
+ 'char_xmin': int(round(char_left)), # Round and convert to int
129
+ 'char_ymin': int(round(char_top)), # Round and convert to int
130
+ 'char_xmax': int(round(char_right)), # Round and convert to int
131
+ 'char_ymax': int(round(char_bottom)), # Round and convert to int
132
+ 'block': int(row_word['block_num']),
133
+ 'paragraph': int(row_word['par_num']),
134
+ 'line_number': int(row_word['line_num']),
135
+ 'word_nr': int(row_word['word_num']),
136
+ 'letter_nr': int(char_index_in_word), #already an int
137
+ 'word': word_text,
138
+ 'char_x_center': int(round((char_left + char_right) / 2)), # Round and convert
139
+ 'char_y_center': int(round((char_top + char_bottom) / 2)), # Round and convert
140
+ 'assigned_line': None,
141
+ 'trial_id': trial_id
142
+ }
143
+ df_word_chars = pd.concat([df_word_chars, pd.DataFrame(char_data, index=[0])], ignore_index=True)
144
+ char_index_in_word += 1
145
+ previous_char_right = char_right
146
+
147
+ spaces_following_word = df_spaces[
148
+ (df_spaces['word_num'] == int(row_word['word_num'])) &
149
+ (df_spaces['line_num'] == int(row_word['line_num'])) &
150
+ (df_spaces['block_num'] == int(row_word['block_num'])) &
151
+ (df_spaces['par_num'] == int(row_word['par_num']))
152
+ ]
153
+
154
+ for index_space, row_space in spaces_following_word.iterrows():
155
+ space_data = {
156
+ 'char': ' ',
157
+ 'char_xmin': int(round(row_space['left'])), # Round and convert
158
+ 'char_ymin': int(round(row_space['top'])), # Round and convert
159
+ 'char_xmax': int(round(row_space['left'] + row_space['width'])), # Round and convert
160
+ 'char_ymax': int(round(row_space['top'] + row_space['height'])), # Round and convert
161
+ 'block': int(row_space['block_num']),
162
+ 'paragraph': int(row_space['par_num']),
163
+ 'line_number': int(row_space['line_num']),
164
+ 'word_nr': int(row_space['word_num']),
165
+ 'letter_nr': int(char_index_in_word), # Already int
166
+ 'word': word_text,
167
+ 'char_x_center': int(round((row_space['left'] + row_space['left'] + row_space['width']) / 2)), # Round
168
+ 'char_y_center': int(round((row_space['top'] + row_space['top'] + row_space['height']) / 2)), # Round
169
+ 'assigned_line': None,
170
+ 'trial_id': trial_id
171
+ }
172
+ df_word_chars = pd.concat([df_word_chars, pd.DataFrame(space_data, index=[0])], ignore_index=True)
173
+ char_index_in_word += 1
174
+
175
+ # Create 'assigned_line' column
176
+ df_word_chars['assigned_line'] = 0
177
+ line_counter = 1
178
+ for block_num in sorted(df_word_chars['block'].unique()):
179
+ for par_num in sorted(df_word_chars.loc[df_word_chars['block'] == block_num, 'paragraph'].unique()):
180
+ for line_num in sorted(df_word_chars.loc[(df_word_chars['block'] == block_num) & (df_word_chars['paragraph'] == par_num), 'line_number'].unique()):
181
+ line_mask = (df_word_chars['line_number'] == line_num) & (df_word_chars['paragraph'] == par_num) & (df_word_chars['block'] == block_num)
182
+ df_word_chars.loc[line_mask, 'assigned_line'] = line_counter
183
+ line_counter += 1
184
+
185
+ # Adjust Y_Start, Y_End, and char_y_center, converting to integers
186
+ for assigned_line in df_word_chars['assigned_line'].unique():
187
+ line_mask = (df_word_chars['assigned_line'] == assigned_line)
188
+ min_top = df_word_chars.loc[line_mask, 'char_ymin'].min()
189
+ max_bottom = df_word_chars.loc[line_mask, 'char_ymax'].max()
190
+ new_y_center = (min_top + max_bottom) / 2
191
+ df_word_chars.loc[line_mask, 'char_ymin'] = int(round(min_top)) # Round and convert
192
+ df_word_chars.loc[line_mask, 'char_ymax'] = int(round(max_bottom)) # Round and convert
193
+ df_word_chars.loc[line_mask, 'char_y_center'] = int(round(new_y_center)) # Round and convert
194
+
195
+ # Convert relevant columns to integers
196
+ int_columns = ['char_xmin', 'char_ymin', 'char_xmax', 'char_ymax', 'block', 'paragraph',
197
+ 'line_number', 'word_nr', 'letter_nr', 'char_x_center', 'char_y_center', 'assigned_line']
198
+ for col in int_columns:
199
+ df_word_chars[col] = df_word_chars[col].astype(int)
200
+
201
+ return df_word_chars
202
+
203
+
204
+ def draw_char_boxes(image_path, df_word_chars, output_path='output_boxes_combined.png'):
205
+ """
206
+ Draws bounding boxes around characters on the image.
207
+
208
+ Args:
209
+ image_path: Path to the image file.
210
+ df_word_chars: DataFrame containing character bounding box data.
211
+ output_path: Path to save the image with bounding boxes. Defaults to 'output_boxes_combined.png'.
212
+ """
213
+ image = Image.open(image_path).convert('RGB')
214
+ draw = ImageDraw.Draw(image)
215
+
216
+ # Draw bounding boxes for characters (purple)
217
+ for index, row in df_word_chars.iterrows():
218
+ left = int(row['char_xmin'])
219
+ top = int(row['char_ymin'])
220
+ right = int(row['char_xmax'])
221
+ bottom = int(row['char_ymax'])
222
+ draw.rectangle([(left, top), (right, bottom)], outline='purple', width=1)
223
+
224
+ # Display or save the image
225
+ image.save(output_path)
226
+
227
+
228
+ # Example usage
229
+ if __name__ == '__main__':
230
+ # image_path = 'testfiles/testim_ocr.png'
231
+ image_path = 'testfiles/newplot.png'
232
+ # Example with default tesseract config
233
+ df_chars = recognize_text(image_path)
234
+ draw_char_boxes(image_path, df_chars)
235
+ df_chars.to_csv('testim_ocr_df_word_chars_test.csv', index=False)
236
+ print("\nDataFrame of Characters within Words (df_word_chars) - Default Config:")
237
+ print(df_chars)
eyekit_measures.py CHANGED
@@ -17,7 +17,6 @@ MEASURES_DICT = {
17
  "second_pass_duration": [],
18
  "initial_landing_position": [],
19
  "initial_landing_distance": [],
20
- "landing_distances": [],
21
  "number_of_regressions_in": [],
22
  }
23
 
 
17
  "second_pass_duration": [],
18
  "initial_landing_position": [],
19
  "initial_landing_distance": [],
 
20
  "number_of_regressions_in": [],
21
  }
22
 
multi_proc_funcs.py CHANGED
@@ -16,6 +16,7 @@ from matplotlib.font_manager import FontProperties
16
  from matplotlib.patches import Rectangle
17
  from tqdm.auto import tqdm
18
  import torch as t
 
19
  import plotly.express as px
20
  import copy
21
 
@@ -1992,8 +1993,12 @@ def add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, xcol, cols_to_
1992
  ].reset_index()
1993
  selected_stimmat.loc[:, "letword"] = selected_stimmat.groupby("in_word_number")["letternum"].rank()
1994
  letters_on_line = selected_stimmat.shape[0]
1995
- out = dffix.loc[i, xcol] - selected_stimmat["char_x_center"]
1996
- min_idx = out.abs().idxmin()
 
 
 
 
1997
  dffix.loc[i, f"letternum_{algo_choice}"] = selected_stimmat.loc[min_idx, "letternum"]
1998
  dffix.loc[i, f"letter_{algo_choice}"] = selected_stimmat.loc[min_idx, "char"]
1999
  dffix.loc[i, f"line_let_{algo_choice}"] = selected_stimmat.loc[min_idx, "letline"]
 
16
  from matplotlib.patches import Rectangle
17
  from tqdm.auto import tqdm
18
  import torch as t
19
+ t.classes.__path__ = [] # https://discuss.streamlit.io/t/error-in-torch-with-streamlit/90908/3
20
  import plotly.express as px
21
  import copy
22
 
 
1993
  ].reset_index()
1994
  selected_stimmat.loc[:, "letword"] = selected_stimmat.groupby("in_word_number")["letternum"].rank()
1995
  letters_on_line = selected_stimmat.shape[0]
1996
+ if len(selected_stimmat["char_x_center"])>0:
1997
+ out = dffix.loc[i, xcol] - selected_stimmat["char_x_center"]
1998
+ min_idx = out.abs().idxmin()
1999
+ # ic(selected_stimmat)
2000
+ else:
2001
+ min_idx = 0
2002
  dffix.loc[i, f"letternum_{algo_choice}"] = selected_stimmat.loc[min_idx, "letternum"]
2003
  dffix.loc[i, f"letter_{algo_choice}"] = selected_stimmat.loc[min_idx, "char"]
2004
  dffix.loc[i, f"line_let_{algo_choice}"] = selected_stimmat.loc[min_idx, "letline"]
requirements.txt CHANGED
@@ -22,4 +22,5 @@ pycairo
22
  eyekit
23
  stqdm
24
  jellyfish
25
- icecream
 
 
22
  eyekit
23
  stqdm
24
  jellyfish
25
+ icecream
26
+ pytesseract
utils.py CHANGED
@@ -1,6 +1,7 @@
1
  import pickle
2
  from io import StringIO
3
  import re
 
4
  import zipfile
5
  import os
6
  import plotly.graph_objects as go
@@ -17,6 +18,8 @@ from tqdm.auto import tqdm
17
  import time
18
  import requests
19
  from icecream import ic
 
 
20
  from matplotlib import font_manager
21
  from multi_proc_funcs import (
22
  COLORS,
@@ -99,13 +102,14 @@ def download_url(url, target_filename):
99
 
100
 
101
  def asc_to_trial_ids(
102
- asc_file, close_gap_between_words, paragraph_trials_only, ias_files, trial_start_keyword, end_trial_at_keyword
103
  ):
104
  asc_encoding = ["ISO-8859-15", "UTF-8"][0]
105
  trials_dict, lines = file_to_trials_and_lines(
106
  asc_file,
107
  asc_encoding,
108
  close_gap_between_words=close_gap_between_words,
 
109
  paragraph_trials_only=paragraph_trials_only,
110
  uploaded_ias_files=ias_files,
111
  trial_start_keyword=trial_start_keyword,
@@ -122,7 +126,7 @@ def asc_to_trial_ids(
122
 
123
 
124
  def get_trials_list(
125
- asc_file, close_gap_between_words, paragraph_trials_only, ias_files, trial_start_keyword, end_trial_at_keyword
126
  ):
127
  if hasattr(asc_file, "name"):
128
  savename = pl.Path(asc_file.name).stem
@@ -132,6 +136,7 @@ def get_trials_list(
132
  trials_by_ids, lines, trials_dict = asc_to_trial_ids(
133
  asc_file,
134
  close_gap_between_words=close_gap_between_words,
 
135
  paragraph_trials_only=paragraph_trials_only,
136
  ias_files=ias_files,
137
  trial_start_keyword=trial_start_keyword,
@@ -238,9 +243,9 @@ def add_words(chars_list):
238
  word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
239
  word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
240
  word_x_center = round((word_xmax - word_xmin) / 2 + word_xmin, ndigits=2)
241
- word_y_center = round((word_ymax - word_ymin) / 2 + word_ymin, ndigits=2)
242
  word_length = len(word)
243
- assigned_line = chars_list_reconstructed[word_start_idx]["assigned_line"]
244
  word_dict = dict(
245
  word_number=len(words_list),
246
  word=word,
@@ -473,6 +478,7 @@ def asc_lines_to_trials_by_trail_id(
473
  paragraph_trials_only=True,
474
  filename: str = "",
475
  close_gap_between_words=True,
 
476
  ias_files=[],
477
  start_trial_at_keyword="START",
478
  end_trial_at_keyword="END",
@@ -884,21 +890,189 @@ def asc_lines_to_trials_by_trail_id(
884
  words_list = words_list_from_func
885
 
886
  if close_gap_between_words: # TODO this may need to change the "in_word" col for the chars_df
887
- for widx in range(1, len(words_list)):
888
- if words_list[widx]["assigned_line"] == words_list[widx - 1]["assigned_line"]:
889
- word_sep_half_width = (words_list[widx]["word_xmin"] - words_list[widx - 1]["word_xmax"]) / 2
890
- words_list[widx - 1]["word_xmax"] = words_list[widx - 1]["word_xmax"] + word_sep_half_width
891
- words_list[widx]["word_xmin"] = words_list[widx]["word_xmin"] - word_sep_half_width
892
  else:
893
  chars_df = pd.DataFrame(chars_list_reconstructed)
894
  chars_df.loc[
895
  chars_df["char"] == " ", ["in_word", "in_word_number", "num_letters_from_start_of_word"]
896
  ] = pd.NA
897
  chars_list_reconstructed = chars_df.to_dict("records")
 
 
 
898
  trials_dict[trial_idx]["words_list"] = words_list
899
  trials_dict[trial_idx]["chars_list"] = chars_list_reconstructed
900
  return trials_dict
901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
 
903
  def get_lines_from_file(uploaded_file, asc_encoding="ISO-8859-15"):
904
  if isinstance(uploaded_file, str) or isinstance(uploaded_file, pl.Path):
@@ -915,6 +1089,7 @@ def file_to_trials_and_lines(
915
  uploaded_file,
916
  asc_encoding: str = "ISO-8859-15",
917
  close_gap_between_words=True,
 
918
  paragraph_trials_only=True,
919
  uploaded_ias_files=[],
920
  trial_start_keyword="START",
@@ -926,6 +1101,7 @@ def file_to_trials_and_lines(
926
  paragraph_trials_only,
927
  uploaded_file,
928
  close_gap_between_words=close_gap_between_words,
 
929
  ias_files=uploaded_ias_files,
930
  start_trial_at_keyword=trial_start_keyword,
931
  end_trial_at_keyword=end_trial_at_keyword,
@@ -1007,7 +1183,7 @@ def plotly_plot_with_image(
1007
 
1008
  if lines_in_plot == "Both":
1009
  uncorrected_plot_mode = "markers+lines+text"
1010
- corrected_plot_mode = "markers+lines+text"
1011
 
1012
  fig = go.Figure()
1013
  fig.add_trace(
 
1
  import pickle
2
  from io import StringIO
3
  import re
4
+ from typing import Dict, List
5
  import zipfile
6
  import os
7
  import plotly.graph_objects as go
 
18
  import time
19
  import requests
20
  from icecream import ic
21
+ import collections
22
+ import statistics
23
  from matplotlib import font_manager
24
  from multi_proc_funcs import (
25
  COLORS,
 
102
 
103
 
104
  def asc_to_trial_ids(
105
+ asc_file, close_gap_between_words,close_gap_between_lines, paragraph_trials_only, ias_files, trial_start_keyword, end_trial_at_keyword
106
  ):
107
  asc_encoding = ["ISO-8859-15", "UTF-8"][0]
108
  trials_dict, lines = file_to_trials_and_lines(
109
  asc_file,
110
  asc_encoding,
111
  close_gap_between_words=close_gap_between_words,
112
+ close_gap_between_lines=close_gap_between_lines,
113
  paragraph_trials_only=paragraph_trials_only,
114
  uploaded_ias_files=ias_files,
115
  trial_start_keyword=trial_start_keyword,
 
126
 
127
 
128
  def get_trials_list(
129
+ asc_file, close_gap_between_words,close_gap_between_lines, paragraph_trials_only, ias_files, trial_start_keyword, end_trial_at_keyword
130
  ):
131
  if hasattr(asc_file, "name"):
132
  savename = pl.Path(asc_file.name).stem
 
136
  trials_by_ids, lines, trials_dict = asc_to_trial_ids(
137
  asc_file,
138
  close_gap_between_words=close_gap_between_words,
139
+ close_gap_between_lines=close_gap_between_lines,
140
  paragraph_trials_only=paragraph_trials_only,
141
  ias_files=ias_files,
142
  trial_start_keyword=trial_start_keyword,
 
243
  word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
244
  word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
245
  word_x_center = round((word_xmax - word_xmin) / 2 + word_xmin, ndigits=2)
246
+ word_y_center = chars_list_reconstructed[word_start_idx]["char_y_center"]
247
  word_length = len(word)
248
+ assigned_line = int(chars_list_reconstructed[word_start_idx]["assigned_line"])
249
  word_dict = dict(
250
  word_number=len(words_list),
251
  word=word,
 
478
  paragraph_trials_only=True,
479
  filename: str = "",
480
  close_gap_between_words=True,
481
+ close_gap_between_lines=True,
482
  ias_files=[],
483
  start_trial_at_keyword="START",
484
  end_trial_at_keyword="END",
 
890
  words_list = words_list_from_func
891
 
892
  if close_gap_between_words: # TODO this may need to change the "in_word" col for the chars_df
893
+ words_list = close_gaps_in_words_list(words_list)
 
 
 
 
894
  else:
895
  chars_df = pd.DataFrame(chars_list_reconstructed)
896
  chars_df.loc[
897
  chars_df["char"] == " ", ["in_word", "in_word_number", "num_letters_from_start_of_word"]
898
  ] = pd.NA
899
  chars_list_reconstructed = chars_df.to_dict("records")
900
+ if close_gap_between_lines:
901
+ chars_list_reconstructed = close_gaps_between_lines(chars_list_reconstructed,prefix='char')
902
+ words_list = close_gaps_between_lines(words_list,prefix='word')
903
  trials_dict[trial_idx]["words_list"] = words_list
904
  trials_dict[trial_idx]["chars_list"] = chars_list_reconstructed
905
  return trials_dict
906
 
907
+ def close_gaps_between_lines(data, prefix):
908
+ """
909
+ Adjusts word_ymin and word_ymax for lines in a list of dictionaries based on average y-centers.
910
+
911
+ Args:
912
+ data: A list of dictionaries, where each dictionary must have
913
+ 'assigned_line', 'word_ymin', and 'word_ymax' keys.
914
+
915
+ Returns:
916
+ A new list of dictionaries with adjusted 'word_ymin' and 'word_ymax' values.
917
+ Returns an empty list if the input is empty.
918
+ Returns the original list if there's only one unique line number.
919
+ """
920
+ if not data:
921
+ return []
922
+
923
+ # --- Step 1: Calculate ycenter and group by assigned_line ---
924
+ line_centers = collections.defaultdict(list)
925
+ # Keep track of original min/max for single line case plotting
926
+ original_coords = collections.defaultdict(
927
+ lambda: {f"{prefix}_ymin": float("inf"), f"{prefix}_ymax": float("-inf")}
928
+ )
929
+
930
+ for item in data:
931
+ if f"{prefix}_ymin" in item and f"{prefix}_ymax" in item:
932
+ ycenter = (item[f"{prefix}_ymin"] + item[f"{prefix}_ymax"]) / 2
933
+ line_num = item["assigned_line"]
934
+ line_centers[line_num].append(ycenter)
935
+ # Track overall min/max for original data per line
936
+ original_coords[line_num][f"{prefix}_ymin"] = min(
937
+ original_coords[line_num][f"{prefix}_ymin"], item[f"{prefix}_ymin"]
938
+ )
939
+ original_coords[line_num][f"{prefix}_ymax"] = max(
940
+ original_coords[line_num][f"{prefix}_ymax"], item[f"{prefix}_ymax"]
941
+ )
942
+
943
+ # --- Step 2: Calculate average ycenter for each assigned_line ---
944
+ avg_centers = {}
945
+ for line_num, centers in line_centers.items():
946
+ if centers: # Avoid division by zero if a assigned_line had no valid entries
947
+ avg_centers[line_num] = statistics.mean(centers)
948
+
949
+ # Handle case with 0 or 1 unique line numbers - no adjustments needed/possible
950
+ if len(avg_centers) <= 1:
951
+ print(
952
+ "Only one unique line number found or no valid lines. No adjustments made."
953
+ )
954
+ # Return a deep copy to avoid modifying the original list if needed,
955
+ # or just return the original list. Let's return a copy for safety.
956
+ return data
957
+
958
+ # --- Step 3: Sort line numbers based on average ycenter ---
959
+ # Creates a list of tuples: (assigned_line, avg_ycenter) sorted by avg_ycenter
960
+ sorted_lines = sorted(avg_centers.items(), key=lambda item: item[1])
961
+ # Extract sorted line numbers and their average centers
962
+ sorted_line_nums = [item[0] for item in sorted_lines]
963
+ sorted_avg_centers = [item[1] for item in sorted_lines]
964
+
965
+ # --- Step 4: Calculate boundaries ---
966
+ num_lines = len(sorted_avg_centers)
967
+ boundaries = {} # Store boundaries between line i and line i+1
968
+
969
+ # Calculate boundaries between adjacent lines
970
+ for i in range(num_lines - 1):
971
+ midpoint = (sorted_avg_centers[i] + sorted_avg_centers[i + 1]) / 2
972
+ boundaries[i] = midpoint
973
+
974
+ # --- Step 5: Determine new word_ymin and word_ymax for each assigned_line ---
975
+ new_coords = {} # Stores {assigned_line: {'word_ymin': new_ymin, 'word_ymax': new_ymax}}
976
+
977
+ # Handle the first line
978
+ first_line_num = sorted_line_nums[0]
979
+ # Estimate boundary before the first line by extrapolating
980
+ # Use max(0, ...) to prevent negative word_ymin if lines are very close to 0
981
+ # Ensure extrapolation doesn't create negative boundary if first line is near 0
982
+ extrapolated_start_boundary = max(
983
+ 0, sorted_avg_centers[0] - (sorted_avg_centers[1] - sorted_avg_centers[0]) / 2
984
+ )
985
+ # The new word_ymin should start 1 pixel *after* the rounded boundary
986
+ # The boundary itself is the dividing line.
987
+ new_ymin_first = round(extrapolated_start_boundary) + 1
988
+ new_ymax_first = round(boundaries[0])
989
+ # Ensure word_ymin is not greater than word_ymax, adjust if necessary
990
+ if new_ymin_first > new_ymax_first:
991
+ print(
992
+ f"Warning: Calculated word_ymin ({new_ymin_first}) > word_ymax ({new_ymax_first}) for first line ({first_line_num}). Adjusting word_ymin."
993
+ )
994
+ new_ymin_first = new_ymax_first # Set word_ymin = word_ymax, resulting in a height of 0
995
+ new_coords[first_line_num] = {f"{prefix}_ymin": new_ymin_first, f"{prefix}_ymax": new_ymax_first}
996
+
997
+ # Handle intermediate lines
998
+ for i in range(1, num_lines - 1):
999
+ line_num = sorted_line_nums[i]
1000
+ # word_ymin starts 1 pixel after the previous boundary
1001
+ new_ymin = round(boundaries[i - 1]) + 1
1002
+ # word_ymax is at the current boundary
1003
+ new_ymax = round(boundaries[i])
1004
+ # Ensure word_ymin is not greater than word_ymax
1005
+ if new_ymin > new_ymax:
1006
+ print(
1007
+ f"Warning: Calculated word_ymin ({new_ymin}) > word_ymax ({new_ymax}) for intermediate line ({line_num}). Adjusting word_ymin."
1008
+ )
1009
+ new_ymin = new_ymax # Adjust word_ymin to be equal to word_ymax
1010
+ new_coords[line_num] = {f"{prefix}_ymin": new_ymin, f"{prefix}_ymax": new_ymax}
1011
+
1012
+ # Handle the last line
1013
+ last_line_num = sorted_line_nums[-1]
1014
+ # Estimate boundary after the last line by extrapolating
1015
+ extrapolated_end_boundary = (
1016
+ sorted_avg_centers[-1] + (sorted_avg_centers[-1] - sorted_avg_centers[-2]) / 2
1017
+ )
1018
+ # word_ymin starts 1 pixel after the previous boundary
1019
+ new_ymin_last = round(boundaries[num_lines - 2]) + 1
1020
+ # word_ymax is at the extrapolated end boundary
1021
+ new_ymax_last = round(extrapolated_end_boundary)
1022
+ # Ensure word_ymin is not greater than word_ymax
1023
+ if new_ymin_last > new_ymax_last:
1024
+ print(
1025
+ f"Warning: Calculated word_ymin ({new_ymin_last}) > word_ymax ({new_ymax_last}) for last line ({last_line_num}). Adjusting word_ymax."
1026
+ )
1027
+ new_ymax_last = new_ymin_last # Adjust word_ymax to be equal to word_ymin
1028
+ new_coords[last_line_num] = {f"{prefix}_ymin": new_ymin_last, f"{prefix}_ymax": new_ymax_last}
1029
+
1030
+ # --- Step 6: Update the original data structure ---
1031
+ # Create a new list to store results, preserving other keys
1032
+ adjusted_data = []
1033
+ for item in data:
1034
+ new_item = (
1035
+ item.copy()
1036
+ ) # Create a copy to avoid modifying original dicts directly if they are reused
1037
+ line_num = new_item.get("assigned_line")
1038
+ if line_num in new_coords:
1039
+ new_item[f"{prefix}_ymin"] = new_coords[line_num][f"{prefix}_ymin"]
1040
+ new_item[f"{prefix}_ymax"] = new_coords[line_num][f"{prefix}_ymax"]
1041
+ adjusted_data.append(new_item)
1042
+
1043
+ return adjusted_data
1044
+
1045
+ def close_gaps_in_words_list(words_list:List[Dict]):
1046
+ """
1047
+ Adjusts the positions of words in a list to close gaps between consecutive words
1048
+ that belong to the same assigned line. The function modifies the input list in place.
1049
+
1050
+ Args:
1051
+ words_list (list of dict): A list of dictionaries where each dictionary represents
1052
+ a word with the following keys:
1053
+ - "assigned_line" (int): The line number to which the word is assigned.
1054
+ - "word_xmin" (float): The minimum x-coordinate of the word's bounding box.
1055
+ - "word_xmax" (float): The maximum x-coordinate of the word's bounding box.
1056
+
1057
+ Behavior:
1058
+ - For each pair of consecutive words in the list that belong to the same line
1059
+ (i.e., have the same "assigned_line"), the function calculates the gap between
1060
+ their bounding boxes.
1061
+ - The gap is split equally between the two words, and their "word_xmin" and
1062
+ "word_xmax" values are adjusted accordingly to close the gap.
1063
+
1064
+ Note:
1065
+ - The input list is modified in place, and no value is returned.
1066
+ - It is assumed that the input list is sorted by "assigned_line" and the x-coordinates
1067
+ of the words.
1068
+
1069
+ """
1070
+ for widx in range(1, len(words_list)):
1071
+ if words_list[widx]["assigned_line"] == words_list[widx - 1]["assigned_line"]:
1072
+ word_sep_half_width = (words_list[widx]["word_xmin"] - words_list[widx - 1]["word_xmax"]) / 2
1073
+ words_list[widx - 1]["word_xmax"] = words_list[widx - 1]["word_xmax"] + word_sep_half_width
1074
+ words_list[widx]["word_xmin"] = words_list[widx]["word_xmin"] - word_sep_half_width
1075
+ return words_list
1076
 
1077
  def get_lines_from_file(uploaded_file, asc_encoding="ISO-8859-15"):
1078
  if isinstance(uploaded_file, str) or isinstance(uploaded_file, pl.Path):
 
1089
  uploaded_file,
1090
  asc_encoding: str = "ISO-8859-15",
1091
  close_gap_between_words=True,
1092
+ close_gap_between_lines=True,
1093
  paragraph_trials_only=True,
1094
  uploaded_ias_files=[],
1095
  trial_start_keyword="START",
 
1101
  paragraph_trials_only,
1102
  uploaded_file,
1103
  close_gap_between_words=close_gap_between_words,
1104
+ close_gap_between_lines=close_gap_between_lines,
1105
  ias_files=uploaded_ias_files,
1106
  start_trial_at_keyword=trial_start_keyword,
1107
  end_trial_at_keyword=end_trial_at_keyword,
 
1183
 
1184
  if lines_in_plot == "Both":
1185
  uncorrected_plot_mode = "markers+lines+text"
1186
+ corrected_plot_mode = "markers+text"
1187
 
1188
  fig = go.Figure()
1189
  fig.add_trace(