synced with dev version. Mostly OCR related
Browse files- app.py +55 -6
- create_interest_areas_from_image.py +237 -0
- eyekit_measures.py +0 -1
- multi_proc_funcs.py +7 -2
- requirements.txt +2 -1
- utils.py +186 -10
app.py
CHANGED
@@ -22,6 +22,11 @@ import zipfile
|
|
22 |
from matplotlib import font_manager
|
23 |
import os
|
24 |
|
|
|
|
|
|
|
|
|
|
|
25 |
from multi_proc_funcs import (
|
26 |
ALL_FIX_MEASURES,
|
27 |
COLORS,
|
@@ -236,7 +241,7 @@ COLNAME_CANDIDATES_CUSTOM_CSV_FIX = {
|
|
236 |
"trial_id_col_name_fix": ["trial_id", "trialid", "trial", "trial_num", "id"],
|
237 |
"subject_col_name_fix": ["subject", "sub", "subid", "sub_id"],
|
238 |
"time_start_col_name_fix": ["start", "start_time", "ts", "t_start", "starttime"],
|
239 |
-
"time_stop_col_name_fix": ["stop", "stop_time", "te", "t_end", "t_stop", "stoptime"],
|
240 |
}
|
241 |
COLNAME_CANDIDATES_CUSTOM_CSV_FIX_DEFAULT = {k: v[0] for k, v in COLNAME_CANDIDATES_CUSTOM_CSV_FIX.items()}
|
242 |
|
@@ -923,6 +928,8 @@ def make_trial_from_stimulus_df(
|
|
923 |
stim_plot_df,
|
924 |
filename,
|
925 |
trial_id,
|
|
|
|
|
926 |
):
|
927 |
chars_list = []
|
928 |
words_list = []
|
@@ -931,6 +938,12 @@ def make_trial_from_stimulus_df(
|
|
931 |
chars_list.append(char_dict)
|
932 |
|
933 |
words_list, chars_list = ut.add_words(chars_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
934 |
letter_width_avg = np.mean([x["char_xmax"] - x["char_xmin"] for x in chars_list if x["char_xmax"] > x["char_xmin"]])
|
935 |
line_heights = [x["char_ymax"] - x["char_ymin"] for x in chars_list]
|
936 |
line_xcoords_all = [x["char_x_center"] for x in chars_list]
|
@@ -1034,12 +1047,15 @@ def get_fixations_file_trials_list(dffix, stimulus):
|
|
1034 |
for trial_id, subdf in stqdm(enum, desc="Creating trials"):
|
1035 |
if isinstance(stimulus, pd.DataFrame):
|
1036 |
stim_df = stimulus[stimulus.trial_id == subdf["trial_id"].iloc[0]]
|
|
|
|
|
1037 |
|
1038 |
stim_df = stim_df.dropna(axis=0, how="all")
|
1039 |
subdf = subdf.dropna(axis=0, how="all")
|
1040 |
stim_df = stim_df.dropna(axis=1, how="all")
|
1041 |
subdf = subdf.dropna(axis=1, how="all")
|
1042 |
if subdf.empty:
|
|
|
1043 |
continue
|
1044 |
subdf = subdf.reset_index(drop=True).copy()
|
1045 |
stim_df = stim_df.reset_index(drop=True).copy()
|
@@ -1048,6 +1064,8 @@ def get_fixations_file_trials_list(dffix, stimulus):
|
|
1048 |
stim_df,
|
1049 |
st.session_state["single_csv_file_stim"].name,
|
1050 |
trial_id,
|
|
|
|
|
1051 |
)
|
1052 |
else:
|
1053 |
if "trial_id" in stimulus.keys() and (
|
@@ -1072,20 +1090,25 @@ def get_fixations_file_trials_list(dffix, stimulus):
|
|
1072 |
|
1073 |
return trials_by_ids, trial_keys
|
1074 |
|
|
|
|
|
|
|
|
|
|
|
1075 |
|
1076 |
def load_csv_delim_agnostic(file_path):
|
1077 |
try:
|
1078 |
df = pd.read_csv(file_path)
|
1079 |
if df.shape[1] > 1:
|
1080 |
-
return df
|
1081 |
else:
|
1082 |
dec_file = get_decoded_input_from_file(file_path)
|
1083 |
df = pd.read_csv(StringIO(dec_file.replace(";", ",").replace("\t", ",")))
|
1084 |
-
return df
|
1085 |
except Exception as e:
|
1086 |
dec_file = get_decoded_input_from_file(file_path)
|
1087 |
df = pd.read_csv(StringIO(dec_file.replace(";", ",").replace("\t", ",")))
|
1088 |
-
return df
|
1089 |
|
1090 |
|
1091 |
def find_col_name_suggestions(cols, candidates_dict):
|
@@ -1374,6 +1397,7 @@ def main():
|
|
1374 |
trial_choices_single_asc, trials_by_ids, lines, asc_file, trials_dict = ut.get_trials_list(
|
1375 |
st.session_state["single_asc_file_asc"],
|
1376 |
close_gap_between_words=st.session_state["close_gap_between_words_single_asc"],
|
|
|
1377 |
paragraph_trials_only=st.session_state["paragraph_trials_only_single_asc"],
|
1378 |
ias_files=st.session_state["single_asc_file_ias_files"],
|
1379 |
trial_start_keyword=trial_start_keyword,
|
@@ -1981,10 +2005,22 @@ def main():
|
|
1981 |
"Select .csv or .json file containing the stimulus data",
|
1982 |
accept_multiple_files=False,
|
1983 |
key="single_csv_file_stim_uploaded",
|
1984 |
-
type={"json", "csv", "txt", "dat"},
|
1985 |
-
help="Drag and drop or select a single .json, .csv, .txt
|
1986 |
)
|
1987 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1988 |
use_example_or_uploaded_file_choice = st.radio(
|
1989 |
"Should the uploaded files be used or some example files?",
|
1990 |
index=1,
|
@@ -2024,6 +2060,11 @@ def main():
|
|
2024 |
trial = json.loads(decoded_input)
|
2025 |
st.session_state["stimdf_single_csv"] = trial
|
2026 |
colnames_stim = list(st.session_state["stimdf_single_csv"].keys())
|
|
|
|
|
|
|
|
|
|
|
2027 |
else:
|
2028 |
st.session_state["stimdf_single_csv"] = load_csv_delim_agnostic(single_csv_stim_file)
|
2029 |
colnames_stim = st.session_state["stimdf_single_csv"].columns
|
@@ -2407,6 +2448,7 @@ def main():
|
|
2407 |
use_corrected_fixations=True,
|
2408 |
correction_algo=st.session_state["algo_choice_custom_eyekit"],
|
2409 |
save_to_csv=True,
|
|
|
2410 |
)
|
2411 |
st.dataframe(own_word_measures, use_container_width=True, hide_index=True, height=200)
|
2412 |
own_word_measures_csv = convert_df(own_word_measures)
|
@@ -3056,6 +3098,7 @@ def main():
|
|
3056 |
use_corrected_fixations=True,
|
3057 |
correction_algo=st.session_state["algo_choice_multi_asc_eyekit"],
|
3058 |
save_to_csv=True,
|
|
|
3059 |
)
|
3060 |
if "sentence_measures_multi_asc" in st.session_state:
|
3061 |
sent_measures_multi = st.session_state["sentence_measures_multi_asc"]
|
@@ -3219,6 +3262,12 @@ def show_file_parsing_settings(suffix: str):
|
|
3219 |
key=f"close_gap_between_words{suffix}",
|
3220 |
help="If this is selected, each word bounding box will include half the spaces between adjacent words. If not, the word bounding boxes will simply be the combined bounding boxes of the letters making up the word.", # TODO check if this affects analysis
|
3221 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
3222 |
st.markdown("### Trial filtering settings")
|
3223 |
|
3224 |
st.checkbox(
|
|
|
22 |
from matplotlib import font_manager
|
23 |
import os
|
24 |
|
25 |
+
try:
|
26 |
+
from create_interest_areas_from_image import recognize_text
|
27 |
+
except Exception as e:
|
28 |
+
print(e)
|
29 |
+
|
30 |
from multi_proc_funcs import (
|
31 |
ALL_FIX_MEASURES,
|
32 |
COLORS,
|
|
|
241 |
"trial_id_col_name_fix": ["trial_id", "trialid", "trial", "trial_num", "id"],
|
242 |
"subject_col_name_fix": ["subject", "sub", "subid", "sub_id"],
|
243 |
"time_start_col_name_fix": ["start", "start_time", "ts", "t_start", "starttime"],
|
244 |
+
"time_stop_col_name_fix": ["end","stop", "stop_time", "te", "t_end", "t_stop", "stoptime"],
|
245 |
}
|
246 |
COLNAME_CANDIDATES_CUSTOM_CSV_FIX_DEFAULT = {k: v[0] for k, v in COLNAME_CANDIDATES_CUSTOM_CSV_FIX.items()}
|
247 |
|
|
|
928 |
stim_plot_df,
|
929 |
filename,
|
930 |
trial_id,
|
931 |
+
close_gaps_between_words:bool,
|
932 |
+
close_gaps_between_lines:bool,
|
933 |
):
|
934 |
chars_list = []
|
935 |
words_list = []
|
|
|
938 |
chars_list.append(char_dict)
|
939 |
|
940 |
words_list, chars_list = ut.add_words(chars_list)
|
941 |
+
if close_gaps_between_words:
|
942 |
+
words_list = ut.close_gaps_in_words_list(words_list)
|
943 |
+
if close_gaps_between_lines:
|
944 |
+
chars_list = ut.close_gaps_between_lines(chars_list,prefix='char')
|
945 |
+
words_list = ut.close_gaps_between_lines(words_list,prefix='word')
|
946 |
+
|
947 |
letter_width_avg = np.mean([x["char_xmax"] - x["char_xmin"] for x in chars_list if x["char_xmax"] > x["char_xmin"]])
|
948 |
line_heights = [x["char_ymax"] - x["char_ymin"] for x in chars_list]
|
949 |
line_xcoords_all = [x["char_x_center"] for x in chars_list]
|
|
|
1047 |
for trial_id, subdf in stqdm(enum, desc="Creating trials"):
|
1048 |
if isinstance(stimulus, pd.DataFrame):
|
1049 |
stim_df = stimulus[stimulus.trial_id == subdf["trial_id"].iloc[0]]
|
1050 |
+
if stim_df.empty:
|
1051 |
+
st.session_state["logger"].warning(f"stim_df dataframe is empty because trial_id {trial_id} not in stimulus trial ids:\n{stimulus.trial_id.unique()}")
|
1052 |
|
1053 |
stim_df = stim_df.dropna(axis=0, how="all")
|
1054 |
subdf = subdf.dropna(axis=0, how="all")
|
1055 |
stim_df = stim_df.dropna(axis=1, how="all")
|
1056 |
subdf = subdf.dropna(axis=1, how="all")
|
1057 |
if subdf.empty:
|
1058 |
+
st.session_state["logger"].warning(f"Sub dataframe is empty for trial_id {trial_id}")
|
1059 |
continue
|
1060 |
subdf = subdf.reset_index(drop=True).copy()
|
1061 |
stim_df = stim_df.reset_index(drop=True).copy()
|
|
|
1064 |
stim_df,
|
1065 |
st.session_state["single_csv_file_stim"].name,
|
1066 |
trial_id,
|
1067 |
+
close_gaps_between_words=st.session_state["close_gap_between_words_single_csv"],
|
1068 |
+
close_gaps_between_lines=st.session_state["close_gap_between_lines_single_csv"],
|
1069 |
)
|
1070 |
else:
|
1071 |
if "trial_id" in stimulus.keys() and (
|
|
|
1090 |
|
1091 |
return trials_by_ids, trial_keys
|
1092 |
|
1093 |
+
def make_ints_float(df):
|
1094 |
+
for col in df.columns:
|
1095 |
+
if 'int' in str(df[col].dtype).lower():
|
1096 |
+
df[col] = pd.to_numeric(df[col], downcast='float')
|
1097 |
+
return df
|
1098 |
|
1099 |
def load_csv_delim_agnostic(file_path):
|
1100 |
try:
|
1101 |
df = pd.read_csv(file_path)
|
1102 |
if df.shape[1] > 1:
|
1103 |
+
return make_ints_float(df)
|
1104 |
else:
|
1105 |
dec_file = get_decoded_input_from_file(file_path)
|
1106 |
df = pd.read_csv(StringIO(dec_file.replace(";", ",").replace("\t", ",")))
|
1107 |
+
return make_ints_float(df)
|
1108 |
except Exception as e:
|
1109 |
dec_file = get_decoded_input_from_file(file_path)
|
1110 |
df = pd.read_csv(StringIO(dec_file.replace(";", ",").replace("\t", ",")))
|
1111 |
+
return make_ints_float(df)
|
1112 |
|
1113 |
|
1114 |
def find_col_name_suggestions(cols, candidates_dict):
|
|
|
1397 |
trial_choices_single_asc, trials_by_ids, lines, asc_file, trials_dict = ut.get_trials_list(
|
1398 |
st.session_state["single_asc_file_asc"],
|
1399 |
close_gap_between_words=st.session_state["close_gap_between_words_single_asc"],
|
1400 |
+
close_gap_between_lines=st.session_state["close_gap_between_lines_single_asc"],
|
1401 |
paragraph_trials_only=st.session_state["paragraph_trials_only_single_asc"],
|
1402 |
ias_files=st.session_state["single_asc_file_ias_files"],
|
1403 |
trial_start_keyword=trial_start_keyword,
|
|
|
2005 |
"Select .csv or .json file containing the stimulus data",
|
2006 |
accept_multiple_files=False,
|
2007 |
key="single_csv_file_stim_uploaded",
|
2008 |
+
type={"json", "csv", "txt", "dat","jpeg","png"},
|
2009 |
+
help="Drag and drop or select a single .json, .csv, .txt, .dat, jpeg or png file that you wish to process as the stimulus file for the uploaded fixation data. If an image is uploaded OCR will be attempted to extract the character bounding boxes. This can be left blank if you chose to use the examples.",
|
2010 |
)
|
2011 |
|
2012 |
+
st.checkbox(
|
2013 |
+
label="Should spaces between words be included in word bounding box?",
|
2014 |
+
value=get_default_val("close_gap_between_words_csv", True),
|
2015 |
+
key="close_gap_between_words_single_csv",
|
2016 |
+
help="If this is selected, each word bounding box will include half the spaces between adjacent words. If not, the word bounding boxes will simply be the combined bounding boxes of the letters making up the word.", # TODO check if this affects analysis
|
2017 |
+
)
|
2018 |
+
st.checkbox(
|
2019 |
+
label="Should spaces between lines be included in word and character bounding boxes?",
|
2020 |
+
value=get_default_val("close_gap_between_lines_single_csv", True),
|
2021 |
+
key="close_gap_between_lines_single_csv",
|
2022 |
+
help="If this is selected, each word and char bounding box will include half the spaces between adjacent lines.", # TODO check if this affects analysis
|
2023 |
+
)
|
2024 |
use_example_or_uploaded_file_choice = st.radio(
|
2025 |
"Should the uploaded files be used or some example files?",
|
2026 |
index=1,
|
|
|
2060 |
trial = json.loads(decoded_input)
|
2061 |
st.session_state["stimdf_single_csv"] = trial
|
2062 |
colnames_stim = list(st.session_state["stimdf_single_csv"].keys())
|
2063 |
+
elif any([".png" in single_csv_stim_file.name, ".jpeg" in single_csv_stim_file.name]):
|
2064 |
+
stimdf_single_csv = recognize_text(single_csv_stim_file)
|
2065 |
+
stimdf_single_csv.to_csv(RESULTS_FOLDER / f"{single_csv_stim_file.name}_stimdf_single_from_OCR.csv")
|
2066 |
+
st.session_state["stimdf_single_csv"] = stimdf_single_csv
|
2067 |
+
colnames_stim = st.session_state["stimdf_single_csv"].columns
|
2068 |
else:
|
2069 |
st.session_state["stimdf_single_csv"] = load_csv_delim_agnostic(single_csv_stim_file)
|
2070 |
colnames_stim = st.session_state["stimdf_single_csv"].columns
|
|
|
2448 |
use_corrected_fixations=True,
|
2449 |
correction_algo=st.session_state["algo_choice_custom_eyekit"],
|
2450 |
save_to_csv=True,
|
2451 |
+
measures_to_calculate = ALL_MEASURES_OWN
|
2452 |
)
|
2453 |
st.dataframe(own_word_measures, use_container_width=True, hide_index=True, height=200)
|
2454 |
own_word_measures_csv = convert_df(own_word_measures)
|
|
|
3098 |
use_corrected_fixations=True,
|
3099 |
correction_algo=st.session_state["algo_choice_multi_asc_eyekit"],
|
3100 |
save_to_csv=True,
|
3101 |
+
measures_to_calculate = ALL_MEASURES_OWN
|
3102 |
)
|
3103 |
if "sentence_measures_multi_asc" in st.session_state:
|
3104 |
sent_measures_multi = st.session_state["sentence_measures_multi_asc"]
|
|
|
3262 |
key=f"close_gap_between_words{suffix}",
|
3263 |
help="If this is selected, each word bounding box will include half the spaces between adjacent words. If not, the word bounding boxes will simply be the combined bounding boxes of the letters making up the word.", # TODO check if this affects analysis
|
3264 |
)
|
3265 |
+
st.checkbox(
|
3266 |
+
label="Should spaces between lines be included in word and character bounding boxes?",
|
3267 |
+
value=get_default_val(f"close_gap_between_lines{suffix}", True),
|
3268 |
+
key=f"close_gap_between_lines{suffix}",
|
3269 |
+
help="If this is selected, each word and char bounding box will include half the spaces between adjacent lines.", # TODO check if this affects analysis
|
3270 |
+
)
|
3271 |
st.markdown("### Trial filtering settings")
|
3272 |
|
3273 |
st.checkbox(
|
create_interest_areas_from_image.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image, ImageDraw
|
2 |
+
import pandas as pd
|
3 |
+
import io
|
4 |
+
import csv
|
5 |
+
import os
|
6 |
+
|
7 |
+
if os.environ.get('TESSDATA_PREFIX') is None and os.name == 'nt':
|
8 |
+
os.environ['TESSDATA_PREFIX'] = 'C:/Program Files/Tesseract-OCR/tessdata/'
|
9 |
+
tessdata_prefix = 'C:/Program Files/Tesseract-OCR/tessdata/'
|
10 |
+
if os.environ.get('TESSDATA_PREFIX') is None and os.name != 'nt':
|
11 |
+
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'
|
12 |
+
tessdata_prefix = '/usr/share/tesseract-ocr/4.00/tessdata'
|
13 |
+
|
14 |
+
import pytesseract
|
15 |
+
if os.name == 'nt':
|
16 |
+
pytesseract.pytesseract.tesseract_cmd = r'c:/Program Files/Tesseract-OCR/tesseract.exe'
|
17 |
+
else:
|
18 |
+
pytesseract.pytesseract.tesseract_cmd =r'/usr/bin/tesseract'
|
19 |
+
|
20 |
+
def recognize_text(image_path, tesseract_config='--psm 6 -l spa'):
|
21 |
+
"""
|
22 |
+
Performs OCR on an image and returns a DataFrame with character bounding boxes
|
23 |
+
and associated information.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
image_path: Path to the image file.
|
27 |
+
tesseract_config: Configuration string for pytesseract (e.g., '--psm 6 -l spa').
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
pandas.DataFrame: DataFrame containing character-level data (df_word_chars).
|
31 |
+
"""
|
32 |
+
# if os.environ['TESSDATA_PREFIX'] is not None:
|
33 |
+
# tesseract_config = f'--tessdata-dir "{tessdata_prefix}"' + tesseract_config
|
34 |
+
image = Image.open(image_path).convert('RGB')
|
35 |
+
if hasattr(image_path,'name'):
|
36 |
+
im_name = image_path.name
|
37 |
+
else:
|
38 |
+
im_name = image_path
|
39 |
+
image_height = image.height
|
40 |
+
|
41 |
+
# Extract filename for trial_id
|
42 |
+
trial_id = os.path.splitext(os.path.basename(im_name))[0]
|
43 |
+
|
44 |
+
# Use pytesseract to extract data for words and characters
|
45 |
+
data_words = pytesseract.image_to_data(image, config=tesseract_config)
|
46 |
+
data_chars = pytesseract.image_to_boxes(image, config=tesseract_config)
|
47 |
+
|
48 |
+
df_words = pd.read_csv(io.StringIO(data_words), sep='\t', quoting=csv.QUOTE_NONE)
|
49 |
+
df_chars = pd.read_csv(io.StringIO(data_chars), sep=' ', header=None, names=['char', 'left', 'top', 'right', 'bottom', 'unknown'])
|
50 |
+
|
51 |
+
# Fix character coordinates
|
52 |
+
for index, row in df_chars.iterrows():
|
53 |
+
original_top = int(row['top'])
|
54 |
+
original_bottom = int(row['bottom'])
|
55 |
+
df_chars.at[index, 'top'] = image_height - original_bottom
|
56 |
+
df_chars.at[index, 'bottom'] = image_height - original_top
|
57 |
+
|
58 |
+
# Create DataFrame to store spaces
|
59 |
+
df_spaces = pd.DataFrame(columns=['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text'])
|
60 |
+
|
61 |
+
# Group words by line, block, and paragraph
|
62 |
+
grouped_lines = df_words.groupby(['block_num', 'par_num', 'line_num'])
|
63 |
+
|
64 |
+
for (block_num, par_num, line_num), line_words_df in grouped_lines:
|
65 |
+
sorted_words = line_words_df.sort_values(by='left')
|
66 |
+
previous_word = None
|
67 |
+
for index, current_word in sorted_words.iterrows():
|
68 |
+
if previous_word is not None:
|
69 |
+
space_left = int(previous_word['left']) + int(previous_word['width'])
|
70 |
+
space_width = int(current_word['left']) - space_left
|
71 |
+
if space_width > 0:
|
72 |
+
space_top = int(previous_word['top'])
|
73 |
+
space_height = int(previous_word['height'])
|
74 |
+
space_data = {
|
75 |
+
'level': 5,
|
76 |
+
'page_num': int(current_word['page_num']),
|
77 |
+
'block_num': int(current_word['block_num']),
|
78 |
+
'par_num': int(current_word['par_num']),
|
79 |
+
'line_num': int(current_word['line_num']),
|
80 |
+
'word_num': int(previous_word['word_num']),
|
81 |
+
'left': space_left,
|
82 |
+
'top': space_top,
|
83 |
+
'width': space_width,
|
84 |
+
'height': space_height,
|
85 |
+
'conf': 0,
|
86 |
+
'text': ' '
|
87 |
+
}
|
88 |
+
df_spaces = pd.concat([df_spaces, pd.DataFrame(space_data, index=[0])], ignore_index=True)
|
89 |
+
previous_word = current_word
|
90 |
+
|
91 |
+
# Create DataFrame for characters within words (and spaces)
|
92 |
+
df_word_chars = pd.DataFrame(columns=['char', 'char_xmin', 'char_ymin', 'char_xmax', 'char_ymax',
|
93 |
+
'block', 'paragraph', 'line_number',
|
94 |
+
'word_nr', 'letter_nr', 'word',
|
95 |
+
'char_x_center', 'char_y_center', 'assigned_line', 'trial_id'])
|
96 |
+
|
97 |
+
|
98 |
+
for index_word, row_word in df_words.iterrows():
|
99 |
+
if isinstance(row_word['text'], str) and row_word['text'].strip() and row_word['level'] == 5:
|
100 |
+
word_left = int(row_word['left'])
|
101 |
+
word_top = int(row_word['top'])
|
102 |
+
word_width = int(row_word['width'])
|
103 |
+
word_height = int(row_word['height'])
|
104 |
+
word_right = word_left + word_width
|
105 |
+
word_bottom = word_top + word_height
|
106 |
+
word_text = row_word['text']
|
107 |
+
|
108 |
+
char_index_in_word = 0
|
109 |
+
relevant_chars = df_chars[
|
110 |
+
(df_chars['left'] >= word_left) & (df_chars['right'] <= word_right) &
|
111 |
+
(df_chars['top'] >= word_top) & (df_chars['bottom'] <= word_bottom)
|
112 |
+
]
|
113 |
+
relevant_chars = relevant_chars.sort_values(by='left')
|
114 |
+
previous_char_right = word_left
|
115 |
+
|
116 |
+
for index_char, row_char in relevant_chars.iterrows():
|
117 |
+
char_text = row_char['char']
|
118 |
+
char_left = previous_char_right
|
119 |
+
char_right = int(row_char['right'])
|
120 |
+
char_right = min(char_right, word_right)
|
121 |
+
if char_left > char_right:
|
122 |
+
char_right = int(row_char['right'])
|
123 |
+
char_top = word_top
|
124 |
+
char_bottom = word_bottom
|
125 |
+
|
126 |
+
char_data = {
|
127 |
+
'char': char_text,
|
128 |
+
'char_xmin': int(round(char_left)), # Round and convert to int
|
129 |
+
'char_ymin': int(round(char_top)), # Round and convert to int
|
130 |
+
'char_xmax': int(round(char_right)), # Round and convert to int
|
131 |
+
'char_ymax': int(round(char_bottom)), # Round and convert to int
|
132 |
+
'block': int(row_word['block_num']),
|
133 |
+
'paragraph': int(row_word['par_num']),
|
134 |
+
'line_number': int(row_word['line_num']),
|
135 |
+
'word_nr': int(row_word['word_num']),
|
136 |
+
'letter_nr': int(char_index_in_word), #already an int
|
137 |
+
'word': word_text,
|
138 |
+
'char_x_center': int(round((char_left + char_right) / 2)), # Round and convert
|
139 |
+
'char_y_center': int(round((char_top + char_bottom) / 2)), # Round and convert
|
140 |
+
'assigned_line': None,
|
141 |
+
'trial_id': trial_id
|
142 |
+
}
|
143 |
+
df_word_chars = pd.concat([df_word_chars, pd.DataFrame(char_data, index=[0])], ignore_index=True)
|
144 |
+
char_index_in_word += 1
|
145 |
+
previous_char_right = char_right
|
146 |
+
|
147 |
+
spaces_following_word = df_spaces[
|
148 |
+
(df_spaces['word_num'] == int(row_word['word_num'])) &
|
149 |
+
(df_spaces['line_num'] == int(row_word['line_num'])) &
|
150 |
+
(df_spaces['block_num'] == int(row_word['block_num'])) &
|
151 |
+
(df_spaces['par_num'] == int(row_word['par_num']))
|
152 |
+
]
|
153 |
+
|
154 |
+
for index_space, row_space in spaces_following_word.iterrows():
|
155 |
+
space_data = {
|
156 |
+
'char': ' ',
|
157 |
+
'char_xmin': int(round(row_space['left'])), # Round and convert
|
158 |
+
'char_ymin': int(round(row_space['top'])), # Round and convert
|
159 |
+
'char_xmax': int(round(row_space['left'] + row_space['width'])), # Round and convert
|
160 |
+
'char_ymax': int(round(row_space['top'] + row_space['height'])), # Round and convert
|
161 |
+
'block': int(row_space['block_num']),
|
162 |
+
'paragraph': int(row_space['par_num']),
|
163 |
+
'line_number': int(row_space['line_num']),
|
164 |
+
'word_nr': int(row_space['word_num']),
|
165 |
+
'letter_nr': int(char_index_in_word), # Already int
|
166 |
+
'word': word_text,
|
167 |
+
'char_x_center': int(round((row_space['left'] + row_space['left'] + row_space['width']) / 2)), # Round
|
168 |
+
'char_y_center': int(round((row_space['top'] + row_space['top'] + row_space['height']) / 2)), # Round
|
169 |
+
'assigned_line': None,
|
170 |
+
'trial_id': trial_id
|
171 |
+
}
|
172 |
+
df_word_chars = pd.concat([df_word_chars, pd.DataFrame(space_data, index=[0])], ignore_index=True)
|
173 |
+
char_index_in_word += 1
|
174 |
+
|
175 |
+
# Create 'assigned_line' column
|
176 |
+
df_word_chars['assigned_line'] = 0
|
177 |
+
line_counter = 1
|
178 |
+
for block_num in sorted(df_word_chars['block'].unique()):
|
179 |
+
for par_num in sorted(df_word_chars.loc[df_word_chars['block'] == block_num, 'paragraph'].unique()):
|
180 |
+
for line_num in sorted(df_word_chars.loc[(df_word_chars['block'] == block_num) & (df_word_chars['paragraph'] == par_num), 'line_number'].unique()):
|
181 |
+
line_mask = (df_word_chars['line_number'] == line_num) & (df_word_chars['paragraph'] == par_num) & (df_word_chars['block'] == block_num)
|
182 |
+
df_word_chars.loc[line_mask, 'assigned_line'] = line_counter
|
183 |
+
line_counter += 1
|
184 |
+
|
185 |
+
# Adjust Y_Start, Y_End, and char_y_center, converting to integers
|
186 |
+
for assigned_line in df_word_chars['assigned_line'].unique():
|
187 |
+
line_mask = (df_word_chars['assigned_line'] == assigned_line)
|
188 |
+
min_top = df_word_chars.loc[line_mask, 'char_ymin'].min()
|
189 |
+
max_bottom = df_word_chars.loc[line_mask, 'char_ymax'].max()
|
190 |
+
new_y_center = (min_top + max_bottom) / 2
|
191 |
+
df_word_chars.loc[line_mask, 'char_ymin'] = int(round(min_top)) # Round and convert
|
192 |
+
df_word_chars.loc[line_mask, 'char_ymax'] = int(round(max_bottom)) # Round and convert
|
193 |
+
df_word_chars.loc[line_mask, 'char_y_center'] = int(round(new_y_center)) # Round and convert
|
194 |
+
|
195 |
+
# Convert relevant columns to integers
|
196 |
+
int_columns = ['char_xmin', 'char_ymin', 'char_xmax', 'char_ymax', 'block', 'paragraph',
|
197 |
+
'line_number', 'word_nr', 'letter_nr', 'char_x_center', 'char_y_center', 'assigned_line']
|
198 |
+
for col in int_columns:
|
199 |
+
df_word_chars[col] = df_word_chars[col].astype(int)
|
200 |
+
|
201 |
+
return df_word_chars
|
202 |
+
|
203 |
+
|
204 |
+
def draw_char_boxes(image_path, df_word_chars, output_path='output_boxes_combined.png'):
|
205 |
+
"""
|
206 |
+
Draws bounding boxes around characters on the image.
|
207 |
+
|
208 |
+
Args:
|
209 |
+
image_path: Path to the image file.
|
210 |
+
df_word_chars: DataFrame containing character bounding box data.
|
211 |
+
output_path: Path to save the image with bounding boxes. Defaults to 'output_boxes_combined.png'.
|
212 |
+
"""
|
213 |
+
image = Image.open(image_path).convert('RGB')
|
214 |
+
draw = ImageDraw.Draw(image)
|
215 |
+
|
216 |
+
# Draw bounding boxes for characters (purple)
|
217 |
+
for index, row in df_word_chars.iterrows():
|
218 |
+
left = int(row['char_xmin'])
|
219 |
+
top = int(row['char_ymin'])
|
220 |
+
right = int(row['char_xmax'])
|
221 |
+
bottom = int(row['char_ymax'])
|
222 |
+
draw.rectangle([(left, top), (right, bottom)], outline='purple', width=1)
|
223 |
+
|
224 |
+
# Display or save the image
|
225 |
+
image.save(output_path)
|
226 |
+
|
227 |
+
|
228 |
+
# Example usage
|
229 |
+
if __name__ == '__main__':
|
230 |
+
# image_path = 'testfiles/testim_ocr.png'
|
231 |
+
image_path = 'testfiles/newplot.png'
|
232 |
+
# Example with default tesseract config
|
233 |
+
df_chars = recognize_text(image_path)
|
234 |
+
draw_char_boxes(image_path, df_chars)
|
235 |
+
df_chars.to_csv('testim_ocr_df_word_chars_test.csv', index=False)
|
236 |
+
print("\nDataFrame of Characters within Words (df_word_chars) - Default Config:")
|
237 |
+
print(df_chars)
|
eyekit_measures.py
CHANGED
@@ -17,7 +17,6 @@ MEASURES_DICT = {
|
|
17 |
"second_pass_duration": [],
|
18 |
"initial_landing_position": [],
|
19 |
"initial_landing_distance": [],
|
20 |
-
"landing_distances": [],
|
21 |
"number_of_regressions_in": [],
|
22 |
}
|
23 |
|
|
|
17 |
"second_pass_duration": [],
|
18 |
"initial_landing_position": [],
|
19 |
"initial_landing_distance": [],
|
|
|
20 |
"number_of_regressions_in": [],
|
21 |
}
|
22 |
|
multi_proc_funcs.py
CHANGED
@@ -16,6 +16,7 @@ from matplotlib.font_manager import FontProperties
|
|
16 |
from matplotlib.patches import Rectangle
|
17 |
from tqdm.auto import tqdm
|
18 |
import torch as t
|
|
|
19 |
import plotly.express as px
|
20 |
import copy
|
21 |
|
@@ -1992,8 +1993,12 @@ def add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, xcol, cols_to_
|
|
1992 |
].reset_index()
|
1993 |
selected_stimmat.loc[:, "letword"] = selected_stimmat.groupby("in_word_number")["letternum"].rank()
|
1994 |
letters_on_line = selected_stimmat.shape[0]
|
1995 |
-
|
1996 |
-
|
|
|
|
|
|
|
|
|
1997 |
dffix.loc[i, f"letternum_{algo_choice}"] = selected_stimmat.loc[min_idx, "letternum"]
|
1998 |
dffix.loc[i, f"letter_{algo_choice}"] = selected_stimmat.loc[min_idx, "char"]
|
1999 |
dffix.loc[i, f"line_let_{algo_choice}"] = selected_stimmat.loc[min_idx, "letline"]
|
|
|
16 |
from matplotlib.patches import Rectangle
|
17 |
from tqdm.auto import tqdm
|
18 |
import torch as t
|
19 |
+
t.classes.__path__ = [] # https://discuss.streamlit.io/t/error-in-torch-with-streamlit/90908/3
|
20 |
import plotly.express as px
|
21 |
import copy
|
22 |
|
|
|
1993 |
].reset_index()
|
1994 |
selected_stimmat.loc[:, "letword"] = selected_stimmat.groupby("in_word_number")["letternum"].rank()
|
1995 |
letters_on_line = selected_stimmat.shape[0]
|
1996 |
+
if len(selected_stimmat["char_x_center"])>0:
|
1997 |
+
out = dffix.loc[i, xcol] - selected_stimmat["char_x_center"]
|
1998 |
+
min_idx = out.abs().idxmin()
|
1999 |
+
# ic(selected_stimmat)
|
2000 |
+
else:
|
2001 |
+
min_idx = 0
|
2002 |
dffix.loc[i, f"letternum_{algo_choice}"] = selected_stimmat.loc[min_idx, "letternum"]
|
2003 |
dffix.loc[i, f"letter_{algo_choice}"] = selected_stimmat.loc[min_idx, "char"]
|
2004 |
dffix.loc[i, f"line_let_{algo_choice}"] = selected_stimmat.loc[min_idx, "letline"]
|
requirements.txt
CHANGED
@@ -22,4 +22,5 @@ pycairo
|
|
22 |
eyekit
|
23 |
stqdm
|
24 |
jellyfish
|
25 |
-
icecream
|
|
|
|
22 |
eyekit
|
23 |
stqdm
|
24 |
jellyfish
|
25 |
+
icecream
|
26 |
+
pytesseract
|
utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import pickle
|
2 |
from io import StringIO
|
3 |
import re
|
|
|
4 |
import zipfile
|
5 |
import os
|
6 |
import plotly.graph_objects as go
|
@@ -17,6 +18,8 @@ from tqdm.auto import tqdm
|
|
17 |
import time
|
18 |
import requests
|
19 |
from icecream import ic
|
|
|
|
|
20 |
from matplotlib import font_manager
|
21 |
from multi_proc_funcs import (
|
22 |
COLORS,
|
@@ -99,13 +102,14 @@ def download_url(url, target_filename):
|
|
99 |
|
100 |
|
101 |
def asc_to_trial_ids(
|
102 |
-
asc_file, close_gap_between_words, paragraph_trials_only, ias_files, trial_start_keyword, end_trial_at_keyword
|
103 |
):
|
104 |
asc_encoding = ["ISO-8859-15", "UTF-8"][0]
|
105 |
trials_dict, lines = file_to_trials_and_lines(
|
106 |
asc_file,
|
107 |
asc_encoding,
|
108 |
close_gap_between_words=close_gap_between_words,
|
|
|
109 |
paragraph_trials_only=paragraph_trials_only,
|
110 |
uploaded_ias_files=ias_files,
|
111 |
trial_start_keyword=trial_start_keyword,
|
@@ -122,7 +126,7 @@ def asc_to_trial_ids(
|
|
122 |
|
123 |
|
124 |
def get_trials_list(
|
125 |
-
asc_file, close_gap_between_words, paragraph_trials_only, ias_files, trial_start_keyword, end_trial_at_keyword
|
126 |
):
|
127 |
if hasattr(asc_file, "name"):
|
128 |
savename = pl.Path(asc_file.name).stem
|
@@ -132,6 +136,7 @@ def get_trials_list(
|
|
132 |
trials_by_ids, lines, trials_dict = asc_to_trial_ids(
|
133 |
asc_file,
|
134 |
close_gap_between_words=close_gap_between_words,
|
|
|
135 |
paragraph_trials_only=paragraph_trials_only,
|
136 |
ias_files=ias_files,
|
137 |
trial_start_keyword=trial_start_keyword,
|
@@ -238,9 +243,9 @@ def add_words(chars_list):
|
|
238 |
word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
|
239 |
word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
|
240 |
word_x_center = round((word_xmax - word_xmin) / 2 + word_xmin, ndigits=2)
|
241 |
-
word_y_center =
|
242 |
word_length = len(word)
|
243 |
-
assigned_line = chars_list_reconstructed[word_start_idx]["assigned_line"]
|
244 |
word_dict = dict(
|
245 |
word_number=len(words_list),
|
246 |
word=word,
|
@@ -473,6 +478,7 @@ def asc_lines_to_trials_by_trail_id(
|
|
473 |
paragraph_trials_only=True,
|
474 |
filename: str = "",
|
475 |
close_gap_between_words=True,
|
|
|
476 |
ias_files=[],
|
477 |
start_trial_at_keyword="START",
|
478 |
end_trial_at_keyword="END",
|
@@ -884,21 +890,189 @@ def asc_lines_to_trials_by_trail_id(
|
|
884 |
words_list = words_list_from_func
|
885 |
|
886 |
if close_gap_between_words: # TODO this may need to change the "in_word" col for the chars_df
|
887 |
-
|
888 |
-
if words_list[widx]["assigned_line"] == words_list[widx - 1]["assigned_line"]:
|
889 |
-
word_sep_half_width = (words_list[widx]["word_xmin"] - words_list[widx - 1]["word_xmax"]) / 2
|
890 |
-
words_list[widx - 1]["word_xmax"] = words_list[widx - 1]["word_xmax"] + word_sep_half_width
|
891 |
-
words_list[widx]["word_xmin"] = words_list[widx]["word_xmin"] - word_sep_half_width
|
892 |
else:
|
893 |
chars_df = pd.DataFrame(chars_list_reconstructed)
|
894 |
chars_df.loc[
|
895 |
chars_df["char"] == " ", ["in_word", "in_word_number", "num_letters_from_start_of_word"]
|
896 |
] = pd.NA
|
897 |
chars_list_reconstructed = chars_df.to_dict("records")
|
|
|
|
|
|
|
898 |
trials_dict[trial_idx]["words_list"] = words_list
|
899 |
trials_dict[trial_idx]["chars_list"] = chars_list_reconstructed
|
900 |
return trials_dict
|
901 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
902 |
|
903 |
def get_lines_from_file(uploaded_file, asc_encoding="ISO-8859-15"):
|
904 |
if isinstance(uploaded_file, str) or isinstance(uploaded_file, pl.Path):
|
@@ -915,6 +1089,7 @@ def file_to_trials_and_lines(
|
|
915 |
uploaded_file,
|
916 |
asc_encoding: str = "ISO-8859-15",
|
917 |
close_gap_between_words=True,
|
|
|
918 |
paragraph_trials_only=True,
|
919 |
uploaded_ias_files=[],
|
920 |
trial_start_keyword="START",
|
@@ -926,6 +1101,7 @@ def file_to_trials_and_lines(
|
|
926 |
paragraph_trials_only,
|
927 |
uploaded_file,
|
928 |
close_gap_between_words=close_gap_between_words,
|
|
|
929 |
ias_files=uploaded_ias_files,
|
930 |
start_trial_at_keyword=trial_start_keyword,
|
931 |
end_trial_at_keyword=end_trial_at_keyword,
|
@@ -1007,7 +1183,7 @@ def plotly_plot_with_image(
|
|
1007 |
|
1008 |
if lines_in_plot == "Both":
|
1009 |
uncorrected_plot_mode = "markers+lines+text"
|
1010 |
-
corrected_plot_mode = "markers+
|
1011 |
|
1012 |
fig = go.Figure()
|
1013 |
fig.add_trace(
|
|
|
1 |
import pickle
|
2 |
from io import StringIO
|
3 |
import re
|
4 |
+
from typing import Dict, List
|
5 |
import zipfile
|
6 |
import os
|
7 |
import plotly.graph_objects as go
|
|
|
18 |
import time
|
19 |
import requests
|
20 |
from icecream import ic
|
21 |
+
import collections
|
22 |
+
import statistics
|
23 |
from matplotlib import font_manager
|
24 |
from multi_proc_funcs import (
|
25 |
COLORS,
|
|
|
102 |
|
103 |
|
104 |
def asc_to_trial_ids(
|
105 |
+
asc_file, close_gap_between_words,close_gap_between_lines, paragraph_trials_only, ias_files, trial_start_keyword, end_trial_at_keyword
|
106 |
):
|
107 |
asc_encoding = ["ISO-8859-15", "UTF-8"][0]
|
108 |
trials_dict, lines = file_to_trials_and_lines(
|
109 |
asc_file,
|
110 |
asc_encoding,
|
111 |
close_gap_between_words=close_gap_between_words,
|
112 |
+
close_gap_between_lines=close_gap_between_lines,
|
113 |
paragraph_trials_only=paragraph_trials_only,
|
114 |
uploaded_ias_files=ias_files,
|
115 |
trial_start_keyword=trial_start_keyword,
|
|
|
126 |
|
127 |
|
128 |
def get_trials_list(
|
129 |
+
asc_file, close_gap_between_words,close_gap_between_lines, paragraph_trials_only, ias_files, trial_start_keyword, end_trial_at_keyword
|
130 |
):
|
131 |
if hasattr(asc_file, "name"):
|
132 |
savename = pl.Path(asc_file.name).stem
|
|
|
136 |
trials_by_ids, lines, trials_dict = asc_to_trial_ids(
|
137 |
asc_file,
|
138 |
close_gap_between_words=close_gap_between_words,
|
139 |
+
close_gap_between_lines=close_gap_between_lines,
|
140 |
paragraph_trials_only=paragraph_trials_only,
|
141 |
ias_files=ias_files,
|
142 |
trial_start_keyword=trial_start_keyword,
|
|
|
243 |
word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
|
244 |
word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
|
245 |
word_x_center = round((word_xmax - word_xmin) / 2 + word_xmin, ndigits=2)
|
246 |
+
word_y_center = chars_list_reconstructed[word_start_idx]["char_y_center"]
|
247 |
word_length = len(word)
|
248 |
+
assigned_line = int(chars_list_reconstructed[word_start_idx]["assigned_line"])
|
249 |
word_dict = dict(
|
250 |
word_number=len(words_list),
|
251 |
word=word,
|
|
|
478 |
paragraph_trials_only=True,
|
479 |
filename: str = "",
|
480 |
close_gap_between_words=True,
|
481 |
+
close_gap_between_lines=True,
|
482 |
ias_files=[],
|
483 |
start_trial_at_keyword="START",
|
484 |
end_trial_at_keyword="END",
|
|
|
890 |
words_list = words_list_from_func
|
891 |
|
892 |
if close_gap_between_words: # TODO this may need to change the "in_word" col for the chars_df
|
893 |
+
words_list = close_gaps_in_words_list(words_list)
|
|
|
|
|
|
|
|
|
894 |
else:
|
895 |
chars_df = pd.DataFrame(chars_list_reconstructed)
|
896 |
chars_df.loc[
|
897 |
chars_df["char"] == " ", ["in_word", "in_word_number", "num_letters_from_start_of_word"]
|
898 |
] = pd.NA
|
899 |
chars_list_reconstructed = chars_df.to_dict("records")
|
900 |
+
if close_gap_between_lines:
|
901 |
+
chars_list_reconstructed = close_gaps_between_lines(chars_list_reconstructed,prefix='char')
|
902 |
+
words_list = close_gaps_between_lines(words_list,prefix='word')
|
903 |
trials_dict[trial_idx]["words_list"] = words_list
|
904 |
trials_dict[trial_idx]["chars_list"] = chars_list_reconstructed
|
905 |
return trials_dict
|
906 |
|
907 |
+
def close_gaps_between_lines(data, prefix):
|
908 |
+
"""
|
909 |
+
Adjusts word_ymin and word_ymax for lines in a list of dictionaries based on average y-centers.
|
910 |
+
|
911 |
+
Args:
|
912 |
+
data: A list of dictionaries, where each dictionary must have
|
913 |
+
'assigned_line', 'word_ymin', and 'word_ymax' keys.
|
914 |
+
|
915 |
+
Returns:
|
916 |
+
A new list of dictionaries with adjusted 'word_ymin' and 'word_ymax' values.
|
917 |
+
Returns an empty list if the input is empty.
|
918 |
+
Returns the original list if there's only one unique line number.
|
919 |
+
"""
|
920 |
+
if not data:
|
921 |
+
return []
|
922 |
+
|
923 |
+
# --- Step 1: Calculate ycenter and group by assigned_line ---
|
924 |
+
line_centers = collections.defaultdict(list)
|
925 |
+
# Keep track of original min/max for single line case plotting
|
926 |
+
original_coords = collections.defaultdict(
|
927 |
+
lambda: {f"{prefix}_ymin": float("inf"), f"{prefix}_ymax": float("-inf")}
|
928 |
+
)
|
929 |
+
|
930 |
+
for item in data:
|
931 |
+
if f"{prefix}_ymin" in item and f"{prefix}_ymax" in item:
|
932 |
+
ycenter = (item[f"{prefix}_ymin"] + item[f"{prefix}_ymax"]) / 2
|
933 |
+
line_num = item["assigned_line"]
|
934 |
+
line_centers[line_num].append(ycenter)
|
935 |
+
# Track overall min/max for original data per line
|
936 |
+
original_coords[line_num][f"{prefix}_ymin"] = min(
|
937 |
+
original_coords[line_num][f"{prefix}_ymin"], item[f"{prefix}_ymin"]
|
938 |
+
)
|
939 |
+
original_coords[line_num][f"{prefix}_ymax"] = max(
|
940 |
+
original_coords[line_num][f"{prefix}_ymax"], item[f"{prefix}_ymax"]
|
941 |
+
)
|
942 |
+
|
943 |
+
# --- Step 2: Calculate average ycenter for each assigned_line ---
|
944 |
+
avg_centers = {}
|
945 |
+
for line_num, centers in line_centers.items():
|
946 |
+
if centers: # Avoid division by zero if a assigned_line had no valid entries
|
947 |
+
avg_centers[line_num] = statistics.mean(centers)
|
948 |
+
|
949 |
+
# Handle case with 0 or 1 unique line numbers - no adjustments needed/possible
|
950 |
+
if len(avg_centers) <= 1:
|
951 |
+
print(
|
952 |
+
"Only one unique line number found or no valid lines. No adjustments made."
|
953 |
+
)
|
954 |
+
# Return a deep copy to avoid modifying the original list if needed,
|
955 |
+
# or just return the original list. Let's return a copy for safety.
|
956 |
+
return data
|
957 |
+
|
958 |
+
# --- Step 3: Sort line numbers based on average ycenter ---
|
959 |
+
# Creates a list of tuples: (assigned_line, avg_ycenter) sorted by avg_ycenter
|
960 |
+
sorted_lines = sorted(avg_centers.items(), key=lambda item: item[1])
|
961 |
+
# Extract sorted line numbers and their average centers
|
962 |
+
sorted_line_nums = [item[0] for item in sorted_lines]
|
963 |
+
sorted_avg_centers = [item[1] for item in sorted_lines]
|
964 |
+
|
965 |
+
# --- Step 4: Calculate boundaries ---
|
966 |
+
num_lines = len(sorted_avg_centers)
|
967 |
+
boundaries = {} # Store boundaries between line i and line i+1
|
968 |
+
|
969 |
+
# Calculate boundaries between adjacent lines
|
970 |
+
for i in range(num_lines - 1):
|
971 |
+
midpoint = (sorted_avg_centers[i] + sorted_avg_centers[i + 1]) / 2
|
972 |
+
boundaries[i] = midpoint
|
973 |
+
|
974 |
+
# --- Step 5: Determine new word_ymin and word_ymax for each assigned_line ---
|
975 |
+
new_coords = {} # Stores {assigned_line: {'word_ymin': new_ymin, 'word_ymax': new_ymax}}
|
976 |
+
|
977 |
+
# Handle the first line
|
978 |
+
first_line_num = sorted_line_nums[0]
|
979 |
+
# Estimate boundary before the first line by extrapolating
|
980 |
+
# Use max(0, ...) to prevent negative word_ymin if lines are very close to 0
|
981 |
+
# Ensure extrapolation doesn't create negative boundary if first line is near 0
|
982 |
+
extrapolated_start_boundary = max(
|
983 |
+
0, sorted_avg_centers[0] - (sorted_avg_centers[1] - sorted_avg_centers[0]) / 2
|
984 |
+
)
|
985 |
+
# The new word_ymin should start 1 pixel *after* the rounded boundary
|
986 |
+
# The boundary itself is the dividing line.
|
987 |
+
new_ymin_first = round(extrapolated_start_boundary) + 1
|
988 |
+
new_ymax_first = round(boundaries[0])
|
989 |
+
# Ensure word_ymin is not greater than word_ymax, adjust if necessary
|
990 |
+
if new_ymin_first > new_ymax_first:
|
991 |
+
print(
|
992 |
+
f"Warning: Calculated word_ymin ({new_ymin_first}) > word_ymax ({new_ymax_first}) for first line ({first_line_num}). Adjusting word_ymin."
|
993 |
+
)
|
994 |
+
new_ymin_first = new_ymax_first # Set word_ymin = word_ymax, resulting in a height of 0
|
995 |
+
new_coords[first_line_num] = {f"{prefix}_ymin": new_ymin_first, f"{prefix}_ymax": new_ymax_first}
|
996 |
+
|
997 |
+
# Handle intermediate lines
|
998 |
+
for i in range(1, num_lines - 1):
|
999 |
+
line_num = sorted_line_nums[i]
|
1000 |
+
# word_ymin starts 1 pixel after the previous boundary
|
1001 |
+
new_ymin = round(boundaries[i - 1]) + 1
|
1002 |
+
# word_ymax is at the current boundary
|
1003 |
+
new_ymax = round(boundaries[i])
|
1004 |
+
# Ensure word_ymin is not greater than word_ymax
|
1005 |
+
if new_ymin > new_ymax:
|
1006 |
+
print(
|
1007 |
+
f"Warning: Calculated word_ymin ({new_ymin}) > word_ymax ({new_ymax}) for intermediate line ({line_num}). Adjusting word_ymin."
|
1008 |
+
)
|
1009 |
+
new_ymin = new_ymax # Adjust word_ymin to be equal to word_ymax
|
1010 |
+
new_coords[line_num] = {f"{prefix}_ymin": new_ymin, f"{prefix}_ymax": new_ymax}
|
1011 |
+
|
1012 |
+
# Handle the last line
|
1013 |
+
last_line_num = sorted_line_nums[-1]
|
1014 |
+
# Estimate boundary after the last line by extrapolating
|
1015 |
+
extrapolated_end_boundary = (
|
1016 |
+
sorted_avg_centers[-1] + (sorted_avg_centers[-1] - sorted_avg_centers[-2]) / 2
|
1017 |
+
)
|
1018 |
+
# word_ymin starts 1 pixel after the previous boundary
|
1019 |
+
new_ymin_last = round(boundaries[num_lines - 2]) + 1
|
1020 |
+
# word_ymax is at the extrapolated end boundary
|
1021 |
+
new_ymax_last = round(extrapolated_end_boundary)
|
1022 |
+
# Ensure word_ymin is not greater than word_ymax
|
1023 |
+
if new_ymin_last > new_ymax_last:
|
1024 |
+
print(
|
1025 |
+
f"Warning: Calculated word_ymin ({new_ymin_last}) > word_ymax ({new_ymax_last}) for last line ({last_line_num}). Adjusting word_ymax."
|
1026 |
+
)
|
1027 |
+
new_ymax_last = new_ymin_last # Adjust word_ymax to be equal to word_ymin
|
1028 |
+
new_coords[last_line_num] = {f"{prefix}_ymin": new_ymin_last, f"{prefix}_ymax": new_ymax_last}
|
1029 |
+
|
1030 |
+
# --- Step 6: Update the original data structure ---
|
1031 |
+
# Create a new list to store results, preserving other keys
|
1032 |
+
adjusted_data = []
|
1033 |
+
for item in data:
|
1034 |
+
new_item = (
|
1035 |
+
item.copy()
|
1036 |
+
) # Create a copy to avoid modifying original dicts directly if they are reused
|
1037 |
+
line_num = new_item.get("assigned_line")
|
1038 |
+
if line_num in new_coords:
|
1039 |
+
new_item[f"{prefix}_ymin"] = new_coords[line_num][f"{prefix}_ymin"]
|
1040 |
+
new_item[f"{prefix}_ymax"] = new_coords[line_num][f"{prefix}_ymax"]
|
1041 |
+
adjusted_data.append(new_item)
|
1042 |
+
|
1043 |
+
return adjusted_data
|
1044 |
+
|
1045 |
+
def close_gaps_in_words_list(words_list:List[Dict]):
|
1046 |
+
"""
|
1047 |
+
Adjusts the positions of words in a list to close gaps between consecutive words
|
1048 |
+
that belong to the same assigned line. The function modifies the input list in place.
|
1049 |
+
|
1050 |
+
Args:
|
1051 |
+
words_list (list of dict): A list of dictionaries where each dictionary represents
|
1052 |
+
a word with the following keys:
|
1053 |
+
- "assigned_line" (int): The line number to which the word is assigned.
|
1054 |
+
- "word_xmin" (float): The minimum x-coordinate of the word's bounding box.
|
1055 |
+
- "word_xmax" (float): The maximum x-coordinate of the word's bounding box.
|
1056 |
+
|
1057 |
+
Behavior:
|
1058 |
+
- For each pair of consecutive words in the list that belong to the same line
|
1059 |
+
(i.e., have the same "assigned_line"), the function calculates the gap between
|
1060 |
+
their bounding boxes.
|
1061 |
+
- The gap is split equally between the two words, and their "word_xmin" and
|
1062 |
+
"word_xmax" values are adjusted accordingly to close the gap.
|
1063 |
+
|
1064 |
+
Note:
|
1065 |
+
- The input list is modified in place, and no value is returned.
|
1066 |
+
- It is assumed that the input list is sorted by "assigned_line" and the x-coordinates
|
1067 |
+
of the words.
|
1068 |
+
|
1069 |
+
"""
|
1070 |
+
for widx in range(1, len(words_list)):
|
1071 |
+
if words_list[widx]["assigned_line"] == words_list[widx - 1]["assigned_line"]:
|
1072 |
+
word_sep_half_width = (words_list[widx]["word_xmin"] - words_list[widx - 1]["word_xmax"]) / 2
|
1073 |
+
words_list[widx - 1]["word_xmax"] = words_list[widx - 1]["word_xmax"] + word_sep_half_width
|
1074 |
+
words_list[widx]["word_xmin"] = words_list[widx]["word_xmin"] - word_sep_half_width
|
1075 |
+
return words_list
|
1076 |
|
1077 |
def get_lines_from_file(uploaded_file, asc_encoding="ISO-8859-15"):
|
1078 |
if isinstance(uploaded_file, str) or isinstance(uploaded_file, pl.Path):
|
|
|
1089 |
uploaded_file,
|
1090 |
asc_encoding: str = "ISO-8859-15",
|
1091 |
close_gap_between_words=True,
|
1092 |
+
close_gap_between_lines=True,
|
1093 |
paragraph_trials_only=True,
|
1094 |
uploaded_ias_files=[],
|
1095 |
trial_start_keyword="START",
|
|
|
1101 |
paragraph_trials_only,
|
1102 |
uploaded_file,
|
1103 |
close_gap_between_words=close_gap_between_words,
|
1104 |
+
close_gap_between_lines=close_gap_between_lines,
|
1105 |
ias_files=uploaded_ias_files,
|
1106 |
start_trial_at_keyword=trial_start_keyword,
|
1107 |
end_trial_at_keyword=end_trial_at_keyword,
|
|
|
1183 |
|
1184 |
if lines_in_plot == "Both":
|
1185 |
uncorrected_plot_mode = "markers+lines+text"
|
1186 |
+
corrected_plot_mode = "markers+text"
|
1187 |
|
1188 |
fig = go.Figure()
|
1189 |
fig.add_trace(
|