MilanM commited on
Commit
e5d0cdb
·
verified ·
1 Parent(s): ed4a452

Upload 3 files

Browse files
helper_functions/helper_functions.py CHANGED
@@ -7,14 +7,15 @@ import glob
7
  import io
8
  import os
9
 
 
10
  def get_cred_value(key, creds_var_name="baked_in_creds", default=""):
11
  """
12
  Helper function to safely get a value from a credentials dictionary.
13
-
14
  Searches for credentials in:
15
  1. Global variables with the specified variable name
16
  2. Imported modules containing the specified variable name
17
-
18
  Args:
19
  key: The key to look up in the credentials dictionary.
20
  creds_var_name: The variable name of the credentials dictionary.
@@ -28,38 +29,44 @@ def get_cred_value(key, creds_var_name="baked_in_creds", default=""):
28
  creds_dict = globals()[creds_var_name]
29
  if isinstance(creds_dict, dict) and key in creds_dict:
30
  return creds_dict[key]
31
-
32
  # Check if credentials are in an imported module
33
  import sys
 
34
  for module_name, module_obj in sys.modules.items():
35
  if hasattr(module_obj, creds_var_name):
36
  creds_dict = getattr(module_obj, creds_var_name)
37
  if isinstance(creds_dict, dict) and key in creds_dict:
38
  return creds_dict[key]
39
-
40
  return default
41
 
 
42
  def get_key_by_value(dictionary, value):
43
  for key, val in dictionary.items():
44
  if val == value:
45
  return key
46
  return None
47
 
 
48
  def markdown_spacing(number):
49
  """Convert a number to that many ' ' characters."""
50
- return ' ' * number
 
51
 
52
  def wrap_with_spaces(text_to_wrap, prefix_spaces=2, suffix_spaces=2):
53
  """Wrap text with non-breaking spaces on either side."""
54
  prefix = markdown_spacing(prefix_spaces) if prefix_spaces > 0 else ""
55
  suffix = markdown_spacing(suffix_spaces) if suffix_spaces > 0 else ""
56
  return f"{prefix}{text_to_wrap}{suffix}"
57
-
58
 
59
- def load_file_dataframe(file, file_extension, sheet_selector=None, excel_data=None, header_row=0):
 
 
 
60
  """
61
  Load a dataframe from an uploaded file with customizable header and row skipping.
62
-
63
  Parameters:
64
  -----------
65
  file : marimo.ui.file object
@@ -73,37 +80,41 @@ def load_file_dataframe(file, file_extension, sheet_selector=None, excel_data=No
73
  header_row : int, optional
74
  Row index to use as column headers (0-based). Default is 0 (first row).
75
  Use None to have pandas generate default column names.
76
-
77
  Returns:
78
  --------
79
  tuple
80
  (pandas.DataFrame, list) - The loaded dataframe and list of column names
81
  """
82
-
83
  dataframe = pd.DataFrame([])
84
  column_names = []
85
-
86
  if file.contents():
87
  # Handle different file types
88
- if file_extension in ['.xlsx', '.xls'] and sheet_selector is not None and sheet_selector.value:
 
 
 
 
89
  # For Excel files - now we can safely access sheet_selector.value
90
  excel_data.seek(0) # Reset buffer position
91
  dataframe = pd.read_excel(
92
- excel_data,
93
  sheet_name=sheet_selector.value,
94
  header=header_row,
95
- engine="openpyxl" if file_extension == '.xlsx' else "xlrd"
96
  )
97
  column_names = list(dataframe.columns)
98
- elif file_extension == '.csv':
99
  # For CSV files
100
- csv_data = io.StringIO(file.contents().decode('utf-8'))
101
  dataframe = pd.read_csv(csv_data, header=header_row)
102
  column_names = list(dataframe.columns)
103
- elif file_extension == '.json':
104
  # For JSON files
105
  try:
106
- json_data = json.loads(file.contents().decode('utf-8'))
107
  # Handle different JSON structures
108
  if isinstance(json_data, list):
109
  dataframe = pd.DataFrame(json_data)
@@ -118,12 +129,17 @@ def load_file_dataframe(file, file_extension, sheet_selector=None, excel_data=No
118
  column_names = list(dataframe.columns)
119
  except Exception as e:
120
  print(f"Error parsing JSON: {e}")
121
-
122
  return dataframe, column_names
123
 
124
 
125
- def create_parameter_table(input_list, column_name="Active Options", label="Select the Parameters to set to Active",
126
- selection_type="multi-cell", text_justify="center"):
 
 
 
 
 
127
  """
128
  Creates a marimo table for parameter selection.
129
 
@@ -150,13 +166,14 @@ def create_parameter_table(input_list, column_name="Active Options", label="Sele
150
  # Create the table
151
  parameter_table = mo.ui.table(
152
  label=f"**{label}**",
153
- data={column_name: input_list},
154
  selection=selection_type,
155
- text_justify_columns={column_name: text_justify}
156
  )
157
 
158
  return parameter_table
159
 
 
160
  def get_cell_values(parameter_options):
161
  """
162
  Extract active parameter values from a mo.ui.table.
@@ -171,11 +188,11 @@ def get_cell_values(parameter_options):
171
  all_params = set()
172
 
173
  # Use the data property to get all options from the table
174
- if hasattr(parameter_options, 'data'):
175
  table_data = parameter_options.data
176
 
177
  # Handle DataFrame-like structure
178
- if hasattr(table_data, 'shape') and hasattr(table_data, 'iloc'):
179
  for i in range(table_data.shape[0]):
180
  # Get value from first column
181
  if table_data.shape[1] > 0:
@@ -196,20 +213,21 @@ def get_cell_values(parameter_options):
196
  result = {param: False for param in all_params}
197
 
198
  # Get the selected cells
199
- if hasattr(parameter_options, 'value') and parameter_options.value is not None:
200
  selected_cells = parameter_options.value
201
 
202
  # Process selected cells
203
  for cell in selected_cells:
204
- if hasattr(cell, 'value') and cell.value in result:
205
  result[cell.value] = True
206
- elif isinstance(cell, dict) and 'value' in cell and cell['value'] in result:
207
- result[cell['value']] = True
208
  elif isinstance(cell, str) and cell in result:
209
  result[cell] = True
210
 
211
  return result
212
 
 
213
  def convert_table_to_json_docs(df, selected_columns=None):
214
  """
215
  Convert a pandas DataFrame or dictionary to a list of JSON documents.
@@ -232,9 +250,9 @@ def convert_table_to_json_docs(df, selected_columns=None):
232
  if not isinstance(key, str):
233
  return str(key).lower()
234
  # Replace spaces with underscores and convert to lowercase
235
- key = key.lower().replace(' ', '_')
236
  # Remove special characters (keeping alphanumeric and underscores)
237
- return re.sub(r'[^\w]', '', key)
238
 
239
  # Handle case when input is a dictionary
240
  if isinstance(df, dict):
@@ -266,7 +284,11 @@ def convert_table_to_json_docs(df, selected_columns=None):
266
  selected_columns = [col for col, include in selected_columns.items() if include]
267
 
268
  # If no columns are specifically selected, use all available columns
269
- if not selected_columns or not isinstance(selected_columns, list) or len(selected_columns) == 0:
 
 
 
 
270
  selected_columns = list(df.columns)
271
 
272
  # Determine which columns exist in the DataFrame
@@ -291,11 +313,114 @@ def convert_table_to_json_docs(df, selected_columns=None):
291
  value = row[col]
292
  # Standardize the column name when adding to document
293
  std_col = standardize_key(col)
294
- doc[std_col] = None if pd.isna(value) else value
 
 
 
 
 
 
 
 
 
 
 
295
  json_docs.append(doc)
296
 
297
  return json_docs
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  def filter_models_by_function(resources, function_type="prompt_chat"):
300
  """
301
  Filter model IDs from resources list that have a specific function type
@@ -317,7 +442,7 @@ def filter_models_by_function(resources, function_type="prompt_chat"):
317
  if "functions" in model and isinstance(model["functions"], list):
318
  # Check if any function has the matching id
319
  has_function = any(
320
- func.get("id") == function_type
321
  for func in model["functions"]
322
  if isinstance(func, dict)
323
  )
@@ -328,25 +453,30 @@ def filter_models_by_function(resources, function_type="prompt_chat"):
328
  return filtered_model_ids
329
 
330
 
331
- def get_model_selection_table(client=None, model_type="all", filter_functionality=None, selection_mode="single-cell"):
 
 
 
 
 
332
  """
333
  Creates and displays a table for model selection based on specified parameters.
334
-
335
  Args:
336
  client: The client object for API calls. If None, returns default models.
337
  model_type (str): Type of models to display. Options: "all", "chat", "embedding".
338
  filter_functionality (str, optional): Filter models by functionality type.
339
- Options include: "image_chat", "text_chat", "autoai_rag",
340
  "text_generation", "multilingual", etc.
341
  selection_mode (str): Mode for selecting table entries. Options: "single", "single-cell".
342
  Defaults to "single-cell".
343
-
344
  Returns:
345
  The selected model ID from the displayed table.
346
  """
347
  # Default model list if client is None
348
- default_models = ['mistralai/mistral-large']
349
-
350
  if client is None:
351
  # If no client, use default models
352
  available_models = default_models
@@ -357,7 +487,7 @@ def get_model_selection_table(client=None, model_type="all", filter_functionalit
357
  page_size=30,
358
  )
359
  return selection
360
-
361
  # Get appropriate model specs based on model_type
362
  if model_type == "chat":
363
  model_specs = client.foundation_models.get_chat_model_specs()
@@ -365,49 +495,51 @@ def get_model_selection_table(client=None, model_type="all", filter_functionalit
365
  model_specs = client.foundation_models.get_embeddings_model_specs()
366
  else:
367
  model_specs = client.foundation_models.get_model_specs()
368
-
369
  # Extract resources from model specs
370
  resources = model_specs.get("resources", [])
371
-
372
  # Filter by functionality if specified
373
  if filter_functionality and resources:
374
  model_id_list = filter_models_by_function(resources, filter_functionality)
375
  else:
376
  # Create list of model IDs if no filtering
377
  model_id_list = [resource["model_id"] for resource in resources]
378
-
379
  # If no models available after filtering, use defaults
380
  if not model_id_list:
381
  model_id_list = default_models
382
-
383
  # Create and display selection table
384
  model_selector = mo.ui.table(
385
  model_id_list,
386
  selection=selection_mode,
387
  label="Select a model to use.",
388
  page_size=30,
389
- initial_selection = [("0", "value")] if selection_mode == "single-cell" else [0]
390
  ### For single-cell it must have [("<row_nr as a string>","column_name string")] to work as initial value
391
  )
392
-
393
  return model_selector, resources, model_id_list
394
 
 
395
  def _enforce_model_selection(model_selection, model_id_list):
396
  # If nothing is selected (empty list) or value is None
397
  if not model_selection.value:
398
  # Reset to first item
399
- model = 0
400
- model_selection._value = model_id_list[model]
401
- print(model_selection.value)
402
- return model_selection.value
403
-
 
404
  def update_max_tokens_limit(model_selection, resources, model_id_list):
405
  # Default value
406
  default_max_tokens = 4096
407
 
408
  try:
409
  # Check if we have a selection and resources
410
- if model_selection.value is None or not hasattr(model_selection, 'value'):
411
  print("No model selection or selection has no value")
412
  return default_max_tokens
413
 
@@ -421,10 +553,14 @@ def update_max_tokens_limit(model_selection, resources, model_id_list):
421
 
422
  # If it's an array with indices
423
  if isinstance(selected_value, list) and len(selected_value) > 0:
424
- if isinstance(selected_value[0], int) and 0 <= selected_value[0] < len(model_id_list):
 
 
425
  selected_model_id = model_id_list[selected_value[0]]
426
  else:
427
- selected_model_id = str(selected_value[0]) # Convert to string if needed
 
 
428
  else:
429
  selected_model_id = str(selected_value) # Direct value
430
 
@@ -434,7 +570,10 @@ def update_max_tokens_limit(model_selection, resources, model_id_list):
434
  for model in resources:
435
  model_id = model.get("model_id")
436
  if model_id == selected_model_id:
437
- if "model_limits" in model and "max_output_tokens" in model["model_limits"]:
 
 
 
438
  return model["model_limits"]["max_output_tokens"]
439
  break
440
 
@@ -447,7 +586,7 @@ def update_max_tokens_limit(model_selection, resources, model_id_list):
447
  def load_templates(
448
  folder_path: str,
449
  file_extensions: Optional[List[str]] = None,
450
- strip_whitespace: bool = True
451
  ) -> Dict[str, str]:
452
  """
453
  Load template files from a specified folder into a dictionary.
@@ -462,15 +601,17 @@ def load_templates(
462
  """
463
  # Default extensions if none provided
464
  if file_extensions is None:
465
- file_extensions = ['.txt', '.md']
466
 
467
  # Ensure extensions start with a dot
468
- file_extensions = [ext if ext.startswith('.') else f'.{ext}' for ext in file_extensions]
 
 
469
 
470
  templates = {"empty": " "} # Default empty template
471
 
472
  # Create glob patterns for each extension
473
- patterns = [os.path.join(folder_path, f'*{ext}') for ext in file_extensions]
474
 
475
  # Find all matching files
476
  for pattern in patterns:
@@ -481,7 +622,7 @@ def load_templates(
481
  template_name = os.path.splitext(filename)[0]
482
 
483
  # Read file content
484
- with open(file_path, 'r', encoding='utf-8') as file:
485
  content = file.read()
486
 
487
  # Strip whitespace if specified
 
7
  import io
8
  import os
9
 
10
+
11
  def get_cred_value(key, creds_var_name="baked_in_creds", default=""):
12
  """
13
  Helper function to safely get a value from a credentials dictionary.
14
+
15
  Searches for credentials in:
16
  1. Global variables with the specified variable name
17
  2. Imported modules containing the specified variable name
18
+
19
  Args:
20
  key: The key to look up in the credentials dictionary.
21
  creds_var_name: The variable name of the credentials dictionary.
 
29
  creds_dict = globals()[creds_var_name]
30
  if isinstance(creds_dict, dict) and key in creds_dict:
31
  return creds_dict[key]
32
+
33
  # Check if credentials are in an imported module
34
  import sys
35
+
36
  for module_name, module_obj in sys.modules.items():
37
  if hasattr(module_obj, creds_var_name):
38
  creds_dict = getattr(module_obj, creds_var_name)
39
  if isinstance(creds_dict, dict) and key in creds_dict:
40
  return creds_dict[key]
41
+
42
  return default
43
 
44
+
45
  def get_key_by_value(dictionary, value):
46
  for key, val in dictionary.items():
47
  if val == value:
48
  return key
49
  return None
50
 
51
+
52
  def markdown_spacing(number):
53
  """Convert a number to that many '&nbsp;' characters."""
54
+ return "&nbsp;" * number
55
+
56
 
57
  def wrap_with_spaces(text_to_wrap, prefix_spaces=2, suffix_spaces=2):
58
  """Wrap text with non-breaking spaces on either side."""
59
  prefix = markdown_spacing(prefix_spaces) if prefix_spaces > 0 else ""
60
  suffix = markdown_spacing(suffix_spaces) if suffix_spaces > 0 else ""
61
  return f"{prefix}{text_to_wrap}{suffix}"
 
62
 
63
+
64
+ def load_file_dataframe(
65
+ file, file_extension, sheet_selector=None, excel_data=None, header_row=0
66
+ ):
67
  """
68
  Load a dataframe from an uploaded file with customizable header and row skipping.
69
+
70
  Parameters:
71
  -----------
72
  file : marimo.ui.file object
 
80
  header_row : int, optional
81
  Row index to use as column headers (0-based). Default is 0 (first row).
82
  Use None to have pandas generate default column names.
83
+
84
  Returns:
85
  --------
86
  tuple
87
  (pandas.DataFrame, list) - The loaded dataframe and list of column names
88
  """
89
+
90
  dataframe = pd.DataFrame([])
91
  column_names = []
92
+
93
  if file.contents():
94
  # Handle different file types
95
+ if (
96
+ file_extension in [".xlsx", ".xls"]
97
+ and sheet_selector is not None
98
+ and sheet_selector.value
99
+ ):
100
  # For Excel files - now we can safely access sheet_selector.value
101
  excel_data.seek(0) # Reset buffer position
102
  dataframe = pd.read_excel(
103
+ excel_data,
104
  sheet_name=sheet_selector.value,
105
  header=header_row,
106
+ engine="openpyxl" if file_extension == ".xlsx" else "xlrd",
107
  )
108
  column_names = list(dataframe.columns)
109
+ elif file_extension == ".csv":
110
  # For CSV files
111
+ csv_data = io.StringIO(file.contents().decode("utf-8"))
112
  dataframe = pd.read_csv(csv_data, header=header_row)
113
  column_names = list(dataframe.columns)
114
+ elif file_extension == ".json":
115
  # For JSON files
116
  try:
117
+ json_data = json.loads(file.contents().decode("utf-8"))
118
  # Handle different JSON structures
119
  if isinstance(json_data, list):
120
  dataframe = pd.DataFrame(json_data)
 
129
  column_names = list(dataframe.columns)
130
  except Exception as e:
131
  print(f"Error parsing JSON: {e}")
132
+
133
  return dataframe, column_names
134
 
135
 
136
+ def create_parameter_table(
137
+ input_list,
138
+ column_name="Active Options",
139
+ label="Select the Parameters to set to Active",
140
+ selection_type="multi-cell",
141
+ text_justify="center",
142
+ ):
143
  """
144
  Creates a marimo table for parameter selection.
145
 
 
166
  # Create the table
167
  parameter_table = mo.ui.table(
168
  label=f"**{label}**",
169
+ data={column_name: input_list},
170
  selection=selection_type,
171
+ text_justify_columns={column_name: text_justify},
172
  )
173
 
174
  return parameter_table
175
 
176
+
177
  def get_cell_values(parameter_options):
178
  """
179
  Extract active parameter values from a mo.ui.table.
 
188
  all_params = set()
189
 
190
  # Use the data property to get all options from the table
191
+ if hasattr(parameter_options, "data"):
192
  table_data = parameter_options.data
193
 
194
  # Handle DataFrame-like structure
195
+ if hasattr(table_data, "shape") and hasattr(table_data, "iloc"):
196
  for i in range(table_data.shape[0]):
197
  # Get value from first column
198
  if table_data.shape[1] > 0:
 
213
  result = {param: False for param in all_params}
214
 
215
  # Get the selected cells
216
+ if hasattr(parameter_options, "value") and parameter_options.value is not None:
217
  selected_cells = parameter_options.value
218
 
219
  # Process selected cells
220
  for cell in selected_cells:
221
+ if hasattr(cell, "value") and cell.value in result:
222
  result[cell.value] = True
223
+ elif isinstance(cell, dict) and "value" in cell and cell["value"] in result:
224
+ result[cell["value"]] = True
225
  elif isinstance(cell, str) and cell in result:
226
  result[cell] = True
227
 
228
  return result
229
 
230
+
231
  def convert_table_to_json_docs(df, selected_columns=None):
232
  """
233
  Convert a pandas DataFrame or dictionary to a list of JSON documents.
 
250
  if not isinstance(key, str):
251
  return str(key).lower()
252
  # Replace spaces with underscores and convert to lowercase
253
+ key = key.lower().replace(" ", "_")
254
  # Remove special characters (keeping alphanumeric and underscores)
255
+ return re.sub(r"[^\w]", "", key)
256
 
257
  # Handle case when input is a dictionary
258
  if isinstance(df, dict):
 
284
  selected_columns = [col for col, include in selected_columns.items() if include]
285
 
286
  # If no columns are specifically selected, use all available columns
287
+ if (
288
+ not selected_columns
289
+ or not isinstance(selected_columns, list)
290
+ or len(selected_columns) == 0
291
+ ):
292
  selected_columns = list(df.columns)
293
 
294
  # Determine which columns exist in the DataFrame
 
313
  value = row[col]
314
  # Standardize the column name when adding to document
315
  std_col = standardize_key(col)
316
+ try:
317
+ is_na = pd.isna(value)
318
+ if hasattr(is_na, "__len__") and not isinstance(is_na, str):
319
+ # Handle case where pd.isna returns an array
320
+ doc[std_col] = None if is_na.all() else value
321
+ else:
322
+ # Handle scalar case
323
+ doc[std_col] = None if is_na else value
324
+ except (ValueError, TypeError) as e:
325
+ # Output the error and re-raise
326
+ print(f"Error processing column '{col}' with value {value}: {e}")
327
+ raise
328
  json_docs.append(doc)
329
 
330
  return json_docs
331
 
332
+
333
+ # def convert_table_to_json_docs(df, selected_columns=None):
334
+ # """
335
+ # Convert a pandas DataFrame or dictionary to a list of JSON documents.
336
+ # Dynamically includes columns based on user selection.
337
+ # Column names are standardized to lowercase with underscores instead of spaces
338
+ # and special characters removed.
339
+
340
+ # Args:
341
+ # df: The DataFrame or dictionary to process
342
+ # selected_columns: List of column names to include in the output documents
343
+
344
+ # Returns:
345
+ # list: A list of dictionaries, each representing a row as a JSON document
346
+ # """
347
+ # import pandas as pd
348
+ # import re
349
+
350
+ # def standardize_key(key):
351
+ # """Convert a column name to lowercase with underscores instead of spaces and no special characters"""
352
+ # if not isinstance(key, str):
353
+ # return str(key).lower()
354
+ # # Replace spaces with underscores and convert to lowercase
355
+ # key = key.lower().replace(" ", "_")
356
+ # # Remove special characters (keeping alphanumeric and underscores)
357
+ # return re.sub(r"[^\w]", "", key)
358
+
359
+ # # Handle case when input is a dictionary
360
+ # if isinstance(df, dict):
361
+ # # Filter the dictionary to include only selected columns
362
+ # if selected_columns:
363
+ # return [{standardize_key(k): df.get(k, None) for k in selected_columns}]
364
+ # else:
365
+ # # If no columns selected, return all key-value pairs with standardized keys
366
+ # return [{standardize_key(k): v for k, v in df.items()}]
367
+
368
+ # # Handle case when df is None
369
+ # if df is None:
370
+ # return []
371
+
372
+ # # Ensure df is a DataFrame
373
+ # if not isinstance(df, pd.DataFrame):
374
+ # try:
375
+ # df = pd.DataFrame(df)
376
+ # except:
377
+ # return [] # Return empty list if conversion fails
378
+
379
+ # # Now check if DataFrame is empty
380
+ # if df.empty:
381
+ # return []
382
+
383
+ # # Process selected_columns if it's a dictionary of true/false values
384
+ # if isinstance(selected_columns, dict):
385
+ # # Extract keys where value is True
386
+ # selected_columns = [col for col, include in selected_columns.items() if include]
387
+
388
+ # # If no columns are specifically selected, use all available columns
389
+ # if (
390
+ # not selected_columns
391
+ # or not isinstance(selected_columns, list)
392
+ # or len(selected_columns) == 0
393
+ # ):
394
+ # selected_columns = list(df.columns)
395
+
396
+ # # Determine which columns exist in the DataFrame
397
+ # available_columns = []
398
+ # columns_lower = {col.lower(): col for col in df.columns if isinstance(col, str)}
399
+
400
+ # for col in selected_columns:
401
+ # if col in df.columns:
402
+ # available_columns.append(col)
403
+ # elif isinstance(col, str) and col.lower() in columns_lower:
404
+ # available_columns.append(columns_lower[col.lower()])
405
+
406
+ # # If no valid columns found, return empty list
407
+ # if not available_columns:
408
+ # return []
409
+
410
+ # # Process rows
411
+ # json_docs = []
412
+ # for _, row in df.iterrows():
413
+ # doc = {}
414
+ # for col in available_columns:
415
+ # value = row[col]
416
+ # # Standardize the column name when adding to document
417
+ # std_col = standardize_key(col)
418
+ # doc[std_col] = None if pd.isna(value) else value
419
+ # json_docs.append(doc)
420
+
421
+ # return json_docs
422
+
423
+
424
  def filter_models_by_function(resources, function_type="prompt_chat"):
425
  """
426
  Filter model IDs from resources list that have a specific function type
 
442
  if "functions" in model and isinstance(model["functions"], list):
443
  # Check if any function has the matching id
444
  has_function = any(
445
+ func.get("id") == function_type
446
  for func in model["functions"]
447
  if isinstance(func, dict)
448
  )
 
453
  return filtered_model_ids
454
 
455
 
456
+ def get_model_selection_table(
457
+ client=None,
458
+ model_type="all",
459
+ filter_functionality=None,
460
+ selection_mode="single-cell",
461
+ ):
462
  """
463
  Creates and displays a table for model selection based on specified parameters.
464
+
465
  Args:
466
  client: The client object for API calls. If None, returns default models.
467
  model_type (str): Type of models to display. Options: "all", "chat", "embedding".
468
  filter_functionality (str, optional): Filter models by functionality type.
469
+ Options include: "image_chat", "text_chat", "autoai_rag",
470
  "text_generation", "multilingual", etc.
471
  selection_mode (str): Mode for selecting table entries. Options: "single", "single-cell".
472
  Defaults to "single-cell".
473
+
474
  Returns:
475
  The selected model ID from the displayed table.
476
  """
477
  # Default model list if client is None
478
+ default_models = ["mistralai/mistral-large"]
479
+
480
  if client is None:
481
  # If no client, use default models
482
  available_models = default_models
 
487
  page_size=30,
488
  )
489
  return selection
490
+
491
  # Get appropriate model specs based on model_type
492
  if model_type == "chat":
493
  model_specs = client.foundation_models.get_chat_model_specs()
 
495
  model_specs = client.foundation_models.get_embeddings_model_specs()
496
  else:
497
  model_specs = client.foundation_models.get_model_specs()
498
+
499
  # Extract resources from model specs
500
  resources = model_specs.get("resources", [])
501
+
502
  # Filter by functionality if specified
503
  if filter_functionality and resources:
504
  model_id_list = filter_models_by_function(resources, filter_functionality)
505
  else:
506
  # Create list of model IDs if no filtering
507
  model_id_list = [resource["model_id"] for resource in resources]
508
+
509
  # If no models available after filtering, use defaults
510
  if not model_id_list:
511
  model_id_list = default_models
512
+
513
  # Create and display selection table
514
  model_selector = mo.ui.table(
515
  model_id_list,
516
  selection=selection_mode,
517
  label="Select a model to use.",
518
  page_size=30,
519
+ initial_selection=[("0", "value")] if selection_mode == "single-cell" else [0],
520
  ### For single-cell it must have [("<row_nr as a string>","column_name string")] to work as initial value
521
  )
522
+
523
  return model_selector, resources, model_id_list
524
 
525
+
526
  def _enforce_model_selection(model_selection, model_id_list):
527
  # If nothing is selected (empty list) or value is None
528
  if not model_selection.value:
529
  # Reset to first item
530
+ model = 0
531
+ model_selection._value = model_id_list[model]
532
+ print(model_selection.value)
533
+ return model_selection.value
534
+
535
+
536
  def update_max_tokens_limit(model_selection, resources, model_id_list):
537
  # Default value
538
  default_max_tokens = 4096
539
 
540
  try:
541
  # Check if we have a selection and resources
542
+ if model_selection.value is None or not hasattr(model_selection, "value"):
543
  print("No model selection or selection has no value")
544
  return default_max_tokens
545
 
 
553
 
554
  # If it's an array with indices
555
  if isinstance(selected_value, list) and len(selected_value) > 0:
556
+ if isinstance(selected_value[0], int) and 0 <= selected_value[0] < len(
557
+ model_id_list
558
+ ):
559
  selected_model_id = model_id_list[selected_value[0]]
560
  else:
561
+ selected_model_id = str(
562
+ selected_value[0]
563
+ ) # Convert to string if needed
564
  else:
565
  selected_model_id = str(selected_value) # Direct value
566
 
 
570
  for model in resources:
571
  model_id = model.get("model_id")
572
  if model_id == selected_model_id:
573
+ if (
574
+ "model_limits" in model
575
+ and "max_output_tokens" in model["model_limits"]
576
+ ):
577
  return model["model_limits"]["max_output_tokens"]
578
  break
579
 
 
586
  def load_templates(
587
  folder_path: str,
588
  file_extensions: Optional[List[str]] = None,
589
+ strip_whitespace: bool = True,
590
  ) -> Dict[str, str]:
591
  """
592
  Load template files from a specified folder into a dictionary.
 
601
  """
602
  # Default extensions if none provided
603
  if file_extensions is None:
604
+ file_extensions = [".txt", ".md"]
605
 
606
  # Ensure extensions start with a dot
607
+ file_extensions = [
608
+ ext if ext.startswith(".") else f".{ext}" for ext in file_extensions
609
+ ]
610
 
611
  templates = {"empty": " "} # Default empty template
612
 
613
  # Create glob patterns for each extension
614
+ patterns = [os.path.join(folder_path, f"*{ext}") for ext in file_extensions]
615
 
616
  # Find all matching files
617
  for pattern in patterns:
 
622
  template_name = os.path.splitext(filename)[0]
623
 
624
  # Read file content
625
+ with open(file_path, "r", encoding="utf-8") as file:
626
  content = file.read()
627
 
628
  # Strip whitespace if specified
helper_functions/table_helper_functions.py CHANGED
@@ -1,4 +1,6 @@
1
- def process_with_llm(fields_to_process, prompt_template, inf_model, params, batch_size=10):
 
 
2
  """
3
  Process documents with LLM using a prompt template with dynamic field mapping.
4
  Uses template fields to extract values from pre-standardized document fields.
@@ -23,15 +25,15 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
23
  return []
24
 
25
  # Handle case where prompt_template is a dictionary (from UI components)
26
- if isinstance(prompt_template, dict) and 'value' in prompt_template:
27
- prompt_template = prompt_template['value']
28
  elif not isinstance(prompt_template, str):
29
  print(f"Invalid prompt template type: {type(prompt_template)}, expected string")
30
  return []
31
 
32
  # Extract field names from the prompt template using regex
33
  # This finds all strings between curly braces
34
- field_pattern = r'\{([^{}]+)\}'
35
  template_fields = re.findall(field_pattern, prompt_template)
36
 
37
  if not template_fields:
@@ -50,10 +52,10 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
50
  if field in doc:
51
  field_values[field] = doc[field] if doc[field] is not None else ""
52
  # If field contains periods (e.g., "data.title"), evaluate it
53
- elif '.' in field:
54
  try:
55
  # Build a safe evaluation string
56
- parts = field.split('.')
57
  value = doc
58
  for part in parts:
59
  if isinstance(value, dict) and part in value:
@@ -92,7 +94,10 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
92
  print(f"Sample formatted prompt: {formatted_prompts[0][:200]}...")
93
 
94
  # Split into batches
95
- batches = [formatted_prompts[i:i + batch_size] for i in range(0, len(formatted_prompts), batch_size)]
 
 
 
96
 
97
  results = []
98
 
@@ -105,7 +110,7 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
105
  completion_subtitle=f"Processed {len(formatted_prompts)} prompts successfully",
106
  show_rate=True,
107
  show_eta=True,
108
- remove_on_exit=True
109
  ) as progress:
110
  for i, batch in enumerate(batches):
111
  start_time = time.time()
@@ -126,17 +131,16 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
126
  end_time = time.time()
127
  inference_time = end_time - start_time
128
  print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
129
-
130
  # Update progress bar
131
  progress.update(increment=1)
132
-
133
  # Add 1 second delay on completion before removing
134
  time.sleep(1)
135
 
136
  return results
137
 
138
 
139
-
140
  # def process_with_llm_no_progress_bar(fields_to_process, prompt_template, inf_model, params, batch_size=10):
141
  # """
142
  # Process documents with LLM using a prompt template with dynamic field mapping.
@@ -257,36 +261,142 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
257
 
258
  # return results
259
 
260
- def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_results, selection_table, column_name=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  """
262
  Add LLM processing results directly to the target DataFrame using selection indices
263
-
264
  Args:
265
  target_dataframe (pandas.DataFrame): DataFrame to modify in-place
266
- fields_to_process (list): List of document dictionaries that were processed
267
  llm_results (list): Results from the process_with_llm function
268
  selection_table: Table selection containing indices of rows to update
269
  column_name (str, optional): Custom name for the new column
270
  """
271
  column_name = column_name or f"Added Column {len(list(target_dataframe))}"
272
-
273
  # Initialize the new column with empty strings if it doesn't exist
274
  if column_name not in target_dataframe.columns:
275
  target_dataframe[column_name] = ""
276
-
277
  # Safety checks
278
  if not isinstance(llm_results, list) or not llm_results:
279
  print("No LLM results to add")
280
  return
281
-
282
  # Get indices from selection table
283
  if selection_table is not None and not selection_table.empty:
284
  selected_indices = selection_table.index.tolist()
285
-
286
  # Make sure we have the right number of results for the selected rows
287
  if len(selected_indices) != len(llm_results):
288
- print(f"Warning: Number of results ({len(llm_results)}) doesn't match selected rows ({len(selected_indices)})")
289
-
 
 
290
  # Add results to the DataFrame at the selected indices
291
  for idx, result in zip(selected_indices, llm_results):
292
  try:
@@ -299,7 +409,10 @@ def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_res
299
  else:
300
  print("No selection table provided or empty selection")
301
 
302
- def add_llm_results_to_dataframe(original_df, fields_to_process, llm_results, column_name=None):
 
 
 
303
  """
304
  Add LLM processing results to a copy of the original DataFrame
305
 
@@ -366,6 +479,7 @@ def display_answers_as_markdown(answers, mo):
366
 
367
  return markdown_elements
368
 
 
369
  def display_answers_stacked(answers, mo):
370
  """
371
  Takes a list of answers and displays them stacked vertically using mo.vstack()
@@ -390,4 +504,4 @@ def display_answers_stacked(answers, mo):
390
  elements_with_separators.append(separator)
391
 
392
  # Return a vertically stacked collection
393
- return mo.vstack(elements_with_separators, align="start", gap="2")
 
1
+ def process_with_llm(
2
+ fields_to_process, prompt_template, inf_model, params, batch_size=10
3
+ ):
4
  """
5
  Process documents with LLM using a prompt template with dynamic field mapping.
6
  Uses template fields to extract values from pre-standardized document fields.
 
25
  return []
26
 
27
  # Handle case where prompt_template is a dictionary (from UI components)
28
+ if isinstance(prompt_template, dict) and "value" in prompt_template:
29
+ prompt_template = prompt_template["value"]
30
  elif not isinstance(prompt_template, str):
31
  print(f"Invalid prompt template type: {type(prompt_template)}, expected string")
32
  return []
33
 
34
  # Extract field names from the prompt template using regex
35
  # This finds all strings between curly braces
36
+ field_pattern = r"\{([^{}]+)\}"
37
  template_fields = re.findall(field_pattern, prompt_template)
38
 
39
  if not template_fields:
 
52
  if field in doc:
53
  field_values[field] = doc[field] if doc[field] is not None else ""
54
  # If field contains periods (e.g., "data.title"), evaluate it
55
+ elif "." in field:
56
  try:
57
  # Build a safe evaluation string
58
+ parts = field.split(".")
59
  value = doc
60
  for part in parts:
61
  if isinstance(value, dict) and part in value:
 
94
  print(f"Sample formatted prompt: {formatted_prompts[0][:200]}...")
95
 
96
  # Split into batches
97
+ batches = [
98
+ formatted_prompts[i : i + batch_size]
99
+ for i in range(0, len(formatted_prompts), batch_size)
100
+ ]
101
 
102
  results = []
103
 
 
110
  completion_subtitle=f"Processed {len(formatted_prompts)} prompts successfully",
111
  show_rate=True,
112
  show_eta=True,
113
+ remove_on_exit=True,
114
  ) as progress:
115
  for i, batch in enumerate(batches):
116
  start_time = time.time()
 
131
  end_time = time.time()
132
  inference_time = end_time - start_time
133
  print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
134
+
135
  # Update progress bar
136
  progress.update(increment=1)
137
+
138
  # Add 1 second delay on completion before removing
139
  time.sleep(1)
140
 
141
  return results
142
 
143
 
 
144
  # def process_with_llm_no_progress_bar(fields_to_process, prompt_template, inf_model, params, batch_size=10):
145
  # """
146
  # Process documents with LLM using a prompt template with dynamic field mapping.
 
261
 
262
  # return results
263
 
264
+
265
+ def process_prompt_lineage(
266
+ lineage_options=None,
267
+ selected_model=None,
268
+ params=None,
269
+ fields_to_process=None,
270
+ prompt_template=None,
271
+ ):
272
+ """
273
+ Process prompt lineage based on configuration options.
274
+
275
+ Returns a list of lineage results - one for each document in fields_to_process.
276
+ Each lineage can be a string (validated/unvalidated prompt) when only switch mode is active,
277
+ or a dict containing selected components (params, model, variable names, template)
278
+ when include flags are active. Returns None if lineage is disabled.
279
+ """
280
+ import re
281
+
282
+ if lineage_options and lineage_options.get("activate_prompt_lineage"):
283
+ include_llm = lineage_options.get("include_llm_parameters")
284
+ include_columns = lineage_options.get("include_input_column_names")
285
+ include_switch = lineage_options.get("switch_between_lineage_modes")
286
+
287
+ input_variable_names = []
288
+ if prompt_template:
289
+ field_pattern = r"\{([^{}]+)\}"
290
+ input_variable_names = re.findall(field_pattern, prompt_template)
291
+
292
+ results = []
293
+
294
+ if fields_to_process:
295
+ for doc in fields_to_process:
296
+ if include_switch and prompt_template:
297
+ template_fields = input_variable_names
298
+
299
+ if template_fields:
300
+ field_values = {}
301
+
302
+ for field in template_fields:
303
+ if field in doc:
304
+ field_values[field] = (
305
+ doc[field] if doc[field] is not None else ""
306
+ )
307
+ elif "." in field:
308
+ try:
309
+ parts = field.split(".")
310
+ value = doc
311
+ for part in parts:
312
+ if isinstance(value, dict) and part in value:
313
+ value = value[part]
314
+ else:
315
+ value = None
316
+ break
317
+ field_values[field] = (
318
+ value if value is not None else ""
319
+ )
320
+ except:
321
+ field_values[field] = ""
322
+ else:
323
+ field_values[field] = ""
324
+
325
+ for key in field_values:
326
+ if field_values[key] is None:
327
+ field_values[key] = ""
328
+
329
+ try:
330
+ template_to_use = prompt_template.format(**field_values)
331
+ except Exception as e:
332
+ print(f"Error formatting prompt template: {str(e)}")
333
+ template_to_use = prompt_template
334
+ else:
335
+ template_to_use = prompt_template
336
+ else:
337
+ template_to_use = prompt_template
338
+
339
+ if not include_llm and not include_columns:
340
+ if include_switch:
341
+ results.append(template_to_use)
342
+ else:
343
+ results.append(prompt_template)
344
+ else:
345
+ lineage = {}
346
+
347
+ if include_switch:
348
+ lineage["switch_between_lineage_modes"] = True
349
+ lineage["prompt_template"] = template_to_use
350
+ elif prompt_template:
351
+ lineage["prompt_template"] = prompt_template
352
+
353
+ if include_llm:
354
+ lineage["params"] = params
355
+ lineage["selected_model"] = selected_model
356
+
357
+ if include_columns:
358
+ lineage["input_variable_names"] = input_variable_names
359
+
360
+ results.append(lineage)
361
+
362
+ return results
363
+ else:
364
+ return None
365
+
366
+
367
+ def append_llm_results_to_dataframe(
368
+ target_dataframe, llm_results, selection_table, column_name=None
369
+ ):
370
  """
371
  Add LLM processing results directly to the target DataFrame using selection indices
372
+
373
  Args:
374
  target_dataframe (pandas.DataFrame): DataFrame to modify in-place
 
375
  llm_results (list): Results from the process_with_llm function
376
  selection_table: Table selection containing indices of rows to update
377
  column_name (str, optional): Custom name for the new column
378
  """
379
  column_name = column_name or f"Added Column {len(list(target_dataframe))}"
380
+
381
  # Initialize the new column with empty strings if it doesn't exist
382
  if column_name not in target_dataframe.columns:
383
  target_dataframe[column_name] = ""
384
+
385
  # Safety checks
386
  if not isinstance(llm_results, list) or not llm_results:
387
  print("No LLM results to add")
388
  return
389
+
390
  # Get indices from selection table
391
  if selection_table is not None and not selection_table.empty:
392
  selected_indices = selection_table.index.tolist()
393
+
394
  # Make sure we have the right number of results for the selected rows
395
  if len(selected_indices) != len(llm_results):
396
+ print(
397
+ f"Warning: Number of results ({len(llm_results)}) doesn't match selected rows ({len(selected_indices)})"
398
+ )
399
+
400
  # Add results to the DataFrame at the selected indices
401
  for idx, result in zip(selected_indices, llm_results):
402
  try:
 
409
  else:
410
  print("No selection table provided or empty selection")
411
 
412
+
413
+ def add_llm_results_to_dataframe(
414
+ original_df, fields_to_process, llm_results, column_name=None
415
+ ):
416
  """
417
  Add LLM processing results to a copy of the original DataFrame
418
 
 
479
 
480
  return markdown_elements
481
 
482
+
483
  def display_answers_stacked(answers, mo):
484
  """
485
  Takes a list of answers and displays them stacked vertically using mo.vstack()
 
504
  elements_with_separators.append(separator)
505
 
506
  # Return a vertically stacked collection
507
+ return mo.vstack(elements_with_separators, align="start", gap="2")