Spaces:

NCEE-Build-Lab
/

watsonx.ai_Table_Processor_MNB

Running

App Files Files Community

MilanM commited on 2 days ago

Commit

e5d0cdb

verified ·

1 Parent(s): ed4a452

Upload 3 files

Browse files

Files changed (2) hide show

helper_functions/helper_functions.py +202 -61
helper_functions/table_helper_functions.py +136 -22

helper_functions/helper_functions.py CHANGED Viewed

@@ -7,14 +7,15 @@ import glob
 import io
 import os
 def get_cred_value(key, creds_var_name="baked_in_creds", default=""):
     """
     Helper function to safely get a value from a credentials dictionary.
     Searches for credentials in:
     1. Global variables with the specified variable name
     2. Imported modules containing the specified variable name
     Args:
         key: The key to look up in the credentials dictionary.
         creds_var_name: The variable name of the credentials dictionary.
@@ -28,38 +29,44 @@ def get_cred_value(key, creds_var_name="baked_in_creds", default=""):
         creds_dict = globals()[creds_var_name]
         if isinstance(creds_dict, dict) and key in creds_dict:
             return creds_dict[key]
     # Check if credentials are in an imported module
     import sys
     for module_name, module_obj in sys.modules.items():
         if hasattr(module_obj, creds_var_name):
             creds_dict = getattr(module_obj, creds_var_name)
             if isinstance(creds_dict, dict) and key in creds_dict:
                 return creds_dict[key]
     return default
 def get_key_by_value(dictionary, value):
     for key, val in dictionary.items():
         if val == value:
             return key
     return None
 def markdown_spacing(number):
     """Convert a number to that many '&nbsp;' characters."""
-    return '&nbsp;' * number
 def wrap_with_spaces(text_to_wrap, prefix_spaces=2, suffix_spaces=2):
     """Wrap text with non-breaking spaces on either side."""
     prefix = markdown_spacing(prefix_spaces) if prefix_spaces > 0 else ""
     suffix = markdown_spacing(suffix_spaces) if suffix_spaces > 0 else ""
     return f"{prefix}{text_to_wrap}{suffix}"
-def load_file_dataframe(file, file_extension, sheet_selector=None, excel_data=None, header_row=0):
     """
     Load a dataframe from an uploaded file with customizable header and row skipping.
     Parameters:
     -----------
     file : marimo.ui.file object
@@ -73,37 +80,41 @@ def load_file_dataframe(file, file_extension, sheet_selector=None, excel_data=No
     header_row : int, optional
         Row index to use as column headers (0-based). Default is 0 (first row).
         Use None to have pandas generate default column names.
     Returns:
     --------
     tuple
         (pandas.DataFrame, list) - The loaded dataframe and list of column names
     """
     dataframe = pd.DataFrame([])
     column_names = []
     if file.contents():
         # Handle different file types
-        if file_extension in ['.xlsx', '.xls'] and sheet_selector is not None and sheet_selector.value:
             # For Excel files - now we can safely access sheet_selector.value
             excel_data.seek(0)  # Reset buffer position
             dataframe = pd.read_excel(
-                excel_data,
                 sheet_name=sheet_selector.value,
                 header=header_row,
-                engine="openpyxl" if file_extension == '.xlsx' else "xlrd"
             )
             column_names = list(dataframe.columns)
-        elif file_extension == '.csv':
             # For CSV files
-            csv_data = io.StringIO(file.contents().decode('utf-8'))
             dataframe = pd.read_csv(csv_data, header=header_row)
             column_names = list(dataframe.columns)
-        elif file_extension == '.json':
             # For JSON files
             try:
-                json_data = json.loads(file.contents().decode('utf-8'))
                 # Handle different JSON structures
                 if isinstance(json_data, list):
                     dataframe = pd.DataFrame(json_data)
@@ -118,12 +129,17 @@ def load_file_dataframe(file, file_extension, sheet_selector=None, excel_data=No
                 column_names = list(dataframe.columns)
             except Exception as e:
                 print(f"Error parsing JSON: {e}")
     return dataframe, column_names
-def create_parameter_table(input_list, column_name="Active Options", label="Select the Parameters to set to Active",
-                            selection_type="multi-cell", text_justify="center"):
     """
     Creates a marimo table for parameter selection.
@@ -150,13 +166,14 @@ def create_parameter_table(input_list, column_name="Active Options", label="Sele
     # Create the table
     parameter_table = mo.ui.table(
         label=f"**{label}**",
-        data={column_name: input_list},
         selection=selection_type,
-        text_justify_columns={column_name: text_justify}
     )
     return parameter_table
 def get_cell_values(parameter_options):
     """
     Extract active parameter values from a mo.ui.table.
@@ -171,11 +188,11 @@ def get_cell_values(parameter_options):
     all_params = set()
     # Use the data property to get all options from the table
-    if hasattr(parameter_options, 'data'):
         table_data = parameter_options.data
         # Handle DataFrame-like structure
-        if hasattr(table_data, 'shape') and hasattr(table_data, 'iloc'):
             for i in range(table_data.shape[0]):
                 # Get value from first column
                 if table_data.shape[1] > 0:
@@ -196,20 +213,21 @@ def get_cell_values(parameter_options):
     result = {param: False for param in all_params}
     # Get the selected cells
-    if hasattr(parameter_options, 'value') and parameter_options.value is not None:
         selected_cells = parameter_options.value
         # Process selected cells
         for cell in selected_cells:
-            if hasattr(cell, 'value') and cell.value in result:
                 result[cell.value] = True
-            elif isinstance(cell, dict) and 'value' in cell and cell['value'] in result:
-                result[cell['value']] = True
             elif isinstance(cell, str) and cell in result:
                 result[cell] = True
     return result
 def convert_table_to_json_docs(df, selected_columns=None):
     """
     Convert a pandas DataFrame or dictionary to a list of JSON documents.
@@ -232,9 +250,9 @@ def convert_table_to_json_docs(df, selected_columns=None):
         if not isinstance(key, str):
             return str(key).lower()
         # Replace spaces with underscores and convert to lowercase
-        key = key.lower().replace(' ', '_')
         # Remove special characters (keeping alphanumeric and underscores)
-        return re.sub(r'[^\w]', '', key)
     # Handle case when input is a dictionary
     if isinstance(df, dict):
@@ -266,7 +284,11 @@ def convert_table_to_json_docs(df, selected_columns=None):
         selected_columns = [col for col, include in selected_columns.items() if include]
     # If no columns are specifically selected, use all available columns
-    if not selected_columns or not isinstance(selected_columns, list) or len(selected_columns) == 0:
         selected_columns = list(df.columns)
     # Determine which columns exist in the DataFrame
@@ -291,11 +313,114 @@ def convert_table_to_json_docs(df, selected_columns=None):
             value = row[col]
             # Standardize the column name when adding to document
             std_col = standardize_key(col)
-            doc[std_col] = None if pd.isna(value) else value
         json_docs.append(doc)
     return json_docs
 def filter_models_by_function(resources, function_type="prompt_chat"):
     """
     Filter model IDs from resources list that have a specific function type
@@ -317,7 +442,7 @@ def filter_models_by_function(resources, function_type="prompt_chat"):
         if "functions" in model and isinstance(model["functions"], list):
             # Check if any function has the matching id
             has_function = any(
-                func.get("id") == function_type
                 for func in model["functions"]
                 if isinstance(func, dict)
             )
@@ -328,25 +453,30 @@ def filter_models_by_function(resources, function_type="prompt_chat"):
     return filtered_model_ids
-def get_model_selection_table(client=None, model_type="all", filter_functionality=None, selection_mode="single-cell"):
     """
     Creates and displays a table for model selection based on specified parameters.
     Args:
         client: The client object for API calls. If None, returns default models.
         model_type (str): Type of models to display. Options: "all", "chat", "embedding".
         filter_functionality (str, optional): Filter models by functionality type.
-            Options include: "image_chat", "text_chat", "autoai_rag",
             "text_generation", "multilingual", etc.
         selection_mode (str): Mode for selecting table entries. Options: "single", "single-cell".
             Defaults to "single-cell".
     Returns:
         The selected model ID from the displayed table.
     """
     # Default model list if client is None
-    default_models = ['mistralai/mistral-large']
     if client is None:
         # If no client, use default models
         available_models = default_models
@@ -357,7 +487,7 @@ def get_model_selection_table(client=None, model_type="all", filter_functionalit
             page_size=30,
         )
         return selection
     # Get appropriate model specs based on model_type
     if model_type == "chat":
         model_specs = client.foundation_models.get_chat_model_specs()
@@ -365,49 +495,51 @@ def get_model_selection_table(client=None, model_type="all", filter_functionalit
         model_specs = client.foundation_models.get_embeddings_model_specs()
     else:
         model_specs = client.foundation_models.get_model_specs()
     # Extract resources from model specs
     resources = model_specs.get("resources", [])
     # Filter by functionality if specified
     if filter_functionality and resources:
         model_id_list = filter_models_by_function(resources, filter_functionality)
     else:
         # Create list of model IDs if no filtering
         model_id_list = [resource["model_id"] for resource in resources]
     # If no models available after filtering, use defaults
     if not model_id_list:
         model_id_list = default_models
     # Create and display selection table
     model_selector = mo.ui.table(
         model_id_list,
         selection=selection_mode,
         label="Select a model to use.",
         page_size=30,
-        initial_selection = [("0", "value")] if selection_mode == "single-cell" else [0]
         ### For single-cell it must have [("<row_nr as a string>","column_name string")] to work as initial value
     )
     return model_selector, resources, model_id_list
 def _enforce_model_selection(model_selection, model_id_list):
     # If nothing is selected (empty list) or value is None
     if not model_selection.value:
         # Reset to first item
-            model = 0
-            model_selection._value = model_id_list[model]
-            print(model_selection.value)
-            return model_selection.value
 def update_max_tokens_limit(model_selection, resources, model_id_list):
     # Default value
     default_max_tokens = 4096
     try:
         # Check if we have a selection and resources
-        if model_selection.value is None or not hasattr(model_selection, 'value'):
             print("No model selection or selection has no value")
             return default_max_tokens
@@ -421,10 +553,14 @@ def update_max_tokens_limit(model_selection, resources, model_id_list):
         # If it's an array with indices
         if isinstance(selected_value, list) and len(selected_value) > 0:
-            if isinstance(selected_value[0], int) and 0 <= selected_value[0] < len(model_id_list):
                 selected_model_id = model_id_list[selected_value[0]]
             else:
-                selected_model_id = str(selected_value[0])  # Convert to string if needed
         else:
             selected_model_id = str(selected_value)  # Direct value
@@ -434,7 +570,10 @@ def update_max_tokens_limit(model_selection, resources, model_id_list):
         for model in resources:
             model_id = model.get("model_id")
             if model_id == selected_model_id:
-                if "model_limits" in model and "max_output_tokens" in model["model_limits"]:
                     return model["model_limits"]["max_output_tokens"]
                 break
@@ -447,7 +586,7 @@ def update_max_tokens_limit(model_selection, resources, model_id_list):
 def load_templates(
     folder_path: str,
     file_extensions: Optional[List[str]] = None,
-    strip_whitespace: bool = True
 ) -> Dict[str, str]:
     """
     Load template files from a specified folder into a dictionary.
@@ -462,15 +601,17 @@ def load_templates(
     """
     # Default extensions if none provided
     if file_extensions is None:
-        file_extensions = ['.txt', '.md']
     # Ensure extensions start with a dot
-    file_extensions = [ext if ext.startswith('.') else f'.{ext}' for ext in file_extensions]
     templates = {"empty": "                        "}  # Default empty template
     # Create glob patterns for each extension
-    patterns = [os.path.join(folder_path, f'*{ext}') for ext in file_extensions]
     # Find all matching files
     for pattern in patterns:
@@ -481,7 +622,7 @@ def load_templates(
                 template_name = os.path.splitext(filename)[0]
                 # Read file content
-                with open(file_path, 'r', encoding='utf-8') as file:
                     content = file.read()
                     # Strip whitespace if specified

 import io
 import os
 def get_cred_value(key, creds_var_name="baked_in_creds", default=""):
     """
     Helper function to safely get a value from a credentials dictionary.
     Searches for credentials in:
     1. Global variables with the specified variable name
     2. Imported modules containing the specified variable name
     Args:
         key: The key to look up in the credentials dictionary.
         creds_var_name: The variable name of the credentials dictionary.
         creds_dict = globals()[creds_var_name]
         if isinstance(creds_dict, dict) and key in creds_dict:
             return creds_dict[key]
     # Check if credentials are in an imported module
     import sys
     for module_name, module_obj in sys.modules.items():
         if hasattr(module_obj, creds_var_name):
             creds_dict = getattr(module_obj, creds_var_name)
             if isinstance(creds_dict, dict) and key in creds_dict:
                 return creds_dict[key]
     return default
 def get_key_by_value(dictionary, value):
     for key, val in dictionary.items():
         if val == value:
             return key
     return None
 def markdown_spacing(number):
     """Convert a number to that many '&nbsp;' characters."""
+    return "&nbsp;" * number
 def wrap_with_spaces(text_to_wrap, prefix_spaces=2, suffix_spaces=2):
     """Wrap text with non-breaking spaces on either side."""
     prefix = markdown_spacing(prefix_spaces) if prefix_spaces > 0 else ""
     suffix = markdown_spacing(suffix_spaces) if suffix_spaces > 0 else ""
     return f"{prefix}{text_to_wrap}{suffix}"
+def load_file_dataframe(
+    file, file_extension, sheet_selector=None, excel_data=None, header_row=0
+):
     """
     Load a dataframe from an uploaded file with customizable header and row skipping.
     Parameters:
     -----------
     file : marimo.ui.file object
     header_row : int, optional
         Row index to use as column headers (0-based). Default is 0 (first row).
         Use None to have pandas generate default column names.
     Returns:
     --------
     tuple
         (pandas.DataFrame, list) - The loaded dataframe and list of column names
     """
     dataframe = pd.DataFrame([])
     column_names = []
     if file.contents():
         # Handle different file types
+        if (
+            file_extension in [".xlsx", ".xls"]
+            and sheet_selector is not None
+            and sheet_selector.value
+        ):
             # For Excel files - now we can safely access sheet_selector.value
             excel_data.seek(0)  # Reset buffer position
             dataframe = pd.read_excel(
+                excel_data,
                 sheet_name=sheet_selector.value,
                 header=header_row,
+                engine="openpyxl" if file_extension == ".xlsx" else "xlrd",
             )
             column_names = list(dataframe.columns)
+        elif file_extension == ".csv":
             # For CSV files
+            csv_data = io.StringIO(file.contents().decode("utf-8"))
             dataframe = pd.read_csv(csv_data, header=header_row)
             column_names = list(dataframe.columns)
+        elif file_extension == ".json":
             # For JSON files
             try:
+                json_data = json.loads(file.contents().decode("utf-8"))
                 # Handle different JSON structures
                 if isinstance(json_data, list):
                     dataframe = pd.DataFrame(json_data)
                 column_names = list(dataframe.columns)
             except Exception as e:
                 print(f"Error parsing JSON: {e}")
     return dataframe, column_names
+def create_parameter_table(
+    input_list,
+    column_name="Active Options",
+    label="Select the Parameters to set to Active",
+    selection_type="multi-cell",
+    text_justify="center",
+):
     """
     Creates a marimo table for parameter selection.
     # Create the table
     parameter_table = mo.ui.table(
         label=f"**{label}**",
+        data={column_name: input_list},
         selection=selection_type,
+        text_justify_columns={column_name: text_justify},
     )
     return parameter_table
 def get_cell_values(parameter_options):
     """
     Extract active parameter values from a mo.ui.table.
     all_params = set()
     # Use the data property to get all options from the table
+    if hasattr(parameter_options, "data"):
         table_data = parameter_options.data
         # Handle DataFrame-like structure
+        if hasattr(table_data, "shape") and hasattr(table_data, "iloc"):
             for i in range(table_data.shape[0]):
                 # Get value from first column
                 if table_data.shape[1] > 0:
     result = {param: False for param in all_params}
     # Get the selected cells
+    if hasattr(parameter_options, "value") and parameter_options.value is not None:
         selected_cells = parameter_options.value
         # Process selected cells
         for cell in selected_cells:
+            if hasattr(cell, "value") and cell.value in result:
                 result[cell.value] = True
+            elif isinstance(cell, dict) and "value" in cell and cell["value"] in result:
+                result[cell["value"]] = True
             elif isinstance(cell, str) and cell in result:
                 result[cell] = True
     return result
 def convert_table_to_json_docs(df, selected_columns=None):
     """
     Convert a pandas DataFrame or dictionary to a list of JSON documents.
         if not isinstance(key, str):
             return str(key).lower()
         # Replace spaces with underscores and convert to lowercase
+        key = key.lower().replace(" ", "_")
         # Remove special characters (keeping alphanumeric and underscores)
+        return re.sub(r"[^\w]", "", key)
     # Handle case when input is a dictionary
     if isinstance(df, dict):
         selected_columns = [col for col, include in selected_columns.items() if include]
     # If no columns are specifically selected, use all available columns
+    if (
+        not selected_columns
+        or not isinstance(selected_columns, list)
+        or len(selected_columns) == 0
+    ):
         selected_columns = list(df.columns)
     # Determine which columns exist in the DataFrame
             value = row[col]
             # Standardize the column name when adding to document
             std_col = standardize_key(col)
+            try:
+                is_na = pd.isna(value)
+                if hasattr(is_na, "__len__") and not isinstance(is_na, str):
+                    # Handle case where pd.isna returns an array
+                    doc[std_col] = None if is_na.all() else value
+                else:
+                    # Handle scalar case
+                    doc[std_col] = None if is_na else value
+            except (ValueError, TypeError) as e:
+                # Output the error and re-raise
+                print(f"Error processing column '{col}' with value {value}: {e}")
+                raise
         json_docs.append(doc)
     return json_docs
+# def convert_table_to_json_docs(df, selected_columns=None):
+#     """
+#     Convert a pandas DataFrame or dictionary to a list of JSON documents.
+#     Dynamically includes columns based on user selection.
+#     Column names are standardized to lowercase with underscores instead of spaces
+#     and special characters removed.
+#     Args:
+#         df: The DataFrame or dictionary to process
+#         selected_columns: List of column names to include in the output documents
+#     Returns:
+#         list: A list of dictionaries, each representing a row as a JSON document
+#     """
+#     import pandas as pd
+#     import re
+#     def standardize_key(key):
+#         """Convert a column name to lowercase with underscores instead of spaces and no special characters"""
+#         if not isinstance(key, str):
+#             return str(key).lower()
+#         # Replace spaces with underscores and convert to lowercase
+#         key = key.lower().replace(" ", "_")
+#         # Remove special characters (keeping alphanumeric and underscores)
+#         return re.sub(r"[^\w]", "", key)
+#     # Handle case when input is a dictionary
+#     if isinstance(df, dict):
+#         # Filter the dictionary to include only selected columns
+#         if selected_columns:
+#             return [{standardize_key(k): df.get(k, None) for k in selected_columns}]
+#         else:
+#             # If no columns selected, return all key-value pairs with standardized keys
+#             return [{standardize_key(k): v for k, v in df.items()}]
+#     # Handle case when df is None
+#     if df is None:
+#         return []
+#     # Ensure df is a DataFrame
+#     if not isinstance(df, pd.DataFrame):
+#         try:
+#             df = pd.DataFrame(df)
+#         except:
+#             return []  # Return empty list if conversion fails
+#     # Now check if DataFrame is empty
+#     if df.empty:
+#         return []
+#     # Process selected_columns if it's a dictionary of true/false values
+#     if isinstance(selected_columns, dict):
+#         # Extract keys where value is True
+#         selected_columns = [col for col, include in selected_columns.items() if include]
+#     # If no columns are specifically selected, use all available columns
+#     if (
+#         not selected_columns
+#         or not isinstance(selected_columns, list)
+#         or len(selected_columns) == 0
+#     ):
+#         selected_columns = list(df.columns)
+#     # Determine which columns exist in the DataFrame
+#     available_columns = []
+#     columns_lower = {col.lower(): col for col in df.columns if isinstance(col, str)}
+#     for col in selected_columns:
+#         if col in df.columns:
+#             available_columns.append(col)
+#         elif isinstance(col, str) and col.lower() in columns_lower:
+#             available_columns.append(columns_lower[col.lower()])
+#     # If no valid columns found, return empty list
+#     if not available_columns:
+#         return []
+#     # Process rows
+#     json_docs = []
+#     for _, row in df.iterrows():
+#         doc = {}
+#         for col in available_columns:
+#             value = row[col]
+#             # Standardize the column name when adding to document
+#             std_col = standardize_key(col)
+#             doc[std_col] = None if pd.isna(value) else value
+#         json_docs.append(doc)
+#     return json_docs
 def filter_models_by_function(resources, function_type="prompt_chat"):
     """
     Filter model IDs from resources list that have a specific function type
         if "functions" in model and isinstance(model["functions"], list):
             # Check if any function has the matching id
             has_function = any(
+                func.get("id") == function_type
                 for func in model["functions"]
                 if isinstance(func, dict)
             )
     return filtered_model_ids
+def get_model_selection_table(
+    client=None,
+    model_type="all",
+    filter_functionality=None,
+    selection_mode="single-cell",
+):
     """
     Creates and displays a table for model selection based on specified parameters.
     Args:
         client: The client object for API calls. If None, returns default models.
         model_type (str): Type of models to display. Options: "all", "chat", "embedding".
         filter_functionality (str, optional): Filter models by functionality type.
+            Options include: "image_chat", "text_chat", "autoai_rag",
             "text_generation", "multilingual", etc.
         selection_mode (str): Mode for selecting table entries. Options: "single", "single-cell".
             Defaults to "single-cell".
     Returns:
         The selected model ID from the displayed table.
     """
     # Default model list if client is None
+    default_models = ["mistralai/mistral-large"]
     if client is None:
         # If no client, use default models
         available_models = default_models
             page_size=30,
         )
         return selection
     # Get appropriate model specs based on model_type
     if model_type == "chat":
         model_specs = client.foundation_models.get_chat_model_specs()
         model_specs = client.foundation_models.get_embeddings_model_specs()
     else:
         model_specs = client.foundation_models.get_model_specs()
     # Extract resources from model specs
     resources = model_specs.get("resources", [])
     # Filter by functionality if specified
     if filter_functionality and resources:
         model_id_list = filter_models_by_function(resources, filter_functionality)
     else:
         # Create list of model IDs if no filtering
         model_id_list = [resource["model_id"] for resource in resources]
     # If no models available after filtering, use defaults
     if not model_id_list:
         model_id_list = default_models
     # Create and display selection table
     model_selector = mo.ui.table(
         model_id_list,
         selection=selection_mode,
         label="Select a model to use.",
         page_size=30,
+        initial_selection=[("0", "value")] if selection_mode == "single-cell" else [0],
         ### For single-cell it must have [("<row_nr as a string>","column_name string")] to work as initial value
     )
     return model_selector, resources, model_id_list
 def _enforce_model_selection(model_selection, model_id_list):
     # If nothing is selected (empty list) or value is None
     if not model_selection.value:
         # Reset to first item
+        model = 0
+        model_selection._value = model_id_list[model]
+        print(model_selection.value)
+        return model_selection.value
 def update_max_tokens_limit(model_selection, resources, model_id_list):
     # Default value
     default_max_tokens = 4096
     try:
         # Check if we have a selection and resources
+        if model_selection.value is None or not hasattr(model_selection, "value"):
             print("No model selection or selection has no value")
             return default_max_tokens
         # If it's an array with indices
         if isinstance(selected_value, list) and len(selected_value) > 0:
+            if isinstance(selected_value[0], int) and 0 <= selected_value[0] < len(
+                model_id_list
+            ):
                 selected_model_id = model_id_list[selected_value[0]]
             else:
+                selected_model_id = str(
+                    selected_value[0]
+                )  # Convert to string if needed
         else:
             selected_model_id = str(selected_value)  # Direct value
         for model in resources:
             model_id = model.get("model_id")
             if model_id == selected_model_id:
+                if (
+                    "model_limits" in model
+                    and "max_output_tokens" in model["model_limits"]
+                ):
                     return model["model_limits"]["max_output_tokens"]
                 break
 def load_templates(
     folder_path: str,
     file_extensions: Optional[List[str]] = None,
+    strip_whitespace: bool = True,
 ) -> Dict[str, str]:
     """
     Load template files from a specified folder into a dictionary.
     """
     # Default extensions if none provided
     if file_extensions is None:
+        file_extensions = [".txt", ".md"]
     # Ensure extensions start with a dot
+    file_extensions = [
+        ext if ext.startswith(".") else f".{ext}" for ext in file_extensions
+    ]
     templates = {"empty": "                        "}  # Default empty template
     # Create glob patterns for each extension
+    patterns = [os.path.join(folder_path, f"*{ext}") for ext in file_extensions]
     # Find all matching files
     for pattern in patterns:
                 template_name = os.path.splitext(filename)[0]
                 # Read file content
+                with open(file_path, "r", encoding="utf-8") as file:
                     content = file.read()
                     # Strip whitespace if specified

helper_functions/table_helper_functions.py CHANGED Viewed

@@ -1,4 +1,6 @@
-def process_with_llm(fields_to_process, prompt_template, inf_model, params, batch_size=10):
     """
     Process documents with LLM using a prompt template with dynamic field mapping.
     Uses template fields to extract values from pre-standardized document fields.
@@ -23,15 +25,15 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
         return []
     # Handle case where prompt_template is a dictionary (from UI components)
-    if isinstance(prompt_template, dict) and 'value' in prompt_template:
-        prompt_template = prompt_template['value']
     elif not isinstance(prompt_template, str):
         print(f"Invalid prompt template type: {type(prompt_template)}, expected string")
         return []
     # Extract field names from the prompt template using regex
     # This finds all strings between curly braces
-    field_pattern = r'\{([^{}]+)\}'
     template_fields = re.findall(field_pattern, prompt_template)
     if not template_fields:
@@ -50,10 +52,10 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
                 if field in doc:
                     field_values[field] = doc[field] if doc[field] is not None else ""
                 # If field contains periods (e.g., "data.title"), evaluate it
-                elif '.' in field:
                     try:
                         # Build a safe evaluation string
-                        parts = field.split('.')
                         value = doc
                         for part in parts:
                             if isinstance(value, dict) and part in value:
@@ -92,7 +94,10 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
         print(f"Sample formatted prompt: {formatted_prompts[0][:200]}...")
     # Split into batches
-    batches = [formatted_prompts[i:i + batch_size] for i in range(0, len(formatted_prompts), batch_size)]
     results = []
@@ -105,7 +110,7 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
         completion_subtitle=f"Processed {len(formatted_prompts)} prompts successfully",
         show_rate=True,
         show_eta=True,
-        remove_on_exit=True
     ) as progress:
         for i, batch in enumerate(batches):
             start_time = time.time()
@@ -126,17 +131,16 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
             end_time = time.time()
             inference_time = end_time - start_time
             print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
             # Update progress bar
             progress.update(increment=1)
         # Add 1 second delay on completion before removing
         time.sleep(1)
     return results
 # def process_with_llm_no_progress_bar(fields_to_process, prompt_template, inf_model, params, batch_size=10):
 #     """
 #     Process documents with LLM using a prompt template with dynamic field mapping.
@@ -257,36 +261,142 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
 #     return results
-def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_results, selection_table, column_name=None):
     """
     Add LLM processing results directly to the target DataFrame using selection indices
     Args:
         target_dataframe (pandas.DataFrame): DataFrame to modify in-place
-        fields_to_process (list): List of document dictionaries that were processed
         llm_results (list): Results from the process_with_llm function
         selection_table: Table selection containing indices of rows to update
         column_name (str, optional): Custom name for the new column
     """
     column_name = column_name or f"Added Column {len(list(target_dataframe))}"
     # Initialize the new column with empty strings if it doesn't exist
     if column_name not in target_dataframe.columns:
         target_dataframe[column_name] = ""
     # Safety checks
     if not isinstance(llm_results, list) or not llm_results:
         print("No LLM results to add")
         return
     # Get indices from selection table
     if selection_table is not None and not selection_table.empty:
         selected_indices = selection_table.index.tolist()
         # Make sure we have the right number of results for the selected rows
         if len(selected_indices) != len(llm_results):
-            print(f"Warning: Number of results ({len(llm_results)}) doesn't match selected rows ({len(selected_indices)})")
         # Add results to the DataFrame at the selected indices
         for idx, result in zip(selected_indices, llm_results):
             try:
@@ -299,7 +409,10 @@ def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_res
     else:
         print("No selection table provided or empty selection")
-def add_llm_results_to_dataframe(original_df, fields_to_process, llm_results, column_name=None):
     """
     Add LLM processing results to a copy of the original DataFrame
@@ -366,6 +479,7 @@ def display_answers_as_markdown(answers, mo):
     return markdown_elements
 def display_answers_stacked(answers, mo):
     """
     Takes a list of answers and displays them stacked vertically using mo.vstack()
@@ -390,4 +504,4 @@ def display_answers_stacked(answers, mo):
             elements_with_separators.append(separator)
     # Return a vertically stacked collection
-    return mo.vstack(elements_with_separators, align="start", gap="2")

+def process_with_llm(
+    fields_to_process, prompt_template, inf_model, params, batch_size=10
+):
     """
     Process documents with LLM using a prompt template with dynamic field mapping.
     Uses template fields to extract values from pre-standardized document fields.
         return []
     # Handle case where prompt_template is a dictionary (from UI components)
+    if isinstance(prompt_template, dict) and "value" in prompt_template:
+        prompt_template = prompt_template["value"]
     elif not isinstance(prompt_template, str):
         print(f"Invalid prompt template type: {type(prompt_template)}, expected string")
         return []
     # Extract field names from the prompt template using regex
     # This finds all strings between curly braces
+    field_pattern = r"\{([^{}]+)\}"
     template_fields = re.findall(field_pattern, prompt_template)
     if not template_fields:
                 if field in doc:
                     field_values[field] = doc[field] if doc[field] is not None else ""
                 # If field contains periods (e.g., "data.title"), evaluate it
+                elif "." in field:
                     try:
                         # Build a safe evaluation string
+                        parts = field.split(".")
                         value = doc
                         for part in parts:
                             if isinstance(value, dict) and part in value:
         print(f"Sample formatted prompt: {formatted_prompts[0][:200]}...")
     # Split into batches
+    batches = [
+        formatted_prompts[i : i + batch_size]
+        for i in range(0, len(formatted_prompts), batch_size)
+    ]
     results = []
         completion_subtitle=f"Processed {len(formatted_prompts)} prompts successfully",
         show_rate=True,
         show_eta=True,
+        remove_on_exit=True,
     ) as progress:
         for i, batch in enumerate(batches):
             start_time = time.time()
             end_time = time.time()
             inference_time = end_time - start_time
             print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
             # Update progress bar
             progress.update(increment=1)
         # Add 1 second delay on completion before removing
         time.sleep(1)
     return results
 # def process_with_llm_no_progress_bar(fields_to_process, prompt_template, inf_model, params, batch_size=10):
 #     """
 #     Process documents with LLM using a prompt template with dynamic field mapping.
 #     return results
+def process_prompt_lineage(
+    lineage_options=None,
+    selected_model=None,
+    params=None,
+    fields_to_process=None,
+    prompt_template=None,
+):
+    """
+    Process prompt lineage based on configuration options.
+    Returns a list of lineage results - one for each document in fields_to_process.
+    Each lineage can be a string (validated/unvalidated prompt) when only switch mode is active,
+    or a dict containing selected components (params, model, variable names, template)
+    when include flags are active. Returns None if lineage is disabled.
+    """
+    import re
+    if lineage_options and lineage_options.get("activate_prompt_lineage"):
+        include_llm = lineage_options.get("include_llm_parameters")
+        include_columns = lineage_options.get("include_input_column_names")
+        include_switch = lineage_options.get("switch_between_lineage_modes")
+        input_variable_names = []
+        if prompt_template:
+            field_pattern = r"\{([^{}]+)\}"
+            input_variable_names = re.findall(field_pattern, prompt_template)
+        results = []
+        if fields_to_process:
+            for doc in fields_to_process:
+                if include_switch and prompt_template:
+                    template_fields = input_variable_names
+                    if template_fields:
+                        field_values = {}
+                        for field in template_fields:
+                            if field in doc:
+                                field_values[field] = (
+                                    doc[field] if doc[field] is not None else ""
+                                )
+                            elif "." in field:
+                                try:
+                                    parts = field.split(".")
+                                    value = doc
+                                    for part in parts:
+                                        if isinstance(value, dict) and part in value:
+                                            value = value[part]
+                                        else:
+                                            value = None
+                                            break
+                                    field_values[field] = (
+                                        value if value is not None else ""
+                                    )
+                                except:
+                                    field_values[field] = ""
+                            else:
+                                field_values[field] = ""
+                        for key in field_values:
+                            if field_values[key] is None:
+                                field_values[key] = ""
+                        try:
+                            template_to_use = prompt_template.format(**field_values)
+                        except Exception as e:
+                            print(f"Error formatting prompt template: {str(e)}")
+                            template_to_use = prompt_template
+                    else:
+                        template_to_use = prompt_template
+                else:
+                    template_to_use = prompt_template
+                if not include_llm and not include_columns:
+                    if include_switch:
+                        results.append(template_to_use)
+                    else:
+                        results.append(prompt_template)
+                else:
+                    lineage = {}
+                    if include_switch:
+                        lineage["switch_between_lineage_modes"] = True
+                        lineage["prompt_template"] = template_to_use
+                    elif prompt_template:
+                        lineage["prompt_template"] = prompt_template
+                    if include_llm:
+                        lineage["params"] = params
+                        lineage["selected_model"] = selected_model
+                    if include_columns:
+                        lineage["input_variable_names"] = input_variable_names
+                    results.append(lineage)
+        return results
+    else:
+        return None
+def append_llm_results_to_dataframe(
+    target_dataframe, llm_results, selection_table, column_name=None
+):
     """
     Add LLM processing results directly to the target DataFrame using selection indices
     Args:
         target_dataframe (pandas.DataFrame): DataFrame to modify in-place
         llm_results (list): Results from the process_with_llm function
         selection_table: Table selection containing indices of rows to update
         column_name (str, optional): Custom name for the new column
     """
     column_name = column_name or f"Added Column {len(list(target_dataframe))}"
     # Initialize the new column with empty strings if it doesn't exist
     if column_name not in target_dataframe.columns:
         target_dataframe[column_name] = ""
     # Safety checks
     if not isinstance(llm_results, list) or not llm_results:
         print("No LLM results to add")
         return
     # Get indices from selection table
     if selection_table is not None and not selection_table.empty:
         selected_indices = selection_table.index.tolist()
         # Make sure we have the right number of results for the selected rows
         if len(selected_indices) != len(llm_results):
+            print(
+                f"Warning: Number of results ({len(llm_results)}) doesn't match selected rows ({len(selected_indices)})"
+            )
         # Add results to the DataFrame at the selected indices
         for idx, result in zip(selected_indices, llm_results):
             try:
     else:
         print("No selection table provided or empty selection")
+def add_llm_results_to_dataframe(
+    original_df, fields_to_process, llm_results, column_name=None
+):
     """
     Add LLM processing results to a copy of the original DataFrame
     return markdown_elements
 def display_answers_stacked(answers, mo):
     """
     Takes a list of answers and displays them stacked vertically using mo.vstack()
             elements_with_separators.append(separator)
     # Return a vertically stacked collection
+    return mo.vstack(elements_with_separators, align="start", gap="2")