Update helper_functions/table_helper_functions.py
Browse files
helper_functions/table_helper_functions.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
def process_with_llm(fields_to_process, prompt_template, inf_model, params, batch_size=10):
|
| 3 |
"""
|
| 4 |
Process documents with LLM using a prompt template with dynamic field mapping.
|
|
@@ -14,6 +13,7 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
|
|
| 14 |
Returns:
|
| 15 |
list: Processed results from the LLM
|
| 16 |
"""
|
|
|
|
| 17 |
import time
|
| 18 |
import re
|
| 19 |
|
|
@@ -97,28 +97,166 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
|
|
| 97 |
results = []
|
| 98 |
|
| 99 |
# Process each batch
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
|
| 108 |
-
|
|
|
|
| 109 |
|
| 110 |
-
|
|
|
|
| 111 |
|
| 112 |
-
|
| 113 |
-
print(f"Error in batch {i+1}: {str(e)}")
|
| 114 |
-
continue
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
return results
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_results, selection_table, column_name=None):
|
| 123 |
"""
|
| 124 |
Add LLM processing results directly to the target DataFrame using selection indices
|
|
|
|
|
|
|
| 1 |
def process_with_llm(fields_to_process, prompt_template, inf_model, params, batch_size=10):
|
| 2 |
"""
|
| 3 |
Process documents with LLM using a prompt template with dynamic field mapping.
|
|
|
|
| 13 |
Returns:
|
| 14 |
list: Processed results from the LLM
|
| 15 |
"""
|
| 16 |
+
import marimo as mo
|
| 17 |
import time
|
| 18 |
import re
|
| 19 |
|
|
|
|
| 97 |
results = []
|
| 98 |
|
| 99 |
# Process each batch
|
| 100 |
+
with mo.status.progress_bar(
|
| 101 |
+
total=len(batches),
|
| 102 |
+
title="Processing Batches",
|
| 103 |
+
subtitle=f"Processing {len(formatted_prompts)} prompts in {len(batches)} batches",
|
| 104 |
+
completion_title="Processing Complete",
|
| 105 |
+
completion_subtitle=f"Processed {len(formatted_prompts)} prompts successfully",
|
| 106 |
+
show_rate=True,
|
| 107 |
+
show_eta=True,
|
| 108 |
+
remove_on_exit=True
|
| 109 |
+
) as progress:
|
| 110 |
+
for i, batch in enumerate(batches):
|
| 111 |
+
start_time = time.time()
|
| 112 |
|
| 113 |
+
try:
|
| 114 |
+
# Use the provided inference model to generate responses
|
| 115 |
+
print(f"Sending batch {i+1} of {len(batches)} to model")
|
| 116 |
|
| 117 |
+
# Call the inference model with the batch of prompts and params
|
| 118 |
+
batch_results = inf_model.generate_text(prompt=batch, params=params)
|
| 119 |
|
| 120 |
+
results.extend(batch_results)
|
|
|
|
|
|
|
| 121 |
|
| 122 |
+
except Exception as e:
|
| 123 |
+
print(f"Error in batch {i+1}: {str(e)}")
|
| 124 |
+
continue
|
| 125 |
+
|
| 126 |
+
end_time = time.time()
|
| 127 |
+
inference_time = end_time - start_time
|
| 128 |
+
print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
|
| 129 |
+
|
| 130 |
+
# Update progress bar
|
| 131 |
+
progress.update(increment=1)
|
| 132 |
+
|
| 133 |
+
# Add 1 second delay on completion before removing
|
| 134 |
+
time.sleep(1)
|
| 135 |
|
| 136 |
return results
|
| 137 |
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
# def process_with_llm_no_progress_bar(fields_to_process, prompt_template, inf_model, params, batch_size=10):
|
| 141 |
+
# """
|
| 142 |
+
# Process documents with LLM using a prompt template with dynamic field mapping.
|
| 143 |
+
# Uses template fields to extract values from pre-standardized document fields.
|
| 144 |
+
|
| 145 |
+
# Args:
|
| 146 |
+
# fields_to_process (list): List of document dictionaries to process
|
| 147 |
+
# prompt_template (str): Template with {field_name} placeholders matching keys in documents
|
| 148 |
+
# inf_model: The inference model instance to use for generation
|
| 149 |
+
# params: Parameters to pass to the inference model
|
| 150 |
+
# batch_size (int): Number of documents to process per batch
|
| 151 |
+
|
| 152 |
+
# Returns:
|
| 153 |
+
# list: Processed results from the LLM
|
| 154 |
+
# """
|
| 155 |
+
# import time
|
| 156 |
+
# import re
|
| 157 |
+
|
| 158 |
+
# # Safety check for inputs
|
| 159 |
+
# if not fields_to_process or not inf_model:
|
| 160 |
+
# print("Missing required inputs")
|
| 161 |
+
# return []
|
| 162 |
+
|
| 163 |
+
# # Handle case where prompt_template is a dictionary (from UI components)
|
| 164 |
+
# if isinstance(prompt_template, dict) and 'value' in prompt_template:
|
| 165 |
+
# prompt_template = prompt_template['value']
|
| 166 |
+
# elif not isinstance(prompt_template, str):
|
| 167 |
+
# print(f"Invalid prompt template type: {type(prompt_template)}, expected string")
|
| 168 |
+
# return []
|
| 169 |
+
|
| 170 |
+
# # Extract field names from the prompt template using regex
|
| 171 |
+
# # This finds all strings between curly braces
|
| 172 |
+
# field_pattern = r'\{([^{}]+)\}'
|
| 173 |
+
# template_fields = re.findall(field_pattern, prompt_template)
|
| 174 |
+
|
| 175 |
+
# if not template_fields:
|
| 176 |
+
# print("No field placeholders found in template")
|
| 177 |
+
# return []
|
| 178 |
+
|
| 179 |
+
# # Create formatted prompts from the documents
|
| 180 |
+
# formatted_prompts = []
|
| 181 |
+
# for doc in fields_to_process:
|
| 182 |
+
# try:
|
| 183 |
+
# # Create a dictionary of field values to substitute
|
| 184 |
+
# field_values = {}
|
| 185 |
+
|
| 186 |
+
# for field in template_fields:
|
| 187 |
+
# # Try direct match first
|
| 188 |
+
# if field in doc:
|
| 189 |
+
# field_values[field] = doc[field] if doc[field] is not None else ""
|
| 190 |
+
# # If field contains periods (e.g., "data.title"), evaluate it
|
| 191 |
+
# elif '.' in field:
|
| 192 |
+
# try:
|
| 193 |
+
# # Build a safe evaluation string
|
| 194 |
+
# parts = field.split('.')
|
| 195 |
+
# value = doc
|
| 196 |
+
# for part in parts:
|
| 197 |
+
# if isinstance(value, dict) and part in value:
|
| 198 |
+
# value = value[part]
|
| 199 |
+
# else:
|
| 200 |
+
# value = None
|
| 201 |
+
# break
|
| 202 |
+
# field_values[field] = value if value is not None else ""
|
| 203 |
+
# except:
|
| 204 |
+
# field_values[field] = ""
|
| 205 |
+
# else:
|
| 206 |
+
# # Default to empty string if field not found
|
| 207 |
+
# field_values[field] = ""
|
| 208 |
+
|
| 209 |
+
# # Handle None values at the top level to ensure formatting works
|
| 210 |
+
# for key in field_values:
|
| 211 |
+
# if field_values[key] is None:
|
| 212 |
+
# field_values[key] = ""
|
| 213 |
+
|
| 214 |
+
# # Format the prompt with all available fields
|
| 215 |
+
# prompt = prompt_template.format(**field_values)
|
| 216 |
+
# formatted_prompts.append(prompt)
|
| 217 |
+
|
| 218 |
+
# except Exception as e:
|
| 219 |
+
# print(f"Error formatting prompt: {str(e)}")
|
| 220 |
+
# print(f"Field values: {field_values}")
|
| 221 |
+
# continue
|
| 222 |
+
|
| 223 |
+
# # Return empty list if no valid prompts
|
| 224 |
+
# if not formatted_prompts:
|
| 225 |
+
# print("No valid prompts generated")
|
| 226 |
+
# return []
|
| 227 |
+
|
| 228 |
+
# # Print a sample of the formatted prompts for debugging
|
| 229 |
+
# if formatted_prompts:
|
| 230 |
+
# print(f"Sample formatted prompt: {formatted_prompts[0][:200]}...")
|
| 231 |
+
|
| 232 |
+
# # Split into batches
|
| 233 |
+
# batches = [formatted_prompts[i:i + batch_size] for i in range(0, len(formatted_prompts), batch_size)]
|
| 234 |
+
|
| 235 |
+
# results = []
|
| 236 |
+
|
| 237 |
+
# # Process each batch
|
| 238 |
+
# for i, batch in enumerate(batches):
|
| 239 |
+
# start_time = time.time()
|
| 240 |
+
|
| 241 |
+
# try:
|
| 242 |
+
# # Use the provided inference model to generate responses
|
| 243 |
+
# print(f"Sending batch {i+1} of {len(batches)} to model")
|
| 244 |
+
|
| 245 |
+
# # Call the inference model with the batch of prompts and params
|
| 246 |
+
# batch_results = inf_model.generate_text(prompt=batch, params=params)
|
| 247 |
+
|
| 248 |
+
# results.extend(batch_results)
|
| 249 |
+
|
| 250 |
+
# except Exception as e:
|
| 251 |
+
# print(f"Error in batch {i+1}: {str(e)}")
|
| 252 |
+
# continue
|
| 253 |
+
|
| 254 |
+
# end_time = time.time()
|
| 255 |
+
# inference_time = end_time - start_time
|
| 256 |
+
# print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
|
| 257 |
+
|
| 258 |
+
# return results
|
| 259 |
+
|
| 260 |
def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_results, selection_table, column_name=None):
|
| 261 |
"""
|
| 262 |
Add LLM processing results directly to the target DataFrame using selection indices
|