Stephen Zweibel commited on
Commit
ec0096e
·
1 Parent(s): 4435587

Update app for Hugging Face

Browse files
Files changed (1) hide show
  1. rule_extractor.py +25 -10
rule_extractor.py CHANGED
@@ -202,21 +202,36 @@ def get_rules_from_url(url: str) -> str:
202
  return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
203
 
204
  if result.success and result.extracted_content:
205
- # Format the extracted data into a readable string
206
- if isinstance(result.extracted_content, list) and len(result.extracted_content) > 0:
207
- rules_data = result.extracted_content[0]
208
- elif isinstance(result.extracted_content, dict):
209
- rules_data = result.extracted_content
 
 
 
 
 
 
 
 
 
 
 
 
210
  else:
211
- # If it's a string or other type, use markdown as fallback
212
- return str(result.extracted_content) if result.extracted_content else result.markdown if result.markdown else "Could not extract formatting rules from the provided URL."
213
-
214
  # Store the raw data for debugging
215
- logger.info(f"Extracted rules data: {json.dumps(rules_data, indent=2)}")
216
 
217
  # Format the rules for display
218
  formatted_rules = format_rules_for_display(rules_data)
219
- logger.info(f"Formatted rules: {formatted_rules[:100]}...") # Log for debugging
 
 
 
220
  return formatted_rules
221
  elif result.success and result.markdown:
222
  # Fallback to markdown if structured extraction fails
 
202
  return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
203
 
204
  if result.success and result.extracted_content:
205
+ # The extracted content is often a list containing a JSON string.
206
+ raw_data = result.extracted_content
207
+ if isinstance(raw_data, list) and len(raw_data) > 0:
208
+ raw_data = raw_data[0]
209
+
210
+ # Ensure we have a dictionary to work with
211
+ if isinstance(raw_data, str):
212
+ try:
213
+ rules_data = json.loads(raw_data)
214
+ # If the parsed data is a list, take the first element
215
+ if isinstance(rules_data, list) and len(rules_data) > 0:
216
+ rules_data = rules_data[0]
217
+ except json.JSONDecodeError:
218
+ logger.error(f"Failed to parse JSON from extracted content: {raw_data}")
219
+ return "Failed to parse the extracted formatting rules."
220
+ elif isinstance(raw_data, dict):
221
+ rules_data = raw_data
222
  else:
223
+ logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
224
+ return "Could not process the extracted formatting rules."
225
+
226
  # Store the raw data for debugging
227
+ logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")
228
 
229
  # Format the rules for display
230
  formatted_rules = format_rules_for_display(rules_data)
231
+ if not formatted_rules:
232
+ return "Failed to format the extracted rules."
233
+
234
+ logger.info(f"Formatted rules: {formatted_rules[:100]}...")
235
  return formatted_rules
236
  elif result.success and result.markdown:
237
  # Fallback to markdown if structured extraction fails