Zwounds commited on
Commit
d6fb454
·
verified ·
1 Parent(s): 72542d2

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. rule_extractor.py +39 -39
rule_extractor.py CHANGED
@@ -200,46 +200,46 @@ def get_rules_from_url(url: str) -> str:
200
  except Exception as fallback_e:
201
  logger.error(f"Fallback HTTP request also failed: {fallback_e}")
202
  return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
203
-
204
- if result.success and result.extracted_content:
205
- # The extracted content is often a list containing a JSON string.
206
- raw_data = result.extracted_content
207
- if isinstance(raw_data, list) and len(raw_data) > 0:
208
- raw_data = raw_data[0]
209
-
210
- # Ensure we have a dictionary to work with
211
- if isinstance(raw_data, str):
212
- try:
213
- rules_data = json.loads(raw_data)
214
- # If the parsed data is a list, take the first element
215
- if isinstance(rules_data, list) and len(rules_data) > 0:
216
- rules_data = rules_data[0]
217
- except json.JSONDecodeError:
218
- logger.error(f"Failed to parse JSON from extracted content: {raw_data}")
219
- return "Failed to parse the extracted formatting rules."
220
- elif isinstance(raw_data, dict):
221
- rules_data = raw_data
222
- else:
223
- logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
224
- return "Could not process the extracted formatting rules."
225
-
226
- # Store the raw data for debugging
227
- logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")
228
-
229
- # Format the rules for display
230
- formatted_rules = format_rules_for_display(rules_data)
231
- if not formatted_rules:
232
- return "Failed to format the extracted rules."
233
-
234
- logger.info(f"Formatted rules: {formatted_rules[:100]}...")
235
- return formatted_rules
236
- elif result.success and result.markdown:
237
- # Fallback to markdown if structured extraction fails
238
- logger.info(f"Extraction failed, falling back to markdown for {url}")
239
- return result.markdown
240
  else:
241
- logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}")
242
- return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
  # Run the async function using the patched event loop
245
  return asyncio.run(_extract_rules_async(url))
 
200
  except Exception as fallback_e:
201
  logger.error(f"Fallback HTTP request also failed: {fallback_e}")
202
  return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
203
+
204
+ if result.success and result.extracted_content:
205
+ # The extracted content is often a list containing a JSON string.
206
+ raw_data = result.extracted_content
207
+ if isinstance(raw_data, list) and len(raw_data) > 0:
208
+ raw_data = raw_data[0]
209
+
210
+ # Ensure we have a dictionary to work with
211
+ if isinstance(raw_data, str):
212
+ try:
213
+ rules_data = json.loads(raw_data)
214
+ # If the parsed data is a list, take the first element
215
+ if isinstance(rules_data, list) and len(rules_data) > 0:
216
+ rules_data = rules_data[0]
217
+ except json.JSONDecodeError:
218
+ logger.error(f"Failed to parse JSON from extracted content: {raw_data}")
219
+ return "Failed to parse the extracted formatting rules."
220
+ elif isinstance(raw_data, dict):
221
+ rules_data = raw_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  else:
223
+ logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
224
+ return "Could not process the extracted formatting rules."
225
+
226
+ # Store the raw data for debugging
227
+ logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")
228
+
229
+ # Format the rules for display
230
+ formatted_rules = format_rules_for_display(rules_data)
231
+ if not formatted_rules:
232
+ return "Failed to format the extracted rules."
233
+
234
+ logger.info(f"Formatted rules: {formatted_rules[:100]}...")
235
+ return formatted_rules
236
+ elif result.success and result.markdown:
237
+ # Fallback to markdown if structured extraction fails
238
+ logger.info(f"Extraction failed, falling back to markdown for {url}")
239
+ return result.markdown
240
+ else:
241
+ logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}")
242
+ return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
243
 
244
  # Run the async function using the patched event loop
245
  return asyncio.run(_extract_rules_async(url))