Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- rule_extractor.py +39 -39
rule_extractor.py
CHANGED
@@ -200,46 +200,46 @@ def get_rules_from_url(url: str) -> str:
|
|
200 |
except Exception as fallback_e:
|
201 |
logger.error(f"Fallback HTTP request also failed: {fallback_e}")
|
202 |
return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
else:
|
223 |
-
logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
|
224 |
-
return "Could not process the extracted formatting rules."
|
225 |
-
|
226 |
-
# Store the raw data for debugging
|
227 |
-
logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")
|
228 |
-
|
229 |
-
# Format the rules for display
|
230 |
-
formatted_rules = format_rules_for_display(rules_data)
|
231 |
-
if not formatted_rules:
|
232 |
-
return "Failed to format the extracted rules."
|
233 |
-
|
234 |
-
logger.info(f"Formatted rules: {formatted_rules[:100]}...")
|
235 |
-
return formatted_rules
|
236 |
-
elif result.success and result.markdown:
|
237 |
-
# Fallback to markdown if structured extraction fails
|
238 |
-
logger.info(f"Extraction failed, falling back to markdown for {url}")
|
239 |
-
return result.markdown
|
240 |
else:
|
241 |
-
logger.warning(f"
|
242 |
-
return "Could not
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
|
244 |
# Run the async function using the patched event loop
|
245 |
return asyncio.run(_extract_rules_async(url))
|
|
|
200 |
except Exception as fallback_e:
|
201 |
logger.error(f"Fallback HTTP request also failed: {fallback_e}")
|
202 |
return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
|
203 |
+
|
204 |
+
if result.success and result.extracted_content:
|
205 |
+
# The extracted content is often a list containing a JSON string.
|
206 |
+
raw_data = result.extracted_content
|
207 |
+
if isinstance(raw_data, list) and len(raw_data) > 0:
|
208 |
+
raw_data = raw_data[0]
|
209 |
+
|
210 |
+
# Ensure we have a dictionary to work with
|
211 |
+
if isinstance(raw_data, str):
|
212 |
+
try:
|
213 |
+
rules_data = json.loads(raw_data)
|
214 |
+
# If the parsed data is a list, take the first element
|
215 |
+
if isinstance(rules_data, list) and len(rules_data) > 0:
|
216 |
+
rules_data = rules_data[0]
|
217 |
+
except json.JSONDecodeError:
|
218 |
+
logger.error(f"Failed to parse JSON from extracted content: {raw_data}")
|
219 |
+
return "Failed to parse the extracted formatting rules."
|
220 |
+
elif isinstance(raw_data, dict):
|
221 |
+
rules_data = raw_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
else:
|
223 |
+
logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
|
224 |
+
return "Could not process the extracted formatting rules."
|
225 |
+
|
226 |
+
# Store the raw data for debugging
|
227 |
+
logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")
|
228 |
+
|
229 |
+
# Format the rules for display
|
230 |
+
formatted_rules = format_rules_for_display(rules_data)
|
231 |
+
if not formatted_rules:
|
232 |
+
return "Failed to format the extracted rules."
|
233 |
+
|
234 |
+
logger.info(f"Formatted rules: {formatted_rules[:100]}...")
|
235 |
+
return formatted_rules
|
236 |
+
elif result.success and result.markdown:
|
237 |
+
# Fallback to markdown if structured extraction fails
|
238 |
+
logger.info(f"Extraction failed, falling back to markdown for {url}")
|
239 |
+
return result.markdown
|
240 |
+
else:
|
241 |
+
logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}")
|
242 |
+
return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
|
243 |
|
244 |
# Run the async function using the patched event loop
|
245 |
return asyncio.run(_extract_rules_async(url))
|