Spaces:
Sleeping
Sleeping
File size: 5,441 Bytes
bb869fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import asyncio
import nest_asyncio
import logging
import json
from pprint import pprint
from config import settings
from pydantic import BaseModel, Field
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("crawl4ai_test")
class FormattingRules(BaseModel):
"""Schema for formatting rules extraction"""
margins: str = Field(description="Margin requirements for the manuscript")
font: str = Field(description="Font requirements including size, type, etc.")
line_spacing: str = Field(description="Line spacing requirements")
citations: str = Field(description="Citation style and formatting requirements")
sections: str = Field(description="Required sections and their structure")
other_rules: str = Field(description="Any other formatting requirements")
summary: str = Field(description="A brief summary of the key formatting requirements")
async def test_crawl():
"""Test crawl4ai functionality"""
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
url = "https://journal.code4lib.org/article-guidelines"
# Configure the browser
browser_config = BrowserConfig(verbose=True)
# Configure the LLM extraction
extraction_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(
provider=f"{settings.llm_provider}/{settings.llm_model_name}",
api_token=settings.openrouter_api_key
),
schema=FormattingRules.schema(),
extraction_type="schema",
instruction="""
From the crawled content, extract all formatting rules for manuscript submissions.
Focus on requirements for margins, font, line spacing, citations, section structure,
and any other formatting guidelines. Provide a comprehensive extraction of all
formatting-related information.
"""
)
# Configure the crawler
run_config = CrawlerRunConfig(
word_count_threshold=10,
exclude_external_links=True,
process_iframes=True,
remove_overlay_elements=True,
exclude_social_media_links=True,
check_robots_txt=True,
semaphore_count=3,
extraction_strategy=extraction_strategy
)
# Initialize the crawler and run
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
config=run_config
)
# Print all attributes of the result object
logger.info(f"Result object type: {type(result)}")
logger.info(f"Result object dir: {dir(result)}")
# Check for success
logger.info(f"Success: {result.success}")
# Check for markdown
if hasattr(result, 'markdown'):
logger.info(f"Has markdown: {bool(result.markdown)}")
logger.info(f"Markdown type: {type(result.markdown)}")
logger.info(f"Markdown preview: {str(result.markdown)[:200]}...")
else:
logger.info("No markdown attribute")
# Check for extracted_data
if hasattr(result, 'extracted_data'):
logger.info(f"Has extracted_data: {bool(result.extracted_data)}")
logger.info(f"Extracted data: {result.extracted_data}")
else:
logger.info("No extracted_data attribute")
# Check for other potential attributes
for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']:
if hasattr(result, attr):
logger.info(f"Has {attr}: {bool(getattr(result, attr))}")
logger.info(f"{attr} preview: {str(getattr(result, attr))[:200]}...")
# Try to access _results directly
if hasattr(result, '_results'):
logger.info(f"Has _results: {bool(result._results)}")
if result._results:
first_result = result._results[0]
logger.info(f"First result type: {type(first_result)}")
logger.info(f"First result dir: {dir(first_result)}")
# Check if first result has extracted_data
if hasattr(first_result, 'extracted_data'):
logger.info(f"First result has extracted_data: {bool(first_result.extracted_data)}")
logger.info(f"First result extracted_data: {first_result.extracted_data}")
# Check for other attributes in first result
for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']:
if hasattr(first_result, attr):
logger.info(f"First result has {attr}: {bool(getattr(first_result, attr))}")
logger.info(f"First result {attr} preview: {str(getattr(first_result, attr))[:200]}...")
return result
def main():
"""Main function"""
# Apply nest_asyncio
nest_asyncio.apply()
# Create a new event loop and run the async function
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(test_crawl())
logger.info("Test completed successfully")
finally:
loop.close()
if __name__ == "__main__":
main()
|