import asyncio import nest_asyncio import logging import json from pprint import pprint from config import settings from pydantic import BaseModel, Field # Configure logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger("crawl4ai_test") class FormattingRules(BaseModel): """Schema for formatting rules extraction""" margins: str = Field(description="Margin requirements for the manuscript") font: str = Field(description="Font requirements including size, type, etc.") line_spacing: str = Field(description="Line spacing requirements") citations: str = Field(description="Citation style and formatting requirements") sections: str = Field(description="Required sections and their structure") other_rules: str = Field(description="Any other formatting requirements") summary: str = Field(description="A brief summary of the key formatting requirements") async def test_crawl(): """Test crawl4ai functionality""" from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig from crawl4ai.extraction_strategy import LLMExtractionStrategy url = "https://journal.code4lib.org/article-guidelines" # Configure the browser browser_config = BrowserConfig(verbose=True) # Configure the LLM extraction extraction_strategy = LLMExtractionStrategy( llm_config=LLMConfig( provider=f"{settings.llm_provider}/{settings.llm_model_name}", api_token=settings.openrouter_api_key ), schema=FormattingRules.schema(), extraction_type="schema", instruction=""" From the crawled content, extract all formatting rules for manuscript submissions. Focus on requirements for margins, font, line spacing, citations, section structure, and any other formatting guidelines. Provide a comprehensive extraction of all formatting-related information. """ ) # Configure the crawler run_config = CrawlerRunConfig( word_count_threshold=10, exclude_external_links=True, process_iframes=True, remove_overlay_elements=True, exclude_social_media_links=True, check_robots_txt=True, semaphore_count=3, extraction_strategy=extraction_strategy ) # Initialize the crawler and run async with AsyncWebCrawler() as crawler: result = await crawler.arun( url=url, config=run_config ) # Print all attributes of the result object logger.info(f"Result object type: {type(result)}") logger.info(f"Result object dir: {dir(result)}") # Check for success logger.info(f"Success: {result.success}") # Check for markdown if hasattr(result, 'markdown'): logger.info(f"Has markdown: {bool(result.markdown)}") logger.info(f"Markdown type: {type(result.markdown)}") logger.info(f"Markdown preview: {str(result.markdown)[:200]}...") else: logger.info("No markdown attribute") # Check for extracted_data if hasattr(result, 'extracted_data'): logger.info(f"Has extracted_data: {bool(result.extracted_data)}") logger.info(f"Extracted data: {result.extracted_data}") else: logger.info("No extracted_data attribute") # Check for other potential attributes for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']: if hasattr(result, attr): logger.info(f"Has {attr}: {bool(getattr(result, attr))}") logger.info(f"{attr} preview: {str(getattr(result, attr))[:200]}...") # Try to access _results directly if hasattr(result, '_results'): logger.info(f"Has _results: {bool(result._results)}") if result._results: first_result = result._results[0] logger.info(f"First result type: {type(first_result)}") logger.info(f"First result dir: {dir(first_result)}") # Check if first result has extracted_data if hasattr(first_result, 'extracted_data'): logger.info(f"First result has extracted_data: {bool(first_result.extracted_data)}") logger.info(f"First result extracted_data: {first_result.extracted_data}") # Check for other attributes in first result for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']: if hasattr(first_result, attr): logger.info(f"First result has {attr}: {bool(getattr(first_result, attr))}") logger.info(f"First result {attr} preview: {str(getattr(first_result, attr))[:200]}...") return result def main(): """Main function""" # Apply nest_asyncio nest_asyncio.apply() # Create a new event loop and run the async function loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: result = loop.run_until_complete(test_crawl()) logger.info("Test completed successfully") finally: loop.close() if __name__ == "__main__": main()