File size: 5,441 Bytes
bb869fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import asyncio
import nest_asyncio
import logging
import json
from pprint import pprint
from config import settings
from pydantic import BaseModel, Field

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("crawl4ai_test")

class FormattingRules(BaseModel):
    """Schema for formatting rules extraction"""
    margins: str = Field(description="Margin requirements for the manuscript")
    font: str = Field(description="Font requirements including size, type, etc.")
    line_spacing: str = Field(description="Line spacing requirements")
    citations: str = Field(description="Citation style and formatting requirements")
    sections: str = Field(description="Required sections and their structure")
    other_rules: str = Field(description="Any other formatting requirements")
    summary: str = Field(description="A brief summary of the key formatting requirements")

async def test_crawl():
    """Test crawl4ai functionality"""
    from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
    from crawl4ai.extraction_strategy import LLMExtractionStrategy
    
    url = "https://journal.code4lib.org/article-guidelines"
    
    # Configure the browser
    browser_config = BrowserConfig(verbose=True)
    
    # Configure the LLM extraction
    extraction_strategy = LLMExtractionStrategy(
        llm_config=LLMConfig(
            provider=f"{settings.llm_provider}/{settings.llm_model_name}",
            api_token=settings.openrouter_api_key
        ),
        schema=FormattingRules.schema(),
        extraction_type="schema",
        instruction="""
        From the crawled content, extract all formatting rules for manuscript submissions.
        Focus on requirements for margins, font, line spacing, citations, section structure,
        and any other formatting guidelines. Provide a comprehensive extraction of all
        formatting-related information.
        """
    )
    
    # Configure the crawler
    run_config = CrawlerRunConfig(
        word_count_threshold=10,
        exclude_external_links=True,
        process_iframes=True,
        remove_overlay_elements=True,
        exclude_social_media_links=True,
        check_robots_txt=True,
        semaphore_count=3,
        extraction_strategy=extraction_strategy
    )

    # Initialize the crawler and run
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url=url,
            config=run_config
        )
        
        # Print all attributes of the result object
        logger.info(f"Result object type: {type(result)}")
        logger.info(f"Result object dir: {dir(result)}")
        
        # Check for success
        logger.info(f"Success: {result.success}")
        
        # Check for markdown
        if hasattr(result, 'markdown'):
            logger.info(f"Has markdown: {bool(result.markdown)}")
            logger.info(f"Markdown type: {type(result.markdown)}")
            logger.info(f"Markdown preview: {str(result.markdown)[:200]}...")
        else:
            logger.info("No markdown attribute")
        
        # Check for extracted_data
        if hasattr(result, 'extracted_data'):
            logger.info(f"Has extracted_data: {bool(result.extracted_data)}")
            logger.info(f"Extracted data: {result.extracted_data}")
        else:
            logger.info("No extracted_data attribute")
        
        # Check for other potential attributes
        for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']:
            if hasattr(result, attr):
                logger.info(f"Has {attr}: {bool(getattr(result, attr))}")
                logger.info(f"{attr} preview: {str(getattr(result, attr))[:200]}...")
        
        # Try to access _results directly
        if hasattr(result, '_results'):
            logger.info(f"Has _results: {bool(result._results)}")
            if result._results:
                first_result = result._results[0]
                logger.info(f"First result type: {type(first_result)}")
                logger.info(f"First result dir: {dir(first_result)}")
                
                # Check if first result has extracted_data
                if hasattr(first_result, 'extracted_data'):
                    logger.info(f"First result has extracted_data: {bool(first_result.extracted_data)}")
                    logger.info(f"First result extracted_data: {first_result.extracted_data}")
                
                # Check for other attributes in first result
                for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']:
                    if hasattr(first_result, attr):
                        logger.info(f"First result has {attr}: {bool(getattr(first_result, attr))}")
                        logger.info(f"First result {attr} preview: {str(getattr(first_result, attr))[:200]}...")
        
        return result

def main():
    """Main function"""
    # Apply nest_asyncio
    nest_asyncio.apply()
    
    # Create a new event loop and run the async function
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
        result = loop.run_until_complete(test_crawl())
        logger.info("Test completed successfully")
    finally:
        loop.close()

if __name__ == "__main__":
    main()