Spaces:

Zwounds
/

FormatReview

Sleeping

File size: 5,441 Bytes

bb869fd

import asyncio
import nest_asyncio
import logging
import json
from pprint import pprint
from config import settings
from pydantic import BaseModel, Field

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("crawl4ai_test")

class FormattingRules(BaseModel):
    """Schema for formatting rules extraction"""
    margins: str = Field(description="Margin requirements for the manuscript")
    font: str = Field(description="Font requirements including size, type, etc.")
    line_spacing: str = Field(description="Line spacing requirements")
    citations: str = Field(description="Citation style and formatting requirements")
    sections: str = Field(description="Required sections and their structure")
    other_rules: str = Field(description="Any other formatting requirements")
    summary: str = Field(description="A brief summary of the key formatting requirements")

async def test_crawl():
    """Test crawl4ai functionality"""
    from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
    from crawl4ai.extraction_strategy import LLMExtractionStrategy
    
    url = "https://journal.code4lib.org/article-guidelines"
    
    # Configure the browser
    browser_config = BrowserConfig(verbose=True)
    
    # Configure the LLM extraction
    extraction_strategy = LLMExtractionStrategy(
        llm_config=LLMConfig(
            provider=f"{settings.llm_provider}/{settings.llm_model_name}",
            api_token=settings.openrouter_api_key
        ),
        schema=FormattingRules.schema(),
        extraction_type="schema",
        instruction="""
        From the crawled content, extract all formatting rules for manuscript submissions.
        Focus on requirements for margins, font, line spacing, citations, section structure,
        and any other formatting guidelines. Provide a comprehensive extraction of all
        formatting-related information.
        """
    )
    
    # Configure the crawler
    run_config = CrawlerRunConfig(
        word_count_threshold=10,
        exclude_external_links=True,
        process_iframes=True,
        remove_overlay_elements=True,
        exclude_social_media_links=True,
        check_robots_txt=True,
        semaphore_count=3,
        extraction_strategy=extraction_strategy
    )

    # Initialize the crawler and run
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url=url,
            config=run_config
        )
        
        # Print all attributes of the result object
        logger.info(f"Result object type: {type(result)}")
        logger.info(f"Result object dir: {dir(result)}")
        
        # Check for success
        logger.info(f"Success: {result.success}")
        
        # Check for markdown
        if hasattr(result, 'markdown'):
            logger.info(f"Has markdown: {bool(result.markdown)}")
            logger.info(f"Markdown type: {type(result.markdown)}")
            logger.info(f"Markdown preview: {str(result.markdown)[:200]}...")
        else:
            logger.info("No markdown attribute")
        
        # Check for extracted_data
        if hasattr(result, 'extracted_data'):
            logger.info(f"Has extracted_data: {bool(result.extracted_data)}")
            logger.info(f"Extracted data: {result.extracted_data}")
        else:
            logger.info("No extracted_data attribute")
        
        # Check for other potential attributes
        for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']:
            if hasattr(result, attr):
                logger.info(f"Has {attr}: {bool(getattr(result, attr))}")
                logger.info(f"{attr} preview: {str(getattr(result, attr))[:200]}...")
        
        # Try to access _results directly
        if hasattr(result, '_results'):
            logger.info(f"Has _results: {bool(result._results)}")
            if result._results:
                first_result = result._results[0]
                logger.info(f"First result type: {type(first_result)}")
                logger.info(f"First result dir: {dir(first_result)}")
                
                # Check if first result has extracted_data
                if hasattr(first_result, 'extracted_data'):
                    logger.info(f"First result has extracted_data: {bool(first_result.extracted_data)}")
                    logger.info(f"First result extracted_data: {first_result.extracted_data}")
                
                # Check for other attributes in first result
                for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']:
                    if hasattr(first_result, attr):
                        logger.info(f"First result has {attr}: {bool(getattr(first_result, attr))}")
                        logger.info(f"First result {attr} preview: {str(getattr(first_result, attr))[:200]}...")
        
        return result

def main():
    """Main function"""
    # Apply nest_asyncio
    nest_asyncio.apply()
    
    # Create a new event loop and run the async function
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
        result = loop.run_until_complete(test_crawl())
        logger.info("Test completed successfully")
    finally:
        loop.close()

if __name__ == "__main__":
    main()