FormatReview / test_crawl.py
Stephen Zweibel
Add initial implementation of FormatReview tool with core features and configurations
bb869fd
import asyncio
import nest_asyncio
import logging
import json
from pprint import pprint
from config import settings
from pydantic import BaseModel, Field
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("crawl4ai_test")
class FormattingRules(BaseModel):
"""Schema for formatting rules extraction"""
margins: str = Field(description="Margin requirements for the manuscript")
font: str = Field(description="Font requirements including size, type, etc.")
line_spacing: str = Field(description="Line spacing requirements")
citations: str = Field(description="Citation style and formatting requirements")
sections: str = Field(description="Required sections and their structure")
other_rules: str = Field(description="Any other formatting requirements")
summary: str = Field(description="A brief summary of the key formatting requirements")
async def test_crawl():
"""Test crawl4ai functionality"""
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
url = "https://journal.code4lib.org/article-guidelines"
# Configure the browser
browser_config = BrowserConfig(verbose=True)
# Configure the LLM extraction
extraction_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(
provider=f"{settings.llm_provider}/{settings.llm_model_name}",
api_token=settings.openrouter_api_key
),
schema=FormattingRules.schema(),
extraction_type="schema",
instruction="""
From the crawled content, extract all formatting rules for manuscript submissions.
Focus on requirements for margins, font, line spacing, citations, section structure,
and any other formatting guidelines. Provide a comprehensive extraction of all
formatting-related information.
"""
)
# Configure the crawler
run_config = CrawlerRunConfig(
word_count_threshold=10,
exclude_external_links=True,
process_iframes=True,
remove_overlay_elements=True,
exclude_social_media_links=True,
check_robots_txt=True,
semaphore_count=3,
extraction_strategy=extraction_strategy
)
# Initialize the crawler and run
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
config=run_config
)
# Print all attributes of the result object
logger.info(f"Result object type: {type(result)}")
logger.info(f"Result object dir: {dir(result)}")
# Check for success
logger.info(f"Success: {result.success}")
# Check for markdown
if hasattr(result, 'markdown'):
logger.info(f"Has markdown: {bool(result.markdown)}")
logger.info(f"Markdown type: {type(result.markdown)}")
logger.info(f"Markdown preview: {str(result.markdown)[:200]}...")
else:
logger.info("No markdown attribute")
# Check for extracted_data
if hasattr(result, 'extracted_data'):
logger.info(f"Has extracted_data: {bool(result.extracted_data)}")
logger.info(f"Extracted data: {result.extracted_data}")
else:
logger.info("No extracted_data attribute")
# Check for other potential attributes
for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']:
if hasattr(result, attr):
logger.info(f"Has {attr}: {bool(getattr(result, attr))}")
logger.info(f"{attr} preview: {str(getattr(result, attr))[:200]}...")
# Try to access _results directly
if hasattr(result, '_results'):
logger.info(f"Has _results: {bool(result._results)}")
if result._results:
first_result = result._results[0]
logger.info(f"First result type: {type(first_result)}")
logger.info(f"First result dir: {dir(first_result)}")
# Check if first result has extracted_data
if hasattr(first_result, 'extracted_data'):
logger.info(f"First result has extracted_data: {bool(first_result.extracted_data)}")
logger.info(f"First result extracted_data: {first_result.extracted_data}")
# Check for other attributes in first result
for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']:
if hasattr(first_result, attr):
logger.info(f"First result has {attr}: {bool(getattr(first_result, attr))}")
logger.info(f"First result {attr} preview: {str(getattr(first_result, attr))[:200]}...")
return result
def main():
"""Main function"""
# Apply nest_asyncio
nest_asyncio.apply()
# Create a new event loop and run the async function
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(test_crawl())
logger.info("Test completed successfully")
finally:
loop.close()
if __name__ == "__main__":
main()