Spaces:
Sleeping
Sleeping
Stephen Zweibel
Add initial implementation of FormatReview tool with core features and configurations
bb869fd
import asyncio | |
import nest_asyncio | |
import logging | |
import json | |
from pprint import pprint | |
from config import settings | |
from pydantic import BaseModel, Field | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
logger = logging.getLogger("crawl4ai_test") | |
class FormattingRules(BaseModel): | |
"""Schema for formatting rules extraction""" | |
margins: str = Field(description="Margin requirements for the manuscript") | |
font: str = Field(description="Font requirements including size, type, etc.") | |
line_spacing: str = Field(description="Line spacing requirements") | |
citations: str = Field(description="Citation style and formatting requirements") | |
sections: str = Field(description="Required sections and their structure") | |
other_rules: str = Field(description="Any other formatting requirements") | |
summary: str = Field(description="A brief summary of the key formatting requirements") | |
async def test_crawl(): | |
"""Test crawl4ai functionality""" | |
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig | |
from crawl4ai.extraction_strategy import LLMExtractionStrategy | |
url = "https://journal.code4lib.org/article-guidelines" | |
# Configure the browser | |
browser_config = BrowserConfig(verbose=True) | |
# Configure the LLM extraction | |
extraction_strategy = LLMExtractionStrategy( | |
llm_config=LLMConfig( | |
provider=f"{settings.llm_provider}/{settings.llm_model_name}", | |
api_token=settings.openrouter_api_key | |
), | |
schema=FormattingRules.schema(), | |
extraction_type="schema", | |
instruction=""" | |
From the crawled content, extract all formatting rules for manuscript submissions. | |
Focus on requirements for margins, font, line spacing, citations, section structure, | |
and any other formatting guidelines. Provide a comprehensive extraction of all | |
formatting-related information. | |
""" | |
) | |
# Configure the crawler | |
run_config = CrawlerRunConfig( | |
word_count_threshold=10, | |
exclude_external_links=True, | |
process_iframes=True, | |
remove_overlay_elements=True, | |
exclude_social_media_links=True, | |
check_robots_txt=True, | |
semaphore_count=3, | |
extraction_strategy=extraction_strategy | |
) | |
# Initialize the crawler and run | |
async with AsyncWebCrawler() as crawler: | |
result = await crawler.arun( | |
url=url, | |
config=run_config | |
) | |
# Print all attributes of the result object | |
logger.info(f"Result object type: {type(result)}") | |
logger.info(f"Result object dir: {dir(result)}") | |
# Check for success | |
logger.info(f"Success: {result.success}") | |
# Check for markdown | |
if hasattr(result, 'markdown'): | |
logger.info(f"Has markdown: {bool(result.markdown)}") | |
logger.info(f"Markdown type: {type(result.markdown)}") | |
logger.info(f"Markdown preview: {str(result.markdown)[:200]}...") | |
else: | |
logger.info("No markdown attribute") | |
# Check for extracted_data | |
if hasattr(result, 'extracted_data'): | |
logger.info(f"Has extracted_data: {bool(result.extracted_data)}") | |
logger.info(f"Extracted data: {result.extracted_data}") | |
else: | |
logger.info("No extracted_data attribute") | |
# Check for other potential attributes | |
for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']: | |
if hasattr(result, attr): | |
logger.info(f"Has {attr}: {bool(getattr(result, attr))}") | |
logger.info(f"{attr} preview: {str(getattr(result, attr))[:200]}...") | |
# Try to access _results directly | |
if hasattr(result, '_results'): | |
logger.info(f"Has _results: {bool(result._results)}") | |
if result._results: | |
first_result = result._results[0] | |
logger.info(f"First result type: {type(first_result)}") | |
logger.info(f"First result dir: {dir(first_result)}") | |
# Check if first result has extracted_data | |
if hasattr(first_result, 'extracted_data'): | |
logger.info(f"First result has extracted_data: {bool(first_result.extracted_data)}") | |
logger.info(f"First result extracted_data: {first_result.extracted_data}") | |
# Check for other attributes in first result | |
for attr in ['data', 'extraction', 'llm_extraction', 'content', 'text', 'extracted_content']: | |
if hasattr(first_result, attr): | |
logger.info(f"First result has {attr}: {bool(getattr(first_result, attr))}") | |
logger.info(f"First result {attr} preview: {str(getattr(first_result, attr))[:200]}...") | |
return result | |
def main(): | |
"""Main function""" | |
# Apply nest_asyncio | |
nest_asyncio.apply() | |
# Create a new event loop and run the async function | |
loop = asyncio.new_event_loop() | |
asyncio.set_event_loop(loop) | |
try: | |
result = loop.run_until_complete(test_crawl()) | |
logger.info("Test completed successfully") | |
finally: | |
loop.close() | |
if __name__ == "__main__": | |
main() | |