Spaces:

brickfrog
/

ankigen

Running

File size: 13,800 Bytes

100024e

import pytest
import requests_mock
from bs4 import BeautifulSoup

from ankigen_core.crawler import WebCrawler

BASE_URL = "http://example.com"
SUB_PAGE_URL = f"{BASE_URL}/subpage"
EXTERNAL_URL = "http://anotherdomain.com"


@pytest.fixture
def crawler_fixture():
    return WebCrawler(start_url=BASE_URL, max_depth=1)


@pytest.fixture
def crawler_with_patterns_fixture():
    return WebCrawler(
        start_url=BASE_URL,
        max_depth=1,
        include_patterns=[r"http://example\.com/docs/.*"],
        exclude_patterns=[r"http://example\.com/docs/v1/.*"],
    )


# --- Tests for _is_valid_url ---


def test_is_valid_url_valid(crawler_fixture):
    assert crawler_fixture._is_valid_url(f"{BASE_URL}/page1")
    assert crawler_fixture._is_valid_url(f"{BASE_URL}/another/page")


def test_is_valid_url_different_domain(crawler_fixture):
    assert not crawler_fixture._is_valid_url("http://otherdomain.com/page")


def test_is_valid_url_different_scheme(crawler_fixture):
    assert not crawler_fixture._is_valid_url("ftp://example.com/page")
    assert not crawler_fixture._is_valid_url(
        "mailto:[email protected]"
    )  # Schemes like mailto will be filtered by _extract_links first


def test_is_valid_url_malformed(crawler_fixture):
    assert not crawler_fixture._is_valid_url(
        "htp://example.com/page"
    )  # urlparse might handle this, but scheme check will fail
    assert not crawler_fixture._is_valid_url(
        "http:///page"
    )  # Malformed, netloc might be empty


def test_is_valid_url_include_patterns_match(crawler_with_patterns_fixture):
    assert crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/page1")
    assert crawler_with_patterns_fixture._is_valid_url(
        f"{BASE_URL}/docs/topic/subtopic"
    )


def test_is_valid_url_include_patterns_no_match(crawler_with_patterns_fixture):
    assert not crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/blog/page1")


def test_is_valid_url_exclude_patterns_match(crawler_with_patterns_fixture):
    # This URL matches include, but also exclude, so it should be invalid
    assert not crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/v1/page1")


def test_is_valid_url_exclude_patterns_no_match(crawler_with_patterns_fixture):
    # This URL matches include and does not match exclude
    assert crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/v2/page1")


def test_is_valid_url_no_patterns_defined(crawler_fixture):
    # Default crawler has no patterns, should allow any same-domain http/https URL
    assert crawler_fixture._is_valid_url(f"{BASE_URL}/any/path")


# --- Tests for _extract_links ---


@pytest.mark.parametrize(
    "html_content, base_url, expected_links",
    [
        # Basic relative and absolute links
        (
            """<a href="/page1">1</a> <a href="http://example.com/page2">2</a>""",
            BASE_URL,
            [f"{BASE_URL}/page1", f"{BASE_URL}/page2"],
        ),
        # Fragment and JS links
        (
            """<a href="#section">S</a> <a href="javascript:void(0)">JS</a> <a href="/page3">3</a>""",
            BASE_URL,
            [f"{BASE_URL}/page3"],
        ),
        # External link
        (
            """<a href="http://anotherdomain.com">Ext</a> <a href="/page4">4</a>""",
            BASE_URL,
            [f"{BASE_URL}/page4"],
        ),  # External link will be filtered by _is_valid_url
        # No href
        ("""<a>No Href</a> <a href="/page5">5</a>""", BASE_URL, [f"{BASE_URL}/page5"]),
        # Empty href
        (
            """<a href="">Empty Href</a> <a href="/page6">6</a>""",
            BASE_URL,
            [f"{BASE_URL}/page6"],
        ),
        # Base tag impact (not directly tested here, urljoin handles it)
        (
            """<a href="sub/page7">7</a>""",
            f"{BASE_URL}/path/",
            [f"{BASE_URL}/path/sub/page7"],
        ),
    ],
)
def test_extract_links(crawler_fixture, html_content, base_url, expected_links):
    soup = BeautifulSoup(html_content, "html.parser")
    # For this test, we assume _is_valid_url allows same-domain http/https
    # We can mock _is_valid_url if we need finer control for specific link tests
    actual_links = crawler_fixture._extract_links(soup, base_url)
    assert sorted(actual_links) == sorted(expected_links)


def test_extract_links_with_filtering(crawler_with_patterns_fixture):
    html = """
        <a href="http://example.com/docs/pageA">Allowed Doc</a>
        <a href="http://example.com/docs/v1/pageB">Excluded Doc v1</a>
        <a href="http://example.com/blog/pageC">Non-Doc Page</a>
        <a href="http://example.com/docs/v2/pageD">Allowed Doc v2</a>
    """
    soup = BeautifulSoup(html, "html.parser")
    # _is_valid_url from crawler_with_patterns_fixture will be used
    expected = [f"{BASE_URL}/docs/pageA", f"{BASE_URL}/docs/v2/pageD"]
    actual_links = crawler_with_patterns_fixture._extract_links(soup, BASE_URL)
    assert sorted(actual_links) == sorted(expected)


# --- Tests for _extract_text ---
@pytest.mark.parametrize(
    "html_content, expected_text",
    [
        (
            "<html><head><title>T</title><script>alert('x');</script><style>.c{}</style></head><body><p>Hello</p><div>World</div></body></html>",
            "T Hello World",
        ),
        ("<body>Just text</body>", "Just text"),
        (
            "<body><nav>Menu</nav><main><p>Main content</p></main><footer>Foot</footer></body>",
            "Menu Main content Foot",
        ),  # Assuming no removal of nav/footer for now
    ],
)
def test_extract_text(crawler_fixture, html_content, expected_text):
    soup = BeautifulSoup(html_content, "html.parser")
    assert crawler_fixture._extract_text(soup) == expected_text


# --- Integration Tests for crawl ---


def test_crawl_single_page_no_links(crawler_fixture):
    with requests_mock.Mocker() as m:
        m.get(
            BASE_URL,
            text="<html><head><title>Test Title</title></head><body>No links here.</body></html>",
        )

        pages = crawler_fixture.crawl()

        assert len(pages) == 1
        page = pages[0]
        assert page.url == BASE_URL
        assert page.title == "Test Title"
        assert "No links here" in page.text_content
        assert page.meta_description is None
        assert page.meta_keywords == []


def test_crawl_with_links_and_depth(crawler_fixture):
    # crawler_fixture has max_depth=1
    with requests_mock.Mocker() as m:
        m.get(
            BASE_URL,
            text=f"""<html><head><title>Main</title><meta name="description" content="Main page desc"><meta name="keywords" content="main, test"></head>
                                 <body><a href="{SUB_PAGE_URL}">Subpage</a> <a href="{EXTERNAL_URL}">External</a></body></html>""",
        )
        m.get(
            SUB_PAGE_URL,
            text="""<html><head><title>Sub</title></head><body>Subpage content. <a href="http://example.com/another_sub">Deeper</a></body></html>""",
        )  # Deeper link should not be followed
        m.get(EXTERNAL_URL, text="External content")  # Should not be crawled

        pages = crawler_fixture.crawl()

        assert len(pages) == 2  # Main page and one subpage

        main_page = next(p for p in pages if p.url == BASE_URL)
        sub_page = next(p for p in pages if p.url == SUB_PAGE_URL)

        assert main_page.title == "Main"
        assert main_page.meta_description == "Main page desc"
        assert sorted(main_page.meta_keywords) == sorted(["main", "test"])
        assert "Subpage" in main_page.text_content  # Link text

        assert sub_page.title == "Sub"
        assert "Subpage content" in sub_page.text_content
        assert sub_page.crawl_depth == 1
        assert sub_page.parent_url == BASE_URL

        # Verify deeper link from sub_page was not added to queue or crawled
        assert len(crawler_fixture.visited_urls) == 2
        # Check queue is empty (not directly accessible, but len(pages) implies this)


def test_crawl_respects_max_depth_zero(crawler_fixture):
    crawler_fixture.max_depth = 0
    with requests_mock.Mocker() as m:
        m.get(
            BASE_URL,
            text=f"""<html><head><title>Depth Zero</title></head>
                                 <body><a href="{SUB_PAGE_URL}">Link</a></body></html>""",
        )

        pages = crawler_fixture.crawl()
        assert len(pages) == 1
        assert pages[0].url == BASE_URL
        assert pages[0].title == "Depth Zero"
        assert len(crawler_fixture.visited_urls) == 1


def test_crawl_handles_http_error(crawler_fixture):
    with requests_mock.Mocker() as m:
        m.get(
            BASE_URL,
            text=f"""<html><head><title>Main</title></head><body><a href="{SUB_PAGE_URL}">Subpage</a></body></html>""",
        )
        m.get(SUB_PAGE_URL, status_code=404, text="Not Found")

        pages = crawler_fixture.crawl()

        assert len(pages) == 1  # Only main page should be crawled successfully
        assert pages[0].url == BASE_URL
        # SUB_PAGE_URL should be in visited_urls because an attempt was made
        assert SUB_PAGE_URL in crawler_fixture.visited_urls


def test_crawl_include_exclude_patterns(crawler_with_patterns_fixture):
    # Patterns: include example.com/docs/*, exclude example.com/docs/v1/*
    # Max_depth is 1

    page_docs_allowed = f"{BASE_URL}/docs/allowed"
    page_docs_v1_excluded = f"{BASE_URL}/docs/v1/excluded"
    page_docs_v2_allowed = (
        f"{BASE_URL}/docs/v2/allowed_link"  # Will be linked from page_docs_allowed
    )
    page_blog_excluded = f"{BASE_URL}/blog/initial_link"  # This should not even be crawled from start_url due to include pattern

    crawler_with_patterns_fixture.start_url = (
        page_docs_allowed  # Change start to test include
    )

    with requests_mock.Mocker() as m:
        # This page matches include and not exclude
        m.get(
            page_docs_allowed,
            text=f"""<html><head><title>Docs Allowed</title></head>
                                        <body>
                                            <a href="{page_docs_v1_excluded}">To Excluded v1</a>
                                            <a href="{page_docs_v2_allowed}">To Allowed v2</a>
                                            <a href="{page_blog_excluded}">To Blog</a>
                                        </body></html>""",
        )
        # These should not be crawled due to patterns or domain
        m.get(page_docs_v1_excluded, text="V1 Excluded Content")
        m.get(
            page_docs_v2_allowed,
            text="<html><head><title>Docs V2 Allowed</title></head><body>V2 Content</body></html>",
        )  # Should be crawled (depth 1)
        m.get(page_blog_excluded, text="Blog Content")

        pages = crawler_with_patterns_fixture.crawl()

        assert len(pages) == 2  # page_docs_allowed and page_docs_v2_allowed

        crawled_urls = [p.url for p in pages]
        assert page_docs_allowed in crawled_urls
        assert page_docs_v2_allowed in crawled_urls

        assert page_docs_v1_excluded not in crawled_urls
        assert page_blog_excluded not in crawled_urls

        page_v2 = next(p for p in pages if p.url == page_docs_v2_allowed)
        assert page_v2.title == "Docs V2 Allowed"


def test_crawl_progress_callback(crawler_fixture):
    # Test that the progress callback is called.
    # Define a simple callback that appends to a list
    progress_log = []

    def callback(processed_count, total_urls, current_url):
        progress_log.append((processed_count, total_urls, current_url))

    with requests_mock.Mocker() as m:
        m.get(
            BASE_URL,
            text=f"""<html><head><title>Main</title></head>
                                 <body>
                                     <a href="{SUB_PAGE_URL}">Subpage</a>
                                     <a href="{BASE_URL}/another">Another</a>
                                 </body></html>""",
        )
        m.get(SUB_PAGE_URL, text="<html><body>Sub</body></html>")
        m.get(f"{BASE_URL}/another", text="<html><body>Another</body></html>")

        crawler_fixture.crawl(progress_callback=callback)

        # Based on current implementation: initial call, then 2 calls per URL (before/after processing within _crawl_recursive)
        # Initial call from crawl() for start_url
        # For start_url in _crawl_recursive: before processing, after processing (finds 2 new links)
        # For sub_page_url in _crawl_recursive: before processing, after processing (finds 0 new links)
        # For another_url in _crawl_recursive: before processing, after processing (finds 0 new links)
        # Total = 1 (initial) + 2 (start_url) + 2 (sub_page) + 2 (another_url) = 7 calls
        # The final "Crawl Complete" call is not captured if the test focuses on URL processing calls.
        assert (
            len(progress_log) == 7
        )  # MODIFIED: Expect 7 calls for 3 URLs based on current logic

        # Optionally, verify the content of progress_log if specific stages are important
        # For example, check that each URL appears

        # Check specific calls (order can be tricky with sets, focus on counts)
        # The first call to progress_callback is from crawl() method, with processed_count = 0
        assert progress_log[0][0] == 0
        assert progress_log[0][2] == BASE_URL  # Initial call for the base URL

        # Example: Check that after the first URL is fully processed (which means multiple calls),
        # processed_count becomes 1 when the *next* URL starts. This is complex to assert directly
        # on specific indices without knowing exact call order if it varies.
        # For simplicity, we've already asserted the total number of calls.