Spaces:
Runtime error
Runtime error
| # html_to_markdown/dom_utils.py | |
| from bs4 import BeautifulSoup, Tag | |
| from typing import Optional | |
| import logging | |
| from conversion_options import ConversionOptions | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def find_main_content(soup: BeautifulSoup, options: ConversionOptions) -> Tag: | |
| logger.debug("Entering find_main_content function") | |
| main_element = soup.find('main') | |
| if main_element: | |
| logger.debug("Existing <main> element found") | |
| return main_element | |
| logger.debug("No <main> element found. Detecting main content.") | |
| if not soup.body: | |
| logger.debug("No body element found, returning the entire document") | |
| return soup | |
| return detect_main_content(soup.body, options) | |
| def wrap_main_content(main_content: Tag, soup: BeautifulSoup): | |
| if main_content.name.lower() != 'main': | |
| logger.debug("Wrapping main content in <main> element") | |
| main_element = soup.new_tag('main') | |
| main_content.wrap(main_element) | |
| main_element['id'] = 'detected-main-content' | |
| logger.debug("Main content wrapped successfully") | |
| else: | |
| logger.debug("Main content already wrapped") | |
| def detect_main_content(element: Tag, options: ConversionOptions) -> Tag: | |
| candidates = [] | |
| min_score = 20 | |
| logger.debug(f"Collecting candidates with minimum score: {min_score}") | |
| collect_candidates(element, candidates, min_score, options) | |
| logger.debug(f"Total candidates found: {len(candidates)}") | |
| if not candidates: | |
| logger.debug("No suitable candidates found, returning root element") | |
| return element | |
| # Sort candidates by score descending | |
| candidates.sort(key=lambda x: calculate_score(x, options), reverse=True) | |
| logger.debug("Candidates sorted by score") | |
| best_candidate = candidates[0] | |
| for candidate in candidates[1:]: | |
| if not any(other.contains(candidate) for other in candidates): | |
| if calculate_score(candidate, options) > calculate_score(best_candidate, options): | |
| best_candidate = candidate | |
| logger.debug(f"New best independent candidate found: {element_to_string(best_candidate)}") | |
| logger.debug(f"Final main content candidate: {element_to_string(best_candidate)}") | |
| return best_candidate | |
| def element_to_string(element: Optional[Tag]) -> str: | |
| if not element: | |
| return 'No element' | |
| classes = '.'.join(element.get('class', [])) | |
| return f"{element.name}#{element.get('id', 'no-id')}.{classes}" | |
| def collect_candidates(element: Tag, candidates: list, min_score: int, options: ConversionOptions): | |
| score = calculate_score(element, options) | |
| if score >= min_score: | |
| candidates.append(element) | |
| logger.debug(f"Candidate found: {element_to_string(element)}, score: {score}") | |
| for child in element.find_all(recursive=False): | |
| collect_candidates(child, candidates, min_score, options) | |
| def calculate_score(element: Tag, options: ConversionOptions) -> int: | |
| score = 0 | |
| score_log = [] | |
| # High impact attributes | |
| high_impact_attributes = ['article', 'content', 'main-container', 'main', 'main-content'] | |
| for attr in high_impact_attributes: | |
| if 'class' in element.attrs and attr in element['class']: | |
| score += 10 | |
| score_log.append(f"High impact attribute found: {attr}, score increased by 10") | |
| if 'id' in element.attrs and attr in element['id']: | |
| score += 10 | |
| score_log.append(f"High impact ID found: {attr}, score increased by 10") | |
| # High impact tags | |
| high_impact_tags = ['article', 'main', 'section'] | |
| if element.name.lower() in high_impact_tags: | |
| score += 5 | |
| score_log.append(f"High impact tag found: {element.name}, score increased by 5") | |
| # Paragraph count | |
| paragraph_count = len(element.find_all('p')) | |
| paragraph_score = min(paragraph_count, 5) | |
| if paragraph_score > 0: | |
| score += paragraph_score | |
| score_log.append(f"Paragraph count: {paragraph_count}, score increased by {paragraph_score}") | |
| # Text content length | |
| text_content_length = len(element.get_text(strip=True)) | |
| if text_content_length > 200: | |
| text_score = min(text_content_length // 200, 5) | |
| score += text_score | |
| score_log.append(f"Text content length: {text_content_length}, score increased by {text_score}") | |
| # Link density | |
| link_density = calculate_link_density(element) | |
| if link_density < 0.3: | |
| score += 5 | |
| score_log.append(f"Link density: {link_density:.2f}, score increased by 5") | |
| # Data attributes | |
| if element.has_attr('data-main') or element.has_attr('data-content'): | |
| score += 10 | |
| score_log.append("Data attribute for main content found, score increased by 10") | |
| # Role attribute | |
| if element.get('role') and 'main' in element.get('role'): | |
| score += 10 | |
| score_log.append("Role attribute indicating main content found, score increased by 10") | |
| if options.debug and score_log: | |
| logger.debug(f"Scoring for {element_to_string(element)}:") | |
| for log in score_log: | |
| logger.debug(f" {log}") | |
| logger.debug(f" Final score: {score}") | |
| return score | |
| def calculate_link_density(element: Tag) -> float: | |
| links = element.find_all('a') | |
| link_length = sum(len(link.get_text(strip=True)) for link in links) | |
| text_length = len(element.get_text(strip=True)) or 1 # Avoid division by zero | |
| return link_length / text_length | |