Spaces:

oceansweep
/

tldw

Runtime error

App Files Files Community

tldw / App_Function_Libraries /html_to_markdown /dom_utils.py

oceansweep

Upload 155 files

43cd37c verified about 1 year ago

raw

history blame

5.64 kB

	# html_to_markdown/dom_utils.py

	from bs4 import BeautifulSoup, Tag
	from typing import Optional
	import logging

	from conversion_options import ConversionOptions

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def find_main_content(soup: BeautifulSoup, options: ConversionOptions) -> Tag:
	logger.debug("Entering find_main_content function")

	main_element = soup.find('main')
	if main_element:
	logger.debug("Existing <main> element found")
	return main_element

	logger.debug("No <main> element found. Detecting main content.")
	if not soup.body:
	logger.debug("No body element found, returning the entire document")
	return soup

	return detect_main_content(soup.body, options)

	def wrap_main_content(main_content: Tag, soup: BeautifulSoup):
	if main_content.name.lower() != 'main':
	logger.debug("Wrapping main content in <main> element")
	main_element = soup.new_tag('main')
	main_content.wrap(main_element)
	main_element['id'] = 'detected-main-content'
	logger.debug("Main content wrapped successfully")
	else:
	logger.debug("Main content already wrapped")

	def detect_main_content(element: Tag, options: ConversionOptions) -> Tag:
	candidates = []
	min_score = 20
	logger.debug(f"Collecting candidates with minimum score: {min_score}")
	collect_candidates(element, candidates, min_score, options)

	logger.debug(f"Total candidates found: {len(candidates)}")

	if not candidates:
	logger.debug("No suitable candidates found, returning root element")
	return element

	# Sort candidates by score descending
	candidates.sort(key=lambda x: calculate_score(x, options), reverse=True)
	logger.debug("Candidates sorted by score")

	best_candidate = candidates[0]
	for candidate in candidates[1:]:
	if not any(other.contains(candidate) for other in candidates):
	if calculate_score(candidate, options) > calculate_score(best_candidate, options):
	best_candidate = candidate
	logger.debug(f"New best independent candidate found: {element_to_string(best_candidate)}")

	logger.debug(f"Final main content candidate: {element_to_string(best_candidate)}")
	return best_candidate

	def element_to_string(element: Optional[Tag]) -> str:
	if not element:
	return 'No element'
	classes = '.'.join(element.get('class', []))
	return f"{element.name}#{element.get('id', 'no-id')}.{classes}"

	def collect_candidates(element: Tag, candidates: list, min_score: int, options: ConversionOptions):
	score = calculate_score(element, options)
	if score >= min_score:
	candidates.append(element)
	logger.debug(f"Candidate found: {element_to_string(element)}, score: {score}")

	for child in element.find_all(recursive=False):
	collect_candidates(child, candidates, min_score, options)

	def calculate_score(element: Tag, options: ConversionOptions) -> int:
	score = 0
	score_log = []

	# High impact attributes
	high_impact_attributes = ['article', 'content', 'main-container', 'main', 'main-content']
	for attr in high_impact_attributes:
	if 'class' in element.attrs and attr in element['class']:
	score += 10
	score_log.append(f"High impact attribute found: {attr}, score increased by 10")
	if 'id' in element.attrs and attr in element['id']:
	score += 10
	score_log.append(f"High impact ID found: {attr}, score increased by 10")

	# High impact tags
	high_impact_tags = ['article', 'main', 'section']
	if element.name.lower() in high_impact_tags:
	score += 5
	score_log.append(f"High impact tag found: {element.name}, score increased by 5")

	# Paragraph count
	paragraph_count = len(element.find_all('p'))
	paragraph_score = min(paragraph_count, 5)
	if paragraph_score > 0:
	score += paragraph_score
	score_log.append(f"Paragraph count: {paragraph_count}, score increased by {paragraph_score}")

	# Text content length
	text_content_length = len(element.get_text(strip=True))
	if text_content_length > 200:
	text_score = min(text_content_length // 200, 5)
	score += text_score
	score_log.append(f"Text content length: {text_content_length}, score increased by {text_score}")

	# Link density
	link_density = calculate_link_density(element)
	if link_density < 0.3:
	score += 5
	score_log.append(f"Link density: {link_density:.2f}, score increased by 5")

	# Data attributes
	if element.has_attr('data-main') or element.has_attr('data-content'):
	score += 10
	score_log.append("Data attribute for main content found, score increased by 10")

	# Role attribute
	if element.get('role') and 'main' in element.get('role'):
	score += 10
	score_log.append("Role attribute indicating main content found, score increased by 10")

	if options.debug and score_log:
	logger.debug(f"Scoring for {element_to_string(element)}:")
	for log in score_log:
	logger.debug(f" {log}")
	logger.debug(f" Final score: {score}")

	return score

	def calculate_link_density(element: Tag) -> float:
	links = element.find_all('a')
	link_length = sum(len(link.get_text(strip=True)) for link in links)
	text_length = len(element.get_text(strip=True)) or 1 # Avoid division by zero
	return link_length / text_length