dataset-tldr

Paused

dataset-tldr / card_processing.py

Add card_processing.py for markdown parsing and text loading

1f7ca14 over 1 year ago

1.95 kB

	import re

	from huggingface_hub import DatasetCard


	def parse_markdown(markdown_text):
	lines = markdown_text.split("\n")
	parsed_lines = []
	skip_section = False
	empty_section = True
	table_of_contents = False
	more_info_pattern = re.compile(r"\[More Information Needed\]\(https?://\S+\)")
	html_comment_pattern = re.compile(r"<!--.*?-->")

	for line in lines:
	if "Table of Contents" in line:
	table_of_contents = True
	continue
	if table_of_contents:
	if line.startswith("#"):
	table_of_contents = False
	else:
	continue

	if line.startswith("#"):
	if skip_section or empty_section:
	continue
	empty_section = True

	if skip_section:
	if line.startswith("#"):
	skip_section = False

	else:
	continue
	if more_info_pattern.match(line.strip()):
	skip_section = True
	empty_section = True
	continue

	if html_comment_pattern.match(line.strip()):
	continue

	if line.strip():
	empty_section = False
	parsed_lines.append(line)

	if skip_section or empty_section:
	while parsed_lines and parsed_lines[-1].startswith("#"):
	parsed_lines.pop()

	return "\n".join(parsed_lines)


	def is_empty_template(text):
	# Define the placeholder phrases
	placeholders = [r"\[More Information Needed\]", r"\[optional\]"]
	# Remove the placeholder phrases from the text
	for placeholder in placeholders:
	text = re.sub(placeholder, "", text)
	# Remove whitespace and newline characters
	text = text.strip()
	# Check if the remaining text is empty
	return not text


	def try_load_text(row):
	try:
	return DatasetCard(row["card"]).text
	except Exception as e:
	print(e)
	return None