Spaces:
Paused
Paused
| import re | |
| from huggingface_hub import DatasetCard | |
| def parse_markdown(markdown_text): | |
| lines = markdown_text.split("\n") | |
| parsed_lines = [] | |
| skip_section = False | |
| empty_section = True | |
| table_of_contents = False | |
| more_info_pattern = re.compile(r"\[More Information Needed\]\(https?://\S+\)") | |
| html_comment_pattern = re.compile(r"<!--.*?-->") | |
| for line in lines: | |
| if "Table of Contents" in line: | |
| table_of_contents = True | |
| continue | |
| if table_of_contents: | |
| if line.startswith("#"): | |
| table_of_contents = False | |
| else: | |
| continue | |
| if line.startswith("#"): | |
| if skip_section or empty_section: | |
| continue | |
| empty_section = True | |
| if skip_section: | |
| if line.startswith("#"): | |
| skip_section = False | |
| else: | |
| continue | |
| if more_info_pattern.match(line.strip()): | |
| skip_section = True | |
| empty_section = True | |
| continue | |
| if html_comment_pattern.match(line.strip()): | |
| continue | |
| if line.strip(): | |
| empty_section = False | |
| parsed_lines.append(line) | |
| if skip_section or empty_section: | |
| while parsed_lines and parsed_lines[-1].startswith("#"): | |
| parsed_lines.pop() | |
| return "\n".join(parsed_lines) | |
| def is_empty_template(text): | |
| # Define the placeholder phrases | |
| placeholders = [r"\[More Information Needed\]", r"\[optional\]"] | |
| # Remove the placeholder phrases from the text | |
| for placeholder in placeholders: | |
| text = re.sub(placeholder, "", text) | |
| # Remove whitespace and newline characters | |
| text = text.strip() | |
| # Check if the remaining text is empty | |
| return not text | |
| def try_load_text(row): | |
| try: | |
| return DatasetCard(row["card"]).text | |
| except Exception as e: | |
| print(e) | |
| return None | |