""" Markdown Analyzer Library This library provides a comprehensive Markdown parsing and analysis system. It consists of three main components: 1. MarkdownParser: Converts Markdown text into a stream of tokens representing different structural elements (headers, paragraphs, lists, code blocks, etc.). It handles both block-level and inline elements. 2. InlineParser: Processes inline Markdown elements within block tokens, such as: - Links and images - Inline code - Emphasis (bold, italic) - Footnotes - HTML inline elements 3. MarkdownAnalyzer: The main interface that combines parsing and analysis. It: - Parses the input text into tokens - Processes inline elements within tokens - Provides methods to identify and analyze different Markdown elements - Generates statistics about the document structure Usage: analyzer = MarkdownAnalyzer(markdown_text) analysis = analyzer.analyze() # Get document statistics headers = analyzer.identify_headers() # Get all headers links = analyzer.identify_links() # Get all links # etc. The library supports standard Markdown features including: - Headers (ATX and Setext style) - Lists (ordered, unordered, and task lists) - Code blocks (fenced and inline) - Blockquotes - Tables - Links and images - Footnotes - HTML blocks and inline elements """ import re from collections import defaultdict ### MAIN INTERFACE ### class MarkdownAnalyzer: # def __init__(self, file_path, encoding='utf-8'): def __init__(self, text): # with open(file_path, 'r', encoding=encoding) as f: # self.text = f.read() self.text = text parser = MarkdownParser(self.text) self.tokens = parser.parse() self.references = parser.references self.footnotes = parser.footnotes self.inline_parser = InlineParser( references=self.references, footnotes=self.footnotes ) self._parse_inline_tokens() def _parse_inline_tokens(self): inline_types = ("paragraph", "header", "blockquote") for token in self.tokens: if token.type in inline_types and token.content: inline_data = self.inline_parser.parse_inline(token.content) token.meta.update(inline_data) def identify_headers(self): result = defaultdict(list) for token in self.tokens: if token.type == "header": result["Header"].append( {"line": token.line, "level": token.level, "text": token.content} ) return dict(result) def identify_paragraphs(self): result = defaultdict(list) for token in self.tokens: if token.type == "paragraph": result["Paragraph"].append(token.content) return dict(result) def identify_blockquotes(self): result = defaultdict(list) for token in self.tokens: if token.type == "blockquote": result["Blockquote"].append(token.content) return dict(result) def identify_code_blocks(self): result = defaultdict(list) for token in self.tokens: if token.type == "code": result["Code block"].append( { "start_line": token.line, "content": token.content, "language": token.meta.get("language"), } ) return dict(result) def identify_lists(self): result = defaultdict(list) for token in self.tokens: if token.type == "ordered_list": result["Ordered list"].append(token.meta["items"]) elif token.type == "unordered_list": result["Unordered list"].append(token.meta["items"]) return dict(result) def identify_tables(self): result = defaultdict(list) for token in self.tokens: if token.type == "table": result["Table"].append( {"header": token.meta["header"], "rows": token.meta["rows"]} ) return dict(result) def identify_links(self): result = defaultdict(list) for token in self.tokens: if "text_links" in token.meta: for l in token.meta["text_links"]: result["Text link"].append( {"line": token.line, "text": l["text"], "url": l["url"]} ) if "image_links" in token.meta: for img in token.meta["image_links"]: result["Image link"].append( { "line": token.line, "alt_text": img["alt_text"], "url": img["url"], } ) return dict(result) def identify_footnotes(self): result = [] seen = set() for token in self.tokens: if "footnotes_used" in token.meta: for fn in token.meta["footnotes_used"]: key = (fn["id"], fn["content"]) if key not in seen: seen.add(key) result.append( { "line": token.line, "id": fn["id"], "content": fn["content"], } ) return result def identify_inline_code(self): codes = [] for token in self.tokens: if "inline_code" in token.meta: for c in token.meta["inline_code"]: codes.append({"line": token.line, "code": c}) return codes def identify_emphasis(self): ems = [] for token in self.tokens: if "emphasis" in token.meta: for e in token.meta["emphasis"]: ems.append({"line": token.line, "text": e}) return ems def identify_task_items(self): tasks = [] for token in self.tokens: if token.type in ("ordered_list", "unordered_list"): for it in token.meta["items"]: if it.get("task_item"): tasks.append( { "line": token.line, "text": it["text"], "checked": it["checked"], } ) return tasks def identify_html_blocks(self): # Gets HTML blocks result = [] for token in self.tokens: if token.type == "html_block": result.append({"line": token.line, "content": token.content}) return result def identify_html_inline(self): # Gets HTML tags from inline tokens result = [] inline_types = ("paragraph", "header", "blockquote") for token in self.tokens: if token.type in inline_types and "html_inline" in token.meta: for h in token.meta["html_inline"]: result.append({"line": token.line, "html": h}) return result def count_words(self): words = self.text.split() return len(words) def count_characters(self): characters = [char for char in self.text if not char.isspace()] return len(characters) def analyze(self): headers = self.identify_headers().get("Header", []) paragraphs = self.identify_paragraphs().get("Paragraph", []) blockquotes = self.identify_blockquotes().get("Blockquote", []) code_blocks = self.identify_code_blocks().get("Code block", []) lists = self.identify_lists() ordered_lists = lists.get("Ordered list", []) unordered_lists = lists.get("Unordered list", []) tables = self.identify_tables().get("Table", []) html_blocks = self.identify_html_blocks() html_inline = self.identify_html_inline() analysis = { "headers": len(headers), "paragraphs": len(paragraphs), "blockquotes": len(blockquotes), "code_blocks": len(code_blocks), "ordered_lists": sum(len(l) for l in ordered_lists), "unordered_lists": sum(len(l) for l in unordered_lists), "tables": len(tables), "html_blocks": len(html_blocks), "html_inline_count": len(html_inline), "words": self.count_words(), "characters": self.count_characters(), } return analysis ### PARSING CLASSES ### class BlockToken: """Represents a block-level Markdown element with its type, content, and metadata.""" def __init__(self, type_, content="", level=None, meta=None, line=None): self.type = type_ # Type of block (header, paragraph, code, etc.) self.content = content # The actual content of the block self.level = level # Used for headers (h1-h6) and list indentation self.meta = meta or {} # Additional metadata (language for code blocks, etc.) self.line = line # Line number in the original document class InlineParser: # Regular expressions for matching inline Markdown elements IMAGE_OR_LINK_RE = re.compile( r"(!?\[([^\]]*)\])(\(([^\)]+)\)|\[([^\]]+)\])" ) # Matches [text](url) or ![alt](url) CODE_INLINE_RE = re.compile(r"`([^`]+)`") # Matches `code` EMPHASIS_RE = re.compile( r"(\*\*|__)(.*?)\1|\*(.*?)\*|_(.*?)_" ) # Matches **bold**, *italic*, _underline_ FOOTNOTE_RE = re.compile(r"\[\^([^\]]+)\]") # Matches [^footnote] HTML_INLINE_RE = re.compile(r"<[a-zA-Z/][^>]*>") # Matches HTML tags HTML_INLINE_BLOCK_RE = re.compile( r"<([a-zA-Z]+)([^>]*)>(.*?)", re.DOTALL ) # Matches HTML blocks with content def __init__(self, references=None, footnotes=None): # Initialize with optional reference links and footnotes from the document self.references = references or {} # For [text][ref] style links self.footnotes = footnotes or {} # For [^footnote] style references def parse_inline(self, text): """Parse inline Markdown elements within a block of text.""" result = { "text_links": [], # Regular [text](url) links "image_links": [], # ![alt](url) images "inline_code": [], # `code` blocks "emphasis": [], # **bold**, *italic* text "footnotes_used": [], # [^footnote] references "html_inline": [], # HTML tags and blocks } # Process footnotes first to avoid conflicts with other patterns used_footnotes = set() for fm in self.FOOTNOTE_RE.finditer(text): fid = fm.group(1) if fid in self.footnotes and fid not in used_footnotes: used_footnotes.add(fid) result["footnotes_used"].append( {"id": fid, "content": self.footnotes[fid]} ) # Find inline code blocks for cm in self.CODE_INLINE_RE.finditer(text): code = cm.group(1) result["inline_code"].append(code) # Find emphasized text (bold, italic, underline) for em_match in self.EMPHASIS_RE.finditer(text): emphasized_text = ( em_match.group(2) or em_match.group(3) or em_match.group(4) ) if emphasized_text: result["emphasis"].append(emphasized_text) # Process HTML blocks first to avoid conflicts with other patterns temp_text = text for block_match in self.HTML_INLINE_BLOCK_RE.finditer(text): html_content = block_match.group(0) result["html_inline"].append(html_content) temp_text = temp_text.replace(html_content, "") # Process links and images for mm in self.IMAGE_OR_LINK_RE.finditer(temp_text): prefix = mm.group(1) # The [text] or ![alt] part inner_text = mm.group(2) # The text inside [] url = mm.group(4) # The (url) part ref_id = mm.group(5) # The [ref] part for reference-style links is_image = prefix.startswith("!") final_url = url if ref_id and ref_id.lower() in self.references: final_url = self.references[ref_id.lower()] if is_image: if final_url: result["image_links"].append( {"alt_text": inner_text, "url": final_url} ) else: if final_url: result["text_links"].append({"text": inner_text, "url": final_url}) return result class MarkdownParser: # Regular expressions for matching block-level Markdown elements FRONTMATTER_RE = re.compile(r"^---\s*$") # Matches YAML frontmatter delimiters ATX_HEADER_RE = re.compile(r"^(#{1,6})\s+(.*)$") # Matches # Header style SETEXT_H1_RE = re.compile(r"^=+\s*$") # Matches ==== style h1 SETEXT_H2_RE = re.compile(r"^-+\s*$") # Matches ---- style h2 FENCE_RE = re.compile(r"^```([^`]*)$") # Matches code fence start BLOCKQUOTE_RE = re.compile(r"^(>\s?)(.*)$") # Matches > quote style ORDERED_LIST_RE = re.compile(r"^\s*\d+\.\s+(.*)$") # Matches 1. list style UNORDERED_LIST_RE = re.compile(r"^\s*[-+*]\s+(.*)$") # Matches - list style HR_RE = re.compile(r"^(\*{3,}|-{3,}|_{3,})\s*$") # Matches horizontal rules TABLE_SEPARATOR_RE = re.compile( r"^\|?(\s*:?-+:?\s*\|)+\s*:?-+:?\s*\|?\s*$" ) # Matches table separators REFERENCE_DEF_RE = re.compile( r"^\[([^\]]+)\]:\s+(.*?)\s*$", re.MULTILINE ) # Matches [ref]: url definitions FOOTNOTE_DEF_RE = re.compile( r"^\[\^([^\]]+)\]:\s+(.*?)\s*$", re.MULTILINE ) # Matches [^footnote]: content HTML_BLOCK_START = re.compile( r"^(<([a-zA-Z]+)([^>]*)>|\s*$") # Matches HTML comment end def __init__(self, text): """Initialize parser with the Markdown text to parse.""" self.lines = text.split("\n") self.length = len(self.lines) self.pos = 0 # Current position in the text self.tokens = [] # List of parsed tokens self.text = text self.references = {} # Reference-style link definitions self.footnotes = {} # Footnote definitions self.extract_references_and_footnotes() def extract_references_and_footnotes(self): """Extract all reference-style links and footnotes from the document.""" for m in self.REFERENCE_DEF_RE.finditer(self.text): rid, url = m.groups() self.references[rid.lower()] = url for m in self.FOOTNOTE_DEF_RE.finditer(self.text): fid, content = m.groups() self.footnotes[fid] = content def parse(self): """Main parsing method that processes the entire document.""" # Check for frontmatter at the start if self.pos < self.length and self.FRONTMATTER_RE.match( self.lines[self.pos].strip() ): self.parse_frontmatter() # Process the document line by line while self.pos < self.length: if self.pos >= self.length: break line = self.lines[self.pos] if not line.strip(): self.pos += 1 continue # Check for table start if self.is_table_start(): self.parse_table() continue # Check for HTML block if self.is_html_block_start(line): self.parse_html_block() continue # Check for ATX-style headers (# Header) m = self.ATX_HEADER_RE.match(line) if m: level = len(m.group(1)) text = m.group(2).strip() self.tokens.append( BlockToken("header", content=text, level=level, line=self.pos + 1) ) self.pos += 1 continue # Check for Setext-style headers (=== or ---) if self.pos + 1 < self.length: next_line = self.lines[self.pos + 1].strip() if self.SETEXT_H1_RE.match(next_line): text = line.strip() self.tokens.append( BlockToken("header", content=text, level=1, line=self.pos + 1) ) self.pos += 2 continue if self.SETEXT_H2_RE.match(next_line): text = line.strip() self.tokens.append( BlockToken("header", content=text, level=2, line=self.pos + 1) ) self.pos += 2 continue # Check for horizontal rule if self.HR_RE.match(line.strip()): self.tokens.append(BlockToken("hr", line=self.pos + 1)) self.pos += 1 continue # Check for fenced code block fm = self.FENCE_RE.match(line.strip()) if fm: lang = fm.group(1).strip() self.parse_fenced_code_block(lang) continue # Check for blockquote bm = self.BLOCKQUOTE_RE.match(line) if bm: self.parse_blockquote() continue # Check for lists om = self.ORDERED_LIST_RE.match(line) um = self.UNORDERED_LIST_RE.match(line) if om or um: self.parse_list(ordered=bool(om)) continue # If no other block type matches, treat as paragraph self.parse_paragraph() return self.tokens def is_html_block_start(self, line): # Verify if line seems to be HTML return self.HTML_BLOCK_START.match(line.strip()) is not None def parse_html_block(self): start = self.pos lines = [] first_line = self.lines[self.pos].strip() comment_mode = first_line.startswith("