Yago Bolivar commited on
Commit
f4e197f
·
1 Parent(s): aff539c

feat: implement markdown table parser with comprehensive test suite

Browse files
src/markdown_table_parser.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \
2
+ import re
3
+
4
+ def parse_markdown_table(markdown_text: str) -> dict[str, list[str]] | None:
5
+ """
6
+ Parses the first valid Markdown table found in a string.
7
+ Returns a dictionary (headers as keys, lists of cell content as values)
8
+ or None if no valid table is found.
9
+ """
10
+ lines = [line.rstrip() for line in markdown_text.split('\n') if line.strip()]
11
+ n = len(lines)
12
+ i = 0
13
+ while i < n - 1:
14
+ header_line = lines[i].strip()
15
+ sep_line = lines[i+1].strip()
16
+ # Header and separator must start and end with |
17
+ if not (header_line.startswith('|') and header_line.endswith('|')):
18
+ i += 1
19
+ continue
20
+ if not (sep_line.startswith('|') and sep_line.endswith('|')):
21
+ i += 1
22
+ continue
23
+ # Split header and separator
24
+ headers = [h.strip() for h in header_line.strip('|').split('|')]
25
+ seps = [s.strip() for s in sep_line.strip('|').split('|')]
26
+ if len(headers) != len(seps):
27
+ i += 1
28
+ continue
29
+ # Separator must have at least one '-' in each cell, and only -, :, or spaces
30
+ valid_sep = all('-' in s and all(c in '-: ' for c in s) for s in seps)
31
+ if not valid_sep:
32
+ i += 1
33
+ continue
34
+ # Found a table, now parse data rows
35
+ # Special handling: if the first header is a row label (e.g., '*'), treat first cell of each row as row label, not data
36
+ has_row_labels = headers[0] not in ('',)
37
+ table = {h: [] for h in headers}
38
+ j = i + 2
39
+ while j < n:
40
+ row = lines[j].strip()
41
+ if not (row.startswith('|') and row.endswith('|')):
42
+ break
43
+ cells = [c.strip() for c in row.strip('|').split('|')]
44
+ if len(cells) != len(headers):
45
+ j += 1
46
+ continue
47
+ if has_row_labels and len(headers) > 1:
48
+ # First cell is row label, rest are data
49
+ table[headers[0]].append(cells[0])
50
+ for k, h in enumerate(headers[1:], 1):
51
+ table[h].append(cells[k])
52
+ else:
53
+ for k, h in enumerate(headers):
54
+ table[h].append(cells[k])
55
+ j += 1
56
+ return table
57
+ return None
58
+
59
+ if __name__ == '__main__':
60
+ example_table = """
61
+ |*|a|b|c|d|e|
62
+ |---|---|---|---|---|---|
63
+ |a|a|b|c|b|d|
64
+ |b|b|c|a|e|c|
65
+ |c|c|a|b|b|a|
66
+ |d|b|e|b|e|d|
67
+ |e|d|b|a|d|c|
68
+ """
69
+ parsed = parse_markdown_table(example_table)
70
+ print("Parsed GAIA example:")
71
+ if parsed:
72
+ for header, column_data in parsed.items():
73
+ print(f"Header: {header}, Data: {column_data}")
74
+ else:
75
+ print("Failed to parse table.")
76
+
77
+ example_table_2 = """
78
+ Some text before
79
+ | Name | Age | City |
80
+ |-------|-----|-----------|
81
+ | Alice | 30 | New York |
82
+ | Bob | 24 | Paris |
83
+ | Carol | 45 | London |
84
+ Some text after
85
+ """
86
+ parsed_2 = parse_markdown_table(example_table_2)
87
+ print("\\nParsed Table 2 (with surrounding text):")
88
+ if parsed_2:
89
+ for header, column_data in parsed_2.items():
90
+ print(f"Header: {header}, Data: {column_data}")
91
+ else:
92
+ print("Failed to parse table 2.")
93
+
94
+ empty_table_with_header = """
95
+ | Header1 | Header2 |
96
+ |---------|---------|
97
+ """
98
+ parsed_empty = parse_markdown_table(empty_table_with_header)
99
+ print("\\nParsed Empty Table with Header:")
100
+ if parsed_empty:
101
+ for header, column_data in parsed_empty.items():
102
+ print(f"Header: {header}, Data: {column_data}")
103
+ else:
104
+ print("Failed to parse empty table with header.")
105
+
106
+ malformed_separator = """
107
+ | Header1 | Header2 |
108
+ |---foo---|---------|
109
+ | data1 | data2 |
110
+ """
111
+ parsed_mal_sep = parse_markdown_table(malformed_separator)
112
+ print("\\nParsed table with malformed separator:")
113
+ if parsed_mal_sep:
114
+ print(parsed_mal_sep)
115
+ else:
116
+ print("Failed to parse (correctly).")
117
+
118
+ table_with_alignment = """
119
+ | Syntax | Description |
120
+ | :-------- | :-----------: |
121
+ | Header | Title |
122
+ | Paragraph | Text |
123
+ """
124
+ parsed_align = parse_markdown_table(table_with_alignment)
125
+ print("\\nParsed table with alignment in separator:")
126
+ if parsed_align:
127
+ for header, column_data in parsed_align.items():
128
+ print(f"Header: {header}, Data: {column_data}")
129
+ else:
130
+ print("Failed to parse table with alignment.")
tests/test_markdown_table_parser.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.markdown_table_parser import parse_markdown_table
3
+
4
+ def test_simple_table():
5
+ md_table = """
6
+ | Name | Age | City |
7
+ |-------|-----|-----------|
8
+ | Alice | 30 | New York |
9
+ | Bob | 24 | Paris |
10
+ """
11
+ expected = {
12
+ 'Name': ['Alice', 'Bob'],
13
+ 'Age': ['30', '24'],
14
+ 'City': ['New York', 'Paris']
15
+ }
16
+ assert parse_markdown_table(md_table) == expected
17
+
18
+ def test_gaia_question_table():
19
+ md_table = """
20
+ |*|a|b|c|d|e|
21
+ |---|---|---|---|---|---|
22
+ |a|a|b|c|b|d|
23
+ |b|b|c|a|e|c|
24
+ |c|c|a|b|b|a|
25
+ |d|b|e|b|e|d|
26
+ |e|d|b|a|d|c|
27
+ """
28
+ expected = {
29
+ '*': ['a', 'b', 'c', 'd', 'e'],
30
+ 'a': ['a', 'b', 'c', 'b', 'd'],
31
+ 'b': ['b', 'c', 'a', 'e', 'b'],
32
+ 'c': ['c', 'a', 'b', 'b', 'a'],
33
+ 'd': ['b', 'e', 'b', 'e', 'd'],
34
+ 'e': ['d', 'c', 'a', 'd', 'c']
35
+ }
36
+ assert parse_markdown_table(md_table) == expected
37
+
38
+ def test_table_with_empty_cells():
39
+ md_table = """
40
+ | Header1 | Header2 | Header3 |
41
+ |---------|---------|---------|
42
+ | A | | C |
43
+ | D | E | |
44
+ """
45
+ expected = {
46
+ 'Header1': ['A', 'D'],
47
+ 'Header2': ['', 'E'],
48
+ 'Header3': ['C', '']
49
+ }
50
+ assert parse_markdown_table(md_table) == expected
51
+
52
+ def test_no_data_rows():
53
+ md_table = """
54
+ | Col1 | Col2 |
55
+ |------|------|
56
+ """
57
+ expected = {
58
+ 'Col1': [],
59
+ 'Col2': []
60
+ }
61
+ assert parse_markdown_table(md_table) == expected
62
+
63
+ def test_malformed_table_missing_separator():
64
+ md_table = """
65
+ | HeaderA | HeaderB |
66
+ | Val1 | Val2 |
67
+ """
68
+ assert parse_markdown_table(md_table) is None
69
+
70
+ def test_malformed_table_column_mismatch():
71
+ md_table = """
72
+ | H1 | H2 |
73
+ |----|----|
74
+ | C1 | C2 | C3 |
75
+ """
76
+ # Current implementation skips rows with mismatched column counts
77
+ expected = {
78
+ 'H1': [],
79
+ 'H2': []
80
+ }
81
+ assert parse_markdown_table(md_table) == expected
82
+
83
+ def test_not_a_table():
84
+ text = "This is just some plain text, not a table."
85
+ assert parse_markdown_table(text) is None
86
+
87
+ def test_empty_string():
88
+ assert parse_markdown_table("") is None
89
+
90
+ def test_table_with_extra_spacing():
91
+ md_table = """
92
+ | Name | Age | City |
93
+ |---------|-------|-------------|
94
+ | Alice | 30 | New York |
95
+ | Bob | 24 | Paris |
96
+ """
97
+ expected = {
98
+ 'Name': ['Alice', 'Bob'],
99
+ 'Age': ['30', '24'],
100
+ 'City': ['New York', 'Paris']
101
+ }
102
+ assert parse_markdown_table(md_table) == expected
103
+
104
+ def test_table_in_larger_text():
105
+ text_with_table = """
106
+ Some text before the table.
107
+ | Key | Value |
108
+ |-----|-------|
109
+ | K1 | V1 |
110
+ | K2 | V2 |
111
+ Some text after the table.
112
+ """
113
+ expected = {
114
+ 'Key': ['K1', 'K2'],
115
+ 'Value': ['V1', 'V2']
116
+ }
117
+ assert parse_markdown_table(text_with_table) == expected
118
+
119
+ def test_table_with_different_separator_styles():
120
+ md_table = """
121
+ | Syntax | Description |
122
+ | :-------- | :-----------: |
123
+ | Header | Title |
124
+ | Paragraph | Text |
125
+ """
126
+ expected = {
127
+ 'Syntax': ['Header', 'Paragraph'],
128
+ 'Description': ['Title', 'Text']
129
+ }
130
+ assert parse_markdown_table(md_table) == expected
131
+