File size: 6,131 Bytes
f4e197f
87aa741
 
f4e197f
87aa741
 
f4e197f
 
 
 
b1939df
f4e197f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87aa741
 
 
f4e197f
 
 
87aa741
 
f4e197f
 
 
 
 
87aa741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4e197f
87aa741
f4e197f
 
 
 
 
 
 
 
 
87aa741
f4e197f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87aa741
f4e197f
 
 
 
 
 
 
 
 
 
 
87aa741
f4e197f
 
 
 
 
87aa741
f4e197f
87aa741
 
 
 
f4e197f
87aa741
 
 
 
f4e197f
 
87aa741
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import re
from smolagents.tools import Tool
from typing import Dict, List, Optional

# Original parsing function
def _parse_markdown_table_string(markdown_text: str) -> Optional[Dict[str, List[str]]]:
    """
    Parses the first valid Markdown table found in a string.
    Returns a dictionary (headers as keys, lists of cell content as values)
    or None if no valid table is found.
    Useful for converting markdown tables into Python data structures for further analysis.
    """
    lines = [line.rstrip() for line in markdown_text.split('\n') if line.strip()]
    n = len(lines)
    i = 0
    while i < n - 1:
        header_line = lines[i].strip()
        sep_line = lines[i+1].strip()
        # Header and separator must start and end with |
        if not (header_line.startswith('|') and header_line.endswith('|')):
            i += 1
            continue
        if not (sep_line.startswith('|') and sep_line.endswith('|')):
            i += 1
            continue
        # Split header and separator
        headers = [h.strip() for h in header_line.strip('|').split('|')]
        seps = [s.strip() for s in sep_line.strip('|').split('|')]
        if len(headers) != len(seps):
            i += 1
            continue
        # Separator must have at least one '-' in each cell, and only -, :, or spaces
        valid_sep = all('-' in s and all(c in '-: ' for c in s) for s in seps)
        if not valid_sep:
            i += 1
            continue
        # Found a table, now parse data rows
        # Special handling: if the first header is a row label (e.g., '*'), treat first cell of each row as row label, not data
        has_row_labels = headers[0] not in ('',)
        table = {h: [] for h in headers}
        j = i + 2
        while j < n:
            row = lines[j].strip()
            if not (row.startswith('|') and row.endswith('|')):
                break
            cells = [c.strip() for c in row.strip('|').split('|')]
            if len(cells) != len(headers):
                j += 1
                continue
            if has_row_labels and len(headers) > 1:
                # First cell is row label, rest are data
                table[headers[0]].append(cells[0])
                for k, h in enumerate(headers[1:], 1):
                    # Ensure the key exists and is a list
                    if h not in table or not isinstance(table[h], list):
                        table[h] = [] # Initialize if not present or not a list
                    table[h].append(cells[k])
            else:
                for k, h in enumerate(headers):
                    if h not in table or not isinstance(table[h], list):
                        table[h] = []
                    table[h].append(cells[k])
            j += 1
        return table
    return None

class MarkdownTableParserTool(Tool):
    """
    Parses a Markdown table from a given text string.
    Useful for converting markdown tables into Python data structures for further analysis.
    """
    name = "markdown_table_parser"
    description = "Parses the first valid Markdown table found in a string and returns it as a dictionary."
    inputs = {'markdown_text': {'type': 'string', 'description': 'The string containing the Markdown table.'}}
    outputs = {'parsed_table': {'type': 'object', 'description': 'A dictionary representing the table (headers as keys, lists of cell content as values), or null if no table is found.'}}
    output_type = "object" # Or dict/None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.is_initialized = True

    def forward(self, markdown_text: str) -> Optional[Dict[str, List[str]]]:
        """
        Wrapper for the _parse_markdown_table_string function.
        """
        return _parse_markdown_table_string(markdown_text)

# Expose the original function name if other parts of the system expect it (optional)
parse_markdown_table = _parse_markdown_table_string

if __name__ == '__main__':
    tool_instance = MarkdownTableParserTool()
    example_table = """
    |*|a|b|c|d|e|
    |---|---|---|---|---|---|
    |a|a|b|c|b|d|
    |b|b|c|a|e|c|
    |c|c|a|b|b|a|
    |d|b|e|b|e|d|
    |e|d|b|a|d|c|
    """
    parsed = tool_instance.forward(example_table)
    print("Parsed GAIA example:")
    if parsed:
        for header, column_data in parsed.items():
            print(f"Header: {header}, Data: {column_data}")
    else:
        print("Failed to parse table.")

    example_table_2 = """
    Some text before
    | Name  | Age | City      |
    |-------|-----|-----------|
    | Alice | 30  | New York  |
    | Bob   | 24  | Paris     |
    | Carol | 45  | London    |
    Some text after
    """
    parsed_2 = tool_instance.forward(example_table_2)
    print("\\nParsed Table 2 (with surrounding text):")
    if parsed_2:
        for header, column_data in parsed_2.items():
            print(f"Header: {header}, Data: {column_data}")
    else:
        print("Failed to parse table 2.")

    empty_table_with_header = """
    | Header1 | Header2 |
    |---------|---------|
    """
    parsed_empty = tool_instance.forward(empty_table_with_header)
    print("\\nParsed Empty Table with Header:")
    if parsed_empty:
        for header, column_data in parsed_empty.items():
            print(f"Header: {header}, Data: {column_data}")
    else:
        print("Failed to parse table (empty with header).") # Corrected message

    malformed_table = """
    | Header1 | Header2
    |--- ---|
    | cell1 | cell2 |
    """
    parsed_malformed = tool_instance.forward(malformed_table)
    print("\\nParsed Malformed Table:")
    if parsed_malformed:
        for header, column_data in parsed_malformed.items():
            print(f"Header: {header}, Data: {column_data}")
    else:
        print("Failed to parse malformed table.")

    no_table_text = "This is just some text without a table."
    parsed_no_table = tool_instance.forward(no_table_text)
    print("\\nParsed Text Without Table:")
    if parsed_no_table:
        print("Error: Should not have parsed a table.")
    else:
        print("Correctly found no table.")