File size: 6,940 Bytes
52b18ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def levenshtein_with_wildcards(str1, str2, wildcard='?', verbose=False):
    """
    Calculate the Levenshtein distance between two strings with support for wildcards.
    
    Args:
        str1 (str): The first string.
        str2 (str): The second string.
        wildcard (str, optional): The wildcard character. Defaults to '?'.
        verbose (bool, optional): If True, prints the DP matrix and explains the process.
        
    Returns:
        int: The Levenshtein distance between the two strings.
        list: If verbose=True, also returns a list of operations performed.
    """
    m, n = len(str1), len(str2)
    
    # Create a matrix of size (m+1) x (n+1)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Initialize the first row and column
    for i in range(m + 1):
        dp[i][0] = i
    
    for j in range(n + 1):
        dp[0][j] = j
    
    # Fill the dp matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            # If either character is a wildcard, treat it as a match (cost = 0)
            if str1[i - 1] == wildcard or str2[j - 1] == wildcard:
                dp[i][j] = dp[i - 1][j - 1]  # No cost for wildcard matches
            else:
                cost = 0 if str1[i - 1] == str2[j - 1] else 1
                dp[i][j] = min(
                    dp[i - 1][j] + 1,      # deletion
                    dp[i][j - 1] + 1,      # insertion
                    dp[i - 1][j - 1] + cost  # substitution
                )
    
    if verbose:
        operations = explain_match(str1, str2, dp, wildcard)
        return dp[m][n], operations
    
    return dp[m][n]

def explain_match(str1, str2, dp, wildcard='?'):
    """
    Traces the optimal alignment path and explains each step of the matching process.
    
    Args:
        str1 (str): The first string.
        str2 (str): The second string.
        dp (list): The dynamic programming matrix.
        wildcard (str, optional): The wildcard character. Defaults to '?'.
        
    Returns:
        list: A list of explanation strings for each operation performed.
    """
    m, n = len(str1), len(str2)
    operations = []
    
    # Find the optimal path
    i, j = m, n
    path = []
    
    while i > 0 or j > 0:
        path.append((i, j))
        
        if i == 0:
            j -= 1
        elif j == 0:
            i -= 1
        else:
            substitution_cost = dp[i-1][j-1]
            deletion_cost = dp[i-1][j]
            insertion_cost = dp[i][j-1]
            
            min_cost = min(substitution_cost, deletion_cost, insertion_cost)
            
            if min_cost == substitution_cost:
                i -= 1
                j -= 1
            elif min_cost == deletion_cost:
                i -= 1
            else:
                j -= 1
    
    path.append((0, 0))
    path.reverse()
    
    # Generate explanations for each step
    for idx in range(1, len(path)):
        prev_i, prev_j = path[idx-1]
        curr_i, curr_j = path[idx]
        
        # Diagonal move (match or substitution)
        if curr_i > prev_i and curr_j > prev_j:
            char1 = str1[curr_i-1]
            char2 = str2[curr_j-1]
            
            if char1 == wildcard or char2 == wildcard:
                wildcard_char = char1 if char1 == wildcard else char2
                match_char = char2 if char1 == wildcard else char1
                operations.append(f"Wildcard match: '{wildcard_char}' matches any character, here '{match_char}'")
            elif char1 == char2:
                operations.append(f"Match: '{char1}' matches '{char2}'")
            else:
                operations.append(f"Substitution: Replace '{char1}' with '{char2}'")
        
        # Horizontal move (insertion)
        elif curr_i == prev_i and curr_j > prev_j:
            operations.append(f"Insertion: Insert '{str2[curr_j-1]}'")
        
        # Vertical move (deletion)
        elif curr_i > prev_i and curr_j == prev_j:
            operations.append(f"Deletion: Delete '{str1[curr_i-1]}'")
    
    return operations

def print_match_summary(str1, str2, wildcard='?'):
    """
    Prints a summary of the match between two strings, highlighting wildcards.
    
    Args:
        str1 (str): The first string.
        str2 (str): The second string.
        wildcard (str, optional): The wildcard character. Defaults to '?'.
    """
    distance, operations = levenshtein_with_wildcards(str1, str2, wildcard, verbose=True)
    
    print(f"Comparing '{str1}' and '{str2}' (wildcard: '{wildcard}')")
    print(f"Edit distance: {distance}")
    print("\nMatch process:")
    
    for i, op in enumerate(operations):
        print(f"Step {i+1}: {op}")
    
    # Visual representation
    alignment = []
    i, j = 0, 0
    aligned_str1 = ""
    aligned_str2 = ""
    match_indicators = ""
    
    for op in operations:
        if "match" in op or "Match" in op or "Substitution" in op:
            aligned_str1 += str1[i]
            aligned_str2 += str2[j]
            
            if "Wildcard" in op:
                match_indicators += "*"  # Wildcard match
            elif "Match" in op:
                match_indicators += "|"  # Exact match
            else:
                match_indicators += "X"  # Substitution
            
            i += 1
            j += 1
        elif "Insertion" in op:
            aligned_str1 += "-"
            aligned_str2 += str2[j]
            match_indicators += " "
            j += 1
        elif "Deletion" in op:
            aligned_str1 += str1[i]
            aligned_str2 += "-"
            match_indicators += " "
            i += 1
    
    print("\nAlignment:")
    print(aligned_str1)
    print(match_indicators)
    print(aligned_str2)
    print("\nLegend:")
    print("| = exact match, * = wildcard match, X = substitution, - = gap (insertion/deletion)")
    
    # Summary of wildcard matches
    wildcard_matches = [op for op in operations if "Wildcard" in op]
    if wildcard_matches:
        print("\nWildcard matches:")
        for match in wildcard_matches:
            print(f"- {match}")
    
    return distance, operations

# Example usage
if __name__ == "__main__":
    # Basic examples
    print_match_summary("hello", "hello")       # 0 (identical strings)
    print_match_summary("hello", "hallo")       # 1 (one substitution)
    print_match_summary("he?lo", "hello")       # 0 (wildcard matches 'l')
    print_match_summary("he?lo", "hallo")       # 0 (wildcard matches 'a')
    print_match_summary("h?llo", "hello")       # 0 (wildcard matches 'e')
    print_match_summary("h?llo", "hillo")       # 0 (wildcard matches 'i')
    print_match_summary("c?t", "cat")           # 0 (wildcard matches 'a')
    print_match_summary("c?t", "cut")           # 0 (wildcard matches 'u')
    print_match_summary("w?rd", "word")         # 0 (wildcard matches 'o')
    print_match_summary("d?g", "dog")           # 0 (wildcard matches 'o')