File size: 9,600 Bytes
8427d96
52b18ab
8427d96
52b18ab
 
 
 
8427d96
 
52b18ab
 
 
 
 
 
8427d96
 
 
 
52b18ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8427d96
 
 
 
 
 
52b18ab
 
 
 
 
 
 
 
 
 
8427d96
52b18ab
 
 
 
8427d96
52b18ab
 
 
 
 
 
 
8427d96
 
52b18ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8427d96
 
 
 
 
 
 
52b18ab
8427d96
 
 
 
 
 
52b18ab
8427d96
52b18ab
8427d96
52b18ab
 
 
8427d96
 
52b18ab
 
 
8427d96
 
52b18ab
 
 
784e69f
 
 
8427d96
52b18ab
8427d96
52b18ab
 
 
 
8427d96
 
52b18ab
8427d96
 
 
 
 
 
 
 
 
15f3753
8427d96
3782fc1
8427d96
3782fc1
8427d96
3782fc1
52b18ab
8427d96
 
 
52b18ab
 
 
 
 
 
8427d96
52b18ab
3782fc1
 
52b18ab
 
 
8427d96
8e74111
8427d96
8e74111
8427d96
 
 
52b18ab
8427d96
52b18ab
 
 
 
 
 
8427d96
784e69f
8427d96
3782fc1
8427d96
52b18ab
 
8427d96
3782fc1
8427d96
784e69f
52b18ab
 
 
 
 
 
 
 
8427d96
52b18ab
 
8427d96
52b18ab
 
 
 
 
 
 
 
 
8427d96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def levenshtein_with_wildcard(str1, str2, wildcard_offsets_str1=None, wildcard_offsets_str2=None, verbose=False):
    """
    Calculate the Levenshtein distance between two strings with support for wildcards at specific positions.
    
    Args:
        str1 (str): The first string.
        str2 (str): The second string.
        wildcard_offsets_str1 (iterable, optional): Indices in str1 that are wildcards. Defaults to None.
        wildcard_offsets_str2 (iterable, optional): Indices in str2 that are wildcards. Defaults to None.
        verbose (bool, optional): If True, prints the DP matrix and explains the process.
        
    Returns:
        int: The Levenshtein distance between the two strings.
        list: If verbose=True, also returns a list of operations performed.
    """
    # Initialize empty sets if None
    wildcard_offsets_str1 = set(wildcard_offsets_str1 or [])
    wildcard_offsets_str2 = set(wildcard_offsets_str2 or [])
    
    m, n = len(str1), len(str2)
    
    # Create a matrix of size (m+1) x (n+1)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Initialize the first row and column
    for i in range(m + 1):
        dp[i][0] = i
    
    for j in range(n + 1):
        dp[0][j] = j
    
    # Fill the dp matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            # Check if either position is a wildcard
            is_str1_wildcard = (i - 1) in wildcard_offsets_str1
            is_str2_wildcard = (j - 1) in wildcard_offsets_str2
            
            # If either position is a wildcard, treat it as a match (cost = 0)
            if is_str1_wildcard or is_str2_wildcard:
                dp[i][j] = dp[i - 1][j - 1]  # No cost for wildcard matches
            else:
                cost = 0 if str1[i - 1] == str2[j - 1] else 1
                dp[i][j] = min(
                    dp[i - 1][j] + 1,      # deletion
                    dp[i][j - 1] + 1,      # insertion
                    dp[i - 1][j - 1] + cost  # substitution
                )
    
    if verbose:
        operations = explain_match(str1, str2, dp, wildcard_offsets_str1, wildcard_offsets_str2)
        return dp[m][n], operations
    
    return dp[m][n]

def explain_match(str1, str2, dp, wildcard_offsets_str1, wildcard_offsets_str2):
    """
    Traces the optimal alignment path and explains each step of the matching process.
    
    Args:
        str1 (str): The first string.
        str2 (str): The second string.
        dp (list): The dynamic programming matrix.
        wildcard_offsets_str1 (set): Indices in str1 that are wildcards.
        wildcard_offsets_str2 (set): Indices in str2 that are wildcards.
        
    Returns:
        list: A list of explanation strings for each operation performed.
    """
    m, n = len(str1), len(str2)
    operations = []
    
    # Find the optimal path
    i, j = m, n
    path = []
    
    while i > 0 or j > 0:
        path.append((i, j))
        
        if i == 0:
            j -= 1
        elif j == 0:
            i -= 1
        else:
            substitution_cost = dp[i-1][j-1]
            deletion_cost = dp[i-1][j]
            insertion_cost = dp[i][j-1]
            
            min_cost = min(substitution_cost, deletion_cost, insertion_cost)
            
            if min_cost == substitution_cost:
                i -= 1
                j -= 1
            elif min_cost == deletion_cost:
                i -= 1
            else:
                j -= 1
    
    path.append((0, 0))
    path.reverse()
    
    # Generate explanations for each step
    for idx in range(1, len(path)):
        prev_i, prev_j = path[idx-1]
        curr_i, curr_j = path[idx]
        
        # Diagonal move (match or substitution)
        if curr_i > prev_i and curr_j > prev_j:
            char1_idx = curr_i-1
            char2_idx = curr_j-1
            char1 = str1[char1_idx]
            char2 = str2[char2_idx]
            
            is_str1_wildcard = char1_idx in wildcard_offsets_str1
            is_str2_wildcard = char2_idx in wildcard_offsets_str2
            
            if is_str1_wildcard and is_str2_wildcard:
                operations.append(f"Double wildcard: Position {char1_idx} in str1 and position {char2_idx} in str2 are both wildcards")
            elif is_str1_wildcard:
                operations.append(f"Wildcard match: Position {char1_idx} in str1 is a wildcard, matches '{char2}' at position {char2_idx} in str2")
            elif is_str2_wildcard:
                operations.append(f"Wildcard match: Position {char2_idx} in str2 is a wildcard, matches '{char1}' at position {char1_idx} in str1")
            elif char1 == char2:
                operations.append(f"Match: '{char1}' at position {char1_idx} matches '{char2}' at position {char2_idx}")
            else:
                operations.append(f"Substitution: Replace '{char1}' at position {char1_idx} with '{char2}' at position {char2_idx}")
        
        # Horizontal move (insertion)
        elif curr_i == prev_i and curr_j > prev_j:
            char_idx = curr_j-1
            operations.append(f"Insertion: Insert '{str2[char_idx]}' at position {char_idx} in str2")
        
        # Vertical move (deletion)
        elif curr_i > prev_i and curr_j == prev_j:
            char_idx = curr_i-1
            operations.append(f"Deletion: Delete '{str1[char_idx]}' at position {char_idx} in str1")
    
    return operations

def to_bytes(s):
    return s.encode('utf-8') if isinstance(s, str) else s

def print_match_summary(str1, str2, wildcard_offsets_str1=None, wildcard_offsets_str2=None):
    """
    Prints a summary of the match between two strings, highlighting wildcards by their offsets.
    
    Args:
        str1 (str): The first string.
        str2 (str): The second string.
        wildcard_offsets_str1 (iterable, optional): Indices in str1 that are wildcards. Defaults to None.
        wildcard_offsets_str2 (iterable, optional): Indices in str2 that are wildcards. Defaults to None.
    """
    # Initialize empty lists if None
    wildcard_offsets_str1 = set(wildcard_offsets_str1 or [])
    wildcard_offsets_str2 = set(wildcard_offsets_str2 or [])
    
    distance, operations = levenshtein_with_wildcard(
        str1, str2, wildcard_offsets_str1, wildcard_offsets_str2, verbose=True
    )
    
    # Create visual representations of the strings with wildcard markers
    str1_visual = type(str1)()
    for i, char in enumerate(str1):
        str1_visual += bytes([char])
    
    str2_visual = type(str1)()
    for i, char in enumerate(str2):
        str2_visual += bytes([char])
    
    print(f"Comparing '{str1_visual}' and '{str2_visual}'")
    print(f"Wildcards in str1: {sorted(wildcard_offsets_str1)}")
    print(f"Wildcards in str2: {sorted(wildcard_offsets_str2)}")
    print(f"Edit distance: {distance}")
    print("\nMatch process:")
    
    for i, op in enumerate(operations):
        print(f"Step {i+1}: {op}")
    
    # Visual representation of the alignment
    i, j = 0, 0
    aligned_str1 = type(str1)()
    aligned_str2 = type(str1)()
    match_indicators = ""
    
    for op in operations:
        if "Match:" in op or "Substitution:" in op or "Wildcard match:" in op or "Double wildcard:" in op:
            aligned_str1 += str1[i:i+1]
                
            aligned_str2 += str2[j:j+1]
            
            # Determine match indicator
            if "Wildcard match:" in op or "Double wildcard:" in op:
                match_indicators += "*"  # Wildcard match
            elif "Match:" in op:
                match_indicators += "|"  # Exact match
            else:
                match_indicators += "X"  # Substitution
            
            i += 1
            j += 1
        elif "Insertion:" in op:
            aligned_str1 += to_bytes("-")
            
            aligned_str2 += str2[j:j+1]
                
            match_indicators += " "
            j += 1
        elif "Deletion:" in op:
            aligned_str1 += str1[i:i+1]
                
            aligned_str2 += to_bytes("-")
            match_indicators += " "
            i += 1
    
    print("\nAlignment:")
    print(aligned_str1)
    print(match_indicators)
    print(aligned_str2)
    print("\nLegend:")
    print("| = exact match, * = wildcard match, X = substitution, - = gap (insertion/deletion), [c] = wildcard position")
    
    # Summary of wildcard matches
    wildcard_matches = [op for op in operations if "Wildcard match:" in op or "Double wildcard:" in op]
    if wildcard_matches:
        print("\nWildcard matches:")
        for match in wildcard_matches:
            print(f"- {match}")
    
    return distance, operations

# Example usage
if __name__ == "__main__":
    # Example 1: "hello" vs "hello" with no wildcards
    print_match_summary("hello", "hello")
    
    # Example 2: "hello" vs "hallo" with no wildcards - expect distance of 1
    print_match_summary("hello", "hallo")
    
    # Example 3: "hello" with 3rd position (index 2) as wildcard vs "hallo" - expect distance of 0
    print_match_summary("hello", "hallo", wildcard_offsets_str1=[2])
    
    # Example 4: "hello" vs "hillo" with 2nd position (index 1) as wildcard in str2 - expect distance of 0
    print_match_summary("hello", "hillo", wildcard_offsets_str2=[1])
    
    # Example 5: Multiple wildcards in str1
    print_match_summary("hello", "haxyz", wildcard_offsets_str1=[2, 3, 4])
    
    # Example 6: Wildcards in both strings at different positions
    print_match_summary("hello", "howdy", wildcard_offsets_str1=[2], wildcard_offsets_str2=[3, 4])