|
def levenshtein_with_wildcard(str1, str2, wildcard_offsets_str1=None, wildcard_offsets_str2=None, verbose=False): |
|
""" |
|
Calculate the Levenshtein distance between two strings with support for wildcards at specific positions. |
|
|
|
Args: |
|
str1 (str): The first string. |
|
str2 (str): The second string. |
|
wildcard_offsets_str1 (iterable, optional): Indices in str1 that are wildcards. Defaults to None. |
|
wildcard_offsets_str2 (iterable, optional): Indices in str2 that are wildcards. Defaults to None. |
|
verbose (bool, optional): If True, prints the DP matrix and explains the process. |
|
|
|
Returns: |
|
int: The Levenshtein distance between the two strings. |
|
list: If verbose=True, also returns a list of operations performed. |
|
""" |
|
|
|
wildcard_offsets_str1 = set(wildcard_offsets_str1 or []) |
|
wildcard_offsets_str2 = set(wildcard_offsets_str2 or []) |
|
|
|
m, n = len(str1), len(str2) |
|
|
|
|
|
dp = [[0] * (n + 1) for _ in range(m + 1)] |
|
|
|
|
|
for i in range(m + 1): |
|
dp[i][0] = i |
|
|
|
for j in range(n + 1): |
|
dp[0][j] = j |
|
|
|
|
|
for i in range(1, m + 1): |
|
for j in range(1, n + 1): |
|
|
|
is_str1_wildcard = (i - 1) in wildcard_offsets_str1 |
|
is_str2_wildcard = (j - 1) in wildcard_offsets_str2 |
|
|
|
|
|
if is_str1_wildcard or is_str2_wildcard: |
|
dp[i][j] = dp[i - 1][j - 1] |
|
else: |
|
cost = 0 if str1[i - 1] == str2[j - 1] else 1 |
|
dp[i][j] = min( |
|
dp[i - 1][j] + 1, |
|
dp[i][j - 1] + 1, |
|
dp[i - 1][j - 1] + cost |
|
) |
|
|
|
if verbose: |
|
operations = explain_match(str1, str2, dp, wildcard_offsets_str1, wildcard_offsets_str2) |
|
return dp[m][n], operations |
|
|
|
return dp[m][n] |
|
|
|
def explain_match(str1, str2, dp, wildcard_offsets_str1, wildcard_offsets_str2): |
|
""" |
|
Traces the optimal alignment path and explains each step of the matching process. |
|
|
|
Args: |
|
str1 (str): The first string. |
|
str2 (str): The second string. |
|
dp (list): The dynamic programming matrix. |
|
wildcard_offsets_str1 (set): Indices in str1 that are wildcards. |
|
wildcard_offsets_str2 (set): Indices in str2 that are wildcards. |
|
|
|
Returns: |
|
list: A list of explanation strings for each operation performed. |
|
""" |
|
m, n = len(str1), len(str2) |
|
operations = [] |
|
|
|
|
|
i, j = m, n |
|
path = [] |
|
|
|
while i > 0 or j > 0: |
|
path.append((i, j)) |
|
|
|
if i == 0: |
|
j -= 1 |
|
elif j == 0: |
|
i -= 1 |
|
else: |
|
substitution_cost = dp[i-1][j-1] |
|
deletion_cost = dp[i-1][j] |
|
insertion_cost = dp[i][j-1] |
|
|
|
min_cost = min(substitution_cost, deletion_cost, insertion_cost) |
|
|
|
if min_cost == substitution_cost: |
|
i -= 1 |
|
j -= 1 |
|
elif min_cost == deletion_cost: |
|
i -= 1 |
|
else: |
|
j -= 1 |
|
|
|
path.append((0, 0)) |
|
path.reverse() |
|
|
|
|
|
for idx in range(1, len(path)): |
|
prev_i, prev_j = path[idx-1] |
|
curr_i, curr_j = path[idx] |
|
|
|
|
|
if curr_i > prev_i and curr_j > prev_j: |
|
char1_idx = curr_i-1 |
|
char2_idx = curr_j-1 |
|
char1 = str1[char1_idx] |
|
char2 = str2[char2_idx] |
|
|
|
is_str1_wildcard = char1_idx in wildcard_offsets_str1 |
|
is_str2_wildcard = char2_idx in wildcard_offsets_str2 |
|
|
|
if is_str1_wildcard and is_str2_wildcard: |
|
operations.append(f"Double wildcard: Position {char1_idx} in str1 and position {char2_idx} in str2 are both wildcards") |
|
elif is_str1_wildcard: |
|
operations.append(f"Wildcard match: Position {char1_idx} in str1 is a wildcard, matches '{char2}' at position {char2_idx} in str2") |
|
elif is_str2_wildcard: |
|
operations.append(f"Wildcard match: Position {char2_idx} in str2 is a wildcard, matches '{char1}' at position {char1_idx} in str1") |
|
elif char1 == char2: |
|
operations.append(f"Match: '{char1}' at position {char1_idx} matches '{char2}' at position {char2_idx}") |
|
else: |
|
operations.append(f"Substitution: Replace '{char1}' at position {char1_idx} with '{char2}' at position {char2_idx}") |
|
|
|
|
|
elif curr_i == prev_i and curr_j > prev_j: |
|
char_idx = curr_j-1 |
|
operations.append(f"Insertion: Insert '{str2[char_idx]}' at position {char_idx} in str2") |
|
|
|
|
|
elif curr_i > prev_i and curr_j == prev_j: |
|
char_idx = curr_i-1 |
|
operations.append(f"Deletion: Delete '{str1[char_idx]}' at position {char_idx} in str1") |
|
|
|
return operations |
|
|
|
def print_match_summary(str1, str2, wildcard_offsets_str1=None, wildcard_offsets_str2=None): |
|
""" |
|
Prints a summary of the match between two strings, highlighting wildcards by their offsets. |
|
|
|
Args: |
|
str1 (str): The first string. |
|
str2 (str): The second string. |
|
wildcard_offsets_str1 (iterable, optional): Indices in str1 that are wildcards. Defaults to None. |
|
wildcard_offsets_str2 (iterable, optional): Indices in str2 that are wildcards. Defaults to None. |
|
""" |
|
|
|
wildcard_offsets_str1 = set(wildcard_offsets_str1 or []) |
|
wildcard_offsets_str2 = set(wildcard_offsets_str2 or []) |
|
|
|
distance, operations = levenshtein_with_wildcard( |
|
str1, str2, wildcard_offsets_str1, wildcard_offsets_str2, verbose=True |
|
) |
|
|
|
|
|
str1_visual = type(str1)() |
|
for i, char in enumerate(str1): |
|
if i in wildcard_offsets_str1: |
|
str1_visual += f"[{char}]" |
|
else: |
|
str1_visual += char |
|
|
|
str2_visual = type(str2)() |
|
for i, char in enumerate(str2): |
|
if i in wildcard_offsets_str2: |
|
str2_visual += f"[{char}]" |
|
else: |
|
str2_visual += char |
|
|
|
print(f"Comparing '{str1_visual}' and '{str2_visual}'") |
|
print(f"Wildcards in str1: {sorted(wildcard_offsets_str1)}") |
|
print(f"Wildcards in str2: {sorted(wildcard_offsets_str2)}") |
|
print(f"Edit distance: {distance}") |
|
print("\nMatch process:") |
|
|
|
for i, op in enumerate(operations): |
|
print(f"Step {i+1}: {op}") |
|
|
|
|
|
i, j = 0, 0 |
|
aligned_str1 = "" |
|
aligned_str2 = "" |
|
match_indicators = "" |
|
|
|
for op in operations: |
|
if "Match:" in op or "Substitution:" in op or "Wildcard match:" in op or "Double wildcard:" in op: |
|
is_str1_wildcard = i in wildcard_offsets_str1 |
|
is_str2_wildcard = j in wildcard_offsets_str2 |
|
|
|
|
|
if is_str1_wildcard: |
|
aligned_str1 += f"[{str1[i]}]" |
|
else: |
|
aligned_str1 += str1[i] |
|
|
|
if is_str2_wildcard: |
|
aligned_str2 += f"[{str2[j]}]" |
|
else: |
|
aligned_str2 += str2[j] |
|
|
|
|
|
if "Wildcard match:" in op or "Double wildcard:" in op: |
|
match_indicators += "*" |
|
elif "Match:" in op: |
|
match_indicators += "|" |
|
else: |
|
match_indicators += "X" |
|
|
|
i += 1 |
|
j += 1 |
|
elif "Insertion:" in op: |
|
aligned_str1 += "-" |
|
|
|
if j in wildcard_offsets_str2: |
|
aligned_str2 += f"[{str2[j]}]" |
|
else: |
|
aligned_str2 += str2[j] |
|
|
|
match_indicators += " " |
|
j += 1 |
|
elif "Deletion:" in op: |
|
if i in wildcard_offsets_str1: |
|
aligned_str1 += f"[{str1[i]}]" |
|
else: |
|
aligned_str1 += str1[i] |
|
|
|
aligned_str2 += "-" |
|
match_indicators += " " |
|
i += 1 |
|
|
|
print("\nAlignment:") |
|
print(aligned_str1) |
|
print(match_indicators) |
|
print(aligned_str2) |
|
print("\nLegend:") |
|
print("| = exact match, * = wildcard match, X = substitution, - = gap (insertion/deletion), [c] = wildcard position") |
|
|
|
|
|
wildcard_matches = [op for op in operations if "Wildcard match:" in op or "Double wildcard:" in op] |
|
if wildcard_matches: |
|
print("\nWildcard matches:") |
|
for match in wildcard_matches: |
|
print(f"- {match}") |
|
|
|
return distance, operations |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
print_match_summary("hello", "hello") |
|
|
|
|
|
print_match_summary("hello", "hallo") |
|
|
|
|
|
print_match_summary("hello", "hallo", wildcard_offsets_str1=[2]) |
|
|
|
|
|
print_match_summary("hello", "hillo", wildcard_offsets_str2=[1]) |
|
|
|
|
|
print_match_summary("hello", "haxyz", wildcard_offsets_str1=[2, 3, 4]) |
|
|
|
|
|
print_match_summary("hello", "howdy", wildcard_offsets_str1=[2], wildcard_offsets_str2=[3, 4]) |