Spaces:
Sleeping
Sleeping
File size: 9,600 Bytes
8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 784e69f 8427d96 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 15f3753 8427d96 3782fc1 8427d96 3782fc1 8427d96 3782fc1 52b18ab 8427d96 52b18ab 8427d96 52b18ab 3782fc1 52b18ab 8427d96 8e74111 8427d96 8e74111 8427d96 52b18ab 8427d96 52b18ab 8427d96 784e69f 8427d96 3782fc1 8427d96 52b18ab 8427d96 3782fc1 8427d96 784e69f 52b18ab 8427d96 52b18ab 8427d96 52b18ab 8427d96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
def levenshtein_with_wildcard(str1, str2, wildcard_offsets_str1=None, wildcard_offsets_str2=None, verbose=False):
"""
Calculate the Levenshtein distance between two strings with support for wildcards at specific positions.
Args:
str1 (str): The first string.
str2 (str): The second string.
wildcard_offsets_str1 (iterable, optional): Indices in str1 that are wildcards. Defaults to None.
wildcard_offsets_str2 (iterable, optional): Indices in str2 that are wildcards. Defaults to None.
verbose (bool, optional): If True, prints the DP matrix and explains the process.
Returns:
int: The Levenshtein distance between the two strings.
list: If verbose=True, also returns a list of operations performed.
"""
# Initialize empty sets if None
wildcard_offsets_str1 = set(wildcard_offsets_str1 or [])
wildcard_offsets_str2 = set(wildcard_offsets_str2 or [])
m, n = len(str1), len(str2)
# Create a matrix of size (m+1) x (n+1)
dp = [[0] * (n + 1) for _ in range(m + 1)]
# Initialize the first row and column
for i in range(m + 1):
dp[i][0] = i
for j in range(n + 1):
dp[0][j] = j
# Fill the dp matrix
for i in range(1, m + 1):
for j in range(1, n + 1):
# Check if either position is a wildcard
is_str1_wildcard = (i - 1) in wildcard_offsets_str1
is_str2_wildcard = (j - 1) in wildcard_offsets_str2
# If either position is a wildcard, treat it as a match (cost = 0)
if is_str1_wildcard or is_str2_wildcard:
dp[i][j] = dp[i - 1][j - 1] # No cost for wildcard matches
else:
cost = 0 if str1[i - 1] == str2[j - 1] else 1
dp[i][j] = min(
dp[i - 1][j] + 1, # deletion
dp[i][j - 1] + 1, # insertion
dp[i - 1][j - 1] + cost # substitution
)
if verbose:
operations = explain_match(str1, str2, dp, wildcard_offsets_str1, wildcard_offsets_str2)
return dp[m][n], operations
return dp[m][n]
def explain_match(str1, str2, dp, wildcard_offsets_str1, wildcard_offsets_str2):
"""
Traces the optimal alignment path and explains each step of the matching process.
Args:
str1 (str): The first string.
str2 (str): The second string.
dp (list): The dynamic programming matrix.
wildcard_offsets_str1 (set): Indices in str1 that are wildcards.
wildcard_offsets_str2 (set): Indices in str2 that are wildcards.
Returns:
list: A list of explanation strings for each operation performed.
"""
m, n = len(str1), len(str2)
operations = []
# Find the optimal path
i, j = m, n
path = []
while i > 0 or j > 0:
path.append((i, j))
if i == 0:
j -= 1
elif j == 0:
i -= 1
else:
substitution_cost = dp[i-1][j-1]
deletion_cost = dp[i-1][j]
insertion_cost = dp[i][j-1]
min_cost = min(substitution_cost, deletion_cost, insertion_cost)
if min_cost == substitution_cost:
i -= 1
j -= 1
elif min_cost == deletion_cost:
i -= 1
else:
j -= 1
path.append((0, 0))
path.reverse()
# Generate explanations for each step
for idx in range(1, len(path)):
prev_i, prev_j = path[idx-1]
curr_i, curr_j = path[idx]
# Diagonal move (match or substitution)
if curr_i > prev_i and curr_j > prev_j:
char1_idx = curr_i-1
char2_idx = curr_j-1
char1 = str1[char1_idx]
char2 = str2[char2_idx]
is_str1_wildcard = char1_idx in wildcard_offsets_str1
is_str2_wildcard = char2_idx in wildcard_offsets_str2
if is_str1_wildcard and is_str2_wildcard:
operations.append(f"Double wildcard: Position {char1_idx} in str1 and position {char2_idx} in str2 are both wildcards")
elif is_str1_wildcard:
operations.append(f"Wildcard match: Position {char1_idx} in str1 is a wildcard, matches '{char2}' at position {char2_idx} in str2")
elif is_str2_wildcard:
operations.append(f"Wildcard match: Position {char2_idx} in str2 is a wildcard, matches '{char1}' at position {char1_idx} in str1")
elif char1 == char2:
operations.append(f"Match: '{char1}' at position {char1_idx} matches '{char2}' at position {char2_idx}")
else:
operations.append(f"Substitution: Replace '{char1}' at position {char1_idx} with '{char2}' at position {char2_idx}")
# Horizontal move (insertion)
elif curr_i == prev_i and curr_j > prev_j:
char_idx = curr_j-1
operations.append(f"Insertion: Insert '{str2[char_idx]}' at position {char_idx} in str2")
# Vertical move (deletion)
elif curr_i > prev_i and curr_j == prev_j:
char_idx = curr_i-1
operations.append(f"Deletion: Delete '{str1[char_idx]}' at position {char_idx} in str1")
return operations
def to_bytes(s):
return s.encode('utf-8') if isinstance(s, str) else s
def print_match_summary(str1, str2, wildcard_offsets_str1=None, wildcard_offsets_str2=None):
"""
Prints a summary of the match between two strings, highlighting wildcards by their offsets.
Args:
str1 (str): The first string.
str2 (str): The second string.
wildcard_offsets_str1 (iterable, optional): Indices in str1 that are wildcards. Defaults to None.
wildcard_offsets_str2 (iterable, optional): Indices in str2 that are wildcards. Defaults to None.
"""
# Initialize empty lists if None
wildcard_offsets_str1 = set(wildcard_offsets_str1 or [])
wildcard_offsets_str2 = set(wildcard_offsets_str2 or [])
distance, operations = levenshtein_with_wildcard(
str1, str2, wildcard_offsets_str1, wildcard_offsets_str2, verbose=True
)
# Create visual representations of the strings with wildcard markers
str1_visual = type(str1)()
for i, char in enumerate(str1):
str1_visual += bytes([char])
str2_visual = type(str1)()
for i, char in enumerate(str2):
str2_visual += bytes([char])
print(f"Comparing '{str1_visual}' and '{str2_visual}'")
print(f"Wildcards in str1: {sorted(wildcard_offsets_str1)}")
print(f"Wildcards in str2: {sorted(wildcard_offsets_str2)}")
print(f"Edit distance: {distance}")
print("\nMatch process:")
for i, op in enumerate(operations):
print(f"Step {i+1}: {op}")
# Visual representation of the alignment
i, j = 0, 0
aligned_str1 = type(str1)()
aligned_str2 = type(str1)()
match_indicators = ""
for op in operations:
if "Match:" in op or "Substitution:" in op or "Wildcard match:" in op or "Double wildcard:" in op:
aligned_str1 += str1[i:i+1]
aligned_str2 += str2[j:j+1]
# Determine match indicator
if "Wildcard match:" in op or "Double wildcard:" in op:
match_indicators += "*" # Wildcard match
elif "Match:" in op:
match_indicators += "|" # Exact match
else:
match_indicators += "X" # Substitution
i += 1
j += 1
elif "Insertion:" in op:
aligned_str1 += to_bytes("-")
aligned_str2 += str2[j:j+1]
match_indicators += " "
j += 1
elif "Deletion:" in op:
aligned_str1 += str1[i:i+1]
aligned_str2 += to_bytes("-")
match_indicators += " "
i += 1
print("\nAlignment:")
print(aligned_str1)
print(match_indicators)
print(aligned_str2)
print("\nLegend:")
print("| = exact match, * = wildcard match, X = substitution, - = gap (insertion/deletion), [c] = wildcard position")
# Summary of wildcard matches
wildcard_matches = [op for op in operations if "Wildcard match:" in op or "Double wildcard:" in op]
if wildcard_matches:
print("\nWildcard matches:")
for match in wildcard_matches:
print(f"- {match}")
return distance, operations
# Example usage
if __name__ == "__main__":
# Example 1: "hello" vs "hello" with no wildcards
print_match_summary("hello", "hello")
# Example 2: "hello" vs "hallo" with no wildcards - expect distance of 1
print_match_summary("hello", "hallo")
# Example 3: "hello" with 3rd position (index 2) as wildcard vs "hallo" - expect distance of 0
print_match_summary("hello", "hallo", wildcard_offsets_str1=[2])
# Example 4: "hello" vs "hillo" with 2nd position (index 1) as wildcard in str2 - expect distance of 0
print_match_summary("hello", "hillo", wildcard_offsets_str2=[1])
# Example 5: Multiple wildcards in str1
print_match_summary("hello", "haxyz", wildcard_offsets_str1=[2, 3, 4])
# Example 6: Wildcards in both strings at different positions
print_match_summary("hello", "howdy", wildcard_offsets_str1=[2], wildcard_offsets_str2=[3, 4]) |