ejschwartz commited on
Commit
ca8e3d9
·
1 Parent(s): a89d907
Files changed (1) hide show
  1. dist.py +136 -96
dist.py CHANGED
@@ -6,54 +6,58 @@ This module provides functions to calculate Levenshtein (edit) distance between
6
  two sequences (strings or bytes) with support for wildcard positions.
7
  """
8
 
 
9
  def ensure_same_type(seq1, seq2):
10
  """
11
  Ensure both sequences are the same type (both str or both bytes).
12
-
13
  Args:
14
  seq1: First sequence (str or bytes)
15
  seq2: Second sequence (str or bytes)
16
-
17
  Returns:
18
  Tuple of (seq1, seq2) with consistent types
19
  """
20
  if isinstance(seq1, str) and isinstance(seq2, bytes):
21
- seq2 = seq2.decode('utf-8', errors='replace')
22
  elif isinstance(seq1, bytes) and isinstance(seq2, str):
23
- seq2 = seq2.encode('utf-8', errors='replace')
24
  return seq1, seq2
25
 
 
26
  def to_bytes(s):
27
  """
28
  Convert a sequence to bytes if it's a string, otherwise return as is.
29
-
30
  Args:
31
  s: The sequence to convert (str or bytes)
32
-
33
  Returns:
34
  bytes: The input converted to bytes if it was a string
35
  """
36
- return s.encode('utf-8', errors='replace') if isinstance(s, str) else s
 
37
 
38
  def to_str(s):
39
  """
40
  Convert a sequence to string if it's bytes, otherwise return as is.
41
-
42
  Args:
43
  s: The sequence to convert (str or bytes)
44
-
45
  Returns:
46
  str: The input converted to string if it was bytes
47
  """
48
- return s.decode('utf-8', errors='replace') if isinstance(s, bytes) else s
 
49
 
50
  def get_element_repr(element):
51
  """
52
  Get a human-readable representation of a sequence element.
53
-
54
  Args:
55
  element: A single element from a sequence (byte or character)
56
-
57
  Returns:
58
  str: A printable representation of the element
59
  """
@@ -63,18 +67,21 @@ def get_element_repr(element):
63
  return f"0x{element:02x}"
64
  return repr(element) # For str objects
65
 
66
- def levenshtein_with_wildcard(seq1, seq2, wildcard_offsets_seq1=None, wildcard_offsets_seq2=None, verbose=False):
 
 
 
67
  """
68
  Calculate the Levenshtein distance between two sequences with support for wildcards.
69
  Works with both strings and bytes.
70
-
71
  Args:
72
  seq1: First sequence (str or bytes)
73
  seq2: Second sequence (str or bytes)
74
  wildcard_offsets_seq1 (iterable, optional): Indices in seq1 that are wildcards. Defaults to None.
75
  wildcard_offsets_seq2 (iterable, optional): Indices in seq2 that are wildcards. Defaults to None.
76
  verbose (bool, optional): If True, returns additional information about operations. Defaults to False.
77
-
78
  Returns:
79
  int: The Levenshtein distance between the two sequences.
80
  list: If verbose=True, also returns a list of operations performed.
@@ -82,78 +89,81 @@ def levenshtein_with_wildcard(seq1, seq2, wildcard_offsets_seq1=None, wildcard_o
82
  # Initialize empty sets if None
83
  wildcard_offsets_seq1 = set(wildcard_offsets_seq1 or [])
84
  wildcard_offsets_seq2 = set(wildcard_offsets_seq2 or [])
85
-
86
  m, n = len(seq1), len(seq2)
87
-
88
  # Create a matrix of size (m+1) x (n+1)
89
  dp = [[0] * (n + 1) for _ in range(m + 1)]
90
-
91
  # Initialize the first row and column
92
  for i in range(m + 1):
93
  dp[i][0] = i
94
-
95
  for j in range(n + 1):
96
  dp[0][j] = j
97
-
98
  # Fill the dp matrix
99
  for i in range(1, m + 1):
100
  for j in range(1, n + 1):
101
  # Check if either position is a wildcard
102
  is_seq1_wildcard = (i - 1) in wildcard_offsets_seq1
103
  is_seq2_wildcard = (j - 1) in wildcard_offsets_seq2
104
-
105
  # If either position is a wildcard, treat it as a match (cost = 0)
106
  if is_seq1_wildcard or is_seq2_wildcard:
107
  dp[i][j] = dp[i - 1][j - 1] # No cost for wildcard matches
108
  else:
109
  cost = 0 if seq1[i - 1] == seq2[j - 1] else 1
110
  dp[i][j] = min(
111
- dp[i - 1][j] + 1, # deletion
112
- dp[i][j - 1] + 1, # insertion
113
- dp[i - 1][j - 1] + cost # substitution
114
  )
115
-
116
  if verbose:
117
- operations = explain_match(seq1, seq2, dp, wildcard_offsets_seq1, wildcard_offsets_seq2)
 
 
118
  return dp[m][n], operations
119
-
120
  return dp[m][n]
121
 
 
122
  def explain_match(seq1, seq2, dp, wildcard_offsets_seq1, wildcard_offsets_seq2):
123
  """
124
  Traces the optimal alignment path and explains each step of the matching process.
125
-
126
  Args:
127
  seq1: First sequence (str or bytes)
128
  seq2: Second sequence (str or bytes)
129
  dp (list): The dynamic programming matrix.
130
  wildcard_offsets_seq1 (set): Indices in seq1 that are wildcards.
131
  wildcard_offsets_seq2 (set): Indices in seq2 that are wildcards.
132
-
133
  Returns:
134
  list: A list of explanation strings for each operation performed.
135
  """
136
  m, n = len(seq1), len(seq2)
137
  operations = []
138
-
139
  # Find the optimal path
140
  i, j = m, n
141
  path = []
142
-
143
  while i > 0 or j > 0:
144
  path.append((i, j))
145
-
146
  if i == 0:
147
  j -= 1
148
  elif j == 0:
149
  i -= 1
150
  else:
151
- substitution_cost = dp[i-1][j-1]
152
- deletion_cost = dp[i-1][j]
153
- insertion_cost = dp[i][j-1]
154
-
155
  min_cost = min(substitution_cost, deletion_cost, insertion_cost)
156
-
157
  if min_cost == substitution_cost:
158
  i -= 1
159
  j -= 1
@@ -161,130 +171,153 @@ def explain_match(seq1, seq2, dp, wildcard_offsets_seq1, wildcard_offsets_seq2):
161
  i -= 1
162
  else:
163
  j -= 1
164
-
165
  path.append((0, 0))
166
  path.reverse()
167
-
168
  # Generate explanations for each step
169
  for idx in range(1, len(path)):
170
- prev_i, prev_j = path[idx-1]
171
  curr_i, curr_j = path[idx]
172
-
173
  # Diagonal move (match or substitution)
174
  if curr_i > prev_i and curr_j > prev_j:
175
- char1_idx = curr_i-1
176
- char2_idx = curr_j-1
177
  char1 = seq1[char1_idx]
178
  char2 = seq2[char2_idx]
179
-
180
  is_seq1_wildcard = char1_idx in wildcard_offsets_seq1
181
  is_seq2_wildcard = char2_idx in wildcard_offsets_seq2
182
-
183
  char1_repr = get_element_repr(char1)
184
  char2_repr = get_element_repr(char2)
185
-
186
  if is_seq1_wildcard and is_seq2_wildcard:
187
- operations.append(f"Double wildcard: Position {char1_idx} in seq1 and position {char2_idx} in seq2 are both wildcards")
 
 
188
  elif is_seq1_wildcard:
189
- operations.append(f"Wildcard match: Position {char1_idx} in seq1 is a wildcard, matches {char2_repr} at position {char2_idx} in seq2")
 
 
190
  elif is_seq2_wildcard:
191
- operations.append(f"Wildcard match: Position {char2_idx} in seq2 is a wildcard, matches {char1_repr} at position {char1_idx} in seq1")
 
 
192
  elif char1 == char2:
193
- operations.append(f"Match: {char1_repr} at position {char1_idx} matches {char2_repr} at position {char2_idx}")
 
 
194
  else:
195
- operations.append(f"Substitution: Replace {char1_repr} at position {char1_idx} with {char2_repr} at position {char2_idx}")
196
-
 
 
197
  # Horizontal move (insertion)
198
  elif curr_i == prev_i and curr_j > prev_j:
199
- char_idx = curr_j-1
200
  char_repr = get_element_repr(seq2[char_idx])
201
- operations.append(f"Insertion: Insert {char_repr} at position {char_idx} in seq2")
202
-
 
 
203
  # Vertical move (deletion)
204
  elif curr_i > prev_i and curr_j == prev_j:
205
- char_idx = curr_i-1
206
  char_repr = get_element_repr(seq1[char_idx])
207
- operations.append(f"Deletion: Delete {char_repr} at position {char_idx} in seq1")
208
-
 
 
209
  return operations
210
 
 
211
  def create_gap_element(sequence):
212
  """
213
  Create a gap element compatible with the sequence type.
214
-
215
  Args:
216
  sequence: The sequence (str or bytes) to create a gap for
217
-
218
  Returns:
219
  The appropriate gap element for the sequence type
220
  """
221
  if isinstance(sequence, bytes):
222
- return b'-'
223
  else:
224
- return '-'
225
 
226
- def print_match_summary(seq1, seq2, wildcard_offsets_seq1=None, wildcard_offsets_seq2=None):
 
 
 
227
  """
228
  Prints a summary of the match between two sequences, highlighting wildcards by their offsets.
229
  Works with both strings and bytes.
230
-
231
  Args:
232
  seq1: First sequence (str or bytes)
233
  seq2: Second sequence (str or bytes)
234
  wildcard_offsets_seq1 (iterable, optional): Indices in seq1 that are wildcards. Defaults to None.
235
  wildcard_offsets_seq2 (iterable, optional): Indices in seq2 that are wildcards. Defaults to None.
236
-
237
  Returns:
238
  tuple: (distance, operations) The edit distance and list of operations
239
  """
240
  # Ensure sequences are of the same type for comparison
241
  seq1, seq2 = ensure_same_type(seq1, seq2)
242
-
243
  # Initialize empty sets if None
244
  wildcard_offsets_seq1 = set(wildcard_offsets_seq1 or [])
245
  wildcard_offsets_seq2 = set(wildcard_offsets_seq2 or [])
246
-
247
  distance, operations = levenshtein_with_wildcard(
248
  seq1, seq2, wildcard_offsets_seq1, wildcard_offsets_seq2, verbose=True
249
  )
250
-
251
  # For reporting, convert to a human-readable representation if needed
252
  seq1_repr = repr(seq1)
253
  seq2_repr = repr(seq2)
254
-
255
  print(f"Comparing {seq1_repr} and {seq2_repr}")
256
  print(f"Wildcards in seq1: {sorted(wildcard_offsets_seq1)}")
257
  print(f"Wildcards in seq2: {sorted(wildcard_offsets_seq2)}")
258
  print(f"Edit distance: {distance}")
259
  print("\nMatch process:")
260
-
261
  for i, op in enumerate(operations):
262
  print(f"Step {i+1}: {op}")
263
-
264
  # Visual representation of the alignment
265
  i, j = 0, 0
266
  is_bytes = isinstance(seq1, bytes)
267
-
268
  if is_bytes:
269
  aligned_seq1 = bytearray()
270
  aligned_seq2 = bytearray()
271
- gap = ord('-')
272
  else:
273
  aligned_seq1 = ""
274
  aligned_seq2 = ""
275
- gap = '-'
276
-
277
  match_indicators = ""
278
-
279
  for op in operations:
280
- if "Match:" in op or "Substitution:" in op or "Wildcard match:" in op or "Double wildcard:" in op:
 
 
 
 
 
281
  if is_bytes:
282
  aligned_seq1.append(seq1[i])
283
  aligned_seq2.append(seq2[j])
284
  else:
285
  aligned_seq1 += seq1[i]
286
  aligned_seq2 += seq2[j]
287
-
288
  # Determine match indicator
289
  if "Wildcard match:" in op or "Double wildcard:" in op:
290
  match_indicators += "*" # Wildcard match
@@ -292,7 +325,7 @@ def print_match_summary(seq1, seq2, wildcard_offsets_seq1=None, wildcard_offsets
292
  match_indicators += "|" # Exact match
293
  else:
294
  match_indicators += "X" # Substitution
295
-
296
  i += 1
297
  j += 1
298
  elif "Insertion:" in op:
@@ -302,7 +335,7 @@ def print_match_summary(seq1, seq2, wildcard_offsets_seq1=None, wildcard_offsets
302
  else:
303
  aligned_seq1 += gap
304
  aligned_seq2 += seq2[j]
305
-
306
  match_indicators += " "
307
  j += 1
308
  elif "Deletion:" in op:
@@ -312,57 +345,64 @@ def print_match_summary(seq1, seq2, wildcard_offsets_seq1=None, wildcard_offsets
312
  else:
313
  aligned_seq1 += seq1[i]
314
  aligned_seq2 += gap
315
-
316
  match_indicators += " "
317
  i += 1
318
-
319
  print("\nAlignment:")
320
  if is_bytes:
321
  aligned_seq1 = bytes(aligned_seq1)
322
  aligned_seq2 = bytes(aligned_seq2)
323
-
324
  print(repr(aligned_seq1))
325
  print(match_indicators)
326
  print(repr(aligned_seq2))
327
  print("\nLegend:")
328
- print("| = exact match, * = wildcard match, X = substitution, - = gap (insertion/deletion)")
329
-
 
 
330
  # Summary of wildcard matches
331
- wildcard_matches = [op for op in operations if "Wildcard match:" in op or "Double wildcard:" in op]
 
 
332
  if wildcard_matches:
333
  print("\nWildcard matches:")
334
  for match in wildcard_matches:
335
  print(f"- {match}")
336
-
337
  return distance, operations
338
 
 
339
  # Example usage
340
  if __name__ == "__main__":
341
  print("\n--- String Examples ---")
342
  # Example 1: "hello" vs "hello" with no wildcards
343
  print_match_summary("hello", "hello")
344
-
345
  # Example 2: "hello" vs "hallo" with no wildcards - expect distance of 1
346
  print_match_summary("hello", "hallo")
347
-
348
  # Example 3: "hello" with 3rd position (index 2) as wildcard vs "hallo" - expect distance of 0
349
  print_match_summary("hello", "hallo", wildcard_offsets_seq1=[2])
350
-
351
  # Example 4: "hello" vs "hillo" with 2nd position (index 1) as wildcard in seq2 - expect distance of 0
352
  print_match_summary("hello", "hillo", wildcard_offsets_seq2=[1])
353
-
354
  # Example 5: Multiple wildcards in seq1
355
  print_match_summary("hello", "haxyz", wildcard_offsets_seq1=[2, 3, 4])
356
-
357
  print("\n--- Bytes Examples ---")
358
  # Example 6: Working with bytes
359
  print_match_summary(b"hello", b"hallo")
360
-
361
  # Example 7: Working with bytes with wildcard
362
  print_match_summary(b"hello", b"hallo", wildcard_offsets_seq1=[2])
363
-
364
  # Example 8: Mixed types (bytes and string)
365
  print_match_summary(b"hello", "hallo", wildcard_offsets_seq1=[2])
366
-
367
  # Example 9: Non-printable bytes example
368
- print_match_summary(b"\x01\x02\x03\x04", b"\x01\x05\x03\x04", wildcard_offsets_seq1=[1])
 
 
 
6
  two sequences (strings or bytes) with support for wildcard positions.
7
  """
8
 
9
+
10
  def ensure_same_type(seq1, seq2):
11
  """
12
  Ensure both sequences are the same type (both str or both bytes).
13
+
14
  Args:
15
  seq1: First sequence (str or bytes)
16
  seq2: Second sequence (str or bytes)
17
+
18
  Returns:
19
  Tuple of (seq1, seq2) with consistent types
20
  """
21
  if isinstance(seq1, str) and isinstance(seq2, bytes):
22
+ seq2 = seq2.decode("utf-8", errors="replace")
23
  elif isinstance(seq1, bytes) and isinstance(seq2, str):
24
+ seq2 = seq2.encode("utf-8", errors="replace")
25
  return seq1, seq2
26
 
27
+
28
  def to_bytes(s):
29
  """
30
  Convert a sequence to bytes if it's a string, otherwise return as is.
31
+
32
  Args:
33
  s: The sequence to convert (str or bytes)
34
+
35
  Returns:
36
  bytes: The input converted to bytes if it was a string
37
  """
38
+ return s.encode("utf-8", errors="replace") if isinstance(s, str) else s
39
+
40
 
41
  def to_str(s):
42
  """
43
  Convert a sequence to string if it's bytes, otherwise return as is.
44
+
45
  Args:
46
  s: The sequence to convert (str or bytes)
47
+
48
  Returns:
49
  str: The input converted to string if it was bytes
50
  """
51
+ return s.decode("utf-8", errors="replace") if isinstance(s, bytes) else s
52
+
53
 
54
  def get_element_repr(element):
55
  """
56
  Get a human-readable representation of a sequence element.
57
+
58
  Args:
59
  element: A single element from a sequence (byte or character)
60
+
61
  Returns:
62
  str: A printable representation of the element
63
  """
 
67
  return f"0x{element:02x}"
68
  return repr(element) # For str objects
69
 
70
+
71
+ def levenshtein_with_wildcard(
72
+ seq1, seq2, wildcard_offsets_seq1=None, wildcard_offsets_seq2=None, verbose=False
73
+ ):
74
  """
75
  Calculate the Levenshtein distance between two sequences with support for wildcards.
76
  Works with both strings and bytes.
77
+
78
  Args:
79
  seq1: First sequence (str or bytes)
80
  seq2: Second sequence (str or bytes)
81
  wildcard_offsets_seq1 (iterable, optional): Indices in seq1 that are wildcards. Defaults to None.
82
  wildcard_offsets_seq2 (iterable, optional): Indices in seq2 that are wildcards. Defaults to None.
83
  verbose (bool, optional): If True, returns additional information about operations. Defaults to False.
84
+
85
  Returns:
86
  int: The Levenshtein distance between the two sequences.
87
  list: If verbose=True, also returns a list of operations performed.
 
89
  # Initialize empty sets if None
90
  wildcard_offsets_seq1 = set(wildcard_offsets_seq1 or [])
91
  wildcard_offsets_seq2 = set(wildcard_offsets_seq2 or [])
92
+
93
  m, n = len(seq1), len(seq2)
94
+
95
  # Create a matrix of size (m+1) x (n+1)
96
  dp = [[0] * (n + 1) for _ in range(m + 1)]
97
+
98
  # Initialize the first row and column
99
  for i in range(m + 1):
100
  dp[i][0] = i
101
+
102
  for j in range(n + 1):
103
  dp[0][j] = j
104
+
105
  # Fill the dp matrix
106
  for i in range(1, m + 1):
107
  for j in range(1, n + 1):
108
  # Check if either position is a wildcard
109
  is_seq1_wildcard = (i - 1) in wildcard_offsets_seq1
110
  is_seq2_wildcard = (j - 1) in wildcard_offsets_seq2
111
+
112
  # If either position is a wildcard, treat it as a match (cost = 0)
113
  if is_seq1_wildcard or is_seq2_wildcard:
114
  dp[i][j] = dp[i - 1][j - 1] # No cost for wildcard matches
115
  else:
116
  cost = 0 if seq1[i - 1] == seq2[j - 1] else 1
117
  dp[i][j] = min(
118
+ dp[i - 1][j] + 1, # deletion
119
+ dp[i][j - 1] + 1, # insertion
120
+ dp[i - 1][j - 1] + cost, # substitution
121
  )
122
+
123
  if verbose:
124
+ operations = explain_match(
125
+ seq1, seq2, dp, wildcard_offsets_seq1, wildcard_offsets_seq2
126
+ )
127
  return dp[m][n], operations
128
+
129
  return dp[m][n]
130
 
131
+
132
  def explain_match(seq1, seq2, dp, wildcard_offsets_seq1, wildcard_offsets_seq2):
133
  """
134
  Traces the optimal alignment path and explains each step of the matching process.
135
+
136
  Args:
137
  seq1: First sequence (str or bytes)
138
  seq2: Second sequence (str or bytes)
139
  dp (list): The dynamic programming matrix.
140
  wildcard_offsets_seq1 (set): Indices in seq1 that are wildcards.
141
  wildcard_offsets_seq2 (set): Indices in seq2 that are wildcards.
142
+
143
  Returns:
144
  list: A list of explanation strings for each operation performed.
145
  """
146
  m, n = len(seq1), len(seq2)
147
  operations = []
148
+
149
  # Find the optimal path
150
  i, j = m, n
151
  path = []
152
+
153
  while i > 0 or j > 0:
154
  path.append((i, j))
155
+
156
  if i == 0:
157
  j -= 1
158
  elif j == 0:
159
  i -= 1
160
  else:
161
+ substitution_cost = dp[i - 1][j - 1]
162
+ deletion_cost = dp[i - 1][j]
163
+ insertion_cost = dp[i][j - 1]
164
+
165
  min_cost = min(substitution_cost, deletion_cost, insertion_cost)
166
+
167
  if min_cost == substitution_cost:
168
  i -= 1
169
  j -= 1
 
171
  i -= 1
172
  else:
173
  j -= 1
174
+
175
  path.append((0, 0))
176
  path.reverse()
177
+
178
  # Generate explanations for each step
179
  for idx in range(1, len(path)):
180
+ prev_i, prev_j = path[idx - 1]
181
  curr_i, curr_j = path[idx]
182
+
183
  # Diagonal move (match or substitution)
184
  if curr_i > prev_i and curr_j > prev_j:
185
+ char1_idx = curr_i - 1
186
+ char2_idx = curr_j - 1
187
  char1 = seq1[char1_idx]
188
  char2 = seq2[char2_idx]
189
+
190
  is_seq1_wildcard = char1_idx in wildcard_offsets_seq1
191
  is_seq2_wildcard = char2_idx in wildcard_offsets_seq2
192
+
193
  char1_repr = get_element_repr(char1)
194
  char2_repr = get_element_repr(char2)
195
+
196
  if is_seq1_wildcard and is_seq2_wildcard:
197
+ operations.append(
198
+ f"Double wildcard: Position {char1_idx} in seq1 and position {char2_idx} in seq2 are both wildcards"
199
+ )
200
  elif is_seq1_wildcard:
201
+ operations.append(
202
+ f"Wildcard match: Position {char1_idx} in seq1 is a wildcard, matches {char2_repr} at position {char2_idx} in seq2"
203
+ )
204
  elif is_seq2_wildcard:
205
+ operations.append(
206
+ f"Wildcard match: Position {char2_idx} in seq2 is a wildcard, matches {char1_repr} at position {char1_idx} in seq1"
207
+ )
208
  elif char1 == char2:
209
+ operations.append(
210
+ f"Match: {char1_repr} at position {char1_idx} matches {char2_repr} at position {char2_idx}"
211
+ )
212
  else:
213
+ operations.append(
214
+ f"Substitution: Replace {char1_repr} at position {char1_idx} with {char2_repr} at position {char2_idx}"
215
+ )
216
+
217
  # Horizontal move (insertion)
218
  elif curr_i == prev_i and curr_j > prev_j:
219
+ char_idx = curr_j - 1
220
  char_repr = get_element_repr(seq2[char_idx])
221
+ operations.append(
222
+ f"Insertion: Insert {char_repr} at position {char_idx} in seq2"
223
+ )
224
+
225
  # Vertical move (deletion)
226
  elif curr_i > prev_i and curr_j == prev_j:
227
+ char_idx = curr_i - 1
228
  char_repr = get_element_repr(seq1[char_idx])
229
+ operations.append(
230
+ f"Deletion: Delete {char_repr} at position {char_idx} in seq1"
231
+ )
232
+
233
  return operations
234
 
235
+
236
  def create_gap_element(sequence):
237
  """
238
  Create a gap element compatible with the sequence type.
239
+
240
  Args:
241
  sequence: The sequence (str or bytes) to create a gap for
242
+
243
  Returns:
244
  The appropriate gap element for the sequence type
245
  """
246
  if isinstance(sequence, bytes):
247
+ return b"-"
248
  else:
249
+ return "-"
250
 
251
+
252
+ def print_match_summary(
253
+ seq1, seq2, wildcard_offsets_seq1=None, wildcard_offsets_seq2=None
254
+ ):
255
  """
256
  Prints a summary of the match between two sequences, highlighting wildcards by their offsets.
257
  Works with both strings and bytes.
258
+
259
  Args:
260
  seq1: First sequence (str or bytes)
261
  seq2: Second sequence (str or bytes)
262
  wildcard_offsets_seq1 (iterable, optional): Indices in seq1 that are wildcards. Defaults to None.
263
  wildcard_offsets_seq2 (iterable, optional): Indices in seq2 that are wildcards. Defaults to None.
264
+
265
  Returns:
266
  tuple: (distance, operations) The edit distance and list of operations
267
  """
268
  # Ensure sequences are of the same type for comparison
269
  seq1, seq2 = ensure_same_type(seq1, seq2)
270
+
271
  # Initialize empty sets if None
272
  wildcard_offsets_seq1 = set(wildcard_offsets_seq1 or [])
273
  wildcard_offsets_seq2 = set(wildcard_offsets_seq2 or [])
274
+
275
  distance, operations = levenshtein_with_wildcard(
276
  seq1, seq2, wildcard_offsets_seq1, wildcard_offsets_seq2, verbose=True
277
  )
278
+
279
  # For reporting, convert to a human-readable representation if needed
280
  seq1_repr = repr(seq1)
281
  seq2_repr = repr(seq2)
282
+
283
  print(f"Comparing {seq1_repr} and {seq2_repr}")
284
  print(f"Wildcards in seq1: {sorted(wildcard_offsets_seq1)}")
285
  print(f"Wildcards in seq2: {sorted(wildcard_offsets_seq2)}")
286
  print(f"Edit distance: {distance}")
287
  print("\nMatch process:")
288
+
289
  for i, op in enumerate(operations):
290
  print(f"Step {i+1}: {op}")
291
+
292
  # Visual representation of the alignment
293
  i, j = 0, 0
294
  is_bytes = isinstance(seq1, bytes)
295
+
296
  if is_bytes:
297
  aligned_seq1 = bytearray()
298
  aligned_seq2 = bytearray()
299
+ gap = ord("-")
300
  else:
301
  aligned_seq1 = ""
302
  aligned_seq2 = ""
303
+ gap = "-"
304
+
305
  match_indicators = ""
306
+
307
  for op in operations:
308
+ if (
309
+ "Match:" in op
310
+ or "Substitution:" in op
311
+ or "Wildcard match:" in op
312
+ or "Double wildcard:" in op
313
+ ):
314
  if is_bytes:
315
  aligned_seq1.append(seq1[i])
316
  aligned_seq2.append(seq2[j])
317
  else:
318
  aligned_seq1 += seq1[i]
319
  aligned_seq2 += seq2[j]
320
+
321
  # Determine match indicator
322
  if "Wildcard match:" in op or "Double wildcard:" in op:
323
  match_indicators += "*" # Wildcard match
 
325
  match_indicators += "|" # Exact match
326
  else:
327
  match_indicators += "X" # Substitution
328
+
329
  i += 1
330
  j += 1
331
  elif "Insertion:" in op:
 
335
  else:
336
  aligned_seq1 += gap
337
  aligned_seq2 += seq2[j]
338
+
339
  match_indicators += " "
340
  j += 1
341
  elif "Deletion:" in op:
 
345
  else:
346
  aligned_seq1 += seq1[i]
347
  aligned_seq2 += gap
348
+
349
  match_indicators += " "
350
  i += 1
351
+
352
  print("\nAlignment:")
353
  if is_bytes:
354
  aligned_seq1 = bytes(aligned_seq1)
355
  aligned_seq2 = bytes(aligned_seq2)
356
+
357
  print(repr(aligned_seq1))
358
  print(match_indicators)
359
  print(repr(aligned_seq2))
360
  print("\nLegend:")
361
+ print(
362
+ "| = exact match, * = wildcard match, X = substitution, - = gap (insertion/deletion)"
363
+ )
364
+
365
  # Summary of wildcard matches
366
+ wildcard_matches = [
367
+ op for op in operations if "Wildcard match:" in op or "Double wildcard:" in op
368
+ ]
369
  if wildcard_matches:
370
  print("\nWildcard matches:")
371
  for match in wildcard_matches:
372
  print(f"- {match}")
373
+
374
  return distance, operations
375
 
376
+
377
  # Example usage
378
  if __name__ == "__main__":
379
  print("\n--- String Examples ---")
380
  # Example 1: "hello" vs "hello" with no wildcards
381
  print_match_summary("hello", "hello")
382
+
383
  # Example 2: "hello" vs "hallo" with no wildcards - expect distance of 1
384
  print_match_summary("hello", "hallo")
385
+
386
  # Example 3: "hello" with 3rd position (index 2) as wildcard vs "hallo" - expect distance of 0
387
  print_match_summary("hello", "hallo", wildcard_offsets_seq1=[2])
388
+
389
  # Example 4: "hello" vs "hillo" with 2nd position (index 1) as wildcard in seq2 - expect distance of 0
390
  print_match_summary("hello", "hillo", wildcard_offsets_seq2=[1])
391
+
392
  # Example 5: Multiple wildcards in seq1
393
  print_match_summary("hello", "haxyz", wildcard_offsets_seq1=[2, 3, 4])
394
+
395
  print("\n--- Bytes Examples ---")
396
  # Example 6: Working with bytes
397
  print_match_summary(b"hello", b"hallo")
398
+
399
  # Example 7: Working with bytes with wildcard
400
  print_match_summary(b"hello", b"hallo", wildcard_offsets_seq1=[2])
401
+
402
  # Example 8: Mixed types (bytes and string)
403
  print_match_summary(b"hello", "hallo", wildcard_offsets_seq1=[2])
404
+
405
  # Example 9: Non-printable bytes example
406
+ print_match_summary(
407
+ b"\x01\x02\x03\x04", b"\x01\x05\x03\x04", wildcard_offsets_seq1=[1]
408
+ )