yinuozhang commited on
Commit
2c223b3
·
1 Parent(s): 5cfae69
Files changed (1) hide show
  1. app.py +157 -108
app.py CHANGED
@@ -1,17 +1,6 @@
1
  import os
2
  import gradio as gr
3
  import gradio.blocks
4
- from gradio.blocks import Blocks
5
-
6
- original_get_api_info = Blocks.get_api_info
7
-
8
- def safe_get_api_info(self):
9
- try:
10
- return original_get_api_info(self)
11
- except Exception as e:
12
- print("⚠️ Failed to generate API schema:", e)
13
- return {}
14
-
15
  import re
16
  import pandas as pd
17
  from io import StringIO
@@ -37,18 +26,19 @@ class PeptideAnalyzer:
37
  (r'C\(=O\)N[12]?', 'peptide_reverse') # Reverse peptide bond
38
  ]
39
  self.complex_residue_patterns = [
40
- # Kpg - Lys(palmitoyl-Glu-OtBu)
41
  (r'\[C[@]H\]\(CCCNC\(=O\)CCC\[C@@H\]\(NC\(=O\)CCCCCCCCCCCCCCCC\)C\(=O\)OC\(C\)\(C\)C\)', 'Kpg'),
42
  (r'CCCCCCCCCCCCCCCCC\(=O\)N\[C@H\]\(CCCC\(=O\)NCCC\[C@@H\]', 'Kpg'),
43
- (r'\[C[@]?H\]\(CSC\(c\d+ccccc\d+\)\(c\d+ccccc\d+\)c\d+ccc\(OC\)cc\d+\)', 'Cmt'),
44
  (r'CSC\(c.*?c.*?OC\)', 'Cmt'), # Core structure of Cys-Mmt group
45
  (r'COc.*?ccc\(C\(SC', 'Cmt'), # Start of Cmt in cyclic peptides
46
  (r'c2ccccc2\)c2ccccc2\)cc', 'Cmt'), # End of Cmt in cyclic peptides
47
- # Glu(OAll)
48
  (r'C=CCOC\(=O\)CC\[C@@H\]', 'Eal'),
 
49
  #(r'COc\d+ccc\(C\(SC\[C@@H\]\d+.*?\)\(c\d+ccccc\d+\)c\d+ccccc\d+\)cc\d+', 'Cmt-cyclic'),
50
 
51
- # Dtg - Asp(OtBu)-(Dmb)Gly
52
  (r'CN\(Cc\d+ccc\(OC\)cc\d+OC\)C\(=O\)\[C@H\]\(CC\(=O\)OC\(C\)\(C\)C\)', 'Dtg'),
53
  (r'C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
54
  (r'N\[C@@H\]\(CC\(=O\)OC\(C\)\(C\)C\)C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
@@ -68,10 +58,12 @@ class PeptideAnalyzer:
68
  'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
69
  'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
70
  }
71
-
72
  def preprocess_complex_residues(self, smiles):
 
 
73
  complex_positions = []
74
 
 
75
  for pattern, residue_type in self.complex_residue_patterns:
76
  for match in re.finditer(pattern, smiles):
77
  # Only add if this position doesn't overlap with existing matches
@@ -87,6 +79,7 @@ class PeptideAnalyzer:
87
  # Sort by position (to handle potential overlapping matches)
88
  complex_positions.sort(key=lambda x: x['start'])
89
 
 
90
  if not complex_positions:
91
  return smiles, []
92
 
@@ -97,70 +90,37 @@ class PeptideAnalyzer:
97
  protected_residues = []
98
 
99
  for pos in complex_positions:
 
100
  start = pos['start'] + offset
101
  end = pos['end'] + offset
102
 
 
103
  complex_part = preprocessed_smiles[start:end]
104
 
 
105
  if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
106
- continue
107
 
 
108
  placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
109
 
 
110
  preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
111
 
 
112
  offset += len(placeholder) - (end - start)
113
 
 
114
  protected_residues.append({
115
  'placeholder': placeholder,
116
  'type': pos['type'],
117
  'content': complex_part
118
  })
119
-
120
- #print(f"Protected {pos['type']}: {complex_part[:20]}... as {placeholder}")
121
-
122
- return preprocessed_smiles, protected_residues
123
-
124
- def is_peptide(self, smiles):
125
- """Check if the SMILES represents a peptide structure"""
126
- mol = Chem.MolFromSmiles(smiles)
127
- if mol is None:
128
- return False
129
-
130
- # Look for peptide bonds: NC(=O) pattern
131
- peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')
132
- if mol.HasSubstructMatch(peptide_bond_pattern):
133
- return True
134
-
135
- # Look for N-methylated peptide bonds: N(C)C(=O) pattern
136
- n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')
137
- if mol.HasSubstructMatch(n_methyl_pattern):
138
- return True
139
-
140
- return False
141
-
142
- def is_cyclic(self, smiles):
143
- """Improved cyclic peptide detection"""
144
- # Check for C-terminal carboxyl
145
- if smiles.endswith('C(=O)O'):
146
- return False, [], []
147
 
148
- # Find all numbers used in ring closures
149
- ring_numbers = re.findall(r'(?:^|[^c])[0-9](?=[A-Z@\(\)])', smiles)
150
-
151
- # Find aromatic ring numbers
152
- aromatic_matches = re.findall(r'c[0-9](?:ccccc|c\[nH\]c)[0-9]', smiles)
153
- aromatic_cycles = []
154
- for match in aromatic_matches:
155
- numbers = re.findall(r'[0-9]', match)
156
- aromatic_cycles.extend(numbers)
157
-
158
- # Numbers that aren't part of aromatic rings are peptide cycles
159
- peptide_cycles = [n for n in ring_numbers if n not in aromatic_cycles]
160
 
161
- is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
162
- return is_cyclic, peptide_cycles, aromatic_cycles
163
-
164
  def split_on_bonds(self, smiles, protected_residues=None):
165
  """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
166
  positions = []
@@ -196,6 +156,7 @@ class PeptideAnalyzer:
196
  })
197
  used.update(range(match.start(), match.end()))
198
 
 
199
  for pattern, bond_type in self.bond_patterns:
200
  for match in re.finditer(pattern, smiles):
201
  if not any(p in range(match.start(), match.end()) for p in used):
@@ -207,6 +168,7 @@ class PeptideAnalyzer:
207
  })
208
  used.update(range(match.start(), match.end()))
209
 
 
210
  bond_positions.sort(key=lambda x: x['start'])
211
 
212
  # Combine complex residue positions and bond positions
@@ -216,6 +178,7 @@ class PeptideAnalyzer:
216
  # Create segments
217
  segments = []
218
 
 
219
  if all_positions and all_positions[0]['start'] > 0:
220
  segments.append({
221
  'content': smiles[0:all_positions[0]['start']],
@@ -223,10 +186,12 @@ class PeptideAnalyzer:
223
  'complex_after': all_positions[0]['pattern'] if all_positions[0]['type'] == 'complex' else None
224
  })
225
 
 
226
  for i in range(len(all_positions)-1):
227
  current = all_positions[i]
228
  next_pos = all_positions[i+1]
229
 
 
230
  if current['type'] == 'complex':
231
  segments.append({
232
  'content': current['content'],
@@ -234,6 +199,7 @@ class PeptideAnalyzer:
234
  'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None,
235
  'complex_type': current['residue_type']
236
  })
 
237
  elif current['type'] == 'gly':
238
  segments.append({
239
  'content': 'NCC(=O)',
@@ -250,6 +216,7 @@ class PeptideAnalyzer:
250
  'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
251
  })
252
 
 
253
  if all_positions and all_positions[-1]['end'] < len(smiles):
254
  if all_positions[-1]['type'] == 'complex':
255
  segments.append({
@@ -264,6 +231,46 @@ class PeptideAnalyzer:
264
  })
265
 
266
  return segments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
  def clean_terminal_carboxyl(self, segment):
269
  """Remove C-terminal carboxyl only if it's the true terminus"""
@@ -272,17 +279,14 @@ class PeptideAnalyzer:
272
  # Only clean if:
273
  # 1. Contains C(=O)O
274
  # 2. No bond_after exists (meaning it's the last segment)
275
- # 3. C(=O)O is at the end of the content
276
  if 'C(=O)O' in content and not segment.get('bond_after'):
277
- print('recognized?')
278
  # Remove C(=O)O pattern regardless of position
279
  cleaned = re.sub(r'\(C\(=O\)O\)', '', content)
280
  # Remove any leftover empty parentheses
281
  cleaned = re.sub(r'\(\)', '', cleaned)
282
- print(cleaned)
283
  return cleaned
284
  return content
285
-
286
  def identify_residue(self, segment):
287
  """Identify residue with Pro reconstruction"""
288
  # Only clean terminal carboxyl if this is the last segment
@@ -295,14 +299,14 @@ class PeptideAnalyzer:
295
  print("DIRECT MATCH: Found Cmt at beginning")
296
  return 'Cmt', mods
297
 
 
298
  if '[C@@H]3CCCN3C2=O)(c2ccccc2)c2ccccc2)cc' in content:
299
  print("DIRECT MATCH: Found Pro at end")
300
  return 'Pro', mods
301
-
302
- # Eal - Glu(OAll)
303
  if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
304
  return 'Eal', mods
305
-
306
  # Proline (P) - flexible ring numbers
307
  if any([
308
  # Check for any ring number in bond patterns
@@ -332,33 +336,46 @@ class PeptideAnalyzer:
332
  if ('N1[C@H](CCC1)' in content):
333
  return 'pro', mods
334
 
335
- # Tryptophan (W)
336
  if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
337
  'c[nH]c' in content.replace(' ', ''):
 
338
  if '[C@H](CC' in content: # D-form
339
  return 'trp', mods
340
  return 'Trp', mods
341
 
342
  # Lysine (K) - both patterns
343
  if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
 
344
  if '[C@H](CCCCN)' in content: # D-form
345
  return 'lys', mods
346
  return 'Lys', mods
347
 
348
  # Arginine (R) - both patterns
349
  if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
 
350
  if '[C@H](CCCNC(=N)N)' in content: # D-form
351
  return 'arg', mods
352
  return 'Arg', mods
353
 
 
354
  if content == 'C' and segment.get('bond_before') and segment.get('bond_after'):
355
  # If it's surrounded by peptide bonds, it's almost certainly Gly
356
  if ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
357
  ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
358
  return 'Gly', mods
 
 
 
 
 
 
 
 
359
 
360
  # Leucine patterns (L/l)
361
  if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
 
362
  if '[C@H](CC(C)C)' in content or 'CC(C)C[C@H]' in content: # D-form
363
  return 'leu', mods
364
  return 'Leu', mods
@@ -375,6 +392,7 @@ class PeptideAnalyzer:
375
 
376
  # Phenylalanine patterns (F/f)
377
  if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
 
378
  if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content): # D-form
379
  return 'phe', mods
380
  return 'Phe', mods
@@ -385,33 +403,46 @@ class PeptideAnalyzer:
385
 
386
  # Make sure it's not leucine
387
  if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
 
388
  if '[C@H]' in content and not '[C@@H]' in content: # D-form
389
  return 'val', mods
390
  return 'Val', mods
391
 
392
  # Isoleucine patterns (I/i)
393
- if any([
394
- 'CC[C@@H](C)' in content, '[C@@H](C)CC' in content,
395
- '[C@@H](CC)C' in content,
396
- 'C(C)C[C@@H]' in content and 'CC(C)C' not in content
397
- ]):
398
- if '[C@H]([C@@H](CC)C)' in content or '[C@H](CC)C' in content: # D-form
399
- return 'ile', mods
400
- elif '[C@H](C)CC' in content or '[C@H](CC)C' in content or 'CC[C@H](C)' in content:
401
- return 'ile', mods
402
- elif 'C(C)C[C@H]' in content and 'CC(C)C' not in content:
 
 
 
 
 
 
403
  return 'ile', mods
 
404
  return 'Ile', mods
 
 
 
405
 
406
  # Alanine patterns (A/a)
407
  if ('[C@H](C)' in content or '[C@@H](C)' in content):
408
  if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
 
409
  if '[C@H](C)' in content: # D-form
410
  return 'ala', mods
411
  return 'Ala', mods
412
 
413
  # Tyrosine patterns (Y/y)
414
  if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
 
415
  if '[C@H](Cc1ccc(O)cc1)' in content: # D-form
416
  return 'tyr', mods
417
  return 'Tyr', mods
@@ -419,21 +450,25 @@ class PeptideAnalyzer:
419
  # Serine patterns (S/s)
420
  if '[C@H](CO)' in content or '[C@@H](CO)' in content:
421
  if not ('C(C)O' in content or 'COC' in content):
 
422
  if '[C@H](CO)' in content: # D-form
423
  return 'ser', mods
424
  return 'Ser', mods
425
 
426
  if 'CSSC' in content:
 
427
  if re.search(r'\[C@@H\].*CSSC.*\[C@@H\]', content) or re.search(r'\[C@H\].*CSSC.*\[C@H\]', content):
428
  if '[C@H]' in content and not '[C@@H]' in content: # D-form
429
  return 'cys-cys', mods
430
  return 'Cys-Cys', mods
431
 
 
432
  if '[C@@H](N)CSSC' in content or '[C@H](N)CSSC' in content:
433
  if '[C@H](N)CSSC' in content: # D-form
434
  return 'cys-cys', mods
435
  return 'Cys-Cys', mods
436
 
 
437
  if 'CSSC[C@@H](C(=O)O)' in content or 'CSSC[C@H](C(=O)O)' in content:
438
  if 'CSSC[C@H](C(=O)O)' in content: # D-form
439
  return 'cys-cys', mods
@@ -441,12 +476,14 @@ class PeptideAnalyzer:
441
 
442
  # Cysteine patterns (C/c)
443
  if '[C@H](CS)' in content or '[C@@H](CS)' in content:
 
444
  if '[C@H](CS)' in content: # D-form
445
  return 'cys', mods
446
  return 'Cys', mods
447
 
448
  # Methionine patterns (M/m)
449
  if ('CCSC' in content) or ("CSCC" in content):
 
450
  if '[C@H](CCSC)' in content: # D-form
451
  return 'met', mods
452
  elif '[C@H]' in content:
@@ -455,29 +492,34 @@ class PeptideAnalyzer:
455
 
456
  # Glutamine patterns (Q/q)
457
  if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
 
458
  if '[C@H](CCC(=O)N)' in content: # D-form
459
  return 'gln', mods
460
  return 'Gln', mods
461
 
462
  # Asparagine patterns (N/n)
463
  if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
 
464
  if '[C@H](CC(=O)N)' in content: # D-form
465
  return 'asn', mods
466
  return 'Asn', mods
467
 
468
  # Glutamic acid patterns (E/e)
469
  if ('CCC(=O)O' in content):
 
470
  if '[C@H](CCC(=O)O)' in content: # D-form
471
  return 'glu', mods
472
  return 'Glu', mods
473
 
474
  # Aspartic acid patterns (D/d)
475
  if ('CC(=O)O' in content):
 
476
  if '[C@H](CC(=O)O)' in content: # D-form
477
  return 'asp', mods
478
  return 'Asp', mods
479
 
480
  if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
 
481
  if '[C@H]' in content: # D-form
482
  return 'his', mods
483
  return 'His', mods
@@ -488,22 +530,27 @@ class PeptideAnalyzer:
488
  'N[C@H](CCCC)' in content or '[C@H](CCCC)' in content) and 'CC(C)' not in content:
489
  return 'Nle', mods
490
  # Aib - alpha-aminoisobutyric acid (2-aminoisobutyric acid)
491
- if 'C(C)(C)(N)' in content or 'C(C)(C)' in content or 'C(C)(C)' in content and ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
492
- ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
493
  return 'Aib', mods
494
 
495
- # Dtg - Asp(OtBu)-(Dmb)Gly
 
 
 
 
 
 
 
496
  if 'CC(=O)OC(C)(C)C' in content and 'CC1=C(C=C(C=C1)OC)OC' in content:
497
  return 'Dtg', mods
498
 
499
 
500
- # Kpg - Lys(palmitoyl-Glu-OtBu)
501
  if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
502
  return 'Kpg', mods
503
 
504
- # Tpb - Thr(PO(OBzl)OH)
505
- if re.search(r'\[C[@]?H\]\(C\)OP\(=O\)\(O\)', content) or 'OP(=O)(O)OCC' in content:
506
- return 'Tpb', mods
507
 
508
  return None, mods
509
 
@@ -524,7 +571,7 @@ class PeptideAnalyzer:
524
  #mods.append('O-linked')
525
 
526
  return mods
527
-
528
  def analyze_structure(self, smiles):
529
  """Main analysis function with preprocessing for complex residues"""
530
  print("\nAnalyzing structure:", smiles)
@@ -541,6 +588,7 @@ class PeptideAnalyzer:
541
  # Check if it's cyclic
542
  is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
543
 
 
544
  segments = self.split_on_bonds(preprocessed_smiles, protected_residues)
545
 
546
  print("\nSegment Analysis:")
@@ -562,8 +610,10 @@ class PeptideAnalyzer:
562
  else:
563
  print(f"Warning: Could not identify residue in segment: {segment.get('content', 'None')}")
564
 
 
565
  three_letter = '-'.join(sequence)
566
 
 
567
  one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
568
 
569
  if is_cyclic:
@@ -849,6 +899,13 @@ def process_input(
849
  return "Error: Input SMILES does not appear to be a peptide structure.", None, None, []
850
 
851
  try:
 
 
 
 
 
 
 
852
  mol = Chem.MolFromSmiles(smiles)
853
  if mol is None:
854
  return "Error: Invalid SMILES notation.", None, None, []
@@ -876,14 +933,18 @@ def process_input(
876
 
877
  except Exception as e:
878
  return f"Error generating 3D structures: {str(e)}", None, None, []
879
-
880
- segments = analyzer.split_on_bonds(smiles)
881
-
882
- sequence_parts = []
883
- output_text = ""
884
 
 
 
 
 
 
885
  # Only include segment analysis in output if requested
886
  if show_segment_details:
 
 
 
 
887
  output_text += "Segment Analysis:\n"
888
  for i, segment in enumerate(segments):
889
  output_text += f"\nSegment {i}:\n"
@@ -902,22 +963,11 @@ def process_input(
902
  else:
903
  output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
904
  output_text += "\n"
 
 
 
905
  else:
906
- for segment in segments:
907
- residue, mods = analyzer.identify_residue(segment)
908
- if residue:
909
- if mods:
910
- sequence_parts.append(f"{residue}({','.join(mods)})")
911
- else:
912
- sequence_parts.append(residue)
913
-
914
- is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
915
- three_letter = '-'.join(sequence_parts)
916
- one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts)
917
-
918
- if is_cyclic:
919
- three_letter = f"cyclo({three_letter})"
920
- one_letter = f"cyclo({one_letter})"
921
 
922
  img_cyclic = annotate_cyclic_structure(mol, three_letter)
923
 
@@ -944,7 +994,7 @@ def process_input(
944
  for filepath in structure_files:
945
  summary += f"- {os.path.basename(filepath)}\n"
946
 
947
- return summary + output_text, img_cyclic, img_linear, structure_files if structure_files else []
948
 
949
  except Exception as e:
950
  return f"Error processing SMILES: {str(e)}", None, None, []
@@ -1067,5 +1117,4 @@ iface = gr.Interface(
1067
  )
1068
 
1069
  if __name__ == "__main__":
1070
- Blocks.get_api_info = safe_get_api_info
1071
  iface.launch(share=True)
 
1
  import os
2
  import gradio as gr
3
  import gradio.blocks
 
 
 
 
 
 
 
 
 
 
 
4
  import re
5
  import pandas as pd
6
  from io import StringIO
 
26
  (r'C\(=O\)N[12]?', 'peptide_reverse') # Reverse peptide bond
27
  ]
28
  self.complex_residue_patterns = [
29
+ # Kpg - Lys(palmitoyl-Glu-OtBu) - Exact pattern for the specific structure
30
  (r'\[C[@]H\]\(CCCNC\(=O\)CCC\[C@@H\]\(NC\(=O\)CCCCCCCCCCCCCCCC\)C\(=O\)OC\(C\)\(C\)C\)', 'Kpg'),
31
  (r'CCCCCCCCCCCCCCCCC\(=O\)N\[C@H\]\(CCCC\(=O\)NCCC\[C@@H\]', 'Kpg'),
32
+ (r'\[C@*H\]\(CSC\(c\d+ccccc\d+\)\(c\d+ccccc\d+\)c\d+ccc\(OC\)cc\d+\)', 'Cmt'),
33
  (r'CSC\(c.*?c.*?OC\)', 'Cmt'), # Core structure of Cys-Mmt group
34
  (r'COc.*?ccc\(C\(SC', 'Cmt'), # Start of Cmt in cyclic peptides
35
  (r'c2ccccc2\)c2ccccc2\)cc', 'Cmt'), # End of Cmt in cyclic peptides
36
+ # Glu(OAll) - Only match the complete pattern to avoid partial matches
37
  (r'C=CCOC\(=O\)CC\[C@@H\]', 'Eal'),
38
+ (r'\(C\)OP\(=O\)\(O\)OCc\d+ccccc\d+', 'Tpb'),
39
  #(r'COc\d+ccc\(C\(SC\[C@@H\]\d+.*?\)\(c\d+ccccc\d+\)c\d+ccccc\d+\)cc\d+', 'Cmt-cyclic'),
40
 
41
+ # Dtg - Asp(OtBu)-(Dmb)Gly - Full pattern
42
  (r'CN\(Cc\d+ccc\(OC\)cc\d+OC\)C\(=O\)\[C@H\]\(CC\(=O\)OC\(C\)\(C\)C\)', 'Dtg'),
43
  (r'C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
44
  (r'N\[C@@H\]\(CC\(=O\)OC\(C\)\(C\)C\)C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
 
58
  'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
59
  'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
60
  }
 
61
  def preprocess_complex_residues(self, smiles):
62
+ """Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
63
+ # Create a mapping of positions to complex residue types
64
  complex_positions = []
65
 
66
+ # Search for all complex residue patterns
67
  for pattern, residue_type in self.complex_residue_patterns:
68
  for match in re.finditer(pattern, smiles):
69
  # Only add if this position doesn't overlap with existing matches
 
79
  # Sort by position (to handle potential overlapping matches)
80
  complex_positions.sort(key=lambda x: x['start'])
81
 
82
+ # If no complex residues found, return original SMILES
83
  if not complex_positions:
84
  return smiles, []
85
 
 
90
  protected_residues = []
91
 
92
  for pos in complex_positions:
93
+ # Adjust positions based on previous replacements
94
  start = pos['start'] + offset
95
  end = pos['end'] + offset
96
 
97
+ # Extract the complex residue part
98
  complex_part = preprocessed_smiles[start:end]
99
 
100
+ # Verify this is a complete residue (should have proper amino acid structure)
101
  if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
102
+ continue # Skip if not a proper amino acid structure
103
 
104
+ # Create a placeholder for this complex residue
105
  placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
106
 
107
+ # Replace the complex part with the placeholder
108
  preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
109
 
110
+ # Track the offset change
111
  offset += len(placeholder) - (end - start)
112
 
113
+ # Store the residue information
114
  protected_residues.append({
115
  'placeholder': placeholder,
116
  'type': pos['type'],
117
  'content': complex_part
118
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ # Debug
121
+ print(f"Protected {pos['type']}: {complex_part[:20]}... as {placeholder}")
 
 
 
 
 
 
 
 
 
 
122
 
123
+ return preprocessed_smiles, protected_residues
 
 
124
  def split_on_bonds(self, smiles, protected_residues=None):
125
  """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
126
  positions = []
 
156
  })
157
  used.update(range(match.start(), match.end()))
158
 
159
+ # Then find all other bonds
160
  for pattern, bond_type in self.bond_patterns:
161
  for match in re.finditer(pattern, smiles):
162
  if not any(p in range(match.start(), match.end()) for p in used):
 
168
  })
169
  used.update(range(match.start(), match.end()))
170
 
171
+ # Sort all positions
172
  bond_positions.sort(key=lambda x: x['start'])
173
 
174
  # Combine complex residue positions and bond positions
 
178
  # Create segments
179
  segments = []
180
 
181
+ # First segment (if not starting with a bond or complex residue)
182
  if all_positions and all_positions[0]['start'] > 0:
183
  segments.append({
184
  'content': smiles[0:all_positions[0]['start']],
 
186
  'complex_after': all_positions[0]['pattern'] if all_positions[0]['type'] == 'complex' else None
187
  })
188
 
189
+ # Process segments between positions
190
  for i in range(len(all_positions)-1):
191
  current = all_positions[i]
192
  next_pos = all_positions[i+1]
193
 
194
+ # Handle complex residues
195
  if current['type'] == 'complex':
196
  segments.append({
197
  'content': current['content'],
 
199
  'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None,
200
  'complex_type': current['residue_type']
201
  })
202
+ # Handle regular bonds
203
  elif current['type'] == 'gly':
204
  segments.append({
205
  'content': 'NCC(=O)',
 
216
  'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
217
  })
218
 
219
+ # Last segment
220
  if all_positions and all_positions[-1]['end'] < len(smiles):
221
  if all_positions[-1]['type'] == 'complex':
222
  segments.append({
 
231
  })
232
 
233
  return segments
234
+ def is_peptide(self, smiles):
235
+ """Check if the SMILES represents a peptide structure"""
236
+ mol = Chem.MolFromSmiles(smiles)
237
+ if mol is None:
238
+ return False
239
+
240
+ # Look for peptide bonds: NC(=O) pattern
241
+ peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')
242
+ if mol.HasSubstructMatch(peptide_bond_pattern):
243
+ return True
244
+
245
+ # Look for N-methylated peptide bonds: N(C)C(=O) pattern
246
+ n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')
247
+ if mol.HasSubstructMatch(n_methyl_pattern):
248
+ return True
249
+
250
+ return False
251
+
252
+ def is_cyclic(self, smiles):
253
+ """Improved cyclic peptide detection"""
254
+ # Check for C-terminal carboxyl
255
+ if smiles.endswith('C(=O)O'):
256
+ return False, [], []
257
+
258
+ # Find all numbers used in ring closures
259
+ ring_numbers = re.findall(r'(?:^|[^c])[0-9](?=[A-Z@\(\)])', smiles)
260
+
261
+ # Find aromatic ring numbers
262
+ aromatic_matches = re.findall(r'c[0-9](?:ccccc|c\[nH\]c)[0-9]', smiles)
263
+ aromatic_cycles = []
264
+ for match in aromatic_matches:
265
+ numbers = re.findall(r'[0-9]', match)
266
+ aromatic_cycles.extend(numbers)
267
+
268
+ # Numbers that aren't part of aromatic rings are peptide cycles
269
+ peptide_cycles = [n for n in ring_numbers if n not in aromatic_cycles]
270
+
271
+ is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
272
+ return is_cyclic, peptide_cycles, aromatic_cycles
273
+
274
 
275
  def clean_terminal_carboxyl(self, segment):
276
  """Remove C-terminal carboxyl only if it's the true terminus"""
 
279
  # Only clean if:
280
  # 1. Contains C(=O)O
281
  # 2. No bond_after exists (meaning it's the last segment)
 
282
  if 'C(=O)O' in content and not segment.get('bond_after'):
 
283
  # Remove C(=O)O pattern regardless of position
284
  cleaned = re.sub(r'\(C\(=O\)O\)', '', content)
285
  # Remove any leftover empty parentheses
286
  cleaned = re.sub(r'\(\)', '', cleaned)
 
287
  return cleaned
288
  return content
289
+
290
  def identify_residue(self, segment):
291
  """Identify residue with Pro reconstruction"""
292
  # Only clean terminal carboxyl if this is the last segment
 
299
  print("DIRECT MATCH: Found Cmt at beginning")
300
  return 'Cmt', mods
301
 
302
+ # VERY EXPLICIT check for the last segment in your example
303
  if '[C@@H]3CCCN3C2=O)(c2ccccc2)c2ccccc2)cc' in content:
304
  print("DIRECT MATCH: Found Pro at end")
305
  return 'Pro', mods
306
+ # === Original amino acid patterns ===
307
+ # Eal - Glu(OAll) - Multiple patterns
308
  if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
309
  return 'Eal', mods
 
310
  # Proline (P) - flexible ring numbers
311
  if any([
312
  # Check for any ring number in bond patterns
 
336
  if ('N1[C@H](CCC1)' in content):
337
  return 'pro', mods
338
 
339
+ # Tryptophan (W) - more specific indole pattern
340
  if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
341
  'c[nH]c' in content.replace(' ', ''):
342
+ # Check stereochemistry for D/L
343
  if '[C@H](CC' in content: # D-form
344
  return 'trp', mods
345
  return 'Trp', mods
346
 
347
  # Lysine (K) - both patterns
348
  if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
349
+ # Check stereochemistry for D/L
350
  if '[C@H](CCCCN)' in content: # D-form
351
  return 'lys', mods
352
  return 'Lys', mods
353
 
354
  # Arginine (R) - both patterns
355
  if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
356
+ # Check stereochemistry for D/L
357
  if '[C@H](CCCNC(=N)N)' in content: # D-form
358
  return 'arg', mods
359
  return 'Arg', mods
360
 
361
+ # Regular residue identification
362
  if content == 'C' and segment.get('bond_before') and segment.get('bond_after'):
363
  # If it's surrounded by peptide bonds, it's almost certainly Gly
364
  if ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
365
  ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
366
  return 'Gly', mods
367
+
368
+ # Case 2: Cyclic terminal glycine - typically contains 'CNC' with ring closure
369
+ if 'CNC' in content and any(f'C{i}=' in content for i in range(1, 10)):
370
+ return 'Gly', mods # This will catch patterns like 'CNC1=O'
371
+ if not segment.get('bond_before') and segment.get('bond_after'):
372
+ if content == 'C' or content == 'NC':
373
+ if ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
374
+ return 'Gly', mods
375
 
376
  # Leucine patterns (L/l)
377
  if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
378
+ # Check stereochemistry for D/L
379
  if '[C@H](CC(C)C)' in content or 'CC(C)C[C@H]' in content: # D-form
380
  return 'leu', mods
381
  return 'Leu', mods
 
392
 
393
  # Phenylalanine patterns (F/f)
394
  if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
395
+ # Check stereochemistry for D/L
396
  if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content): # D-form
397
  return 'phe', mods
398
  return 'Phe', mods
 
403
 
404
  # Make sure it's not leucine
405
  if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
406
+ # Check stereochemistry
407
  if '[C@H]' in content and not '[C@@H]' in content: # D-form
408
  return 'val', mods
409
  return 'Val', mods
410
 
411
  # Isoleucine patterns (I/i)
412
+ # First check for various isoleucine patterns while excluding valine
413
+ if (any(['CC[C@@H](C)' in content, '[C@@H](C)CC' in content, '[C@@H](CC)C' in content,
414
+ 'C(C)C[C@@H]' in content, '[C@@H]([C@H](C)CC)' in content, '[C@H]([C@@H](C)CC)' in content,
415
+ '[C@@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
416
+ 'C[C@H](CC)[C@@H]' in content, 'C[C@@H](CC)[C@H]' in content,
417
+ 'C[C@H](CC)[C@H]' in content, 'C[C@@H](CC)[C@@H]' in content,
418
+ 'CC[C@H](C)[C@@H]' in content, 'CC[C@@H](C)[C@H]' in content,
419
+ 'CC[C@H](C)[C@H]' in content, 'CC[C@@H](C)[C@@H]' in content])
420
+ and 'CC(C)C' not in content): # Exclude valine pattern
421
+
422
+ # Check stereochemistry for D/L forms
423
+ if any(['[C@H]([C@@H](CC)C)' in content, '[C@H](CC)C' in content,
424
+ '[C@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
425
+ 'C[C@@H](CC)[C@H]' in content, 'C[C@H](CC)[C@H]' in content,
426
+ 'CC[C@@H](C)[C@H]' in content, 'CC[C@H](C)[C@H]' in content]):
427
+ # D-form
428
  return 'ile', mods
429
+ # All other stereochemistries are treated as L-form
430
  return 'Ile', mods
431
+ # Tpb - Thr(PO(OBzl)OH) - Multiple patterns
432
+ if re.search(r'\(C\)OP\(=O\)\(O\)OCc[0-9]ccccc[0-9]', content) or 'OP(=O)(O)OCC' in content:
433
+ return 'Tpb', mods
434
 
435
  # Alanine patterns (A/a)
436
  if ('[C@H](C)' in content or '[C@@H](C)' in content):
437
  if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
438
+ # Check stereochemistry for D/L
439
  if '[C@H](C)' in content: # D-form
440
  return 'ala', mods
441
  return 'Ala', mods
442
 
443
  # Tyrosine patterns (Y/y)
444
  if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
445
+ # Check stereochemistry for D/L
446
  if '[C@H](Cc1ccc(O)cc1)' in content: # D-form
447
  return 'tyr', mods
448
  return 'Tyr', mods
 
450
  # Serine patterns (S/s)
451
  if '[C@H](CO)' in content or '[C@@H](CO)' in content:
452
  if not ('C(C)O' in content or 'COC' in content):
453
+ # Check stereochemistry for D/L
454
  if '[C@H](CO)' in content: # D-form
455
  return 'ser', mods
456
  return 'Ser', mods
457
 
458
  if 'CSSC' in content:
459
+ # Check for various cysteine-cysteine bridge patterns
460
  if re.search(r'\[C@@H\].*CSSC.*\[C@@H\]', content) or re.search(r'\[C@H\].*CSSC.*\[C@H\]', content):
461
  if '[C@H]' in content and not '[C@@H]' in content: # D-form
462
  return 'cys-cys', mods
463
  return 'Cys-Cys', mods
464
 
465
+ # Pattern for cysteine with N-terminal amine group
466
  if '[C@@H](N)CSSC' in content or '[C@H](N)CSSC' in content:
467
  if '[C@H](N)CSSC' in content: # D-form
468
  return 'cys-cys', mods
469
  return 'Cys-Cys', mods
470
 
471
+ # Pattern for cysteine with C-terminal carboxyl
472
  if 'CSSC[C@@H](C(=O)O)' in content or 'CSSC[C@H](C(=O)O)' in content:
473
  if 'CSSC[C@H](C(=O)O)' in content: # D-form
474
  return 'cys-cys', mods
 
476
 
477
  # Cysteine patterns (C/c)
478
  if '[C@H](CS)' in content or '[C@@H](CS)' in content:
479
+ # Check stereochemistry for D/L
480
  if '[C@H](CS)' in content: # D-form
481
  return 'cys', mods
482
  return 'Cys', mods
483
 
484
  # Methionine patterns (M/m)
485
  if ('CCSC' in content) or ("CSCC" in content):
486
+ # Check stereochemistry for D/L
487
  if '[C@H](CCSC)' in content: # D-form
488
  return 'met', mods
489
  elif '[C@H]' in content:
 
492
 
493
  # Glutamine patterns (Q/q)
494
  if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
495
+ # Check stereochemistry for D/L
496
  if '[C@H](CCC(=O)N)' in content: # D-form
497
  return 'gln', mods
498
  return 'Gln', mods
499
 
500
  # Asparagine patterns (N/n)
501
  if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
502
+ # Check stereochemistry for D/L
503
  if '[C@H](CC(=O)N)' in content: # D-form
504
  return 'asn', mods
505
  return 'Asn', mods
506
 
507
  # Glutamic acid patterns (E/e)
508
  if ('CCC(=O)O' in content):
509
+ # Check stereochemistry for D/L
510
  if '[C@H](CCC(=O)O)' in content: # D-form
511
  return 'glu', mods
512
  return 'Glu', mods
513
 
514
  # Aspartic acid patterns (D/d)
515
  if ('CC(=O)O' in content):
516
+ # Check stereochemistry for D/L
517
  if '[C@H](CC(=O)O)' in content: # D-form
518
  return 'asp', mods
519
  return 'Asp', mods
520
 
521
  if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
522
+ # Check stereochemistry for D/L
523
  if '[C@H]' in content: # D-form
524
  return 'his', mods
525
  return 'His', mods
 
530
  'N[C@H](CCCC)' in content or '[C@H](CCCC)' in content) and 'CC(C)' not in content:
531
  return 'Nle', mods
532
  # Aib - alpha-aminoisobutyric acid (2-aminoisobutyric acid)
533
+ # More flexible pattern detection
534
+ if 'C(C)(C)(N)' in content:
535
  return 'Aib', mods
536
 
537
+ # Partial Aib pattern but NOT part of t-butyl ester
538
+ if 'C(C)(C)' in content and 'OC(C)(C)C' not in content:
539
+ if (segment.get('bond_before') and segment.get('bond_after') and
540
+ any(bond in segment['bond_before'] for bond in ['C(=O)N', 'NC(=O)', 'N(C)C(=O)']) and
541
+ any(bond in segment['bond_after'] for bond in ['NC(=O)', 'C(=O)N', 'N(C)C(=O)'])):
542
+ return 'Aib', mods
543
+
544
+ # Dtg - Asp(OtBu)-(Dmb)Gly - Simplified pattern for better detection
545
  if 'CC(=O)OC(C)(C)C' in content and 'CC1=C(C=C(C=C1)OC)OC' in content:
546
  return 'Dtg', mods
547
 
548
 
549
+ # Kpg - Lys(palmitoyl-Glu-OtBu) - Simplified pattern
550
  if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
551
  return 'Kpg', mods
552
 
553
+
 
 
554
 
555
  return None, mods
556
 
 
571
  #mods.append('O-linked')
572
 
573
  return mods
574
+
575
  def analyze_structure(self, smiles):
576
  """Main analysis function with preprocessing for complex residues"""
577
  print("\nAnalyzing structure:", smiles)
 
588
  # Check if it's cyclic
589
  is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
590
 
591
+ # Split into segments, respecting protected residues
592
  segments = self.split_on_bonds(preprocessed_smiles, protected_residues)
593
 
594
  print("\nSegment Analysis:")
 
610
  else:
611
  print(f"Warning: Could not identify residue in segment: {segment.get('content', 'None')}")
612
 
613
+ # Format the sequence
614
  three_letter = '-'.join(sequence)
615
 
616
+ # Use the mapping to create one-letter code
617
  one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
618
 
619
  if is_cyclic:
 
899
  return "Error: Input SMILES does not appear to be a peptide structure.", None, None, []
900
 
901
  try:
902
+ # Preprocess to protect complex residues
903
+ pre_smiles, protected_residues = analyzer.preprocess_complex_residues(smiles)
904
+ # Report protected residues in summary if any
905
+ protected_info = None
906
+ if protected_residues:
907
+ protected_info = [res['type'] for res in protected_residues]
908
+
909
  mol = Chem.MolFromSmiles(smiles)
910
  if mol is None:
911
  return "Error: Invalid SMILES notation.", None, None, []
 
933
 
934
  except Exception as e:
935
  return f"Error generating 3D structures: {str(e)}", None, None, []
 
 
 
 
 
936
 
937
+ analysis = analyzer.analyze_structure(smiles)
938
+ three_letter = analysis['three_letter']
939
+ one_letter = analysis['one_letter']
940
+ is_cyclic = analysis['is_cyclic']
941
+
942
  # Only include segment analysis in output if requested
943
  if show_segment_details:
944
+ segments = analyzer.split_on_bonds(smiles)
945
+
946
+ sequence_parts = []
947
+ output_text = ""
948
  output_text += "Segment Analysis:\n"
949
  for i, segment in enumerate(segments):
950
  output_text += f"\nSegment {i}:\n"
 
963
  else:
964
  output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
965
  output_text += "\n"
966
+ is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
967
+ three_letter = '-'.join(sequence_parts)
968
+ one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts)
969
  else:
970
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
971
 
972
  img_cyclic = annotate_cyclic_structure(mol, three_letter)
973
 
 
994
  for filepath in structure_files:
995
  summary += f"- {os.path.basename(filepath)}\n"
996
 
997
+ return summary, img_cyclic, img_linear, structure_files if structure_files else []
998
 
999
  except Exception as e:
1000
  return f"Error processing SMILES: {str(e)}", None, None, []
 
1117
  )
1118
 
1119
  if __name__ == "__main__":
 
1120
  iface.launch(share=True)