SreekarB commited on
Commit
503556f
·
verified ·
1 Parent(s): ec1ed75

Update annotated_casl_app.py

Browse files
Files changed (1) hide show
  1. annotated_casl_app.py +57 -5
annotated_casl_app.py CHANGED
@@ -18,6 +18,48 @@ if ANTHROPIC_API_KEY:
18
  else:
19
  logger.warning("Claude API key not found - using demo mode")
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def segment_response_by_sections(response_text):
22
  """Segment response by section titles and return a dictionary of sections"""
23
  required_sections = [
@@ -714,9 +756,9 @@ def analyze_annotated_transcript(annotated_transcript, age, gender, slp_notes):
714
  - Academic vs. conversational vocabulary ratio
715
  - Age-appropriate vocabulary development
716
 
717
- 7. NLP-DERIVED LINGUISTIC FEATURES
718
 
719
- A. Lexical Diversity Measures (provide exact calculations):
720
  - Type-Token Ratio (TTR): Unique words divided by total words
721
  * Calculate: [unique words] / [total words] = [ratio]
722
  * Interpretation: Higher ratios indicate greater lexical diversity
@@ -729,13 +771,13 @@ def analyze_annotated_transcript(annotated_transcript, age, gender, slp_notes):
729
  * Controls for text length effects
730
  * Provide HDD score and clinical significance
731
 
732
- B. Word Frequency Analysis:
733
- - Most frequent words used (top 10 with counts)
734
  - High-frequency vs. low-frequency word distribution
735
  - Function words vs. content words ratio
736
  - Repetitive word patterns and their clinical significance
737
 
738
- C. Linguistic Complexity Indicators:
739
  - Average word length in syllables
740
  - Syllable complexity patterns
741
  - Morphological complexity index
@@ -1492,6 +1534,9 @@ def call_claude_api_with_continuation(prompt):
1492
  # Combine all sections in the correct order
1493
  final_response = combine_sections_smartly(all_sections)
1494
 
 
 
 
1495
  # Log final results
1496
  print(f"\n=== FINAL SMART VALIDATION ===")
1497
  print(f"Total sections found: {len(all_sections)}")
@@ -1501,6 +1546,13 @@ def call_claude_api_with_continuation(prompt):
1501
  print(f"Total API calls: {continuation_count + 1}")
1502
  print("=" * 50)
1503
 
 
 
 
 
 
 
 
1504
  # Add completion indicator with safety info
1505
  if continuation_count > 0:
1506
  final_response += f"\n\n[Analysis completed in {continuation_count + 1} parts over {(time.time() - start_time) / 60:.1f} minutes]"
 
18
  else:
19
  logger.warning("Claude API key not found - using demo mode")
20
 
21
+ def clean_output_formatting(text):
22
+ """Remove asterisks, hashtags, and convert tables to lists in NLP section"""
23
+ import re
24
+
25
+ # Remove all asterisks (bolding)
26
+ text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # **text** -> text
27
+ text = re.sub(r'\*([^*]+)\*', r'\1', text) # *text* -> text
28
+ text = text.replace('**', '') # Remove any remaining **
29
+ text = text.replace('*', '') # Remove any remaining *
30
+
31
+ # Remove hashtags from headers
32
+ text = re.sub(r'^#{1,6}\s*', '', text, flags=re.MULTILINE) # Remove ### headers
33
+
34
+ # Convert tables to lists - more comprehensive approach
35
+ lines = text.split('\n')
36
+ cleaned_lines = []
37
+ in_table = False
38
+
39
+ for line in lines:
40
+ # Detect table start (line with multiple |)
41
+ if line.count('|') >= 2 and not in_table:
42
+ in_table = True
43
+ # Skip header line, will process data rows
44
+ continue
45
+ elif line.count('|') >= 2 and in_table:
46
+ # This is a table row - convert to bullet point
47
+ if not re.match(r'^\s*\|[\s\-\|]+\|\s*$', line): # Skip separator lines
48
+ cells = [cell.strip() for cell in line.split('|') if cell.strip()]
49
+ if len(cells) >= 2:
50
+ cleaned_lines.append(f"- {cells[0]}: {' '.join(cells[1:])}")
51
+ elif in_table and line.count('|') < 2:
52
+ # End of table
53
+ in_table = False
54
+ cleaned_lines.append(line)
55
+ else:
56
+ # Regular line
57
+ cleaned_lines.append(line)
58
+
59
+ text = '\n'.join(cleaned_lines)
60
+
61
+ return text
62
+
63
  def segment_response_by_sections(response_text):
64
  """Segment response by section titles and return a dictionary of sections"""
65
  required_sections = [
 
756
  - Academic vs. conversational vocabulary ratio
757
  - Age-appropriate vocabulary development
758
 
759
+ 7. NLP-DERIVED LINGUISTIC FEATURES (use bullet lists, NO tables)
760
 
761
+ A. Lexical Diversity Measures (provide exact calculations as bullet points):
762
  - Type-Token Ratio (TTR): Unique words divided by total words
763
  * Calculate: [unique words] / [total words] = [ratio]
764
  * Interpretation: Higher ratios indicate greater lexical diversity
 
771
  * Controls for text length effects
772
  * Provide HDD score and clinical significance
773
 
774
+ B. Word Frequency Analysis (as bullet list, not table):
775
+ - Most frequent words used: List top 10 as "word (count)" format
776
  - High-frequency vs. low-frequency word distribution
777
  - Function words vs. content words ratio
778
  - Repetitive word patterns and their clinical significance
779
 
780
+ C. Linguistic Complexity Indicators (bullet format):
781
  - Average word length in syllables
782
  - Syllable complexity patterns
783
  - Morphological complexity index
 
1534
  # Combine all sections in the correct order
1535
  final_response = combine_sections_smartly(all_sections)
1536
 
1537
+ # Clean formatting: remove asterisks, hashtags, and fix table formatting
1538
+ final_response = clean_output_formatting(final_response)
1539
+
1540
  # Log final results
1541
  print(f"\n=== FINAL SMART VALIDATION ===")
1542
  print(f"Total sections found: {len(all_sections)}")
 
1546
  print(f"Total API calls: {continuation_count + 1}")
1547
  print("=" * 50)
1548
 
1549
+ # Add completion message
1550
+ if len(all_sections) == 13:
1551
+ print("✅ ANALYSIS COMPLETE - All 13 sections generated successfully!")
1552
+ print("📄 Output has been cleaned (removed asterisks, hashtags, converted tables to lists)")
1553
+ else:
1554
+ print(f"⚠️ ANALYSIS INCOMPLETE - {13 - len(all_sections)} sections missing")
1555
+
1556
  # Add completion indicator with safety info
1557
  if continuation_count > 0:
1558
  final_response += f"\n\n[Analysis completed in {continuation_count + 1} parts over {(time.time() - start_time) / 60:.1f} minutes]"