Spaces:
Sleeping
Sleeping
Update annotated_casl_app.py
Browse files- annotated_casl_app.py +57 -5
annotated_casl_app.py
CHANGED
|
@@ -18,6 +18,48 @@ if ANTHROPIC_API_KEY:
|
|
| 18 |
else:
|
| 19 |
logger.warning("Claude API key not found - using demo mode")
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def segment_response_by_sections(response_text):
|
| 22 |
"""Segment response by section titles and return a dictionary of sections"""
|
| 23 |
required_sections = [
|
|
@@ -714,9 +756,9 @@ def analyze_annotated_transcript(annotated_transcript, age, gender, slp_notes):
|
|
| 714 |
- Academic vs. conversational vocabulary ratio
|
| 715 |
- Age-appropriate vocabulary development
|
| 716 |
|
| 717 |
-
7. NLP-DERIVED LINGUISTIC FEATURES
|
| 718 |
|
| 719 |
-
A. Lexical Diversity Measures (provide exact calculations):
|
| 720 |
- Type-Token Ratio (TTR): Unique words divided by total words
|
| 721 |
* Calculate: [unique words] / [total words] = [ratio]
|
| 722 |
* Interpretation: Higher ratios indicate greater lexical diversity
|
|
@@ -729,13 +771,13 @@ def analyze_annotated_transcript(annotated_transcript, age, gender, slp_notes):
|
|
| 729 |
* Controls for text length effects
|
| 730 |
* Provide HDD score and clinical significance
|
| 731 |
|
| 732 |
-
B. Word Frequency Analysis:
|
| 733 |
-
- Most frequent words used
|
| 734 |
- High-frequency vs. low-frequency word distribution
|
| 735 |
- Function words vs. content words ratio
|
| 736 |
- Repetitive word patterns and their clinical significance
|
| 737 |
|
| 738 |
-
C. Linguistic Complexity Indicators:
|
| 739 |
- Average word length in syllables
|
| 740 |
- Syllable complexity patterns
|
| 741 |
- Morphological complexity index
|
|
@@ -1492,6 +1534,9 @@ def call_claude_api_with_continuation(prompt):
|
|
| 1492 |
# Combine all sections in the correct order
|
| 1493 |
final_response = combine_sections_smartly(all_sections)
|
| 1494 |
|
|
|
|
|
|
|
|
|
|
| 1495 |
# Log final results
|
| 1496 |
print(f"\n=== FINAL SMART VALIDATION ===")
|
| 1497 |
print(f"Total sections found: {len(all_sections)}")
|
|
@@ -1501,6 +1546,13 @@ def call_claude_api_with_continuation(prompt):
|
|
| 1501 |
print(f"Total API calls: {continuation_count + 1}")
|
| 1502 |
print("=" * 50)
|
| 1503 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1504 |
# Add completion indicator with safety info
|
| 1505 |
if continuation_count > 0:
|
| 1506 |
final_response += f"\n\n[Analysis completed in {continuation_count + 1} parts over {(time.time() - start_time) / 60:.1f} minutes]"
|
|
|
|
| 18 |
else:
|
| 19 |
logger.warning("Claude API key not found - using demo mode")
|
| 20 |
|
| 21 |
+
def clean_output_formatting(text):
|
| 22 |
+
"""Remove asterisks, hashtags, and convert tables to lists in NLP section"""
|
| 23 |
+
import re
|
| 24 |
+
|
| 25 |
+
# Remove all asterisks (bolding)
|
| 26 |
+
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # **text** -> text
|
| 27 |
+
text = re.sub(r'\*([^*]+)\*', r'\1', text) # *text* -> text
|
| 28 |
+
text = text.replace('**', '') # Remove any remaining **
|
| 29 |
+
text = text.replace('*', '') # Remove any remaining *
|
| 30 |
+
|
| 31 |
+
# Remove hashtags from headers
|
| 32 |
+
text = re.sub(r'^#{1,6}\s*', '', text, flags=re.MULTILINE) # Remove ### headers
|
| 33 |
+
|
| 34 |
+
# Convert tables to lists - more comprehensive approach
|
| 35 |
+
lines = text.split('\n')
|
| 36 |
+
cleaned_lines = []
|
| 37 |
+
in_table = False
|
| 38 |
+
|
| 39 |
+
for line in lines:
|
| 40 |
+
# Detect table start (line with multiple |)
|
| 41 |
+
if line.count('|') >= 2 and not in_table:
|
| 42 |
+
in_table = True
|
| 43 |
+
# Skip header line, will process data rows
|
| 44 |
+
continue
|
| 45 |
+
elif line.count('|') >= 2 and in_table:
|
| 46 |
+
# This is a table row - convert to bullet point
|
| 47 |
+
if not re.match(r'^\s*\|[\s\-\|]+\|\s*$', line): # Skip separator lines
|
| 48 |
+
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
| 49 |
+
if len(cells) >= 2:
|
| 50 |
+
cleaned_lines.append(f"- {cells[0]}: {' '.join(cells[1:])}")
|
| 51 |
+
elif in_table and line.count('|') < 2:
|
| 52 |
+
# End of table
|
| 53 |
+
in_table = False
|
| 54 |
+
cleaned_lines.append(line)
|
| 55 |
+
else:
|
| 56 |
+
# Regular line
|
| 57 |
+
cleaned_lines.append(line)
|
| 58 |
+
|
| 59 |
+
text = '\n'.join(cleaned_lines)
|
| 60 |
+
|
| 61 |
+
return text
|
| 62 |
+
|
| 63 |
def segment_response_by_sections(response_text):
|
| 64 |
"""Segment response by section titles and return a dictionary of sections"""
|
| 65 |
required_sections = [
|
|
|
|
| 756 |
- Academic vs. conversational vocabulary ratio
|
| 757 |
- Age-appropriate vocabulary development
|
| 758 |
|
| 759 |
+
7. NLP-DERIVED LINGUISTIC FEATURES (use bullet lists, NO tables)
|
| 760 |
|
| 761 |
+
A. Lexical Diversity Measures (provide exact calculations as bullet points):
|
| 762 |
- Type-Token Ratio (TTR): Unique words divided by total words
|
| 763 |
* Calculate: [unique words] / [total words] = [ratio]
|
| 764 |
* Interpretation: Higher ratios indicate greater lexical diversity
|
|
|
|
| 771 |
* Controls for text length effects
|
| 772 |
* Provide HDD score and clinical significance
|
| 773 |
|
| 774 |
+
B. Word Frequency Analysis (as bullet list, not table):
|
| 775 |
+
- Most frequent words used: List top 10 as "word (count)" format
|
| 776 |
- High-frequency vs. low-frequency word distribution
|
| 777 |
- Function words vs. content words ratio
|
| 778 |
- Repetitive word patterns and their clinical significance
|
| 779 |
|
| 780 |
+
C. Linguistic Complexity Indicators (bullet format):
|
| 781 |
- Average word length in syllables
|
| 782 |
- Syllable complexity patterns
|
| 783 |
- Morphological complexity index
|
|
|
|
| 1534 |
# Combine all sections in the correct order
|
| 1535 |
final_response = combine_sections_smartly(all_sections)
|
| 1536 |
|
| 1537 |
+
# Clean formatting: remove asterisks, hashtags, and fix table formatting
|
| 1538 |
+
final_response = clean_output_formatting(final_response)
|
| 1539 |
+
|
| 1540 |
# Log final results
|
| 1541 |
print(f"\n=== FINAL SMART VALIDATION ===")
|
| 1542 |
print(f"Total sections found: {len(all_sections)}")
|
|
|
|
| 1546 |
print(f"Total API calls: {continuation_count + 1}")
|
| 1547 |
print("=" * 50)
|
| 1548 |
|
| 1549 |
+
# Add completion message
|
| 1550 |
+
if len(all_sections) == 13:
|
| 1551 |
+
print("✅ ANALYSIS COMPLETE - All 13 sections generated successfully!")
|
| 1552 |
+
print("📄 Output has been cleaned (removed asterisks, hashtags, converted tables to lists)")
|
| 1553 |
+
else:
|
| 1554 |
+
print(f"⚠️ ANALYSIS INCOMPLETE - {13 - len(all_sections)} sections missing")
|
| 1555 |
+
|
| 1556 |
# Add completion indicator with safety info
|
| 1557 |
if continuation_count > 0:
|
| 1558 |
final_response += f"\n\n[Analysis completed in {continuation_count + 1} parts over {(time.time() - start_time) / 60:.1f} minutes]"
|