Spaces:

SreekarB
/

SLPAnalysis

Sleeping

App Files Files Community

SreekarB commited on Nov 8

Commit

503556f

verified ·

1 Parent(s): ec1ed75

Update annotated_casl_app.py

Browse files

Files changed (1) hide show

annotated_casl_app.py +57 -5

annotated_casl_app.py CHANGED Viewed

@@ -18,6 +18,48 @@ if ANTHROPIC_API_KEY:
 else:
     logger.warning("Claude API key not found - using demo mode")
 def segment_response_by_sections(response_text):
     """Segment response by section titles and return a dictionary of sections"""
     required_sections = [
@@ -714,9 +756,9 @@ def analyze_annotated_transcript(annotated_transcript, age, gender, slp_notes):
     - Academic vs. conversational vocabulary ratio
     - Age-appropriate vocabulary development
-    7. NLP-DERIVED LINGUISTIC FEATURES
-    A. Lexical Diversity Measures (provide exact calculations):
     - Type-Token Ratio (TTR): Unique words divided by total words
       * Calculate: [unique words] / [total words] = [ratio]
       * Interpretation: Higher ratios indicate greater lexical diversity
@@ -729,13 +771,13 @@ def analyze_annotated_transcript(annotated_transcript, age, gender, slp_notes):
       * Controls for text length effects
       * Provide HDD score and clinical significance
-    B. Word Frequency Analysis:
-    - Most frequent words used (top 10 with counts)
     - High-frequency vs. low-frequency word distribution
     - Function words vs. content words ratio
     - Repetitive word patterns and their clinical significance
-    C. Linguistic Complexity Indicators:
     - Average word length in syllables
     - Syllable complexity patterns
     - Morphological complexity index
@@ -1492,6 +1534,9 @@ def call_claude_api_with_continuation(prompt):
     # Combine all sections in the correct order
     final_response = combine_sections_smartly(all_sections)
     # Log final results
     print(f"\n=== FINAL SMART VALIDATION ===")
     print(f"Total sections found: {len(all_sections)}")
@@ -1501,6 +1546,13 @@ def call_claude_api_with_continuation(prompt):
     print(f"Total API calls: {continuation_count + 1}")
     print("=" * 50)
     # Add completion indicator with safety info
     if continuation_count > 0:
         final_response += f"\n\n[Analysis completed in {continuation_count + 1} parts over {(time.time() - start_time) / 60:.1f} minutes]"

 else:
     logger.warning("Claude API key not found - using demo mode")
+def clean_output_formatting(text):
+    """Remove asterisks, hashtags, and convert tables to lists in NLP section"""
+    import re
+    # Remove all asterisks (bolding)
+    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)  # **text** -> text
+    text = re.sub(r'\*([^*]+)\*', r'\1', text)      # *text* -> text
+    text = text.replace('**', '')                    # Remove any remaining **
+    text = text.replace('*', '')                     # Remove any remaining *
+    # Remove hashtags from headers
+    text = re.sub(r'^#{1,6}\s*', '', text, flags=re.MULTILINE)  # Remove ### headers
+    # Convert tables to lists - more comprehensive approach
+    lines = text.split('\n')
+    cleaned_lines = []
+    in_table = False
+    for line in lines:
+        # Detect table start (line with multiple |)
+        if line.count('|') >= 2 and not in_table:
+            in_table = True
+            # Skip header line, will process data rows
+            continue
+        elif line.count('|') >= 2 and in_table:
+            # This is a table row - convert to bullet point
+            if not re.match(r'^\s*\|[\s\-\|]+\|\s*$', line):  # Skip separator lines
+                cells = [cell.strip() for cell in line.split('|') if cell.strip()]
+                if len(cells) >= 2:
+                    cleaned_lines.append(f"- {cells[0]}: {' '.join(cells[1:])}")
+        elif in_table and line.count('|') < 2:
+            # End of table
+            in_table = False
+            cleaned_lines.append(line)
+        else:
+            # Regular line
+            cleaned_lines.append(line)
+    text = '\n'.join(cleaned_lines)
+    return text
 def segment_response_by_sections(response_text):
     """Segment response by section titles and return a dictionary of sections"""
     required_sections = [
     - Academic vs. conversational vocabulary ratio
     - Age-appropriate vocabulary development
+    7. NLP-DERIVED LINGUISTIC FEATURES (use bullet lists, NO tables)
+    A. Lexical Diversity Measures (provide exact calculations as bullet points):
     - Type-Token Ratio (TTR): Unique words divided by total words
       * Calculate: [unique words] / [total words] = [ratio]
       * Interpretation: Higher ratios indicate greater lexical diversity
       * Controls for text length effects
       * Provide HDD score and clinical significance
+    B. Word Frequency Analysis (as bullet list, not table):
+    - Most frequent words used: List top 10 as "word (count)" format
     - High-frequency vs. low-frequency word distribution
     - Function words vs. content words ratio
     - Repetitive word patterns and their clinical significance
+    C. Linguistic Complexity Indicators (bullet format):
     - Average word length in syllables
     - Syllable complexity patterns
     - Morphological complexity index
     # Combine all sections in the correct order
     final_response = combine_sections_smartly(all_sections)
+    # Clean formatting: remove asterisks, hashtags, and fix table formatting
+    final_response = clean_output_formatting(final_response)
     # Log final results
     print(f"\n=== FINAL SMART VALIDATION ===")
     print(f"Total sections found: {len(all_sections)}")
     print(f"Total API calls: {continuation_count + 1}")
     print("=" * 50)
+    # Add completion message
+    if len(all_sections) == 13:
+        print("✅ ANALYSIS COMPLETE - All 13 sections generated successfully!")
+        print("📄 Output has been cleaned (removed asterisks, hashtags, converted tables to lists)")
+    else:
+        print(f"⚠️ ANALYSIS INCOMPLETE - {13 - len(all_sections)} sections missing")
     # Add completion indicator with safety info
     if continuation_count > 0:
         final_response += f"\n\n[Analysis completed in {continuation_count + 1} parts over {(time.time() - start_time) / 60:.1f} minutes]"