Meet Patel commited on
Commit
14940e1
·
1 Parent(s): 5e80e3b

Step 3: Added multi-modal interaction capabilities with text, voice, and handwriting processing

Browse files
Files changed (3) hide show
  1. main.py +109 -0
  2. utils/__init__.py +3 -0
  3. utils/multimodal.py +140 -0
main.py CHANGED
@@ -4,6 +4,14 @@ import json
4
  from typing import List, Dict, Any, Optional
5
  from datetime import datetime
6
 
 
 
 
 
 
 
 
 
7
  # Create the TutorX MCP server
8
  mcp = FastMCP("TutorX")
9
 
@@ -346,5 +354,106 @@ def update_accessibility_settings(student_id: str, settings: Dict[str, Any]) ->
346
  "updated_at": datetime.now().isoformat()
347
  }
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  if __name__ == "__main__":
350
  mcp.run()
 
4
  from typing import List, Dict, Any, Optional
5
  from datetime import datetime
6
 
7
+ # Import utility functions for multi-modal interactions
8
+ from utils.multimodal import (
9
+ process_text_query,
10
+ process_voice_input,
11
+ process_handwriting,
12
+ generate_speech_response
13
+ )
14
+
15
  # Create the TutorX MCP server
16
  mcp = FastMCP("TutorX")
17
 
 
354
  "updated_at": datetime.now().isoformat()
355
  }
356
 
357
+ # ------------------ Multi-Modal Interaction ------------------
358
+
359
+ @mcp.tool()
360
+ def text_interaction(query: str, student_id: str, session_context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
361
+ """
362
+ Process a text query from the student
363
+
364
+ Args:
365
+ query: The text query from the student
366
+ student_id: The student's unique identifier
367
+ session_context: Optional context about the current session
368
+
369
+ Returns:
370
+ Processed response
371
+ """
372
+ # Add student information to context
373
+ context = session_context or {}
374
+ context["student_id"] = student_id
375
+
376
+ return process_text_query(query, context)
377
+
378
+ @mcp.tool()
379
+ def voice_interaction(audio_data_base64: str, student_id: str) -> Dict[str, Any]:
380
+ """
381
+ Process voice input from the student
382
+
383
+ Args:
384
+ audio_data_base64: Base64 encoded audio data
385
+ student_id: The student's unique identifier
386
+
387
+ Returns:
388
+ Transcription and response
389
+ """
390
+ # Process voice input
391
+ result = process_voice_input(audio_data_base64)
392
+
393
+ # Process the transcription as a text query
394
+ text_response = process_text_query(result["transcription"], {"student_id": student_id})
395
+
396
+ # Generate speech response
397
+ speech_response = generate_speech_response(
398
+ text_response["response"],
399
+ {"voice_id": "educational_tutor"}
400
+ )
401
+
402
+ # Combine results
403
+ return {
404
+ "input_transcription": result["transcription"],
405
+ "input_confidence": result["confidence"],
406
+ "detected_emotions": result.get("detected_emotions", {}),
407
+ "text_response": text_response["response"],
408
+ "speech_response": speech_response,
409
+ "timestamp": datetime.now().isoformat()
410
+ }
411
+
412
+ @mcp.tool()
413
+ def handwriting_recognition(image_data_base64: str, student_id: str) -> Dict[str, Any]:
414
+ """
415
+ Process handwritten input from the student
416
+
417
+ Args:
418
+ image_data_base64: Base64 encoded image data of handwriting
419
+ student_id: The student's unique identifier
420
+
421
+ Returns:
422
+ Transcription and analysis
423
+ """
424
+ # Process handwriting input
425
+ result = process_handwriting(image_data_base64)
426
+
427
+ # If it's a math equation, solve it
428
+ if result["detected_content_type"] == "math_equation":
429
+ # In a real implementation, this would use a math engine to solve the equation
430
+ # For demonstration, we'll provide a simulated solution
431
+ if result["equation_type"] == "quadratic":
432
+ solution = {
433
+ "equation": result["transcription"],
434
+ "solution_steps": [
435
+ "x^2 + 5x + 6 = 0",
436
+ "Factor: (x + 2)(x + 3) = 0",
437
+ "x + 2 = 0 or x + 3 = 0",
438
+ "x = -2 or x = -3"
439
+ ],
440
+ "solutions": [-2, -3]
441
+ }
442
+ else:
443
+ solution = {
444
+ "equation": result["transcription"],
445
+ "note": "Solution not implemented for this equation type"
446
+ }
447
+ else:
448
+ solution = None
449
+
450
+ return {
451
+ "transcription": result["transcription"],
452
+ "confidence": result["confidence"],
453
+ "detected_content_type": result["detected_content_type"],
454
+ "solution": solution,
455
+ "timestamp": datetime.now().isoformat()
456
+ }
457
+
458
  if __name__ == "__main__":
459
  mcp.run()
utils/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ TutorX MCP Server utilities.
3
+ """
utils/multimodal.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for multi-modal interactions including text processing,
3
+ voice recognition and handwriting recognition for the TutorX MCP server.
4
+ """
5
+
6
+ from typing import Dict, Any, List, Optional
7
+ import base64
8
+ import json
9
+ from datetime import datetime
10
+
11
+
12
+ def process_text_query(query: str, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
13
+ """
14
+ Process a text query from the student
15
+
16
+ Args:
17
+ query: The text query from the student
18
+ context: Optional context about the student and current session
19
+
20
+ Returns:
21
+ Processed response
22
+ """
23
+ # In a real implementation, this would use NLP to understand the query
24
+ # and generate an appropriate response
25
+
26
+ # Simple keyword-based response for demonstration
27
+ keywords = {
28
+ "solve": {
29
+ "type": "math_solution",
30
+ "response": "To solve this equation, first isolate the variable by..."
31
+ },
32
+ "what is": {
33
+ "type": "definition",
34
+ "response": "This concept refers to..."
35
+ },
36
+ "how do i": {
37
+ "type": "procedure",
38
+ "response": "Follow these steps: 1)..."
39
+ },
40
+ "help": {
41
+ "type": "assistance",
42
+ "response": "I'm here to help! You can ask me questions about..."
43
+ }
44
+ }
45
+
46
+ for key, value in keywords.items():
47
+ if key in query.lower():
48
+ return {
49
+ "query": query,
50
+ "response_type": value["type"],
51
+ "response": value["response"],
52
+ "confidence": 0.85,
53
+ "timestamp": datetime.now().isoformat()
54
+ }
55
+
56
+ # Default response if no keywords match
57
+ return {
58
+ "query": query,
59
+ "response_type": "general",
60
+ "response": "That's an interesting question. Let me think about how to help you with that.",
61
+ "confidence": 0.6,
62
+ "timestamp": datetime.now().isoformat()
63
+ }
64
+
65
+
66
+ def process_voice_input(audio_data_base64: str) -> Dict[str, Any]:
67
+ """
68
+ Process voice input from the student
69
+
70
+ Args:
71
+ audio_data_base64: Base64 encoded audio data
72
+
73
+ Returns:
74
+ Transcription and analysis
75
+ """
76
+ # In a real implementation, this would use ASR to transcribe the audio
77
+ # and then process the transcribed text
78
+
79
+ # For demonstration purposes, we'll simulate a transcription
80
+ return {
81
+ "transcription": "What is the quadratic formula?",
82
+ "confidence": 0.92,
83
+ "detected_emotions": {
84
+ "confusion": 0.7,
85
+ "interest": 0.9,
86
+ "frustration": 0.2
87
+ },
88
+ "audio_quality": "good",
89
+ "background_noise": "low",
90
+ "timestamp": datetime.now().isoformat()
91
+ }
92
+
93
+
94
+ def process_handwriting(image_data_base64: str) -> Dict[str, Any]:
95
+ """
96
+ Process handwritten input from the student
97
+
98
+ Args:
99
+ image_data_base64: Base64 encoded image data of handwriting
100
+
101
+ Returns:
102
+ Transcription and analysis
103
+ """
104
+ # In a real implementation, this would use OCR/handwriting recognition
105
+ # to transcribe the handwritten text or equations
106
+
107
+ # For demonstration purposes, we'll simulate a transcription
108
+ return {
109
+ "transcription": "x^2 + 5x + 6 = 0",
110
+ "confidence": 0.85,
111
+ "detected_content_type": "math_equation",
112
+ "equation_type": "quadratic",
113
+ "parsed_latex": "x^2 + 5x + 6 = 0",
114
+ "timestamp": datetime.now().isoformat()
115
+ }
116
+
117
+
118
+ def generate_speech_response(text: str, voice_params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
119
+ """
120
+ Generate speech response from text
121
+
122
+ Args:
123
+ text: The text to convert to speech
124
+ voice_params: Parameters for the voice (gender, age, accent, etc.)
125
+
126
+ Returns:
127
+ Speech data and metadata
128
+ """
129
+ # In a real implementation, this would use TTS to generate audio
130
+
131
+ # For demonstration, we'll simulate audio generation metadata
132
+ return {
133
+ "text": text,
134
+ "audio_format": "mp3",
135
+ "audio_data_base64": "SIMULATED_BASE64_AUDIO_DATA",
136
+ "voice_id": voice_params.get("voice_id", "default"),
137
+ "duration_seconds": len(text) / 15, # Rough estimate of speech duration
138
+ "sample_rate": 24000,
139
+ "timestamp": datetime.now().isoformat()
140
+ }