Spaces:

grasant
/

fhirflame

Sleeping

fhirflame / src /monitoring.py

leksval

initial commit

a963d65 19 days ago

30.6 kB

	"""
	FhirFlame Unified Monitoring and Observability
	Comprehensive Langfuse integration for medical AI workflows with centralized monitoring
	"""

	import time
	import json
	from typing import Dict, Any, Optional, List, Union
	from functools import wraps
	from contextlib import contextmanager

	# Langfuse monitoring with environment configuration
	try:
	import os
	import sys
	from dotenv import load_dotenv
	load_dotenv() # Load environment variables

	# Comprehensive test environment detection
	is_testing = (
	os.getenv("DISABLE_LANGFUSE") == "true" or
	os.getenv("PYTEST_RUNNING") == "true" or
	os.getenv("PYTEST_CURRENT_TEST") is not None or
	"pytest" in str(sys.argv) or
	"pytest" in os.getenv("_", "") or
	"test" in os.path.basename(os.getenv("_", "")) or
	any("pytest" in arg for arg in sys.argv) or
	any("test" in arg for arg in sys.argv)
	)

	if is_testing:
	print("🧪 Test environment detected - disabling Langfuse")
	langfuse = None
	LANGFUSE_AVAILABLE = False
	else:
	try:
	from langfuse import Langfuse

	# Check if Langfuse is properly configured
	secret_key = os.getenv("LANGFUSE_SECRET_KEY")
	public_key = os.getenv("LANGFUSE_PUBLIC_KEY")
	host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")

	if not secret_key or not public_key:
	print("⚠️ Langfuse keys not configured - using local monitoring only")
	langfuse = None
	LANGFUSE_AVAILABLE = False
	else:
	# Initialize with environment variables and timeout settings
	try:
	langfuse = Langfuse(
	secret_key=secret_key,
	public_key=public_key,
	host=host,
	timeout=2 # Very short timeout for faster failure detection
	)

	# Test connection with a simple call
	try:
	# Quick health check - if this fails, disable Langfuse
	# Use the newer Langfuse API for health check
	if hasattr(langfuse, 'trace'):
	test_trace = langfuse.trace(name="connection_test")
	if test_trace:
	test_trace.update(output={"status": "connection_ok"})
	else:
	# Fallback: just test if the client exists
	_ = str(langfuse)
	LANGFUSE_AVAILABLE = True
	print(f"🔍 Langfuse initialized: {host}")
	except Exception as connection_error:
	print(f"⚠️ Langfuse connection test failed: {connection_error}")
	print("🔄 Continuing with local-only monitoring...")
	langfuse = None
	LANGFUSE_AVAILABLE = False

	except Exception as init_error:
	print(f"⚠️ Langfuse client initialization failed: {init_error}")
	print("🔄 Continuing with local-only monitoring...")
	langfuse = None
	LANGFUSE_AVAILABLE = False
	except Exception as langfuse_error:
	print(f"⚠️ Langfuse initialization failed: {langfuse_error}")
	langfuse = None
	LANGFUSE_AVAILABLE = False

	except ImportError:
	langfuse = None
	LANGFUSE_AVAILABLE = False
	print("⚠️ Langfuse package not available - using local monitoring only")
	except Exception as e:
	langfuse = None
	LANGFUSE_AVAILABLE = False
	print(f"⚠️ Langfuse initialization failed: {e}")
	print(f"🔄 Continuing with local-only monitoring...")

	# LangChain monitoring
	try:
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	LANGCHAIN_AVAILABLE = True
	except ImportError:
	LANGCHAIN_AVAILABLE = False

	class FhirFlameMonitor:
	"""Comprehensive monitoring for FhirFlame medical AI workflows"""

	def __init__(self):
	self.langfuse = langfuse if LANGFUSE_AVAILABLE else None
	self.session_id = f"fhirflame_{int(time.time())}" if self.langfuse else None

	def track_operation(self, operation_name: str):
	"""Universal decorator to track any operation"""
	def decorator(func):
	@wraps(func)
	async def wrapper(args, *kwargs):
	start_time = time.time()
	trace = None

	if self.langfuse:
	try:
	# Use newer Langfuse API if available
	if hasattr(self.langfuse, 'trace'):
	trace = self.langfuse.trace(
	name=operation_name,
	session_id=self.session_id
	)
	else:
	trace = None
	except Exception:
	trace = None

	try:
	result = await func(args, *kwargs)
	processing_time = time.time() - start_time

	if trace:
	trace.update(
	output={"status": "success", "processing_time": processing_time},
	metadata={"operation": operation_name}
	)

	return result

	except Exception as e:
	if trace:
	trace.update(
	output={"status": "error", "error": str(e)},
	metadata={"processing_time": time.time() - start_time}
	)
	raise

	return wrapper
	return decorator

	def log_event(self, event_name: str, properties: Dict[str, Any]):
	"""Log any event with properties"""

	# LOCAL DEBUG: write log to local file
	try:
	import os
	os.makedirs('/app/logs', exist_ok=True)
	with open('/app/logs/debug_events.log', 'a') as f:
	f.write(f"{time.time()} {event_name} {json.dumps(properties)}\n")
	except Exception:
	pass
	if self.langfuse:
	try:
	# Use newer Langfuse API if available
	if hasattr(self.langfuse, 'event'):
	self.langfuse.event(
	name=event_name,
	properties=properties,
	session_id=self.session_id
	)
	elif hasattr(self.langfuse, 'log'):
	# Fallback to older API
	self.langfuse.log(
	level="INFO",
	message=event_name,
	extra=properties
	)
	except Exception:
	# Silently fail for logging to avoid disrupting workflow
	# Disable Langfuse for this session if it keeps failing
	self.langfuse = None

	# === AI MODEL PROCESSING MONITORING ===

	def log_ollama_api_call(self, model: str, url: str, prompt_length: int, success: bool = True, response_time: float = 0.0, status_code: int = 200, error: str = None):
	"""Log Ollama API call details"""
	self.log_event("ollama_api_call", {
	"model": model,
	"url": url,
	"prompt_length": prompt_length,
	"success": success,
	"response_time": response_time,
	"status_code": status_code,
	"error": error,
	"api_type": "ollama_generate"
	})

	def log_ai_generation(self, model: str, response_length: int, processing_time: float, entities_found: int, confidence: float, processing_mode: str):
	"""Log AI text generation results"""
	self.log_event("ai_generation_complete", {
	"model": model,
	"response_length": response_length,
	"processing_time": processing_time,
	"entities_found": entities_found,
	"confidence_score": confidence,
	"processing_mode": processing_mode,
	"generation_type": "medical_entity_extraction"
	})

	def log_ai_parsing(self, success: bool, response_format: str, entities_extracted: int, parsing_time: float, error: str = None):
	"""Log AI response parsing results"""
	self.log_event("ai_response_parsing", {
	"parsing_success": success,
	"response_format": response_format,
	"entities_extracted": entities_extracted,
	"parsing_time": parsing_time,
	"error": error,
	"parser_type": "json_medical_extractor"
	})

	def log_data_transformation(self, input_format: str, output_format: str, entities_transformed: int, transformation_time: float, complex_nested: bool = False):
	"""Log data transformation operations"""
	self.log_event("data_transformation", {
	"input_format": input_format,
	"output_format": output_format,
	"entities_transformed": entities_transformed,
	"transformation_time": transformation_time,
	"complex_nested_input": complex_nested,
	"transformer_type": "ai_to_pydantic"
	})

	# === MEDICAL PROCESSING MONITORING ===

	def log_medical_processing(self, entities_found: int, confidence: float, processing_time: float, processing_mode: str = "unknown", model_used: str = "codellama:13b-instruct"):
	"""Log medical processing results"""
	self.log_event("medical_processing_complete", {
	"entities_found": entities_found,
	"confidence_score": confidence,
	"processing_time": processing_time,
	"processing_mode": processing_mode,
	"model_used": model_used,
	"extraction_type": "clinical_entities"
	})

	def log_medical_entity_extraction(self, conditions: int, medications: int, vitals: int, procedures: int, patient_info_found: bool, confidence: float):
	"""Log detailed medical entity extraction"""
	self.log_event("medical_entity_extraction", {
	"conditions_found": conditions,
	"medications_found": medications,
	"vitals_found": vitals,
	"procedures_found": procedures,
	"patient_info_extracted": patient_info_found,
	"total_entities": conditions + medications + vitals + procedures,
	"confidence_score": confidence,
	"extraction_category": "clinical_data"
	})

	def log_rule_based_processing(self, entities_found: int, conditions: int, medications: int, vitals: int, confidence: float, processing_time: float):
	"""Log rule-based processing fallback"""
	self.log_event("rule_based_processing_complete", {
	"total_entities": entities_found,
	"conditions_found": conditions,
	"medications_found": medications,
	"vitals_found": vitals,
	"confidence_score": confidence,
	"processing_time": processing_time,
	"processing_mode": "rule_based_fallback",
	"fallback_triggered": True
	})

	# === FHIR VALIDATION MONITORING ===

	def log_fhir_validation(self, is_valid: bool, compliance_score: float, validation_level: str, fhir_version: str = "R4", resource_types: List[str] = None):
	"""Log FHIR validation results"""
	self.log_event("fhir_validation_complete", {
	"is_valid": is_valid,
	"compliance_score": compliance_score,
	"validation_level": validation_level,
	"fhir_version": fhir_version,
	"resource_types": resource_types or [],
	"validation_type": "bundle_validation"
	})

	def log_fhir_structure_validation(self, structure_valid: bool, resource_types: List[str], validation_time: float, errors: List[str] = None):
	"""Log FHIR structure validation"""
	self.log_event("fhir_structure_validation", {
	"structure_valid": structure_valid,
	"resource_types_detected": resource_types,
	"validation_time": validation_time,
	"error_count": len(errors) if errors else 0,
	"validation_errors": errors or [],
	"validator_type": "pydantic_fhir"
	})

	def log_fhir_terminology_validation(self, terminology_valid: bool, codes_validated: int, loinc_found: bool, snomed_found: bool, validation_time: float):
	"""Log FHIR terminology validation"""
	self.log_event("fhir_terminology_validation", {
	"terminology_valid": terminology_valid,
	"codes_validated": codes_validated,
	"loinc_codes_found": loinc_found,
	"snomed_codes_found": snomed_found,
	"validation_time": validation_time,
	"coding_systems": ["LOINC" if loinc_found else "", "SNOMED" if snomed_found else ""],
	"validator_type": "medical_terminology"
	})

	def log_hipaa_compliance_check(self, is_compliant: bool, phi_protected: bool, security_met: bool, validation_time: float, errors: List[str] = None):
	"""Log HIPAA compliance validation"""
	self.log_event("hipaa_compliance_check", {
	"hipaa_compliant": is_compliant,
	"phi_properly_protected": phi_protected,
	"security_requirements_met": security_met,
	"validation_time": validation_time,
	"compliance_errors": errors or [],
	"compliance_level": "healthcare_grade",
	"validator_type": "hipaa_checker"
	})

	def log_fhir_bundle_generation(self, patient_resources: int, condition_resources: int, observation_resources: int, generation_time: float, success: bool):
	"""Log FHIR bundle generation"""
	self.log_event("fhir_bundle_generation", {
	"patient_resources": patient_resources,
	"condition_resources": condition_resources,
	"observation_resources": observation_resources,
	"total_resources": patient_resources + condition_resources + observation_resources,
	"generation_time": generation_time,
	"generation_success": success,
	"bundle_type": "document",
	"generator_type": "pydantic_fhir"
	})

	# === WORKFLOW MONITORING ===

	def log_document_processing_start(self, document_type: str, text_length: int, extract_entities: bool, generate_fhir: bool):
	"""Log start of document processing"""
	self.log_event("document_processing_start", {
	"document_type": document_type,
	"text_length": text_length,
	"extract_entities": extract_entities,
	"generate_fhir": generate_fhir,
	"workflow_stage": "initialization"
	})

	def log_document_processing_complete(self, success: bool, processing_time: float, entities_found: int, fhir_generated: bool, quality_score: float):
	"""Log completion of document processing"""
	self.log_event("document_processing_complete", {
	"processing_success": success,
	"total_processing_time": processing_time,
	"entities_extracted": entities_found,
	"fhir_bundle_generated": fhir_generated,
	"quality_score": quality_score,
	"workflow_stage": "completion"
	})

	def log_workflow_summary(self, documents_processed: int, successful_documents: int, total_time: float, average_time: float, monitoring_active: bool):
	"""Log overall workflow summary"""
	self.log_event("workflow_summary", {
	"documents_processed": documents_processed,
	"successful_documents": successful_documents,
	"failed_documents": documents_processed - successful_documents,
	"success_rate": successful_documents / documents_processed if documents_processed > 0 else 0,
	"total_processing_time": total_time,
	"average_time_per_document": average_time,
	"monitoring_active": monitoring_active,
	"workflow_type": "real_medical_processing"
	})

	def log_mcp_tool(self, tool_name: str, success: bool, processing_time: float, input_size: int = 0, entities_found: int = 0):
	"""Log MCP tool execution"""
	self.log_event("mcp_tool_execution", {
	"tool_name": tool_name,
	"success": success,
	"processing_time": processing_time,
	"input_size": input_size,
	"entities_found": entities_found,
	"mcp_protocol_version": "2024-11-05"
	})

	def log_mcp_server_start(self, server_name: str, tools_count: int, port: int):
	"""Log MCP server startup"""
	self.log_event("mcp_server_startup", {
	"server_name": server_name,
	"tools_available": tools_count,
	"port": port,
	"protocol": "mcp_2024"
	})

	def log_mcp_authentication(self, auth_method: str, success: bool, user_id: str = None):
	"""Log MCP authentication events"""
	self.log_event("mcp_authentication", {
	"auth_method": auth_method,
	"success": success,
	"user_id": user_id or "anonymous",
	"security_level": "a2a_api"
	})

	# === MISTRAL OCR MONITORING ===

	def log_mistral_ocr_processing(self, document_size: int, extraction_time: float, success: bool, text_length: int = 0, error: str = None):
	"""Log Mistral OCR API processing"""
	self.log_event("mistral_ocr_processing", {
	"document_size_bytes": document_size,
	"extraction_time": extraction_time,
	"success": success,
	"extracted_text_length": text_length,
	"error": error,
	"ocr_provider": "mistral_api"
	})

	def log_ocr_workflow_integration(self, ocr_method: str, agent_processing_time: float, total_workflow_time: float, entities_found: int):
	"""Log complete OCR → Agent workflow integration"""
	self.log_event("ocr_workflow_integration", {
	"ocr_method": ocr_method,
	"agent_processing_time": agent_processing_time,
	"total_workflow_time": total_workflow_time,
	"entities_extracted": entities_found,
	"workflow_type": "ocr_to_agent_pipeline"
	})

	# === A2A API MONITORING ===

	def log_a2a_api_request(self, endpoint: str, method: str, auth_method: str, request_size: int, user_id: str = None):
	"""Log A2A API request"""
	self.log_event("a2a_api_request", {
	"endpoint": endpoint,
	"method": method,
	"auth_method": auth_method,
	"request_size_bytes": request_size,
	"user_id": user_id or "anonymous",
	"api_version": "v1.0"
	})

	def log_a2a_api_response(self, endpoint: str, status_code: int, response_time: float, success: bool, entities_processed: int = 0):
	"""Log A2A API response"""
	self.log_event("a2a_api_response", {
	"endpoint": endpoint,
	"status_code": status_code,
	"response_time": response_time,
	"success": success,
	"entities_processed": entities_processed,
	"api_type": "rest_a2a"
	})

	def log_a2a_authentication(self, auth_provider: str, success: bool, auth_time: float, user_claims: Dict[str, Any] = None):
	"""Log A2A authentication events"""
	self.log_event("a2a_authentication", {
	"auth_provider": auth_provider,
	"success": success,
	"auth_time": auth_time,
	"user_claims": user_claims or {},
	"security_level": "production" if auth_provider == "auth0" else "development"
	})

	# === MODAL SCALING MONITORING ===

	def log_modal_function_call(self, function_name: str, gpu_type: str, processing_time: float, cost_estimate: float, container_id: str):
	"""Log Modal function execution"""
	self.log_event("modal_function_call", {
	"function_name": function_name,
	"gpu_type": gpu_type,
	"processing_time": processing_time,
	"cost_estimate": cost_estimate,
	"container_id": container_id,
	"cloud_provider": "modal_labs"
	})

	def log_modal_scaling_event(self, event_type: str, container_count: int, gpu_utilization: str, auto_scaling: bool):
	"""Log Modal auto-scaling events"""
	self.log_event("modal_scaling_event", {
	"event_type": event_type, # scale_up, scale_down, container_start, container_stop
	"container_count": container_count,
	"gpu_utilization": gpu_utilization,
	"auto_scaling_active": auto_scaling,
	"scaling_provider": "modal_l4"
	})

	def log_modal_deployment(self, app_name: str, functions_deployed: int, success: bool, deployment_time: float):
	"""Log Modal deployment events"""
	self.log_event("modal_deployment", {
	"app_name": app_name,
	"functions_deployed": functions_deployed,
	"deployment_success": success,
	"deployment_time": deployment_time,
	"deployment_target": "modal_serverless"
	})

	def log_modal_cost_tracking(self, daily_cost: float, requests_processed: int, cost_per_request: float, gpu_hours: float):
	"""Log Modal cost analytics"""
	self.log_event("modal_cost_tracking", {
	"daily_cost": daily_cost,
	"requests_processed": requests_processed,
	"cost_per_request": cost_per_request,
	"gpu_hours_used": gpu_hours,
	"cost_optimization": "l4_gpu_auto_scaling"
	})

	# === DOCKER DEPLOYMENT MONITORING ===

	def log_docker_deployment(self, compose_file: str, services_started: int, success: bool, startup_time: float):
	"""Log Docker Compose deployment"""
	self.log_event("docker_deployment", {
	"compose_file": compose_file,
	"services_started": services_started,
	"deployment_success": success,
	"startup_time": startup_time,
	"deployment_type": "docker_compose"
	})

	def log_docker_service_health(self, service_name: str, status: str, response_time: float, healthy: bool):
	"""Log Docker service health checks"""
	self.log_event("docker_service_health", {
	"service_name": service_name,
	"status": status,
	"response_time": response_time,
	"healthy": healthy,
	"monitoring_type": "health_check"
	})

	# === ERROR AND PERFORMANCE MONITORING ===

	def log_error_event(self, error_type: str, error_message: str, stack_trace: str, component: str, severity: str = "error"):
	"""Log error events with context"""
	self.log_event("error_event", {
	"error_type": error_type,
	"error_message": error_message,
	"stack_trace": stack_trace,
	"component": component,
	"severity": severity,
	"timestamp": time.time()
	})

	def log_performance_metrics(self, component: str, cpu_usage: float, memory_usage: float, response_time: float, throughput: float):
	"""Log performance metrics"""
	self.log_event("performance_metrics", {
	"component": component,
	"cpu_usage_percent": cpu_usage,
	"memory_usage_mb": memory_usage,
	"response_time": response_time,
	"throughput_requests_per_second": throughput,
	"metrics_type": "system_performance"
	})

	# === LANGFUSE TRACE UTILITIES ===

	def create_langfuse_trace(self, name: str, input_data: Dict[str, Any] = None, session_id: str = None) -> Any:
	"""Create a Langfuse trace if available"""
	if self.langfuse:
	try:
	return self.langfuse.trace(
	name=name,
	input=input_data or {},
	session_id=session_id or self.session_id
	)
	except Exception:
	return None
	return None

	def update_langfuse_trace(self, trace: Any, output: Dict[str, Any] = None, metadata: Dict[str, Any] = None):
	"""Update a Langfuse trace if available"""
	if trace and self.langfuse:
	try:
	trace.update(
	output=output or {},
	metadata=metadata or {}
	)
	except Exception:
	pass

	def get_monitoring_status(self) -> Dict[str, Any]:
	"""Get comprehensive monitoring status"""
	return {
	"langfuse_enabled": self.langfuse is not None,
	"session_id": self.session_id,
	"langfuse_host": os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com") if self.langfuse else None,
	"monitoring_active": True,
	"events_logged": True,
	"trace_collection": "enabled" if self.langfuse else "disabled"
	}

	@contextmanager
	def trace_operation(self, operation_name: str, input_data: Dict[str, Any] = None):
	"""Context manager for tracing operations"""
	trace = None
	if self.langfuse:
	try:
	trace = self.langfuse.trace(
	name=operation_name,
	input=input_data or {},
	session_id=self.session_id
	)
	except Exception:
	# Silently fail trace creation to avoid disrupting workflow
	trace = None

	start_time = time.time()
	try:
	yield trace
	except Exception as e:
	if trace:
	try:
	trace.update(
	output={"error": str(e), "status": "failed"},
	metadata={"processing_time": time.time() - start_time}
	)
	except Exception:
	# Silently fail trace update
	pass
	raise
	else:
	if trace:
	try:
	trace.update(
	metadata={"processing_time": time.time() - start_time, "status": "completed"}
	)
	except Exception:
	# Silently fail trace update
	pass

	@contextmanager
	def trace_ai_processing(self, model: str, text_length: int, temperature: float, max_tokens: int):
	"""Context manager specifically for AI processing operations"""
	with self.trace_operation("ai_model_processing", {
	"model": model,
	"input_length": text_length,
	"temperature": temperature,
	"max_tokens": max_tokens,
	"processing_type": "medical_extraction"
	}) as trace:
	yield trace

	@contextmanager
	def trace_fhir_validation(self, validation_level: str, resource_count: int):
	"""Context manager specifically for FHIR validation operations"""
	with self.trace_operation("fhir_validation_process", {
	"validation_level": validation_level,
	"resource_count": resource_count,
	"fhir_version": "R4",
	"validation_type": "comprehensive"
	}) as trace:
	yield trace

	@contextmanager
	def trace_document_workflow(self, document_type: str, text_length: int):
	"""Context manager for complete document processing workflow"""
	with self.trace_operation("document_processing_workflow", {
	"document_type": document_type,
	"text_length": text_length,
	"workflow_type": "end_to_end_medical"
	}) as trace:
	yield trace

	def get_langchain_callback(self):
	"""Get LangChain callback handler for monitoring"""
	if LANGCHAIN_AVAILABLE and self.langfuse:
	try:
	return self.langfuse.get_langchain_callback(session_id=self.session_id)
	except Exception:
	return None
	return None

	def process_with_langchain(self, text: str, operation: str = "document_processing"):
	"""Process text using LangChain with monitoring"""
	if not LANGCHAIN_AVAILABLE:
	return {"processed_text": text, "chunks": [text]}

	try:
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=100,
	separators=["\n\n", "\n", ".", " "]
	)

	chunks = splitter.split_text(text)

	self.log_event("langchain_processing", {
	"operation": operation,
	"chunk_count": len(chunks),
	"total_length": len(text)
	})

	return {"processed_text": text, "chunks": chunks}

	except Exception as e:
	self.log_event("langchain_error", {"error": str(e), "operation": operation})
	return {"processed_text": text, "chunks": [text], "error": str(e)}

	# Global monitor instance
	monitor = FhirFlameMonitor()

	# Convenience decorators
	def track_medical_processing(operation: str):
	"""Convenience decorator for medical processing tracking"""
	return monitor.track_operation(f"medical_{operation}")

	def track_performance(func):
	"""Decorator to track function performance"""
	@wraps(func)
	async def wrapper(args, *kwargs):
	start_time = time.time()
	result = await func(args, *kwargs)
	processing_time = time.time() - start_time

	monitor.log_event("performance", {
	"function": func.__name__,
	"processing_time": processing_time
	})

	return result
	return wrapper

	# Make available for import
	__all__ = ["FhirFlameMonitor", "monitor", "track_medical_processing", "track_performance"]