Spaces:

mknolan
/

gpu-diagnostic-test

Configuration error

App Files Files Community

gpu-diagnostic-test / app.py

mknolan

Add GPU diagnostic script

b1d4df2 verified 6 months ago

raw

history blame contribute delete

4.91 kB

	#!/usr/bin/env python3
	"""
	GPU Diagnostics Tool for Hugging Face Spaces
	This script performs a comprehensive check of GPU availability and functionality.
	"""

	import os
	import sys
	import subprocess
	import time
	import json

	print("=" * 80)
	print("GPU DIAGNOSTICS TOOL")
	print("=" * 80)

	# Check Python version
	print(f"Python version: {sys.version}")
	print("-" * 80)

	# Check environment variables
	print("ENVIRONMENT VARIABLES:")
	gpu_related_vars = [
	"CUDA_VISIBLE_DEVICES",
	"NVIDIA_VISIBLE_DEVICES",
	"PYTORCH_CUDA_ALLOC_CONF",
	"HF_HOME"
	]

	for var in gpu_related_vars:
	print(f"{var}: {os.environ.get(var, 'Not set')}")
	print("-" * 80)

	# Check for nvidia-smi
	print("CHECKING FOR NVIDIA-SMI:")
	try:
	result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
	if result.returncode == 0:
	print("nvidia-smi is available and working!")
	print(result.stdout)
	else:
	print("nvidia-smi error:")
	print(result.stderr)
	except Exception as e:
	print(f"Error running nvidia-smi: {str(e)}")
	print("-" * 80)

	# Check PyTorch and CUDA
	print("CHECKING PYTORCH AND CUDA:")
	try:
	import torch

	print(f"PyTorch version: {torch.__version__}")
	print(f"CUDA available: {torch.cuda.is_available()}")
	print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}")

	if torch.cuda.is_available():
	print(f"CUDA device count: {torch.cuda.device_count()}")
	for i in range(torch.cuda.device_count()):
	print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}")
	print(f"Current CUDA device: {torch.cuda.current_device()}")

	# Try to create and operate on a CUDA tensor
	print("\nTesting CUDA tensor creation:")
	try:
	start_time = time.time()
	x = torch.rand(1000, 1000, device="cuda" if torch.cuda.is_available() else "cpu")
	y = x @ x # Matrix multiplication to test computation
	torch.cuda.synchronize() # Wait for the operation to complete
	end_time = time.time()

	if torch.cuda.is_available():
	print(f"Successfully created and operated on a CUDA tensor in {end_time - start_time:.4f} seconds")
	else:
	print(f"Created and operated on a CPU tensor in {end_time - start_time:.4f} seconds (CUDA not available)")
	except Exception as e:
	print(f"Error in tensor creation/operation: {str(e)}")

	# Try to get more detailed CUDA info
	if torch.cuda.is_available():
	print("\nDetailed CUDA information:")
	print(f"CUDA capability: {torch.cuda.get_device_capability(0)}")
	print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
	print(f"CUDA arch list: {torch.cuda.get_arch_list() if hasattr(torch.cuda, 'get_arch_list') else 'Not available'}")
	except ImportError:
	print("PyTorch is not installed")
	print("-" * 80)

	# Create a simple GPU test with a web interface
	print("CREATING SIMPLE GPU TEST WEB INTERFACE...")
	try:
	import gradio as gr

	def check_gpu():
	results = {
	"python_version": sys.version,
	"environment_vars": {var: os.environ.get(var, "Not set") for var in gpu_related_vars},
	"torch_available": False,
	"cuda_available": False
	}

	try:
	import torch
	results["torch_available"] = True
	results["torch_version"] = torch.__version__
	results["cuda_available"] = torch.cuda.is_available()

	if torch.cuda.is_available():
	results["cuda_version"] = torch.version.cuda
	results["cuda_device_count"] = torch.cuda.device_count()
	results["cuda_device_name"] = torch.cuda.get_device_name(0)

	# Test tensor creation
	start_time = time.time()
	x = torch.rand(1000, 1000, device="cuda")
	y = x @ x
	torch.cuda.synchronize()
	end_time = time.time()
	results["tensor_test_time"] = f"{end_time - start_time:.4f} seconds"
	results["gpu_test_passed"] = True
	else:
	results["gpu_test_passed"] = False
	except Exception as e:
	results["error"] = str(e)
	results["gpu_test_passed"] = False

	return json.dumps(results, indent=2)

	demo = gr.Interface(
	fn=check_gpu,
	inputs=[],
	outputs="text",
	title="GPU Diagnostics",
	description="Click the button to run GPU diagnostics"
	)

	print("Starting Gradio web interface on port 7860...")
	demo.launch(server_name="0.0.0.0")
	except ImportError:
	print("Gradio not installed, skipping web interface")
	print("Raw GPU diagnostics complete.")
	print("-" * 80)