Spaces:
Running
on
Zero
Running
on
Zero
video to text added
Browse files- README.md +2 -1
- app.py +162 -0
- data/asl_classifier.pth +3 -0
- data/asl_landmarks_final.csv +0 -0
- data/letters_seq.mp4 +0 -0
- requirements.txt +8 -0
README.md
CHANGED
|
@@ -10,4 +10,5 @@ pinned: false
|
|
| 10 |
short_description: A mini project of sign language conversation
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
| 10 |
short_description: A mini project of sign language conversation
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# Bi Directional Sign Language Conversation
|
| 14 |
+
This is an 50 % web app of real time communication between the deaf / dumb people with a normal person who doesn't know about sign language.
|
app.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sklearn.preprocessing import LabelEncoder
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import cv2
|
| 7 |
+
import mediapipe as mp
|
| 8 |
+
import numpy as np
|
| 9 |
+
from huggingface_hub import spaces
|
| 10 |
+
|
| 11 |
+
# Define the ASLClassifier model
|
| 12 |
+
class ASLClassifier(nn.Module):
|
| 13 |
+
def __init__(self, input_size=63, hidden_size=256, num_classes=28):
|
| 14 |
+
super(ASLClassifier, self).__init__()
|
| 15 |
+
self.fc1 = nn.Linear(input_size, hidden_size)
|
| 16 |
+
self.bn1 = nn.BatchNorm1d(hidden_size)
|
| 17 |
+
self.relu1 = nn.ReLU()
|
| 18 |
+
self.dropout1 = nn.Dropout(0.3)
|
| 19 |
+
self.fc2 = nn.Linear(hidden_size, hidden_size * 2)
|
| 20 |
+
self.bn2 = nn.BatchNorm1d(hidden_size * 2)
|
| 21 |
+
self.relu2 = nn.ReLU()
|
| 22 |
+
self.dropout2 = nn.Dropout(0.3)
|
| 23 |
+
self.fc3 = nn.Linear(hidden_size * 2, hidden_size)
|
| 24 |
+
self.bn3 = nn.BatchNorm1d(hidden_size)
|
| 25 |
+
self.relu3 = nn.ReLU()
|
| 26 |
+
self.dropout3 = nn.Dropout(0.3)
|
| 27 |
+
self.fc4 = nn.Linear(hidden_size, hidden_size // 2)
|
| 28 |
+
self.bn4 = nn.BatchNorm1d(hidden_size // 2)
|
| 29 |
+
self.relu4 = nn.ReLU()
|
| 30 |
+
self.dropout4 = nn.Dropout(0.3)
|
| 31 |
+
self.fc5 = nn.Linear(hidden_size // 2, num_classes)
|
| 32 |
+
|
| 33 |
+
def forward(self, x):
|
| 34 |
+
x = self.fc1(x)
|
| 35 |
+
x = self.bn1(x)
|
| 36 |
+
x = self.relu1(x)
|
| 37 |
+
x = self.dropout1(x)
|
| 38 |
+
x = self.fc2(x)
|
| 39 |
+
x = self.bn2(x)
|
| 40 |
+
x = self.relu2(x)
|
| 41 |
+
x = self.dropout2(x)
|
| 42 |
+
x = self.fc3(x)
|
| 43 |
+
x = self.bn3(x)
|
| 44 |
+
x = self.relu3(x)
|
| 45 |
+
x = self.dropout3(x)
|
| 46 |
+
x = self.fc4(x)
|
| 47 |
+
x = self.bn4(x)
|
| 48 |
+
x = self.relu4(x)
|
| 49 |
+
x = self.dropout4(x)
|
| 50 |
+
x = self.fc5(x)
|
| 51 |
+
return x
|
| 52 |
+
|
| 53 |
+
# Load the model and label encoder (CPU initially, GPU handled by decorator)
|
| 54 |
+
device = torch.device('cpu') # Default to CPU; GPU inference handled by @spaces.GPU
|
| 55 |
+
model = ASLClassifier().to(device)
|
| 56 |
+
model.load_state_dict(torch.load('data/asl_classifier.pth', map_location=device))
|
| 57 |
+
model.eval()
|
| 58 |
+
|
| 59 |
+
df = pd.read_csv('data/asl_landmarks_final.csv')
|
| 60 |
+
label_encoder = LabelEncoder()
|
| 61 |
+
label_encoder.fit(df['label'].values)
|
| 62 |
+
|
| 63 |
+
# Initialize MediaPipe (runs on CPU)
|
| 64 |
+
mp_hands = mp.solutions.hands
|
| 65 |
+
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
|
| 66 |
+
mp_drawing = mp.solutions.drawing_utils
|
| 67 |
+
|
| 68 |
+
# Prediction function with GPU offloading
|
| 69 |
+
@spaces.GPU
|
| 70 |
+
def predict_letter(landmarks, model, label_encoder):
|
| 71 |
+
with torch.no_grad():
|
| 72 |
+
# Move to GPU for inference (handled by decorator)
|
| 73 |
+
landmarks = torch.tensor(landmarks, dtype=torch.float32).unsqueeze(0).to('cuda')
|
| 74 |
+
model = model.to('cuda')
|
| 75 |
+
output = model(landmarks)
|
| 76 |
+
_, predicted_idx = torch.max(output, 1)
|
| 77 |
+
letter = label_encoder.inverse_transform([predicted_idx.item()])[0]
|
| 78 |
+
# Move model back to CPU to free GPU memory
|
| 79 |
+
model = model.to('cpu')
|
| 80 |
+
return letter
|
| 81 |
+
|
| 82 |
+
# Video processing function (CPU for video processing, GPU for prediction)
|
| 83 |
+
def process_video(video_path):
|
| 84 |
+
# Open video file
|
| 85 |
+
cap = cv2.VideoCapture(video_path)
|
| 86 |
+
if not cap.isOpened():
|
| 87 |
+
return None, "Error: Could not open video."
|
| 88 |
+
|
| 89 |
+
# Variables to store output
|
| 90 |
+
text_output = ""
|
| 91 |
+
out_frames = []
|
| 92 |
+
|
| 93 |
+
while cap.isOpened():
|
| 94 |
+
ret, frame = cap.read()
|
| 95 |
+
if not ret:
|
| 96 |
+
break
|
| 97 |
+
|
| 98 |
+
# Process frame with MediaPipe (CPU)
|
| 99 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 100 |
+
results = hands.process(frame_rgb)
|
| 101 |
+
|
| 102 |
+
if results.multi_hand_landmarks:
|
| 103 |
+
for hand_landmarks in results.multi_hand_landmarks:
|
| 104 |
+
# Draw landmarks
|
| 105 |
+
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
|
| 106 |
+
|
| 107 |
+
# Extract landmarks and predict (GPU via decorator)
|
| 108 |
+
landmarks = []
|
| 109 |
+
for lm in hand_landmarks.landmark:
|
| 110 |
+
landmarks.extend([lm.x, lm.y, lm.z])
|
| 111 |
+
landmarks = np.array(landmarks, dtype=np.float32)
|
| 112 |
+
predicted_letter = predict_letter(landmarks, model, label_encoder)
|
| 113 |
+
|
| 114 |
+
# Add letter to text (avoid duplicates if same as last)
|
| 115 |
+
if not text_output or predicted_letter != text_output[-1]:
|
| 116 |
+
text_output += predicted_letter
|
| 117 |
+
|
| 118 |
+
# Overlay predicted letter on frame
|
| 119 |
+
cv2.putText(frame, f"Letter: {predicted_letter}", (10, 30),
|
| 120 |
+
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
|
| 121 |
+
|
| 122 |
+
# Store processed frame
|
| 123 |
+
out_frames.append(frame)
|
| 124 |
+
|
| 125 |
+
cap.release()
|
| 126 |
+
|
| 127 |
+
# Write processed video to a temporary file
|
| 128 |
+
out_path = "processed_video.mp4"
|
| 129 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
| 130 |
+
out = cv2.VideoWriter(out_path, fourcc, 20.0, (out_frames[0].shape[1], out_frames[0].shape[0]))
|
| 131 |
+
for frame in out_frames:
|
| 132 |
+
out.write(frame)
|
| 133 |
+
out.release()
|
| 134 |
+
|
| 135 |
+
return out_path, text_output
|
| 136 |
+
|
| 137 |
+
# Create Gradio interface with sample input
|
| 138 |
+
with gr.Blocks(title="Sign Language Translation") as demo:
|
| 139 |
+
gr.Markdown("## Sign Language Translation")
|
| 140 |
+
video_input = gr.Video(label="Input Video", sources=["upload", "webcam"])
|
| 141 |
+
video_output = gr.Video(label="Processed Video with Landmarks")
|
| 142 |
+
text_output = gr.Textbox(label="Predicted Text", interactive=False)
|
| 143 |
+
|
| 144 |
+
# Button to process video
|
| 145 |
+
btn = gr.Button("Translate")
|
| 146 |
+
btn.click(
|
| 147 |
+
fn=process_video,
|
| 148 |
+
inputs=video_input,
|
| 149 |
+
outputs=[video_output, text_output]
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Add sample input video
|
| 153 |
+
gr.Examples(
|
| 154 |
+
examples=[["data/letter_seq.mp4"]],
|
| 155 |
+
inputs=[video_input],
|
| 156 |
+
outputs=[video_output, text_output],
|
| 157 |
+
fn=process_video,
|
| 158 |
+
cache_examples=True # Cache the output for faster loading
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Launch the app
|
| 162 |
+
demo.launch()
|
data/asl_classifier.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e80bfae266cf8a67e10b4918814dab2701e0e5637f7cf8a7798b82917b143f6
|
| 3 |
+
size 1291050
|
data/asl_landmarks_final.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/letters_seq.mp4
ADDED
|
Binary file (584 kB). View file
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
pandas
|
| 3 |
+
scikit-learn
|
| 4 |
+
gradio
|
| 5 |
+
opencv-python
|
| 6 |
+
mediapipe
|
| 7 |
+
numpy
|
| 8 |
+
huggingface_hub
|