Adieee5 commited on
Commit
3d8b532
·
verified ·
1 Parent(s): 6a49af6

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ res10_300x300_ssd_iter_140000.caffemodel filter=lfs diff=lfs merge=lfs -text
37
+ sample_videos/Sample.mp4 filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,672 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import altair as alt
2
+ import numpy as np
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ import streamlit as st
7
+ import cv2
8
+ import torch
9
+ import numpy as np
10
+ import os
11
+ import tempfile
12
+ import time
13
+ from transformers import AutoImageProcessor, AutoModelForImageClassification
14
+ from collections import deque
15
+ import tensorflow as tf
16
+ from tensorflow.keras.preprocessing import image
17
+ from tensorflow.keras.models import load_model
18
+ import urllib.request
19
+ import shutil
20
+
21
+ class CNNDeepfakeDetector:
22
+ def __init__(self):
23
+ st.info("Initializing CNN Deepfake Detector... This may take a moment.")
24
+
25
+ # Initialize CNN model for deepfake detection
26
+ with st.spinner("Loading CNN deepfake detection model..."):
27
+ try:
28
+ self.model = load_model('cnn_model.h5')
29
+ st.success("CNN model loaded successfully!")
30
+ except Exception as e:
31
+ st.error(f"Error loading CNN model: {e}")
32
+ st.warning("Please make sure 'cnn_model.h5' is in the current directory.")
33
+ self.model = None
34
+
35
+ def classify_image(self, face_img):
36
+ """Classify a face image as real or fake using CNN model"""
37
+ try:
38
+ if self.model is None:
39
+ return "Model Not Loaded", 0.0
40
+
41
+ # Resize to target size
42
+ img_resized = cv2.resize(face_img, (128, 128))
43
+
44
+ # Preprocess the image
45
+ img_array = img_resized / 255.0
46
+ img_array = np.expand_dims(img_array, axis=0)
47
+
48
+ # Make prediction
49
+ prediction = self.model.predict(img_array)
50
+ confidence = float(prediction[0][0])
51
+
52
+ # In this model, <0.5 means Real, >=0.5 means Fake
53
+ label = 'Real' if confidence < 0.5 else 'Fake'
54
+
55
+ # Adjust confidence to be relative to the prediction
56
+ if label == 'Fake':
57
+ confidence = confidence # Already between 0.5-1.0
58
+ else:
59
+ confidence = 1.0 - confidence # Convert 0.0-0.5 to 0.5-1.0
60
+
61
+ return label, confidence
62
+
63
+ except Exception as e:
64
+ st.error(f"Error in CNN classification: {e}")
65
+ return "Error", 0.0
66
+
67
+ class DeepfakeDetector:
68
+ def __init__(self):
69
+ st.info("Initializing Deepfake Detector... This may take a moment.")
70
+
71
+ # Initialize ViT model for deepfake detection
72
+ with st.spinner("Loading deepfake detection model..."):
73
+ self.image_processor = AutoImageProcessor.from_pretrained(
74
+ 'Adieee5/deepfake-detection-f3net-cross')
75
+ self.model = AutoModelForImageClassification.from_pretrained(
76
+ 'Adieee5/deepfake-detection-f3net-cross')
77
+
78
+ # Face detection model setup
79
+ with st.spinner("Loading face detection model..."):
80
+ model_file = "deploy.prototxt"
81
+ weights_file = "res10_300x300_ssd_iter_140000.caffemodel"
82
+
83
+ self.use_dnn = False
84
+ if os.path.exists(model_file) and os.path.exists(weights_file):
85
+ try:
86
+ self.face_net = cv2.dnn.readNetFromCaffe(model_file, weights_file)
87
+ self.use_dnn = True
88
+ st.success("Using DNN face detector (better for close-up faces)")
89
+ except Exception as e:
90
+ st.warning(f"Could not load DNN model: {e}")
91
+ self.use_dnn = False
92
+
93
+ if not self.use_dnn:
94
+ # Fallback to Haar cascade
95
+ cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
96
+ if os.path.exists(cascade_path):
97
+ self.face_cascade = cv2.CascadeClassifier(cascade_path)
98
+ st.warning("Using Haar cascade face detector as fallback")
99
+ else:
100
+ st.error(f"Cascade file not found: {cascade_path}")
101
+
102
+ # Initialize CNN model
103
+ self.cnn_detector = CNNDeepfakeDetector()
104
+
105
+ # Face tracking/smoothing parameters
106
+ self.face_history = {} # Store face tracking data
107
+ self.face_history_max_size = 10 # Store history for last 10 frames
108
+ self.face_ttl = 5 # Number of frames a face can be missing before removing
109
+ self.next_face_id = 0 # For assigning unique IDs to tracked faces
110
+
111
+ # Result smoothing
112
+ self.result_buffer_size = 5 # Number of classifications to average
113
+
114
+ # Performance metrics
115
+ self.processing_times = deque(maxlen=30)
116
+
117
+ st.success("Models loaded successfully!")
118
+
119
+ def detect_faces_haar(self, frame):
120
+ """Detect faces using Haar cascade"""
121
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
122
+ faces = self.face_cascade.detectMultiScale(
123
+ gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
124
+
125
+ # Convert to list of (x,y,w,h,confidence) format for consistency
126
+ return [(x, y, w, h, 0.8) for (x, y, w, h) in faces]
127
+
128
+ def classify_frame(self, face_img, model_type="vit"):
129
+ """Classify a face image as real or fake"""
130
+ try:
131
+ if model_type == "cnn":
132
+ return self.cnn_detector.classify_image(face_img)
133
+
134
+ # Default to ViT model
135
+ # Resize image if too small
136
+ h, w = face_img.shape[:2]
137
+ if h < 224 or w < 224:
138
+ scale = max(224/h, 224/w)
139
+ face_img = cv2.resize(face_img, (int(w*scale), int(h*scale)))
140
+
141
+ # Make sure we have valid image data
142
+ if face_img.size == 0:
143
+ return "Unknown", 0.0
144
+
145
+ # Process with ViT model
146
+ inputs = self.image_processor(images=face_img, return_tensors="pt")
147
+ outputs = self.model(**inputs)
148
+ logits = outputs.logits
149
+
150
+ # Get prediction and confidence
151
+ probs = torch.nn.functional.softmax(logits, dim=1)
152
+ pred = torch.argmax(logits, dim=1).item()
153
+
154
+ # The model has two classes: 0=Fake, 1=Real
155
+ label = 'Real' if pred == 1 else 'Fake'
156
+ confidence = probs[0][pred].item()
157
+
158
+ return label, confidence
159
+
160
+ except Exception as e:
161
+ st.error(f"Error in classification: {e}")
162
+ return "Error", 0.0
163
+
164
+ def detect_faces_dnn(self, frame):
165
+ """Detect faces using DNN method"""
166
+ height, width = frame.shape[:2]
167
+ blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)), 1.0,
168
+ (300, 300), (104.0, 177.0, 123.0))
169
+ self.face_net.setInput(blob)
170
+ detections = self.face_net.forward()
171
+
172
+ faces = []
173
+ for i in range(detections.shape[2]):
174
+ confidence = detections[0, 0, i, 2]
175
+ if confidence > 0.5: # Filter out weak detections
176
+ box = detections[0, 0, i, 3:7] * np.array([width, height, width, height])
177
+ (x1, y1, x2, y2) = box.astype("int")
178
+ # Ensure box is within frame boundaries
179
+ x1, y1 = max(0, x1), max(0, y1)
180
+ x2, y2 = min(width, x2), min(height, y2)
181
+ w, h = x2 - x1, y2 - y1
182
+ if w > 0 and h > 0: # Valid face area
183
+ faces.append((x1, y1, w, h, confidence))
184
+
185
+ return faces
186
+
187
+ def calculate_iou(self, box1, box2):
188
+ """Calculate Intersection over Union for two boxes"""
189
+ # Convert boxes from (x, y, w, h) to (x1, y1, x2, y2)
190
+ box1_x1, box1_y1, box1_w, box1_h = box1
191
+ box2_x1, box2_y1, box2_w, box2_h = box2
192
+
193
+ box1_x2, box1_y2 = box1_x1 + box1_w, box1_y1 + box1_h
194
+ box2_x2, box2_y2 = box2_x1 + box2_w, box2_y1 + box2_h
195
+
196
+ # Calculate area of intersection rectangle
197
+ x_left = max(box1_x1, box2_x1)
198
+ y_top = max(box1_y1, box2_y1)
199
+ x_right = min(box1_x2, box2_x2)
200
+ y_bottom = min(box1_y2, box2_y2)
201
+
202
+ if x_right < x_left or y_bottom < y_top:
203
+ return 0.0
204
+
205
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
206
+
207
+ # Calculate area of both boxes
208
+ box1_area = box1_w * box1_h
209
+ box2_area = box2_w * box2_h
210
+
211
+ # Calculate IoU
212
+ iou = intersection_area / float(box1_area + box2_area - intersection_area)
213
+ return iou
214
+
215
+ def track_faces(self, faces):
216
+ matched_faces = []
217
+ unmatched_detections = list(range(len(faces)))
218
+
219
+ if not self.face_history:
220
+ for face in faces:
221
+ face_id = self.next_face_id
222
+ self.next_face_id += 1
223
+ self.face_history[face_id] = {
224
+ 'positions': deque([face[:4]], maxlen=self.face_history_max_size),
225
+ 'ttl': self.face_ttl,
226
+ 'label': None,
227
+ 'confidence': 0.0,
228
+ 'result_history': deque(maxlen=self.result_buffer_size)
229
+ }
230
+ matched_faces.append((face_id, face))
231
+ return matched_faces
232
+
233
+ for face_id in list(self.face_history.keys()):
234
+ last_pos = self.face_history[face_id]['positions'][-1]
235
+ best_match = -1
236
+ best_iou = 0.3
237
+ for i in unmatched_detections:
238
+ iou = self.calculate_iou(last_pos, faces[i][:4])
239
+ if iou > best_iou:
240
+ best_iou = iou
241
+ best_match = i
242
+ if best_match != -1:
243
+ matched_face = faces[best_match]
244
+ self.face_history[face_id]['positions'].append(matched_face[:4])
245
+ self.face_history[face_id]['ttl'] = self.face_ttl
246
+ matched_faces.append((face_id, matched_face))
247
+ unmatched_detections.remove(best_match)
248
+ else:
249
+ self.face_history[face_id]['ttl'] -= 1
250
+ if self.face_history[face_id]['ttl'] <= 0:
251
+ del self.face_history[face_id]
252
+ else:
253
+ predicted_face = (*last_pos, 0.5)
254
+ matched_faces.append((face_id, predicted_face))
255
+
256
+ for i in unmatched_detections:
257
+ face_id = self.next_face_id
258
+ self.next_face_id += 1
259
+ self.face_history[face_id] = {
260
+ 'positions': deque([faces[i][:4]], maxlen=self.face_history_max_size),
261
+ 'ttl': self.face_ttl,
262
+ 'label': None,
263
+ 'confidence': 0.0,
264
+ 'result_history': deque(maxlen=self.result_buffer_size)
265
+ }
266
+ matched_faces.append((face_id, faces[i]))
267
+
268
+ return matched_faces
269
+
270
+ def smooth_face_position(self, face_id):
271
+ """Calculate smoothed position for a tracked face"""
272
+ positions = self.face_history[face_id]['positions']
273
+
274
+ if len(positions) == 1:
275
+ return positions[0]
276
+
277
+ # Weight recent positions more heavily
278
+ total_weight = 0
279
+ x, y, w, h = 0, 0, 0, 0
280
+
281
+ for i, pos in enumerate(positions):
282
+ # Exponential weighting - newer positions have more influence
283
+ weight = 2 ** i # Positions are stored newest to oldest
284
+ total_weight += weight
285
+
286
+ x += pos[0] * weight
287
+ y += pos[1] * weight
288
+ w += pos[2] * weight
289
+ h += pos[3] * weight
290
+
291
+ # Calculate weighted average
292
+ x = int(x / total_weight)
293
+ y = int(y / total_weight)
294
+ w = int(w / total_weight)
295
+ h = int(h / total_weight)
296
+
297
+ return (x, y, w, h)
298
+
299
+ def update_face_classification(self, face_id, label, confidence):
300
+ """Update classification history for a face"""
301
+ self.face_history[face_id]['result_history'].append((label, confidence))
302
+
303
+ # Calculate the smoothed result
304
+ if not self.face_history[face_id]['result_history']:
305
+ return label, confidence
306
+
307
+ real_votes = 0
308
+ fake_votes = 0
309
+ total_confidence = 0.0
310
+
311
+ for result_label, result_conf in self.face_history[face_id]['result_history']:
312
+ if result_label == "Real":
313
+ real_votes += 1
314
+ total_confidence += result_conf
315
+ elif result_label == "Fake":
316
+ fake_votes += 1
317
+ total_confidence += result_conf
318
+
319
+ # Determine majority vote
320
+ if real_votes >= fake_votes:
321
+ smoothed_label = "Real"
322
+ label_confidence = real_votes / len(self.face_history[face_id]['result_history'])
323
+ else:
324
+ smoothed_label = "Fake"
325
+ label_confidence = fake_votes / len(self.face_history[face_id]['result_history'])
326
+
327
+ # Average confidence weighted by vote consistency
328
+ avg_confidence = (total_confidence / len(self.face_history[face_id]['result_history'])) * label_confidence
329
+
330
+ # Store the smoothed result
331
+ self.face_history[face_id]['label'] = smoothed_label
332
+ self.face_history[face_id]['confidence'] = avg_confidence
333
+
334
+ return smoothed_label, avg_confidence
335
+
336
+ def process_video(self, video_path, stframe, status_text, progress_bar, detector_type="dnn", model_type="vit"):
337
+ """Process video with Streamlit output"""
338
+ use_dnn_current = detector_type == "dnn" and self.use_dnn
339
+
340
+ cap = cv2.VideoCapture(video_path)
341
+ if not cap.isOpened():
342
+ st.error(f"Error: Cannot open video source")
343
+ return
344
+
345
+ # Get video properties
346
+ frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
347
+ frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
348
+ fps = cap.get(cv2.CAP_PROP_FPS)
349
+ total_frames = 250 if video_path != 0 else 0
350
+
351
+ # Display video info
352
+ if video_path != 0: # If not webcam
353
+ status_text.text(f"Video Info: {frame_width}x{frame_height}, {fps:.1f} FPS, {total_frames} frames")
354
+ else:
355
+ status_text.text(f"Webcam: {frame_width}x{frame_height}")
356
+
357
+ # Reset tracking data for new video
358
+ self.face_history = {}
359
+ self.next_face_id = 0
360
+ self.processing_times = deque(maxlen=30)
361
+
362
+ frame_count = 0
363
+ process_every_n_frames = 2 # Process every 2nd frame for better performance
364
+
365
+ # For face detection stats
366
+ face_stats = {"Real": 0, "Fake": 0, "Unknown": 0}
367
+
368
+ # Main processing loop
369
+ while True:
370
+ start_time = time.time()
371
+
372
+ ret, frame = cap.read()
373
+ if not ret:
374
+ status_text.text("End of video reached")
375
+ break
376
+
377
+ frame_count += 1
378
+
379
+ if frame_count == 250:
380
+ st.success("Video Processed Successfully!")
381
+ break
382
+
383
+ if video_path != 0: # If not webcam, update progress
384
+ progress = min(float(frame_count) / float(max(total_frames, 1)), 1.0)
385
+ progress_bar.progress(progress)
386
+
387
+ process_frame = (frame_count % process_every_n_frames == 0)
388
+
389
+ # Store original frame for face extraction
390
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
391
+
392
+ if process_frame:
393
+ # Detect faces using the appropriate method
394
+ if use_dnn_current:
395
+ faces = self.detect_faces_dnn(frame)
396
+ else:
397
+ faces = self.detect_faces_haar(frame)
398
+
399
+ # Track faces across frames
400
+ tracked_faces = self.track_faces(faces)
401
+
402
+ # Process each tracked face
403
+ for face_id, (x, y, w, h, face_confidence) in tracked_faces:
404
+ if face_id not in self.face_history:
405
+ continue
406
+
407
+ sx, sy, sw, sh = self.smooth_face_position(face_id)
408
+ # Draw rectangle around face with smoothed coordinates
409
+ cv2.rectangle(frame, (sx, sy), (sx+sw, sy+sh), (0, 255, 255), 2)
410
+
411
+ # Only process classification for real detections (not predicted)
412
+ if w > 20 and h > 20 and face_id in self.face_history:
413
+ try:
414
+ # Extract face using smoothed coordinates for better consistency
415
+ face = frame_rgb[sy:sy+sh, sx:sx+sw]
416
+
417
+ # Skip processing if face is too small after smoothing
418
+ if face.size == 0 or face.shape[0] < 20 or face.shape[1] < 20:
419
+ continue
420
+
421
+ # Process only every N frames or if this is a new face
422
+ if frame_count % process_every_n_frames == 0 or \
423
+ len(self.face_history[face_id]['result_history']) == 0:
424
+ # Classify the face using the selected model
425
+ label, confidence = self.classify_frame(face, model_type)
426
+
427
+ # Update and smooth results
428
+ label, confidence = self.update_face_classification(face_id, label, confidence)
429
+ else:
430
+ # Use last stored result
431
+ label = self.face_history[face_id]['label'] or "Unknown"
432
+ confidence = self.face_history[face_id]['confidence']
433
+
434
+ # Update stats
435
+ if label in face_stats:
436
+ face_stats[label] += 1
437
+
438
+ # Display results
439
+ result_text = f"{label}: {confidence:.2f}"
440
+ text_color = (0, 255, 0) if label == "Real" else (0, 0, 255)
441
+
442
+ # Add text background for better visibility
443
+ cv2.rectangle(frame, (sx, sy+sh), (sx+len(result_text)*11, sy+sh+25), (0, 0, 0), -1)
444
+ cv2.putText(frame, result_text, (sx, sy+sh+20),
445
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, text_color, 2)
446
+
447
+ # Draw face ID
448
+ cv2.putText(frame, f"ID:{face_id}", (sx, sy-5),
449
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 1)
450
+ except Exception as e:
451
+ st.error(f"Error processing face: {e}")
452
+
453
+ # Measure processing time
454
+ process_time = time.time() - start_time
455
+ self.processing_times.append(process_time)
456
+ avg_time = sum(self.processing_times) / len(self.processing_times)
457
+ effective_fps = 1.0 / avg_time if avg_time > 0 else 0
458
+
459
+ # Add frame counter and progress
460
+ if video_path != 0: # If not webcam
461
+ progress_percent = (frame_count / total_frames) * 100 if total_frames > 0 else 0
462
+ cv2.putText(frame, f"Frame: {frame_count}/{total_frames} ({progress_percent:.1f}%)",
463
+ (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
464
+ else:
465
+ cv2.putText(frame, f"Frame: {frame_count}",
466
+ (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
467
+
468
+ # Show detector info and performance
469
+ detector_name = "DNN" if use_dnn_current else "Haar Cascade"
470
+ model_name = "ViT" if model_type == "vit" else "CNN"
471
+ cv2.putText(frame, f"Detector: {detector_name} | Model: {model_name} | FPS: {effective_fps:.1f}",
472
+ (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
473
+
474
+ # Show tracking info
475
+ cv2.putText(frame, f"Tracked faces: {len(self.face_history)}",
476
+ (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
477
+
478
+ # Display the frame in Streamlit
479
+ stframe.image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), channels="RGB")
480
+
481
+ # Update stats
482
+ status_text.text(f"Real: {face_stats['Real']} | Fake: {face_stats['Fake']} | FPS: {effective_fps:.1f}")
483
+
484
+ # Check if stop button is pressed
485
+ if st.session_state.get('stop_button', False):
486
+ break
487
+
488
+ # Clean up
489
+ cap.release()
490
+ return face_stats
491
+
492
+ # Function to ensure sample video exists
493
+ def ensure_sample_video():
494
+ sample_dir = "sample_videos"
495
+ sample_path = os.path.join(sample_dir, "Sample.mp4")
496
+
497
+ # Create directory if it doesn't exist
498
+ if not os.path.exists(sample_dir):
499
+ os.makedirs(sample_dir)
500
+
501
+ # If sample video doesn't exist, download it
502
+ if not os.path.exists(sample_path):
503
+ try:
504
+ with st.spinner("Downloading sample video..."):
505
+ # URL to a public domain sample video that contains faces
506
+ sample_url = "https://storage.googleapis.com/deepfake-demo/sample_deepfake.mp4"
507
+
508
+ # Download the file
509
+ with urllib.request.urlopen(sample_url) as response, open(sample_path, 'wb') as out_file:
510
+ shutil.copyfileobj(response, out_file)
511
+
512
+ st.success("Sample video downloaded successfully!")
513
+ except Exception as e:
514
+ st.error(f"Failed to download sample video: {e}")
515
+ return None
516
+
517
+ return sample_path
518
+
519
+ def main():
520
+ st.set_page_config(page_title="Deepfake Detector", layout="wide")
521
+
522
+ # App title and description
523
+ st.title("Deepfake Detection App")
524
+ st.markdown("""
525
+ This app uses computer vision and deep learning to detect deepfake videos.
526
+ Upload a video or use your webcam to detect if faces are real or manipulated.
527
+ """)
528
+
529
+ # Initialize session state for the detector and variables
530
+ if 'detector' not in st.session_state:
531
+ st.session_state.detector = None
532
+
533
+ if 'stop_button' not in st.session_state:
534
+ st.session_state.stop_button = False
535
+
536
+ if 'use_sample' not in st.session_state:
537
+ st.session_state.use_sample = False
538
+
539
+ if 'sample_path' not in st.session_state:
540
+ st.session_state.sample_path = None
541
+
542
+ # Initialize the detector
543
+ if st.session_state.detector is None:
544
+ st.session_state.detector = DeepfakeDetector()
545
+
546
+ # Create sidebar for options
547
+ st.sidebar.title("Options")
548
+
549
+ input_option = st.sidebar.radio(
550
+ "Select Input Source",
551
+ ["Upload Video", "Use Webcam", "Try Sample Video"]
552
+ )
553
+
554
+ detector_type = st.sidebar.selectbox(
555
+ "Face Detector",
556
+ ["DNN (better for close-ups)", "Haar Cascade (faster)"],
557
+ index=0 if st.session_state.detector.use_dnn else 1
558
+ )
559
+ detector_option = "dnn" if "DNN" in detector_type else "haar"
560
+
561
+ # Model selection option
562
+ model_type = st.sidebar.selectbox(
563
+ "Deepfake Detection Model",
564
+ ["Vision Transformer (ViT)", "F3 Net Model"],
565
+ index=0
566
+ )
567
+ model_option = "vit" if "Vision" in model_type else "cnn"
568
+
569
+ # Main content area
570
+ col1, col2 = st.columns([3, 1])
571
+
572
+ with col1:
573
+ # Video display area
574
+ video_placeholder = st.empty()
575
+
576
+ with col2:
577
+ # Status and controls
578
+ status_text = st.empty()
579
+ progress_bar = st.empty()
580
+
581
+ # Results section
582
+ st.subheader("Results")
583
+ results_area = st.empty()
584
+
585
+ # Stop button
586
+ if st.button("Stop Processing"):
587
+ st.session_state.stop_button = True
588
+
589
+ # Process based on selected option
590
+ if input_option == "Upload Video":
591
+ uploaded_file = st.sidebar.file_uploader("Choose a video file", type=["mp4", "avi", "mov", "mkv"])
592
+
593
+ if uploaded_file is not None:
594
+ st.session_state.stop_button = False
595
+
596
+ # Save uploaded file to temp file
597
+ tfile = tempfile.NamedTemporaryFile(delete=False)
598
+ tfile.write(uploaded_file.read())
599
+ video_path = tfile.name
600
+
601
+ # Process the video
602
+ face_stats = st.session_state.detector.process_video(
603
+ video_path,
604
+ video_placeholder,
605
+ status_text,
606
+ progress_bar,
607
+ detector_option,
608
+ model_option
609
+ )
610
+
611
+ # Display results
612
+ results_df = {
613
+ "Category": ["Real Faces", "Fake Faces"],
614
+ "Count": [face_stats["Real"], face_stats["Fake"]]
615
+ }
616
+ results_area.dataframe(results_df)
617
+
618
+ # Clean up temp file
619
+ os.unlink(video_path)
620
+
621
+ elif input_option == "Use Webcam":
622
+ # Reset stop button
623
+ st.session_state.stop_button = False
624
+
625
+ if st.sidebar.button("Start Webcam"):
626
+ # Process webcam feed
627
+ face_stats = st.session_state.detector.process_video(
628
+ 0, # 0 is the default camera
629
+ video_placeholder,
630
+ status_text,
631
+ progress_bar,
632
+ detector_option,
633
+ model_option
634
+ )
635
+
636
+ # Display results after stopping
637
+ results_df = {
638
+ "Category": ["Real Faces", "Fake Faces"],
639
+ "Count": [face_stats["Real"], face_stats["Fake"]]
640
+ }
641
+ results_area.dataframe(results_df)
642
+
643
+ elif input_option == "Try Sample Video":
644
+ # Reset stop button
645
+ st.session_state.stop_button = False
646
+
647
+ # Get or download the sample video
648
+ sample_path = ensure_sample_video()
649
+
650
+ if sample_path:
651
+ if st.sidebar.button("Process Sample Video"):
652
+ # Process the sample video
653
+ face_stats = st.session_state.detector.process_video(
654
+ sample_path,
655
+ video_placeholder,
656
+ status_text,
657
+ progress_bar,
658
+ detector_option,
659
+ model_option
660
+ )
661
+
662
+ # Display results
663
+ results_df = {
664
+ "Category": ["Real Faces", "Fake Faces"],
665
+ "Count": [face_stats["Real"], face_stats["Fake"]]
666
+ }
667
+ results_area.dataframe(results_df)
668
+ else:
669
+ st.sidebar.error("Failed to load sample video. Please try uploading your own video instead.")
670
+
671
+ if __name__ == "__main__":
672
+ main()
cnn_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f54d9db020da33f99f861d41dc1334ec33adc14991ada4033a4ece790d0904e
3
+ size 312843624
deploy.prototxt ADDED
@@ -0,0 +1,1790 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ input: "data"
2
+ input_shape {
3
+ dim: 1
4
+ dim: 3
5
+ dim: 300
6
+ dim: 300
7
+ }
8
+
9
+ layer {
10
+ name: "data_bn"
11
+ type: "BatchNorm"
12
+ bottom: "data"
13
+ top: "data_bn"
14
+ param {
15
+ lr_mult: 0.0
16
+ }
17
+ param {
18
+ lr_mult: 0.0
19
+ }
20
+ param {
21
+ lr_mult: 0.0
22
+ }
23
+ }
24
+ layer {
25
+ name: "data_scale"
26
+ type: "Scale"
27
+ bottom: "data_bn"
28
+ top: "data_bn"
29
+ param {
30
+ lr_mult: 1.0
31
+ decay_mult: 1.0
32
+ }
33
+ param {
34
+ lr_mult: 2.0
35
+ decay_mult: 1.0
36
+ }
37
+ scale_param {
38
+ bias_term: true
39
+ }
40
+ }
41
+ layer {
42
+ name: "conv1_h"
43
+ type: "Convolution"
44
+ bottom: "data_bn"
45
+ top: "conv1_h"
46
+ param {
47
+ lr_mult: 1.0
48
+ decay_mult: 1.0
49
+ }
50
+ param {
51
+ lr_mult: 2.0
52
+ decay_mult: 1.0
53
+ }
54
+ convolution_param {
55
+ num_output: 32
56
+ pad: 3
57
+ kernel_size: 7
58
+ stride: 2
59
+ weight_filler {
60
+ type: "msra"
61
+ variance_norm: FAN_OUT
62
+ }
63
+ bias_filler {
64
+ type: "constant"
65
+ value: 0.0
66
+ }
67
+ }
68
+ }
69
+ layer {
70
+ name: "conv1_bn_h"
71
+ type: "BatchNorm"
72
+ bottom: "conv1_h"
73
+ top: "conv1_h"
74
+ param {
75
+ lr_mult: 0.0
76
+ }
77
+ param {
78
+ lr_mult: 0.0
79
+ }
80
+ param {
81
+ lr_mult: 0.0
82
+ }
83
+ }
84
+ layer {
85
+ name: "conv1_scale_h"
86
+ type: "Scale"
87
+ bottom: "conv1_h"
88
+ top: "conv1_h"
89
+ param {
90
+ lr_mult: 1.0
91
+ decay_mult: 1.0
92
+ }
93
+ param {
94
+ lr_mult: 2.0
95
+ decay_mult: 1.0
96
+ }
97
+ scale_param {
98
+ bias_term: true
99
+ }
100
+ }
101
+ layer {
102
+ name: "conv1_relu"
103
+ type: "ReLU"
104
+ bottom: "conv1_h"
105
+ top: "conv1_h"
106
+ }
107
+ layer {
108
+ name: "conv1_pool"
109
+ type: "Pooling"
110
+ bottom: "conv1_h"
111
+ top: "conv1_pool"
112
+ pooling_param {
113
+ kernel_size: 3
114
+ stride: 2
115
+ }
116
+ }
117
+ layer {
118
+ name: "layer_64_1_conv1_h"
119
+ type: "Convolution"
120
+ bottom: "conv1_pool"
121
+ top: "layer_64_1_conv1_h"
122
+ param {
123
+ lr_mult: 1.0
124
+ decay_mult: 1.0
125
+ }
126
+ convolution_param {
127
+ num_output: 32
128
+ bias_term: false
129
+ pad: 1
130
+ kernel_size: 3
131
+ stride: 1
132
+ weight_filler {
133
+ type: "msra"
134
+ }
135
+ bias_filler {
136
+ type: "constant"
137
+ value: 0.0
138
+ }
139
+ }
140
+ }
141
+ layer {
142
+ name: "layer_64_1_bn2_h"
143
+ type: "BatchNorm"
144
+ bottom: "layer_64_1_conv1_h"
145
+ top: "layer_64_1_conv1_h"
146
+ param {
147
+ lr_mult: 0.0
148
+ }
149
+ param {
150
+ lr_mult: 0.0
151
+ }
152
+ param {
153
+ lr_mult: 0.0
154
+ }
155
+ }
156
+ layer {
157
+ name: "layer_64_1_scale2_h"
158
+ type: "Scale"
159
+ bottom: "layer_64_1_conv1_h"
160
+ top: "layer_64_1_conv1_h"
161
+ param {
162
+ lr_mult: 1.0
163
+ decay_mult: 1.0
164
+ }
165
+ param {
166
+ lr_mult: 2.0
167
+ decay_mult: 1.0
168
+ }
169
+ scale_param {
170
+ bias_term: true
171
+ }
172
+ }
173
+ layer {
174
+ name: "layer_64_1_relu2"
175
+ type: "ReLU"
176
+ bottom: "layer_64_1_conv1_h"
177
+ top: "layer_64_1_conv1_h"
178
+ }
179
+ layer {
180
+ name: "layer_64_1_conv2_h"
181
+ type: "Convolution"
182
+ bottom: "layer_64_1_conv1_h"
183
+ top: "layer_64_1_conv2_h"
184
+ param {
185
+ lr_mult: 1.0
186
+ decay_mult: 1.0
187
+ }
188
+ convolution_param {
189
+ num_output: 32
190
+ bias_term: false
191
+ pad: 1
192
+ kernel_size: 3
193
+ stride: 1
194
+ weight_filler {
195
+ type: "msra"
196
+ }
197
+ bias_filler {
198
+ type: "constant"
199
+ value: 0.0
200
+ }
201
+ }
202
+ }
203
+ layer {
204
+ name: "layer_64_1_sum"
205
+ type: "Eltwise"
206
+ bottom: "layer_64_1_conv2_h"
207
+ bottom: "conv1_pool"
208
+ top: "layer_64_1_sum"
209
+ }
210
+ layer {
211
+ name: "layer_128_1_bn1_h"
212
+ type: "BatchNorm"
213
+ bottom: "layer_64_1_sum"
214
+ top: "layer_128_1_bn1_h"
215
+ param {
216
+ lr_mult: 0.0
217
+ }
218
+ param {
219
+ lr_mult: 0.0
220
+ }
221
+ param {
222
+ lr_mult: 0.0
223
+ }
224
+ }
225
+ layer {
226
+ name: "layer_128_1_scale1_h"
227
+ type: "Scale"
228
+ bottom: "layer_128_1_bn1_h"
229
+ top: "layer_128_1_bn1_h"
230
+ param {
231
+ lr_mult: 1.0
232
+ decay_mult: 1.0
233
+ }
234
+ param {
235
+ lr_mult: 2.0
236
+ decay_mult: 1.0
237
+ }
238
+ scale_param {
239
+ bias_term: true
240
+ }
241
+ }
242
+ layer {
243
+ name: "layer_128_1_relu1"
244
+ type: "ReLU"
245
+ bottom: "layer_128_1_bn1_h"
246
+ top: "layer_128_1_bn1_h"
247
+ }
248
+ layer {
249
+ name: "layer_128_1_conv1_h"
250
+ type: "Convolution"
251
+ bottom: "layer_128_1_bn1_h"
252
+ top: "layer_128_1_conv1_h"
253
+ param {
254
+ lr_mult: 1.0
255
+ decay_mult: 1.0
256
+ }
257
+ convolution_param {
258
+ num_output: 128
259
+ bias_term: false
260
+ pad: 1
261
+ kernel_size: 3
262
+ stride: 2
263
+ weight_filler {
264
+ type: "msra"
265
+ }
266
+ bias_filler {
267
+ type: "constant"
268
+ value: 0.0
269
+ }
270
+ }
271
+ }
272
+ layer {
273
+ name: "layer_128_1_bn2"
274
+ type: "BatchNorm"
275
+ bottom: "layer_128_1_conv1_h"
276
+ top: "layer_128_1_conv1_h"
277
+ param {
278
+ lr_mult: 0.0
279
+ }
280
+ param {
281
+ lr_mult: 0.0
282
+ }
283
+ param {
284
+ lr_mult: 0.0
285
+ }
286
+ }
287
+ layer {
288
+ name: "layer_128_1_scale2"
289
+ type: "Scale"
290
+ bottom: "layer_128_1_conv1_h"
291
+ top: "layer_128_1_conv1_h"
292
+ param {
293
+ lr_mult: 1.0
294
+ decay_mult: 1.0
295
+ }
296
+ param {
297
+ lr_mult: 2.0
298
+ decay_mult: 1.0
299
+ }
300
+ scale_param {
301
+ bias_term: true
302
+ }
303
+ }
304
+ layer {
305
+ name: "layer_128_1_relu2"
306
+ type: "ReLU"
307
+ bottom: "layer_128_1_conv1_h"
308
+ top: "layer_128_1_conv1_h"
309
+ }
310
+ layer {
311
+ name: "layer_128_1_conv2"
312
+ type: "Convolution"
313
+ bottom: "layer_128_1_conv1_h"
314
+ top: "layer_128_1_conv2"
315
+ param {
316
+ lr_mult: 1.0
317
+ decay_mult: 1.0
318
+ }
319
+ convolution_param {
320
+ num_output: 128
321
+ bias_term: false
322
+ pad: 1
323
+ kernel_size: 3
324
+ stride: 1
325
+ weight_filler {
326
+ type: "msra"
327
+ }
328
+ bias_filler {
329
+ type: "constant"
330
+ value: 0.0
331
+ }
332
+ }
333
+ }
334
+ layer {
335
+ name: "layer_128_1_conv_expand_h"
336
+ type: "Convolution"
337
+ bottom: "layer_128_1_bn1_h"
338
+ top: "layer_128_1_conv_expand_h"
339
+ param {
340
+ lr_mult: 1.0
341
+ decay_mult: 1.0
342
+ }
343
+ convolution_param {
344
+ num_output: 128
345
+ bias_term: false
346
+ pad: 0
347
+ kernel_size: 1
348
+ stride: 2
349
+ weight_filler {
350
+ type: "msra"
351
+ }
352
+ bias_filler {
353
+ type: "constant"
354
+ value: 0.0
355
+ }
356
+ }
357
+ }
358
+ layer {
359
+ name: "layer_128_1_sum"
360
+ type: "Eltwise"
361
+ bottom: "layer_128_1_conv2"
362
+ bottom: "layer_128_1_conv_expand_h"
363
+ top: "layer_128_1_sum"
364
+ }
365
+ layer {
366
+ name: "layer_256_1_bn1"
367
+ type: "BatchNorm"
368
+ bottom: "layer_128_1_sum"
369
+ top: "layer_256_1_bn1"
370
+ param {
371
+ lr_mult: 0.0
372
+ }
373
+ param {
374
+ lr_mult: 0.0
375
+ }
376
+ param {
377
+ lr_mult: 0.0
378
+ }
379
+ }
380
+ layer {
381
+ name: "layer_256_1_scale1"
382
+ type: "Scale"
383
+ bottom: "layer_256_1_bn1"
384
+ top: "layer_256_1_bn1"
385
+ param {
386
+ lr_mult: 1.0
387
+ decay_mult: 1.0
388
+ }
389
+ param {
390
+ lr_mult: 2.0
391
+ decay_mult: 1.0
392
+ }
393
+ scale_param {
394
+ bias_term: true
395
+ }
396
+ }
397
+ layer {
398
+ name: "layer_256_1_relu1"
399
+ type: "ReLU"
400
+ bottom: "layer_256_1_bn1"
401
+ top: "layer_256_1_bn1"
402
+ }
403
+ layer {
404
+ name: "layer_256_1_conv1"
405
+ type: "Convolution"
406
+ bottom: "layer_256_1_bn1"
407
+ top: "layer_256_1_conv1"
408
+ param {
409
+ lr_mult: 1.0
410
+ decay_mult: 1.0
411
+ }
412
+ convolution_param {
413
+ num_output: 256
414
+ bias_term: false
415
+ pad: 1
416
+ kernel_size: 3
417
+ stride: 2
418
+ weight_filler {
419
+ type: "msra"
420
+ }
421
+ bias_filler {
422
+ type: "constant"
423
+ value: 0.0
424
+ }
425
+ }
426
+ }
427
+ layer {
428
+ name: "layer_256_1_bn2"
429
+ type: "BatchNorm"
430
+ bottom: "layer_256_1_conv1"
431
+ top: "layer_256_1_conv1"
432
+ param {
433
+ lr_mult: 0.0
434
+ }
435
+ param {
436
+ lr_mult: 0.0
437
+ }
438
+ param {
439
+ lr_mult: 0.0
440
+ }
441
+ }
442
+ layer {
443
+ name: "layer_256_1_scale2"
444
+ type: "Scale"
445
+ bottom: "layer_256_1_conv1"
446
+ top: "layer_256_1_conv1"
447
+ param {
448
+ lr_mult: 1.0
449
+ decay_mult: 1.0
450
+ }
451
+ param {
452
+ lr_mult: 2.0
453
+ decay_mult: 1.0
454
+ }
455
+ scale_param {
456
+ bias_term: true
457
+ }
458
+ }
459
+ layer {
460
+ name: "layer_256_1_relu2"
461
+ type: "ReLU"
462
+ bottom: "layer_256_1_conv1"
463
+ top: "layer_256_1_conv1"
464
+ }
465
+ layer {
466
+ name: "layer_256_1_conv2"
467
+ type: "Convolution"
468
+ bottom: "layer_256_1_conv1"
469
+ top: "layer_256_1_conv2"
470
+ param {
471
+ lr_mult: 1.0
472
+ decay_mult: 1.0
473
+ }
474
+ convolution_param {
475
+ num_output: 256
476
+ bias_term: false
477
+ pad: 1
478
+ kernel_size: 3
479
+ stride: 1
480
+ weight_filler {
481
+ type: "msra"
482
+ }
483
+ bias_filler {
484
+ type: "constant"
485
+ value: 0.0
486
+ }
487
+ }
488
+ }
489
+ layer {
490
+ name: "layer_256_1_conv_expand"
491
+ type: "Convolution"
492
+ bottom: "layer_256_1_bn1"
493
+ top: "layer_256_1_conv_expand"
494
+ param {
495
+ lr_mult: 1.0
496
+ decay_mult: 1.0
497
+ }
498
+ convolution_param {
499
+ num_output: 256
500
+ bias_term: false
501
+ pad: 0
502
+ kernel_size: 1
503
+ stride: 2
504
+ weight_filler {
505
+ type: "msra"
506
+ }
507
+ bias_filler {
508
+ type: "constant"
509
+ value: 0.0
510
+ }
511
+ }
512
+ }
513
+ layer {
514
+ name: "layer_256_1_sum"
515
+ type: "Eltwise"
516
+ bottom: "layer_256_1_conv2"
517
+ bottom: "layer_256_1_conv_expand"
518
+ top: "layer_256_1_sum"
519
+ }
520
+ layer {
521
+ name: "layer_512_1_bn1"
522
+ type: "BatchNorm"
523
+ bottom: "layer_256_1_sum"
524
+ top: "layer_512_1_bn1"
525
+ param {
526
+ lr_mult: 0.0
527
+ }
528
+ param {
529
+ lr_mult: 0.0
530
+ }
531
+ param {
532
+ lr_mult: 0.0
533
+ }
534
+ }
535
+ layer {
536
+ name: "layer_512_1_scale1"
537
+ type: "Scale"
538
+ bottom: "layer_512_1_bn1"
539
+ top: "layer_512_1_bn1"
540
+ param {
541
+ lr_mult: 1.0
542
+ decay_mult: 1.0
543
+ }
544
+ param {
545
+ lr_mult: 2.0
546
+ decay_mult: 1.0
547
+ }
548
+ scale_param {
549
+ bias_term: true
550
+ }
551
+ }
552
+ layer {
553
+ name: "layer_512_1_relu1"
554
+ type: "ReLU"
555
+ bottom: "layer_512_1_bn1"
556
+ top: "layer_512_1_bn1"
557
+ }
558
+ layer {
559
+ name: "layer_512_1_conv1_h"
560
+ type: "Convolution"
561
+ bottom: "layer_512_1_bn1"
562
+ top: "layer_512_1_conv1_h"
563
+ param {
564
+ lr_mult: 1.0
565
+ decay_mult: 1.0
566
+ }
567
+ convolution_param {
568
+ num_output: 128
569
+ bias_term: false
570
+ pad: 1
571
+ kernel_size: 3
572
+ stride: 1 # 2
573
+ weight_filler {
574
+ type: "msra"
575
+ }
576
+ bias_filler {
577
+ type: "constant"
578
+ value: 0.0
579
+ }
580
+ }
581
+ }
582
+ layer {
583
+ name: "layer_512_1_bn2_h"
584
+ type: "BatchNorm"
585
+ bottom: "layer_512_1_conv1_h"
586
+ top: "layer_512_1_conv1_h"
587
+ param {
588
+ lr_mult: 0.0
589
+ }
590
+ param {
591
+ lr_mult: 0.0
592
+ }
593
+ param {
594
+ lr_mult: 0.0
595
+ }
596
+ }
597
+ layer {
598
+ name: "layer_512_1_scale2_h"
599
+ type: "Scale"
600
+ bottom: "layer_512_1_conv1_h"
601
+ top: "layer_512_1_conv1_h"
602
+ param {
603
+ lr_mult: 1.0
604
+ decay_mult: 1.0
605
+ }
606
+ param {
607
+ lr_mult: 2.0
608
+ decay_mult: 1.0
609
+ }
610
+ scale_param {
611
+ bias_term: true
612
+ }
613
+ }
614
+ layer {
615
+ name: "layer_512_1_relu2"
616
+ type: "ReLU"
617
+ bottom: "layer_512_1_conv1_h"
618
+ top: "layer_512_1_conv1_h"
619
+ }
620
+ layer {
621
+ name: "layer_512_1_conv2_h"
622
+ type: "Convolution"
623
+ bottom: "layer_512_1_conv1_h"
624
+ top: "layer_512_1_conv2_h"
625
+ param {
626
+ lr_mult: 1.0
627
+ decay_mult: 1.0
628
+ }
629
+ convolution_param {
630
+ num_output: 256
631
+ bias_term: false
632
+ pad: 2 # 1
633
+ kernel_size: 3
634
+ stride: 1
635
+ dilation: 2
636
+ weight_filler {
637
+ type: "msra"
638
+ }
639
+ bias_filler {
640
+ type: "constant"
641
+ value: 0.0
642
+ }
643
+ }
644
+ }
645
+ layer {
646
+ name: "layer_512_1_conv_expand_h"
647
+ type: "Convolution"
648
+ bottom: "layer_512_1_bn1"
649
+ top: "layer_512_1_conv_expand_h"
650
+ param {
651
+ lr_mult: 1.0
652
+ decay_mult: 1.0
653
+ }
654
+ convolution_param {
655
+ num_output: 256
656
+ bias_term: false
657
+ pad: 0
658
+ kernel_size: 1
659
+ stride: 1 # 2
660
+ weight_filler {
661
+ type: "msra"
662
+ }
663
+ bias_filler {
664
+ type: "constant"
665
+ value: 0.0
666
+ }
667
+ }
668
+ }
669
+ layer {
670
+ name: "layer_512_1_sum"
671
+ type: "Eltwise"
672
+ bottom: "layer_512_1_conv2_h"
673
+ bottom: "layer_512_1_conv_expand_h"
674
+ top: "layer_512_1_sum"
675
+ }
676
+ layer {
677
+ name: "last_bn_h"
678
+ type: "BatchNorm"
679
+ bottom: "layer_512_1_sum"
680
+ top: "layer_512_1_sum"
681
+ param {
682
+ lr_mult: 0.0
683
+ }
684
+ param {
685
+ lr_mult: 0.0
686
+ }
687
+ param {
688
+ lr_mult: 0.0
689
+ }
690
+ }
691
+ layer {
692
+ name: "last_scale_h"
693
+ type: "Scale"
694
+ bottom: "layer_512_1_sum"
695
+ top: "layer_512_1_sum"
696
+ param {
697
+ lr_mult: 1.0
698
+ decay_mult: 1.0
699
+ }
700
+ param {
701
+ lr_mult: 2.0
702
+ decay_mult: 1.0
703
+ }
704
+ scale_param {
705
+ bias_term: true
706
+ }
707
+ }
708
+ layer {
709
+ name: "last_relu"
710
+ type: "ReLU"
711
+ bottom: "layer_512_1_sum"
712
+ top: "fc7"
713
+ }
714
+
715
+ layer {
716
+ name: "conv6_1_h"
717
+ type: "Convolution"
718
+ bottom: "fc7"
719
+ top: "conv6_1_h"
720
+ param {
721
+ lr_mult: 1
722
+ decay_mult: 1
723
+ }
724
+ param {
725
+ lr_mult: 2
726
+ decay_mult: 0
727
+ }
728
+ convolution_param {
729
+ num_output: 128
730
+ pad: 0
731
+ kernel_size: 1
732
+ stride: 1
733
+ weight_filler {
734
+ type: "xavier"
735
+ }
736
+ bias_filler {
737
+ type: "constant"
738
+ value: 0
739
+ }
740
+ }
741
+ }
742
+ layer {
743
+ name: "conv6_1_relu"
744
+ type: "ReLU"
745
+ bottom: "conv6_1_h"
746
+ top: "conv6_1_h"
747
+ }
748
+ layer {
749
+ name: "conv6_2_h"
750
+ type: "Convolution"
751
+ bottom: "conv6_1_h"
752
+ top: "conv6_2_h"
753
+ param {
754
+ lr_mult: 1
755
+ decay_mult: 1
756
+ }
757
+ param {
758
+ lr_mult: 2
759
+ decay_mult: 0
760
+ }
761
+ convolution_param {
762
+ num_output: 256
763
+ pad: 1
764
+ kernel_size: 3
765
+ stride: 2
766
+ weight_filler {
767
+ type: "xavier"
768
+ }
769
+ bias_filler {
770
+ type: "constant"
771
+ value: 0
772
+ }
773
+ }
774
+ }
775
+ layer {
776
+ name: "conv6_2_relu"
777
+ type: "ReLU"
778
+ bottom: "conv6_2_h"
779
+ top: "conv6_2_h"
780
+ }
781
+ layer {
782
+ name: "conv7_1_h"
783
+ type: "Convolution"
784
+ bottom: "conv6_2_h"
785
+ top: "conv7_1_h"
786
+ param {
787
+ lr_mult: 1
788
+ decay_mult: 1
789
+ }
790
+ param {
791
+ lr_mult: 2
792
+ decay_mult: 0
793
+ }
794
+ convolution_param {
795
+ num_output: 64
796
+ pad: 0
797
+ kernel_size: 1
798
+ stride: 1
799
+ weight_filler {
800
+ type: "xavier"
801
+ }
802
+ bias_filler {
803
+ type: "constant"
804
+ value: 0
805
+ }
806
+ }
807
+ }
808
+ layer {
809
+ name: "conv7_1_relu"
810
+ type: "ReLU"
811
+ bottom: "conv7_1_h"
812
+ top: "conv7_1_h"
813
+ }
814
+ layer {
815
+ name: "conv7_2_h"
816
+ type: "Convolution"
817
+ bottom: "conv7_1_h"
818
+ top: "conv7_2_h"
819
+ param {
820
+ lr_mult: 1
821
+ decay_mult: 1
822
+ }
823
+ param {
824
+ lr_mult: 2
825
+ decay_mult: 0
826
+ }
827
+ convolution_param {
828
+ num_output: 128
829
+ pad: 1
830
+ kernel_size: 3
831
+ stride: 2
832
+ weight_filler {
833
+ type: "xavier"
834
+ }
835
+ bias_filler {
836
+ type: "constant"
837
+ value: 0
838
+ }
839
+ }
840
+ }
841
+ layer {
842
+ name: "conv7_2_relu"
843
+ type: "ReLU"
844
+ bottom: "conv7_2_h"
845
+ top: "conv7_2_h"
846
+ }
847
+ layer {
848
+ name: "conv8_1_h"
849
+ type: "Convolution"
850
+ bottom: "conv7_2_h"
851
+ top: "conv8_1_h"
852
+ param {
853
+ lr_mult: 1
854
+ decay_mult: 1
855
+ }
856
+ param {
857
+ lr_mult: 2
858
+ decay_mult: 0
859
+ }
860
+ convolution_param {
861
+ num_output: 64
862
+ pad: 0
863
+ kernel_size: 1
864
+ stride: 1
865
+ weight_filler {
866
+ type: "xavier"
867
+ }
868
+ bias_filler {
869
+ type: "constant"
870
+ value: 0
871
+ }
872
+ }
873
+ }
874
+ layer {
875
+ name: "conv8_1_relu"
876
+ type: "ReLU"
877
+ bottom: "conv8_1_h"
878
+ top: "conv8_1_h"
879
+ }
880
+ layer {
881
+ name: "conv8_2_h"
882
+ type: "Convolution"
883
+ bottom: "conv8_1_h"
884
+ top: "conv8_2_h"
885
+ param {
886
+ lr_mult: 1
887
+ decay_mult: 1
888
+ }
889
+ param {
890
+ lr_mult: 2
891
+ decay_mult: 0
892
+ }
893
+ convolution_param {
894
+ num_output: 128
895
+ pad: 0
896
+ kernel_size: 3
897
+ stride: 1
898
+ weight_filler {
899
+ type: "xavier"
900
+ }
901
+ bias_filler {
902
+ type: "constant"
903
+ value: 0
904
+ }
905
+ }
906
+ }
907
+ layer {
908
+ name: "conv8_2_relu"
909
+ type: "ReLU"
910
+ bottom: "conv8_2_h"
911
+ top: "conv8_2_h"
912
+ }
913
+ layer {
914
+ name: "conv9_1_h"
915
+ type: "Convolution"
916
+ bottom: "conv8_2_h"
917
+ top: "conv9_1_h"
918
+ param {
919
+ lr_mult: 1
920
+ decay_mult: 1
921
+ }
922
+ param {
923
+ lr_mult: 2
924
+ decay_mult: 0
925
+ }
926
+ convolution_param {
927
+ num_output: 64
928
+ pad: 0
929
+ kernel_size: 1
930
+ stride: 1
931
+ weight_filler {
932
+ type: "xavier"
933
+ }
934
+ bias_filler {
935
+ type: "constant"
936
+ value: 0
937
+ }
938
+ }
939
+ }
940
+ layer {
941
+ name: "conv9_1_relu"
942
+ type: "ReLU"
943
+ bottom: "conv9_1_h"
944
+ top: "conv9_1_h"
945
+ }
946
+ layer {
947
+ name: "conv9_2_h"
948
+ type: "Convolution"
949
+ bottom: "conv9_1_h"
950
+ top: "conv9_2_h"
951
+ param {
952
+ lr_mult: 1
953
+ decay_mult: 1
954
+ }
955
+ param {
956
+ lr_mult: 2
957
+ decay_mult: 0
958
+ }
959
+ convolution_param {
960
+ num_output: 128
961
+ pad: 0
962
+ kernel_size: 3
963
+ stride: 1
964
+ weight_filler {
965
+ type: "xavier"
966
+ }
967
+ bias_filler {
968
+ type: "constant"
969
+ value: 0
970
+ }
971
+ }
972
+ }
973
+ layer {
974
+ name: "conv9_2_relu"
975
+ type: "ReLU"
976
+ bottom: "conv9_2_h"
977
+ top: "conv9_2_h"
978
+ }
979
+ layer {
980
+ name: "conv4_3_norm"
981
+ type: "Normalize"
982
+ bottom: "layer_256_1_bn1"
983
+ top: "conv4_3_norm"
984
+ norm_param {
985
+ across_spatial: false
986
+ scale_filler {
987
+ type: "constant"
988
+ value: 20
989
+ }
990
+ channel_shared: false
991
+ }
992
+ }
993
+ layer {
994
+ name: "conv4_3_norm_mbox_loc"
995
+ type: "Convolution"
996
+ bottom: "conv4_3_norm"
997
+ top: "conv4_3_norm_mbox_loc"
998
+ param {
999
+ lr_mult: 1
1000
+ decay_mult: 1
1001
+ }
1002
+ param {
1003
+ lr_mult: 2
1004
+ decay_mult: 0
1005
+ }
1006
+ convolution_param {
1007
+ num_output: 16
1008
+ pad: 1
1009
+ kernel_size: 3
1010
+ stride: 1
1011
+ weight_filler {
1012
+ type: "xavier"
1013
+ }
1014
+ bias_filler {
1015
+ type: "constant"
1016
+ value: 0
1017
+ }
1018
+ }
1019
+ }
1020
+ layer {
1021
+ name: "conv4_3_norm_mbox_loc_perm"
1022
+ type: "Permute"
1023
+ bottom: "conv4_3_norm_mbox_loc"
1024
+ top: "conv4_3_norm_mbox_loc_perm"
1025
+ permute_param {
1026
+ order: 0
1027
+ order: 2
1028
+ order: 3
1029
+ order: 1
1030
+ }
1031
+ }
1032
+ layer {
1033
+ name: "conv4_3_norm_mbox_loc_flat"
1034
+ type: "Flatten"
1035
+ bottom: "conv4_3_norm_mbox_loc_perm"
1036
+ top: "conv4_3_norm_mbox_loc_flat"
1037
+ flatten_param {
1038
+ axis: 1
1039
+ }
1040
+ }
1041
+ layer {
1042
+ name: "conv4_3_norm_mbox_conf"
1043
+ type: "Convolution"
1044
+ bottom: "conv4_3_norm"
1045
+ top: "conv4_3_norm_mbox_conf"
1046
+ param {
1047
+ lr_mult: 1
1048
+ decay_mult: 1
1049
+ }
1050
+ param {
1051
+ lr_mult: 2
1052
+ decay_mult: 0
1053
+ }
1054
+ convolution_param {
1055
+ num_output: 8 # 84
1056
+ pad: 1
1057
+ kernel_size: 3
1058
+ stride: 1
1059
+ weight_filler {
1060
+ type: "xavier"
1061
+ }
1062
+ bias_filler {
1063
+ type: "constant"
1064
+ value: 0
1065
+ }
1066
+ }
1067
+ }
1068
+ layer {
1069
+ name: "conv4_3_norm_mbox_conf_perm"
1070
+ type: "Permute"
1071
+ bottom: "conv4_3_norm_mbox_conf"
1072
+ top: "conv4_3_norm_mbox_conf_perm"
1073
+ permute_param {
1074
+ order: 0
1075
+ order: 2
1076
+ order: 3
1077
+ order: 1
1078
+ }
1079
+ }
1080
+ layer {
1081
+ name: "conv4_3_norm_mbox_conf_flat"
1082
+ type: "Flatten"
1083
+ bottom: "conv4_3_norm_mbox_conf_perm"
1084
+ top: "conv4_3_norm_mbox_conf_flat"
1085
+ flatten_param {
1086
+ axis: 1
1087
+ }
1088
+ }
1089
+ layer {
1090
+ name: "conv4_3_norm_mbox_priorbox"
1091
+ type: "PriorBox"
1092
+ bottom: "conv4_3_norm"
1093
+ bottom: "data"
1094
+ top: "conv4_3_norm_mbox_priorbox"
1095
+ prior_box_param {
1096
+ min_size: 30.0
1097
+ max_size: 60.0
1098
+ aspect_ratio: 2
1099
+ flip: true
1100
+ clip: false
1101
+ variance: 0.1
1102
+ variance: 0.1
1103
+ variance: 0.2
1104
+ variance: 0.2
1105
+ step: 8
1106
+ offset: 0.5
1107
+ }
1108
+ }
1109
+ layer {
1110
+ name: "fc7_mbox_loc"
1111
+ type: "Convolution"
1112
+ bottom: "fc7"
1113
+ top: "fc7_mbox_loc"
1114
+ param {
1115
+ lr_mult: 1
1116
+ decay_mult: 1
1117
+ }
1118
+ param {
1119
+ lr_mult: 2
1120
+ decay_mult: 0
1121
+ }
1122
+ convolution_param {
1123
+ num_output: 24
1124
+ pad: 1
1125
+ kernel_size: 3
1126
+ stride: 1
1127
+ weight_filler {
1128
+ type: "xavier"
1129
+ }
1130
+ bias_filler {
1131
+ type: "constant"
1132
+ value: 0
1133
+ }
1134
+ }
1135
+ }
1136
+ layer {
1137
+ name: "fc7_mbox_loc_perm"
1138
+ type: "Permute"
1139
+ bottom: "fc7_mbox_loc"
1140
+ top: "fc7_mbox_loc_perm"
1141
+ permute_param {
1142
+ order: 0
1143
+ order: 2
1144
+ order: 3
1145
+ order: 1
1146
+ }
1147
+ }
1148
+ layer {
1149
+ name: "fc7_mbox_loc_flat"
1150
+ type: "Flatten"
1151
+ bottom: "fc7_mbox_loc_perm"
1152
+ top: "fc7_mbox_loc_flat"
1153
+ flatten_param {
1154
+ axis: 1
1155
+ }
1156
+ }
1157
+ layer {
1158
+ name: "fc7_mbox_conf"
1159
+ type: "Convolution"
1160
+ bottom: "fc7"
1161
+ top: "fc7_mbox_conf"
1162
+ param {
1163
+ lr_mult: 1
1164
+ decay_mult: 1
1165
+ }
1166
+ param {
1167
+ lr_mult: 2
1168
+ decay_mult: 0
1169
+ }
1170
+ convolution_param {
1171
+ num_output: 12 # 126
1172
+ pad: 1
1173
+ kernel_size: 3
1174
+ stride: 1
1175
+ weight_filler {
1176
+ type: "xavier"
1177
+ }
1178
+ bias_filler {
1179
+ type: "constant"
1180
+ value: 0
1181
+ }
1182
+ }
1183
+ }
1184
+ layer {
1185
+ name: "fc7_mbox_conf_perm"
1186
+ type: "Permute"
1187
+ bottom: "fc7_mbox_conf"
1188
+ top: "fc7_mbox_conf_perm"
1189
+ permute_param {
1190
+ order: 0
1191
+ order: 2
1192
+ order: 3
1193
+ order: 1
1194
+ }
1195
+ }
1196
+ layer {
1197
+ name: "fc7_mbox_conf_flat"
1198
+ type: "Flatten"
1199
+ bottom: "fc7_mbox_conf_perm"
1200
+ top: "fc7_mbox_conf_flat"
1201
+ flatten_param {
1202
+ axis: 1
1203
+ }
1204
+ }
1205
+ layer {
1206
+ name: "fc7_mbox_priorbox"
1207
+ type: "PriorBox"
1208
+ bottom: "fc7"
1209
+ bottom: "data"
1210
+ top: "fc7_mbox_priorbox"
1211
+ prior_box_param {
1212
+ min_size: 60.0
1213
+ max_size: 111.0
1214
+ aspect_ratio: 2
1215
+ aspect_ratio: 3
1216
+ flip: true
1217
+ clip: false
1218
+ variance: 0.1
1219
+ variance: 0.1
1220
+ variance: 0.2
1221
+ variance: 0.2
1222
+ step: 16
1223
+ offset: 0.5
1224
+ }
1225
+ }
1226
+ layer {
1227
+ name: "conv6_2_mbox_loc"
1228
+ type: "Convolution"
1229
+ bottom: "conv6_2_h"
1230
+ top: "conv6_2_mbox_loc"
1231
+ param {
1232
+ lr_mult: 1
1233
+ decay_mult: 1
1234
+ }
1235
+ param {
1236
+ lr_mult: 2
1237
+ decay_mult: 0
1238
+ }
1239
+ convolution_param {
1240
+ num_output: 24
1241
+ pad: 1
1242
+ kernel_size: 3
1243
+ stride: 1
1244
+ weight_filler {
1245
+ type: "xavier"
1246
+ }
1247
+ bias_filler {
1248
+ type: "constant"
1249
+ value: 0
1250
+ }
1251
+ }
1252
+ }
1253
+ layer {
1254
+ name: "conv6_2_mbox_loc_perm"
1255
+ type: "Permute"
1256
+ bottom: "conv6_2_mbox_loc"
1257
+ top: "conv6_2_mbox_loc_perm"
1258
+ permute_param {
1259
+ order: 0
1260
+ order: 2
1261
+ order: 3
1262
+ order: 1
1263
+ }
1264
+ }
1265
+ layer {
1266
+ name: "conv6_2_mbox_loc_flat"
1267
+ type: "Flatten"
1268
+ bottom: "conv6_2_mbox_loc_perm"
1269
+ top: "conv6_2_mbox_loc_flat"
1270
+ flatten_param {
1271
+ axis: 1
1272
+ }
1273
+ }
1274
+ layer {
1275
+ name: "conv6_2_mbox_conf"
1276
+ type: "Convolution"
1277
+ bottom: "conv6_2_h"
1278
+ top: "conv6_2_mbox_conf"
1279
+ param {
1280
+ lr_mult: 1
1281
+ decay_mult: 1
1282
+ }
1283
+ param {
1284
+ lr_mult: 2
1285
+ decay_mult: 0
1286
+ }
1287
+ convolution_param {
1288
+ num_output: 12 # 126
1289
+ pad: 1
1290
+ kernel_size: 3
1291
+ stride: 1
1292
+ weight_filler {
1293
+ type: "xavier"
1294
+ }
1295
+ bias_filler {
1296
+ type: "constant"
1297
+ value: 0
1298
+ }
1299
+ }
1300
+ }
1301
+ layer {
1302
+ name: "conv6_2_mbox_conf_perm"
1303
+ type: "Permute"
1304
+ bottom: "conv6_2_mbox_conf"
1305
+ top: "conv6_2_mbox_conf_perm"
1306
+ permute_param {
1307
+ order: 0
1308
+ order: 2
1309
+ order: 3
1310
+ order: 1
1311
+ }
1312
+ }
1313
+ layer {
1314
+ name: "conv6_2_mbox_conf_flat"
1315
+ type: "Flatten"
1316
+ bottom: "conv6_2_mbox_conf_perm"
1317
+ top: "conv6_2_mbox_conf_flat"
1318
+ flatten_param {
1319
+ axis: 1
1320
+ }
1321
+ }
1322
+ layer {
1323
+ name: "conv6_2_mbox_priorbox"
1324
+ type: "PriorBox"
1325
+ bottom: "conv6_2_h"
1326
+ bottom: "data"
1327
+ top: "conv6_2_mbox_priorbox"
1328
+ prior_box_param {
1329
+ min_size: 111.0
1330
+ max_size: 162.0
1331
+ aspect_ratio: 2
1332
+ aspect_ratio: 3
1333
+ flip: true
1334
+ clip: false
1335
+ variance: 0.1
1336
+ variance: 0.1
1337
+ variance: 0.2
1338
+ variance: 0.2
1339
+ step: 32
1340
+ offset: 0.5
1341
+ }
1342
+ }
1343
+ layer {
1344
+ name: "conv7_2_mbox_loc"
1345
+ type: "Convolution"
1346
+ bottom: "conv7_2_h"
1347
+ top: "conv7_2_mbox_loc"
1348
+ param {
1349
+ lr_mult: 1
1350
+ decay_mult: 1
1351
+ }
1352
+ param {
1353
+ lr_mult: 2
1354
+ decay_mult: 0
1355
+ }
1356
+ convolution_param {
1357
+ num_output: 24
1358
+ pad: 1
1359
+ kernel_size: 3
1360
+ stride: 1
1361
+ weight_filler {
1362
+ type: "xavier"
1363
+ }
1364
+ bias_filler {
1365
+ type: "constant"
1366
+ value: 0
1367
+ }
1368
+ }
1369
+ }
1370
+ layer {
1371
+ name: "conv7_2_mbox_loc_perm"
1372
+ type: "Permute"
1373
+ bottom: "conv7_2_mbox_loc"
1374
+ top: "conv7_2_mbox_loc_perm"
1375
+ permute_param {
1376
+ order: 0
1377
+ order: 2
1378
+ order: 3
1379
+ order: 1
1380
+ }
1381
+ }
1382
+ layer {
1383
+ name: "conv7_2_mbox_loc_flat"
1384
+ type: "Flatten"
1385
+ bottom: "conv7_2_mbox_loc_perm"
1386
+ top: "conv7_2_mbox_loc_flat"
1387
+ flatten_param {
1388
+ axis: 1
1389
+ }
1390
+ }
1391
+ layer {
1392
+ name: "conv7_2_mbox_conf"
1393
+ type: "Convolution"
1394
+ bottom: "conv7_2_h"
1395
+ top: "conv7_2_mbox_conf"
1396
+ param {
1397
+ lr_mult: 1
1398
+ decay_mult: 1
1399
+ }
1400
+ param {
1401
+ lr_mult: 2
1402
+ decay_mult: 0
1403
+ }
1404
+ convolution_param {
1405
+ num_output: 12 # 126
1406
+ pad: 1
1407
+ kernel_size: 3
1408
+ stride: 1
1409
+ weight_filler {
1410
+ type: "xavier"
1411
+ }
1412
+ bias_filler {
1413
+ type: "constant"
1414
+ value: 0
1415
+ }
1416
+ }
1417
+ }
1418
+ layer {
1419
+ name: "conv7_2_mbox_conf_perm"
1420
+ type: "Permute"
1421
+ bottom: "conv7_2_mbox_conf"
1422
+ top: "conv7_2_mbox_conf_perm"
1423
+ permute_param {
1424
+ order: 0
1425
+ order: 2
1426
+ order: 3
1427
+ order: 1
1428
+ }
1429
+ }
1430
+ layer {
1431
+ name: "conv7_2_mbox_conf_flat"
1432
+ type: "Flatten"
1433
+ bottom: "conv7_2_mbox_conf_perm"
1434
+ top: "conv7_2_mbox_conf_flat"
1435
+ flatten_param {
1436
+ axis: 1
1437
+ }
1438
+ }
1439
+ layer {
1440
+ name: "conv7_2_mbox_priorbox"
1441
+ type: "PriorBox"
1442
+ bottom: "conv7_2_h"
1443
+ bottom: "data"
1444
+ top: "conv7_2_mbox_priorbox"
1445
+ prior_box_param {
1446
+ min_size: 162.0
1447
+ max_size: 213.0
1448
+ aspect_ratio: 2
1449
+ aspect_ratio: 3
1450
+ flip: true
1451
+ clip: false
1452
+ variance: 0.1
1453
+ variance: 0.1
1454
+ variance: 0.2
1455
+ variance: 0.2
1456
+ step: 64
1457
+ offset: 0.5
1458
+ }
1459
+ }
1460
+ layer {
1461
+ name: "conv8_2_mbox_loc"
1462
+ type: "Convolution"
1463
+ bottom: "conv8_2_h"
1464
+ top: "conv8_2_mbox_loc"
1465
+ param {
1466
+ lr_mult: 1
1467
+ decay_mult: 1
1468
+ }
1469
+ param {
1470
+ lr_mult: 2
1471
+ decay_mult: 0
1472
+ }
1473
+ convolution_param {
1474
+ num_output: 16
1475
+ pad: 1
1476
+ kernel_size: 3
1477
+ stride: 1
1478
+ weight_filler {
1479
+ type: "xavier"
1480
+ }
1481
+ bias_filler {
1482
+ type: "constant"
1483
+ value: 0
1484
+ }
1485
+ }
1486
+ }
1487
+ layer {
1488
+ name: "conv8_2_mbox_loc_perm"
1489
+ type: "Permute"
1490
+ bottom: "conv8_2_mbox_loc"
1491
+ top: "conv8_2_mbox_loc_perm"
1492
+ permute_param {
1493
+ order: 0
1494
+ order: 2
1495
+ order: 3
1496
+ order: 1
1497
+ }
1498
+ }
1499
+ layer {
1500
+ name: "conv8_2_mbox_loc_flat"
1501
+ type: "Flatten"
1502
+ bottom: "conv8_2_mbox_loc_perm"
1503
+ top: "conv8_2_mbox_loc_flat"
1504
+ flatten_param {
1505
+ axis: 1
1506
+ }
1507
+ }
1508
+ layer {
1509
+ name: "conv8_2_mbox_conf"
1510
+ type: "Convolution"
1511
+ bottom: "conv8_2_h"
1512
+ top: "conv8_2_mbox_conf"
1513
+ param {
1514
+ lr_mult: 1
1515
+ decay_mult: 1
1516
+ }
1517
+ param {
1518
+ lr_mult: 2
1519
+ decay_mult: 0
1520
+ }
1521
+ convolution_param {
1522
+ num_output: 8 # 84
1523
+ pad: 1
1524
+ kernel_size: 3
1525
+ stride: 1
1526
+ weight_filler {
1527
+ type: "xavier"
1528
+ }
1529
+ bias_filler {
1530
+ type: "constant"
1531
+ value: 0
1532
+ }
1533
+ }
1534
+ }
1535
+ layer {
1536
+ name: "conv8_2_mbox_conf_perm"
1537
+ type: "Permute"
1538
+ bottom: "conv8_2_mbox_conf"
1539
+ top: "conv8_2_mbox_conf_perm"
1540
+ permute_param {
1541
+ order: 0
1542
+ order: 2
1543
+ order: 3
1544
+ order: 1
1545
+ }
1546
+ }
1547
+ layer {
1548
+ name: "conv8_2_mbox_conf_flat"
1549
+ type: "Flatten"
1550
+ bottom: "conv8_2_mbox_conf_perm"
1551
+ top: "conv8_2_mbox_conf_flat"
1552
+ flatten_param {
1553
+ axis: 1
1554
+ }
1555
+ }
1556
+ layer {
1557
+ name: "conv8_2_mbox_priorbox"
1558
+ type: "PriorBox"
1559
+ bottom: "conv8_2_h"
1560
+ bottom: "data"
1561
+ top: "conv8_2_mbox_priorbox"
1562
+ prior_box_param {
1563
+ min_size: 213.0
1564
+ max_size: 264.0
1565
+ aspect_ratio: 2
1566
+ flip: true
1567
+ clip: false
1568
+ variance: 0.1
1569
+ variance: 0.1
1570
+ variance: 0.2
1571
+ variance: 0.2
1572
+ step: 100
1573
+ offset: 0.5
1574
+ }
1575
+ }
1576
+ layer {
1577
+ name: "conv9_2_mbox_loc"
1578
+ type: "Convolution"
1579
+ bottom: "conv9_2_h"
1580
+ top: "conv9_2_mbox_loc"
1581
+ param {
1582
+ lr_mult: 1
1583
+ decay_mult: 1
1584
+ }
1585
+ param {
1586
+ lr_mult: 2
1587
+ decay_mult: 0
1588
+ }
1589
+ convolution_param {
1590
+ num_output: 16
1591
+ pad: 1
1592
+ kernel_size: 3
1593
+ stride: 1
1594
+ weight_filler {
1595
+ type: "xavier"
1596
+ }
1597
+ bias_filler {
1598
+ type: "constant"
1599
+ value: 0
1600
+ }
1601
+ }
1602
+ }
1603
+ layer {
1604
+ name: "conv9_2_mbox_loc_perm"
1605
+ type: "Permute"
1606
+ bottom: "conv9_2_mbox_loc"
1607
+ top: "conv9_2_mbox_loc_perm"
1608
+ permute_param {
1609
+ order: 0
1610
+ order: 2
1611
+ order: 3
1612
+ order: 1
1613
+ }
1614
+ }
1615
+ layer {
1616
+ name: "conv9_2_mbox_loc_flat"
1617
+ type: "Flatten"
1618
+ bottom: "conv9_2_mbox_loc_perm"
1619
+ top: "conv9_2_mbox_loc_flat"
1620
+ flatten_param {
1621
+ axis: 1
1622
+ }
1623
+ }
1624
+ layer {
1625
+ name: "conv9_2_mbox_conf"
1626
+ type: "Convolution"
1627
+ bottom: "conv9_2_h"
1628
+ top: "conv9_2_mbox_conf"
1629
+ param {
1630
+ lr_mult: 1
1631
+ decay_mult: 1
1632
+ }
1633
+ param {
1634
+ lr_mult: 2
1635
+ decay_mult: 0
1636
+ }
1637
+ convolution_param {
1638
+ num_output: 8 # 84
1639
+ pad: 1
1640
+ kernel_size: 3
1641
+ stride: 1
1642
+ weight_filler {
1643
+ type: "xavier"
1644
+ }
1645
+ bias_filler {
1646
+ type: "constant"
1647
+ value: 0
1648
+ }
1649
+ }
1650
+ }
1651
+ layer {
1652
+ name: "conv9_2_mbox_conf_perm"
1653
+ type: "Permute"
1654
+ bottom: "conv9_2_mbox_conf"
1655
+ top: "conv9_2_mbox_conf_perm"
1656
+ permute_param {
1657
+ order: 0
1658
+ order: 2
1659
+ order: 3
1660
+ order: 1
1661
+ }
1662
+ }
1663
+ layer {
1664
+ name: "conv9_2_mbox_conf_flat"
1665
+ type: "Flatten"
1666
+ bottom: "conv9_2_mbox_conf_perm"
1667
+ top: "conv9_2_mbox_conf_flat"
1668
+ flatten_param {
1669
+ axis: 1
1670
+ }
1671
+ }
1672
+ layer {
1673
+ name: "conv9_2_mbox_priorbox"
1674
+ type: "PriorBox"
1675
+ bottom: "conv9_2_h"
1676
+ bottom: "data"
1677
+ top: "conv9_2_mbox_priorbox"
1678
+ prior_box_param {
1679
+ min_size: 264.0
1680
+ max_size: 315.0
1681
+ aspect_ratio: 2
1682
+ flip: true
1683
+ clip: false
1684
+ variance: 0.1
1685
+ variance: 0.1
1686
+ variance: 0.2
1687
+ variance: 0.2
1688
+ step: 300
1689
+ offset: 0.5
1690
+ }
1691
+ }
1692
+ layer {
1693
+ name: "mbox_loc"
1694
+ type: "Concat"
1695
+ bottom: "conv4_3_norm_mbox_loc_flat"
1696
+ bottom: "fc7_mbox_loc_flat"
1697
+ bottom: "conv6_2_mbox_loc_flat"
1698
+ bottom: "conv7_2_mbox_loc_flat"
1699
+ bottom: "conv8_2_mbox_loc_flat"
1700
+ bottom: "conv9_2_mbox_loc_flat"
1701
+ top: "mbox_loc"
1702
+ concat_param {
1703
+ axis: 1
1704
+ }
1705
+ }
1706
+ layer {
1707
+ name: "mbox_conf"
1708
+ type: "Concat"
1709
+ bottom: "conv4_3_norm_mbox_conf_flat"
1710
+ bottom: "fc7_mbox_conf_flat"
1711
+ bottom: "conv6_2_mbox_conf_flat"
1712
+ bottom: "conv7_2_mbox_conf_flat"
1713
+ bottom: "conv8_2_mbox_conf_flat"
1714
+ bottom: "conv9_2_mbox_conf_flat"
1715
+ top: "mbox_conf"
1716
+ concat_param {
1717
+ axis: 1
1718
+ }
1719
+ }
1720
+ layer {
1721
+ name: "mbox_priorbox"
1722
+ type: "Concat"
1723
+ bottom: "conv4_3_norm_mbox_priorbox"
1724
+ bottom: "fc7_mbox_priorbox"
1725
+ bottom: "conv6_2_mbox_priorbox"
1726
+ bottom: "conv7_2_mbox_priorbox"
1727
+ bottom: "conv8_2_mbox_priorbox"
1728
+ bottom: "conv9_2_mbox_priorbox"
1729
+ top: "mbox_priorbox"
1730
+ concat_param {
1731
+ axis: 2
1732
+ }
1733
+ }
1734
+
1735
+ layer {
1736
+ name: "mbox_conf_reshape"
1737
+ type: "Reshape"
1738
+ bottom: "mbox_conf"
1739
+ top: "mbox_conf_reshape"
1740
+ reshape_param {
1741
+ shape {
1742
+ dim: 0
1743
+ dim: -1
1744
+ dim: 2
1745
+ }
1746
+ }
1747
+ }
1748
+ layer {
1749
+ name: "mbox_conf_softmax"
1750
+ type: "Softmax"
1751
+ bottom: "mbox_conf_reshape"
1752
+ top: "mbox_conf_softmax"
1753
+ softmax_param {
1754
+ axis: 2
1755
+ }
1756
+ }
1757
+ layer {
1758
+ name: "mbox_conf_flatten"
1759
+ type: "Flatten"
1760
+ bottom: "mbox_conf_softmax"
1761
+ top: "mbox_conf_flatten"
1762
+ flatten_param {
1763
+ axis: 1
1764
+ }
1765
+ }
1766
+
1767
+ layer {
1768
+ name: "detection_out"
1769
+ type: "DetectionOutput"
1770
+ bottom: "mbox_loc"
1771
+ bottom: "mbox_conf_flatten"
1772
+ bottom: "mbox_priorbox"
1773
+ top: "detection_out"
1774
+ include {
1775
+ phase: TEST
1776
+ }
1777
+ detection_output_param {
1778
+ num_classes: 2
1779
+ share_location: true
1780
+ background_label_id: 0
1781
+ nms_param {
1782
+ nms_threshold: 0.45
1783
+ top_k: 400
1784
+ }
1785
+ code_type: CENTER_SIZE
1786
+ keep_top_k: 200
1787
+ confidence_threshold: 0.01
1788
+ clip: 1
1789
+ }
1790
+ }
face_detection_yunet_2023mar.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f2383e4dd3cfbb4553ea8718107fc0423210dc964f9f4280604804ed2552fa4
3
+ size 232589
haarcascade_frontalface_default.xml ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.2
2
+ altair==5.5.0
3
+ astunparse==1.6.3
4
+ attrs==25.3.0
5
+ blinker==1.9.0
6
+ cachetools==5.5.2
7
+ certifi==2025.4.26
8
+ charset-normalizer==3.4.2
9
+ click==8.1.8
10
+ filelock==3.18.0
11
+ flatbuffers==25.2.10
12
+ fsspec==2025.3.2
13
+ gast==0.6.0
14
+ gitdb==4.0.12
15
+ GitPython==3.1.44
16
+ google-pasta==0.2.0
17
+ grpcio==1.71.0
18
+ h5py==3.13.0
19
+ idna==3.10
20
+ importlib_metadata==8.7.0
21
+ Jinja2==3.1.6
22
+ jsonschema==4.23.0
23
+ jsonschema-specifications==2025.4.1
24
+ keras==3.9.2
25
+ libclang==18.1.1
26
+ Markdown==3.8
27
+ markdown-it-py==3.0.0
28
+ MarkupSafe==3.0.2
29
+ mdurl==0.1.2
30
+ ml_dtypes==0.5.1
31
+ mpmath==1.3.0
32
+ namex==0.0.9
33
+ narwhals==1.39.0
34
+ networkx==3.2.1
35
+ numpy==2.0.2
36
+ opencv-python==4.11.0.86
37
+ opt_einsum==3.4.0
38
+ optree==0.15.0
39
+ packaging==24.2
40
+ pandas==2.2.3
41
+ pillow==11.2.1
42
+ protobuf==5.29.4
43
+ pyarrow==20.0.0
44
+ pydeck==0.9.1
45
+ Pygments==2.19.1
46
+ python-dateutil==2.9.0.post0
47
+ pytz==2025.2
48
+ referencing==0.36.2
49
+ requests==2.32.3
50
+ rich==14.0.0
51
+ rpds-py==0.24.0
52
+ six==1.17.0
53
+ smmap==5.0.2
54
+ streamlit==1.45.1
55
+ sympy==1.14.0
56
+ tenacity==9.1.2
57
+ tensorboard==2.19.0
58
+ tensorboard-data-server==0.7.2
59
+ tensorflow==2.19.0
60
+ tensorflow-io-gcs-filesystem==0.37.1
61
+ termcolor==3.1.0
62
+ toml==0.10.2
63
+ torch==2.7.0
64
+ tornado==6.4.2
65
+ typing_extensions==4.13.2
66
+ tzdata==2025.2
67
+ urllib3==2.4.0
68
+ Werkzeug==3.1.3
69
+ wrapt==1.17.2
70
+ zipp==3.21.0
res10_300x300_ssd_iter_140000.caffemodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a56a11a57a4a295956b0660b4a3d76bbdca2206c4961cea8efe7d95c7cb2f2d
3
+ size 10666211
sample_videos/Sample.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb48fbbfe295461889585a2c3ffe592ba208d2501018b9517f158108f11acd10
3
+ size 11293922