Spaces:
Runtime error
Runtime error
# Facial Recognition with Emotion / Sentiment Detector | |
# This is a custom, hard-coded version of darknet with | |
# YOLOv3 implementation for openimages database. This | |
# was written to test viability of implementing YOLO | |
# for face detection followed by emotion / sentiment | |
# analysis. | |
# | |
# Configuration, weights and data are hardcoded. | |
# This version takes any images, detects faces, | |
# and then runs emotion / sentiment analysis | |
# | |
# Author : Saikiran Tharimena | |
# Co-Authors: Kjetil Marinius Sjulsen, Juan Carlos Calvet Lopez | |
# Project : Emotion / Sentiment Detection from news images | |
# Date : 12 September 2022 | |
# Version : v0.1 | |
# | |
# (C) Schibsted ASA | |
# Libraries | |
import torch | |
from utils import * | |
import gradio as gr | |
from numpy import array | |
from darknet import Darknet | |
from torch.autograd import Variable | |
from torch.cuda import is_available as check_cuda | |
from PIL.ImageOps import grayscale | |
from fastai.vision.all import PILImage, load_learner | |
################## DARKNET ################## | |
# Parameters | |
batch_size = 1 | |
confidence = 0.25 | |
nms_thresh = 0.30 | |
run_cuda = False | |
# CFG Files | |
cfg = 'cfg/yolov3-openimages.cfg' | |
clsnames= 'cfg/openimages.names' | |
weights = 'cfg/yolov3-openimages.weights' | |
# Load classes | |
classes = load_classes(clsnames) | |
num_classes = len(classes) | |
# Set up the neural network | |
print('Load Network') | |
model = Darknet(cfg) | |
print('Load Weights') | |
model.load_weights(weights) | |
print('Successfully loaded Network') | |
# Check CUDA | |
if run_cuda: | |
CUDA = check_cuda() | |
else: | |
CUDA = False | |
# Input dimension | |
inp_dim = int(model.net_info["height"]) | |
# put the model on GPU | |
if CUDA: | |
model.cuda() | |
# Set the model in evaluation mode | |
model.eval() | |
def get_detections(x): | |
c1 = [int(y) for y in x[1:3]] | |
c2 = [int(y) for y in x[3:5]] | |
det_class = int(x[-1]) | |
label = "{0}".format(classes[det_class]) | |
return (label, tuple(c1 + c2)) | |
# face detector | |
def detector(image): | |
# Just lazy to update this | |
imlist = [image] | |
loaded_ims = [image] | |
im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))])) | |
im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_ims] | |
im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2) | |
leftover = 0 | |
if (len(im_dim_list) % batch_size): | |
leftover = 1 | |
if batch_size != 1: | |
num_batches = len(imlist) // batch_size + leftover | |
im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size, | |
len(im_batches))])) for i in range(num_batches)] | |
write = 0 | |
if CUDA: | |
im_dim_list = im_dim_list.cuda() | |
for i, batch in enumerate(im_batches): | |
# load the image | |
if CUDA: | |
batch = batch.cuda() | |
with torch.no_grad(): | |
prediction = model(Variable(batch), CUDA) | |
prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thresh) | |
if type(prediction) == int: | |
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): | |
im_id = i*batch_size + im_num | |
continue | |
prediction[:,0] += i*batch_size # transform the atribute from index in batch to index in imlist | |
if not write: # If we have't initialised output | |
output = prediction | |
write = 1 | |
else: | |
output = torch.cat((output, prediction)) | |
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): | |
im_id = i * batch_size + im_num | |
objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id] | |
if CUDA: | |
torch.cuda.synchronize() | |
try: | |
output | |
except NameError: | |
return loaded_ims[0], [] | |
im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long()) | |
scaling_factor = torch.min(608/im_dim_list,1)[0].view(-1,1) | |
output[:, [1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2 | |
output[:, [2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2 | |
output[:, 1:5] /= scaling_factor | |
for i in range(output.shape[0]): | |
output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0]) | |
output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1]) | |
detections = list(map(get_detections, output)) | |
if CUDA: | |
torch.cuda.empty_cache() | |
return loaded_ims[0], detections | |
############################################# | |
# Emotion | |
learn_emotion = load_learner('models/emotions_vgg19.pkl') | |
learn_emotion_labels = learn_emotion.dls.vocab | |
# Sentiment | |
learn_sentiment = load_learner('models/sentiment_vgg19.pkl') | |
learn_sentiment_labels = learn_sentiment.dls.vocab | |
def crop_images(img, bbox): | |
"Here image should be an image object from PILImage.create" | |
# Coordinates of face in cv2 format | |
xmin, ymin, xmax, ymax = bbox[1] | |
# resize and crop face | |
return img.crop((xmin, ymin, xmax, ymax)) | |
def detect_person_face(img, detections): | |
'''This function is called from within detect face. | |
If only a person is detected, then this will crop | |
image and then try to detect face again.''' | |
faces = [] | |
# Loop through people | |
for detection in detections: | |
# Get cropped image of person | |
temp = crop_images(img, detection) | |
# run detector again | |
_, detect = detector(array(temp)[...,:3]) | |
# check for human faces | |
human_face = [idx for idx, val in enumerate(detect) if val[0] == 'Human face'] | |
if len(human_face) == 0: | |
continue | |
# Force it to take only 1 face per person | |
# crop face and append to list | |
faces.append(crop_images(temp, detect[human_face[0]])) | |
return faces | |
def detect_face(img): | |
_, detections = detector(array(img)[...,:3]) | |
# check for human faces | |
human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Human face'] | |
if len(human_face) == 0: | |
human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Person'] | |
if len(human_face) == 0: | |
return [] | |
else: | |
# Only get human face detections | |
faces = detect_person_face(img, [detections[idx] for idx in human_face]) | |
else: | |
# Only get human face detections | |
faces = [] | |
for idx in human_face: | |
faces.append(crop_images(img, detections[idx])) | |
return faces | |
# Predict | |
def predict(img): | |
img = PILImage.create(img) | |
# Detect faces | |
faces = detect_face(img) | |
output = [] | |
if len(faces) == 0: | |
img = img.resize((48, 48)) | |
pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img))) | |
pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img))) | |
emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))} | |
sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))} | |
output = [img, emotions, sentiments, img, emotions, sentiments, img, emotions, sentiments] | |
else: # Max 3 for now | |
for face in faces[:3]: | |
img = face.resize((48, 48)) | |
pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img))) | |
pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img))) | |
emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))} | |
sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))} | |
output.append(img) | |
output.append(emotions) | |
output.append(sentiments) | |
temp = output[-3:] | |
while len(output) < 9: | |
output = output + temp | |
return output | |
# Gradio | |
title = 'Face Recognition with Emotion and Sentiment Detector' | |
description = gr.Markdown( | |
"""Ever wondered what a person might be feeling looking at their picture? | |
Well, now you can! Try this fun app. Just upload a facial image in JPG or | |
PNG format. Voila! you can now see what they might have felt when the picture | |
was taken. | |
This is an updated version of Facial Expression Classifier: | |
https://huggingface.co/spaces/schibsted/facial_expression_classifier | |
""").value | |
article = gr.Markdown( | |
"""**DISCLAIMER:** This model does not reveal the actual emotional state of a person. Use and | |
interpret results at your own risk! It was built as a demo for AI course. Samples images | |
were downloaded from VG & AftenPosten news webpages. Copyrights belong to respective | |
brands. All rights reserved. | |
**PREMISE:** The idea is to determine an overall sentiment of a news site on a daily basis | |
based on the pictures. We are restricting pictures to only include close-up facial | |
images. | |
**DATA:** FER2013 dataset consists of 48x48 pixel grayscale images of faces. There are 28,709 | |
images in the training set and 3,589 images in the test set. However, for this demo all | |
pictures were combined into a single dataset and 80:20 split was used for training. Images | |
are assigned one of the 7 emotions: Angry, Disgust, Fear, Happy, Sad, Surprise, and Neutral. | |
In addition to these 7 classes, images were re-classified into 3 sentiment categories based | |
on emotions: | |
Positive (Happy, Surprise) | |
Negative (Angry, Disgust, Fear, Sad) | |
Neutral (Neutral) | |
FER2013 (preliminary version) dataset can be downloaded at: | |
https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge/data | |
**EMOTION / SENTIMENT MODEL:** VGG19 was used as the base model and trained on FER2013 dataset. Model was trained | |
using PyTorch and FastAI. Two models were trained, one for detecting emotion and the other | |
for detecting sentiment. Although, this could have been done with just one model, here two | |
models were trained for the demo. | |
**FACE DETECTOR:** Darknet with YOLOv3 architecture was used for face detection. Reach out to me for full details. | |
In short, any image is first sent through darknet. If face is detected, then it is passed through emotion/sentiment | |
model for each face in the picture. If a person is detected rather than a face, the image is cropped and run through | |
face detector again. If a face is detected, then it is passed through emotion/sentiment model. In case face is not | |
detected in an image, then the entire image is evaluated to generate some score. This is done because, I couldn't | |
figure out how to pipe None/blank output to Gradio.Interface(). There maybe option through Gradio.Blocks() but was | |
too lazy to go through that at this stage. In addition, the output is restricted to only 3 faces in a picture. | |
""").value | |
enable_queue=True | |
examples = ['happy1.jpg', 'happy2.jpg', 'angry1.png', 'angry2.jpg', 'neutral1.jpg', 'neutral2.jpg'] | |
gr.Interface(fn = predict, | |
inputs = gr.Image(), | |
outputs = [gr.Image(shape=(48, 48), label='Person 1'), | |
gr.Label(label='Emotion - Person 1'), | |
gr.Label(label='Sentiment - Person 1'), | |
gr.Image(shape=(48, 48), label='Person 2'), | |
gr.Label(label='Emotion - Person 2'), | |
gr.Label(label='Sentiment - Person 2'), | |
gr.Image(shape=(48, 48), label='Person 3'), | |
gr.Label(label='Emotion - Person 3'), | |
gr.Label(label='Sentiment - Person 3'),], #gr.Label(), | |
title = title, | |
examples = examples, | |
description = description, | |
article=article, | |
allow_flagging='never').launch(enable_queue=enable_queue) |