import streamlit as st
import torch
import cv2
import mediapipe as mp
from transformers import SwinForImageClassification, AutoFeatureExtractor
from PIL import Image
import numpy as np

# Initialize face detection
mp_face_detection = mp.solutions.face_detection.FaceDetection(
    model_selection=1, min_detection_confidence=0.5)

# Initialize model and labels
@st.cache_resource
def load_model():
    id2label = {0: 'Heart', 1: 'Oblong', 2: 'Oval', 3: 'Round', 4: 'Square'}  # Moved inside load_model
    label2id = {v: k for k, v in id2label.items()}
    
    model = SwinForImageClassification.from_pretrained(
        "microsoft/swin-tiny-patch4-window7-224",
        label2id=label2id,
        id2label=id2label,
        ignore_mismatched_sizes=True
    )
    
    model.load_state_dict(torch.load('swin.pth', map_location='cpu'))
    model.eval()
    return model, AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224"), id2label  # Return id2label

# Load model components and labels
model, feature_extractor, id2label = load_model()  # Receive id2label here

glasses_recommendations = {
    "Heart": "Rimless (tanpa bingkai bawah)",
    "Oblong": "Kotak",
    "Oval": "Berbagai bentuk bingkai",
    "Round": "Kotak",
    "Square": "Oval atau bundar"
}

def preprocess_image(image):
    results = mp_face_detection.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    if results.detections:
        detection = results.detections[0]
        bbox = detection.location_data.relative_bounding_box
        h, w, _ = image.shape
        x1 = max(0, int(bbox.xmin * w))
        y1 = max(0, int(bbox.ymin * h))
        x2 = min(w, int((bbox.xmin + bbox.width) * w))
        y2 = min(h, int((bbox.ymin + bbox.height) * h))
        
        # Add validation check
        if (x2 <= x1) or (y2 <= y1) or (x2 - x1 < 10) or (y2 - y1 < 10):
            raise ValueError("Invalid face crop dimensions")
            
        image = image[y1:y2, x1:x2]
    else:
        raise ValueError("No face detected")
    
    image = cv2.resize(image, (224, 224))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Convert to PIL and extract
    image_pil = Image.fromarray(image)
    inputs = feature_extractor(images=image_pil, return_tensors="pt")
    
    return inputs['pixel_values']


def predict_face_shape(image):
    # Force CPU usage on Hugging Face Spaces
    device = torch.device("cpu")
    image_tensor = preprocess_image(image).to(device)
    
    with torch.no_grad():
        outputs = model(image_tensor)
        predicted_class_idx = torch.argmax(outputs.logits, dim=1).item()
    
    return id2label[predicted_class_idx]

# Streamlit UI
st.title("Face Shape & Glasses Recommender")
st.write("Upload a face photo for shape analysis and glasses recommendations")

uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    image = Image.open(uploaded_file).convert('RGB')
    img_array = np.array(image)
    
    st.image(image, caption='Uploaded Image', use_column_width=True)
    
    try:
        with st.spinner('Analyzing...'):
            # Convert PIL image to OpenCV format correctly
            prediction = predict_face_shape(cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR))
            recommendation = glasses_recommendations[prediction]
            
        st.success(f"Predicted Face Shape: **{prediction}**")
        st.info(f"Recommended Glasses Frame: **{recommendation}**")
    except Exception as e:
        st.error(f"Error: {str(e)}")