Spaces:

cpg716
/

staffmanager-llama4-scout

Running

File size: 11,090 Bytes

import gradio as gr
import torch
from PIL import Image
import io
import json
import time
import os
import hashlib
import base64
from huggingface_hub import login
import traceback
import sys
import requests

# Print Python and library versions for debugging
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
import transformers
print(f"Transformers version: {transformers.__version__}")

# Print token information (first few characters only for security)
token = os.environ.get("HUGGINGFACE_TOKEN", "")
if token:
    print(f"Token found: {token[:5]}...")
else:
    print("No token found in environment variables!")

# Explicitly login with your token
try:
    login(token=token)
    print("Successfully logged in to Hugging Face Hub")
except Exception as e:
    print(f"Error logging in: {e}")

# Global variables
model = None
processor = None

# Initialize Llama 4 Scout model
def load_llama4_model():
    global model, processor
    
    if model is None or processor is None:
        try:
            print("Loading Llama 4 Scout model...")
            
            # Import the correct classes for Llama 4
            from transformers import AutoProcessor, Llama4ForConditionalGeneration
            
            model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
            
            # Load processor and model
            processor = AutoProcessor.from_pretrained(
                model_id,
                token=token
            )
            
            # Use 4-bit quantization to reduce memory usage
            from transformers import BitsAndBytesConfig
            
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_quant_type="nf4"
            )
            
            model = Llama4ForConditionalGeneration.from_pretrained(
                model_id,
                token=token,
                device_map="auto",
                torch_dtype=torch.bfloat16,
                quantization_config=quantization_config
            )
            
            print("Llama 4 Scout model loaded successfully!")
            
        except Exception as e:
            print(f"Error loading Llama 4 Scout model: {e}")
            print(traceback.format_exc())
            
            # Fall back to LLaVA if Llama 4 fails
            try:
                print("Falling back to LLaVA...")
                from transformers import AutoProcessor, AutoModelForVision2Seq
                
                processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
                model = AutoModelForVision2Seq.from_pretrained(
                    "llava-hf/llava-1.5-7b-hf",
                    device_map="auto"
                )
                print("LLaVA model loaded as fallback")
            except Exception as fallback_error:
                print(f"Even fallback failed: {fallback_error}")
                raise
    
    return model, processor

# Function to convert PIL Image to base64
def image_to_base64(img):
    buffered = io.BytesIO()
    img.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode()
    return f"data:image/png;base64,{img_str}"

# Simple caching mechanism
cache = {}

def compute_image_hash(image):
    """Compute a hash for an image to use as cache key"""
    # Resize to small dimensions to ensure hash is based on content, not size
    image = image.resize((100, 100), Image.LANCZOS)
    
    # Convert to bytes
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format='PNG')
    img_byte_arr = img_byte_arr.getvalue()
    
    # Compute hash
    return hashlib.md5(img_byte_arr).hexdigest()

def verify_document(img, doc_type, verification_info):
    """Verify a document using Llama 4 Scout"""
    if img is None:
        return "Please upload an image"
    
    # Compute image hash for caching
    image_hash = compute_image_hash(img)
    cache_key = f"verify_{image_hash}_{doc_type}"
    
    # Check cache
    if cache_key in cache:
        return f"[CACHED] {cache[cache_key]}"
    
    try:
        # Load model and processor
        model, processor = load_llama4_model()
        
        # Create prompt
        prompt = f"""This is a {doc_type} document. 
Verify if it's authentic and extract the following information: {verification_info}
Provide your analysis in a structured format."""
        
        # Process with model
        start_time = time.time()
        print(f"Starting document verification at {start_time}")
        
        # Convert image to base64 URL
        img_url = image_to_base64(img)
        
        # Create messages format
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": img_url},
                    {"type": "text", "text": prompt},
                ]
            },
        ]
        
        # Process input using the chat template
        inputs = processor.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        ).to(model.device)
        
        # Generate output
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
        )
        
        # Decode output
        result = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
        
        end_time = time.time()
        print(f"Completed document verification in {end_time - start_time:.2f} seconds")
        
        # Save to cache
        cache[cache_key] = result
        
        return result
    except Exception as e:
        error_details = traceback.format_exc()
        print(f"Error in verify_document: {e}")
        print(error_details)
        return f"Error processing document: {str(e)}\n\nPlease try again with a different image or try later."

def check_workplace(img, industry):
    """Check workplace compliance using Llama 4 Scout"""
    if img is None:
        return "Please upload an image"
    
    # Compute image hash for caching
    image_hash = compute_image_hash(img)
    cache_key = f"workplace_{image_hash}_{industry}"
    
    # Check cache
    if cache_key in cache:
        return f"[CACHED] {cache[cache_key]}"
    
    try:
        # Load model and processor
        model, processor = load_llama4_model()
        
        # Create prompt
        prompt = f"""This is a workplace in the {industry} industry.
Identify any safety or compliance issues visible in this image.
Focus on:
1. Safety hazards
2. Required signage
3. Proper equipment usage
4. Workspace organization
5. Compliance with regulations

Format your response as a detailed assessment with:
- Issues found (if any)
- Severity level for each issue
- Recommendations for correction"""
        
        # Process with model
        start_time = time.time()
        print(f"Starting workplace compliance check at {start_time}")
        
        # Convert image to base64 URL
        img_url = image_to_base64(img)
        
        # Create messages format
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": img_url},
                    {"type": "text", "text": prompt},
                ]
            },
        ]
        
        # Process input using the chat template
        inputs = processor.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        ).to(model.device)
        
        # Generate output
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
        )
        
        # Decode output
        result = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
        
        end_time = time.time()
        print(f"Completed workplace compliance check in {end_time - start_time:.2f} seconds")
        
        # Save to cache
        cache[cache_key] = result
        
        return result
    except Exception as e:
        error_details = traceback.format_exc()
        print(f"Error in check_workplace: {e}")
        print(error_details)
        return f"Error processing workplace image: {str(e)}\n\nPlease try again with a different image or try later."

# Create Gradio interface
with gr.Blocks(title="StaffManager AI Assistant") as demo:
    gr.Markdown("# StaffManager AI Assistant")
    gr.Markdown("This Space provides AI capabilities for StaffManager using Llama 4 Scout.")
    
    with gr.Tab("Document Verification"):
        with gr.Row():
            with gr.Column():
                doc_image = gr.Image(type="pil", label="Upload Document")
                doc_type = gr.Dropdown(
                    ["identification", "tax", "employment", "policy"], 
                    label="Document Type",
                    value="identification"
                )
                verification_info = gr.Textbox(
                    label="Verification Data (JSON)",
                    value='{"name": "John Doe", "id_number": "ABC123456"}'
                )
                verify_button = gr.Button("Verify Document")
            with gr.Column():
                doc_result = gr.Textbox(label="Verification Result", lines=10)
        
        verify_button.click(
            fn=verify_document,
            inputs=[doc_image, doc_type, verification_info],
            outputs=[doc_result]
        )
    
    with gr.Tab("Workplace Compliance"):
        with gr.Row():
            with gr.Column():
                workplace_image = gr.Image(type="pil", label="Upload Workplace Image")
                industry_type = gr.Dropdown(
                    ["retail", "restaurant", "healthcare", "manufacturing"], 
                    label="Industry",
                    value="retail"
                )
                check_button = gr.Button("Check Compliance")
            with gr.Column():
                compliance_result = gr.Textbox(label="Compliance Assessment", lines=10)
        
        check_button.click(
            fn=check_workplace,
            inputs=[workplace_image, industry_type],
            outputs=[compliance_result]
        )
    
    with gr.Tab("About"):
        gr.Markdown("""
        ## About StaffManager AI Assistant
        
        This Space uses the Llama 4 Scout model to provide AI capabilities for StaffManager:
        
        - **Document Verification**: Verify and extract information from documents
        - **Workplace Compliance**: Identify safety and compliance issues in workplace images
        
        The model is loaded on demand and results are cached for better performance.
        
        ### Model Information
        
        - Model: meta-llama/Llama-4-Scout-17B-16E-Instruct
        - Type: Multimodal (image + text)
        - Size: 17B parameters
        """)

# Launch the app
demo.launch()