Spaces:

assentian1970
/

DigitalSiteDiaryV2

Runtime error

App Files Files Community

assentian1970 commited on Mar 17

Commit

efd5a3f

verified ·

1 Parent(s): 6d1a54e

Update app.py

Browse files

Files changed (1) hide show

app.py +426 -436

app.py CHANGED Viewed

@@ -1,418 +1,49 @@
-import spaces
-import torch
-import argparse
 import os
-import sys
-import pickle  # For serializing frames
 import gc
-import tempfile
-import subprocess
-import time
-from datetime import datetime
-from transformers import AutoModel, AutoTokenizer
-from modelscope.hub.snapshot_download import snapshot_download
 from PIL import Image
 from decord import VideoReader, cpu
-import cv2
-import gradio as gr
-from ultralytics import YOLO
-import numpy as np
-import io
-# Install flash-attn (using prebuilt wheel mode if needed)
-subprocess.run(
-    'pip install flash-attn --no-build-isolation',
-    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': 'TRUE'},
-    shell=True
 )
-# --------------------------------------------------------------------
-# Command-line arguments
-# --------------------------------------------------------------------
-parser = argparse.ArgumentParser(description='demo')
-parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
-parser.add_argument("--host", type=str, default="0.0.0.0")
-parser.add_argument("--port", type=int)
-# Arguments for subprocess inference mode
-parser.add_argument("--chunk_inference", action="store_true", help="Run inference on a chunk (subprocess mode).")
-parser.add_argument("--input_file", type=str, help="Path to serialized input chunk frames.")
-parser.add_argument("--output_file", type=str, help="Path to file where inference result is written.")
-parser.add_argument("--inference_prompt", type=str, help="Inference prompt for the chunk.")
-parser.add_argument("--model_path_arg", type=str, help="Model path for the subprocess.")
-args = parser.parse_args()
-device = args.device
-assert device in ['cuda', 'mps']
-# Global model configuration
-MODEL_NAME = 'iic/mPLUG-Owl3-7B-240728'
-MODEL_CACHE_DIR = os.getenv('TRANSFORMERS_CACHE', './models')
-os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
-# Download and cache the model (only in the main process)
-if not args.chunk_inference:
-    try:
-        model_path = snapshot_download(MODEL_NAME, cache_dir=MODEL_CACHE_DIR)
-    except Exception as e:
-        print(f"Error downloading model: {str(e)}")
-        model_path = os.path.join(MODEL_CACHE_DIR, MODEL_NAME)
-else:
-    model_path = args.model_path_arg
-MAX_NUM_FRAMES = 64
-# Initialize YOLO model (assumed to be lightweight)
-YOLO_MODEL = YOLO('./best_yolov11.pt')  # Load YOLOv11 model
-# File type validation
-IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
-VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
-def get_file_extension(filename):
-    return os.path.splitext(filename)[1].lower()
-def is_image(filename):
-    return get_file_extension(filename) in IMAGE_EXTENSIONS
-def is_video(filename):
-    return get_file_extension(filename) in VIDEO_EXTENSIONS
-# --------------------------------------------------------------------
-# Model Loading and Inference Functions
-# --------------------------------------------------------------------
-def load_model_and_tokenizer():
-    """Load a fresh instance of the model and tokenizer."""
-    try:
-        # Clear GPU memory if using CUDA (only at initial load)
-        if device == "cuda":
-            torch.cuda.empty_cache()
-            gc.collect()
-        model = AutoModel.from_pretrained(
-            model_path,
-            attn_implementation='sdpa',
-            trust_remote_code=True,
-            torch_dtype=torch.half,
-            device_map='auto'
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        model.eval()
-        processor = model.init_processor(tokenizer)
-        return model, tokenizer, processor
-    except Exception as e:
-        print(f"Error loading model: {str(e)}")
-        raise
-def process_video_chunk(video_frames, model, tokenizer, processor, prompt):
-    """Process a chunk of video frames with mPLUG model."""
-    messages = [{
-        "role": "user",
-        "content": prompt,
-        "video_frames": video_frames
-    }]
-    model_messages = []
-    videos = []
-    for msg in messages:
-        content_str = msg["content"]
-        if "video_frames" in msg and msg["video_frames"]:
-            content_str += "<|video|>"
-            videos.append(msg["video_frames"])
-        model_messages.append({"role": msg["role"], "content": content_str})
-    model_messages.append({"role": "assistant", "content": ""})
-    inputs = processor(
-        model_messages,
-        images=None,
-        videos=videos if videos else None
-    )
-    inputs.to('cuda')
-    inputs.update({
-        'tokenizer': tokenizer,
-        'max_new_tokens': 100,
-        'decode_text': True,
-        'use_cache': False  # disable caching to reduce memory buildup
-    })
-    with torch.no_grad():
-        response = model.generate(**inputs)
-    del inputs  # Free temporary memory
-    return response[0]
-# --------------------------------------------------------------------
-# Video and YOLO Functions (Unchanged)
-# --------------------------------------------------------------------
-def encode_video_in_chunks(video_path):
-    """Extract frames from a video in chunks."""
-    vr = VideoReader(video_path, ctx=cpu(0))
-    sample_fps = round(vr.get_avg_fps() / 1)  # 1 FPS
-    frame_idx = [i for i in range(0, len(vr), sample_fps)]
-    chunks = [frame_idx[i:i + MAX_NUM_FRAMES] for i in range(0, len(frame_idx), MAX_NUM_FRAMES)]
-    for chunk_idx, chunk in enumerate(chunks):
-        frames = vr.get_batch(chunk).asnumpy()
-        frames = [Image.fromarray(v.astype('uint8')) for v in frames]
-        yield chunk_idx, frames
-def process_yolo_results(results):
-    """Process YOLO detection results and count people and machinery."""
-    people_count = 0
-    machine_types = {
-        "Tower Crane": 0, "Mobile Crane": 0, "Compactor/Roller": 0, "Bulldozer": 0,
-        "Excavator": 0, "Dump Truck": 0, "Concrete Mixer": 0, "Loader": 0,
-        "Pump Truck": 0, "Pile Driver": 0, "Grader": 0, "Other Vehicle": 0
-    }
-    for r in results:
-        boxes = r.boxes
-        for box in boxes:
-            cls = int(box.cls[0])
-            conf = float(box.conf[0])
-            class_name = YOLO_MODEL.names[cls]
-            if class_name.lower() == 'worker' and conf > 0.5:
-                people_count += 1
-            machinery_mapping = {
-                'tower_crane': "Tower Crane",
-                'mobile_crane': "Mobile Crane",
-                'compactor': "Compactor/Roller",
-                'roller': "Compactor/Roller",
-                'bulldozer': "Bulldozer",
-                'dozer': "Bulldozer",
-                'excavator': "Excavator",
-                'dump_truck': "Dump Truck",
-                'truck': "Dump Truck",
-                'concrete_mixer_truck': "Concrete Mixer",
-                'loader': "Loader",
-                'pump_truck': "Pump Truck",
-                'pile_driver': "Pile Driver",
-                'grader': "Grader",
-                'other_vehicle': "Other Vehicle"
-            }
-            if conf > 0.5:
-                class_lower = class_name.lower()
-                for key, value in machinery_mapping.items():
-                    if key in class_lower:
-                        machine_types[value] += 1
-                        break
-    total_machinery = sum(machine_types.values())
-    return people_count, total_machinery, machine_types
-def detect_people_and_machinery(media_path):
-    """Detect people and machinery using YOLOv11 for both images and videos."""
-    try:
-        max_people_count = 0
-        max_machine_types = {
-            "Tower Crane": 0, "Mobile Crane": 0, "Compactor/Roller": 0, "Bulldozer": 0,
-            "Excavator": 0, "Dump Truck": 0, "Concrete Mixer": 0, "Loader": 0,
-            "Pump Truck": 0, "Pile Driver": 0, "Grader": 0, "Other Vehicle": 0
-        }
-        if isinstance(media_path, str) and is_video(media_path):
-            cap = cv2.VideoCapture(media_path)
-            fps = cap.get(cv2.CAP_PROP_FPS)
-            sample_rate = max(1, int(fps))
-            frame_count = 0
-            while cap.isOpened():
-                ret, frame = cap.read()
-                if not ret:
-                    break
-                if frame_count % sample_rate == 0:
-                    results = YOLO_MODEL(frame)
-                    people, _, machine_types = process_yolo_results(results)
-                    max_people_count = max(max_people_count, people)
-                    for k, v in machine_types.items():
-                        max_machine_types[k] = max(max_machine_types[k], v)
-                frame_count += 1
-            cap.release()
-        else:
-            if isinstance(media_path, str):
-                img = cv2.imread(media_path)
-            else:
-                img = cv2.cvtColor(np.array(media_path), cv2.COLOR_RGB2BGR)
-            results = YOLO_MODEL(img)
-            max_people_count, _, max_machine_types = process_yolo_results(results)
-        max_machine_types = {k: v for k, v in max_machine_types.items() if v > 0}
-        total_machinery_count = sum(max_machine_types.values())
-        return max_people_count, total_machinery_count, max_machine_types
-    except Exception as e:
-        print(f"Error in YOLO detection: {str(e)}")
-        return 0, 0, {}
-def process_image(image_path, model, tokenizer, processor, prompt):
-    """Process single image with mPLUG model."""
-    try:
-        image = Image.open(image_path)
-        messages = [{
-            "role": "user",
-            "content": prompt,
-            "images": [image]
-        }]
-        model_messages = []
-        images = []
-        for msg in messages:
-            content_str = msg["content"]
-            if "images" in msg and msg["images"]:
-                content_str += "<|image|>"
-                images.extend(msg["images"])
-            model_messages.append({"role": msg["role"], "content": content_str})
-        model_messages.append({"role": "assistant", "content": ""})
-        inputs = processor(model_messages, images=images, videos=None)
-        inputs.to('cuda')
-        inputs.update({
-            'tokenizer': tokenizer,
-            'max_new_tokens': 100,
-            'decode_text': True,
-            'use_cache': False
-        })
-        with torch.no_grad():
-            response = model.generate(**inputs)
-        del inputs
-        return response[0]
-    except Exception as e:
-        print(f"Error processing image: {str(e)}")
-        return "Error processing image"
-def analyze_image_activities(image_path):
-    """Analyze image using mPLUG model."""
-    try:
-        model, tokenizer, processor = load_model_and_tokenizer()
-        prompt = ("Analyze this construction site image and describe the activities happening. "
-                  "Focus on construction activities, machinery usage, and worker actions.")
-        response = process_image(image_path, model, tokenizer, processor, prompt)
-        del model, tokenizer, processor
-        torch.cuda.empty_cache()  # Final cleanup after image processing
-        gc.collect()
-        return response
-    except Exception as e:
-        print(f"Error analyzing image: {str(e)}")
-        return "Error analyzing image activities"
-def annotate_video_with_bboxes(video_path):
-    """
-    Reads the video frame-by-frame, runs YOLO, draws bounding boxes,
-    writes a per-frame summary of detected classes on the frame, and saves
-    the annotated video. Returns the annotated video path.
-    """
-    cap = cv2.VideoCapture(video_path)
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    out_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
-    annotated_video_path = out_file.name
-    out_file.close()
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    writer = cv2.VideoWriter(annotated_video_path, fourcc, fps, (w, h))
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        results = YOLO_MODEL(frame)
-        frame_counts = {}
-        for r in results:
-            boxes = r.boxes
-            for box in boxes:
-                cls_id = int(box.cls[0])
-                conf = float(box.conf[0])
-                if conf < 0.5:
-                    continue
-                x1, y1, x2, y2 = box.xyxy[0]
-                class_name = YOLO_MODEL.names[cls_id]
-                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
-                color = (0, 255, 0)
-                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
-                label_text = f"{class_name} {conf:.2f}"
-                cv2.putText(frame, label_text, (x1, y1 - 6),
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1)
-                frame_counts[class_name] = frame_counts.get(class_name, 0) + 1
-        summary_str = ", ".join(f"{cls_name}: {count}" for cls_name, count in frame_counts.items())
-        cv2.putText(frame, summary_str, (15, 30),
-                    cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 0), 2)
-        writer.write(frame)
-    cap.release()
-    writer.release()
-    return annotated_video_path
-# --------------------------------------------------------------------
-# Subprocess Worker: Executed when --chunk_inference flag is provided
-# --------------------------------------------------------------------
-if args.chunk_inference:
-    # In worker mode, load the serialized frames from the input file
-    try:
-        with open(args.input_file, "rb") as f:
-            frames_serialized = pickle.load(f)
-        video_frames = []
-        for img_bytes in frames_serialized:
-            video_frames.append(Image.open(io.BytesIO(img_bytes)))
-    except Exception as e:
-        print(f"Error reading input frames: {str(e)}")
-        sys.exit(1)
-    try:
-        model, tokenizer, processor = load_model_and_tokenizer()
-        response = process_video_chunk(video_frames, model, tokenizer, processor, args.inference_prompt)
-        with open(args.output_file, "w") as f:
-            f.write(response)
-        del model, tokenizer, processor
-        torch.cuda.empty_cache()
-        gc.collect()
-    except Exception as e:
-        with open(args.output_file, "w") as f:
-            f.write(f"Error in chunk inference: {str(e)}")
-    sys.exit(0)
-# --------------------------------------------------------------------
-# Main Video Analysis Function Using Subprocess Isolation
-# --------------------------------------------------------------------
-@spaces.GPU
-def analyze_video_activities_subprocess(video_path):
-    """Analyze video by processing each chunk in a separate subprocess.
-       Each subprocess loads a fresh model instance to avoid GPU memory buildup."""
-    try:
-        all_responses = []
-        chunk_generator = encode_video_in_chunks(video_path)
-        for chunk_idx, video_frames in chunk_generator:
-            # Serialize each frame in the chunk to bytes
-            temp_input = tempfile.NamedTemporaryFile(suffix=".pkl", delete=False)
-            frames_serializable = []
-            for img in video_frames:
-                with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tf:
-                    img.save(tf, format="PNG")
-                    tf.seek(0)
-                    frames_serializable.append(tf.read())
-                os.remove(tf.name)
-            with open(temp_input.name, "wb") as f:
-                pickle.dump(frames_serializable, f)
-            # Create a temporary file for subprocess output
-            temp_output = tempfile.NamedTemporaryFile(suffix=".txt", delete=False)
-            temp_output.close()
-            prompt = ("Analyze this construction site video chunk and describe the activities happening. "
-                      "Focus on construction activities, machinery usage, and worker actions.")
-            # Launch subprocess for this chunk
-            subprocess.run([
-                sys.executable, __file__,
-                "--chunk_inference",
-                "--input_file", temp_input.name,
-                "--output_file", temp_output.name,
-                "--inference_prompt", prompt,
-                "--model_path_arg", model_path,
-                "--device", device
-            ], check=True)
-            with open(temp_output.name, "r") as f:
-                response = f.read().strip()
-            all_responses.append(f"Time period {chunk_idx + 1}:\n{response}")
-            os.remove(temp_input.name)
-            os.remove(temp_output.name)
-            time.sleep(2)  # Allow time for GPU memory to fully clear before next chunk
-        return "\n\n".join(all_responses)
-    except Exception as e:
-        print(f"Error in subprocess chunk inference: {str(e)}")
-        return "Error analyzing video activities"
-# --------------------------------------------------------------------
-# Gradio Interface and Main Launch (only executed in main process)
-# --------------------------------------------------------------------
-@spaces.GPU
 def process_diary(day, date, total_people, total_machinery, machinery_types, activities, media):
-    """Process the site diary entry."""
     if media is None:
-        return [day, date, "No media uploaded", "No media uploaded", "No media uploaded", "No media uploaded", None]
     try:
         if not hasattr(media, 'name'):
             raise ValueError("Invalid file upload")
         file_ext = get_file_extension(media.name)
         if not (is_image(media.name) or is_video(media.name)):
             raise ValueError(f"Unsupported file type: {file_ext}")
         with tempfile.NamedTemporaryFile(suffix=file_ext, delete=False) as temp_file:
             temp_path = temp_file.name
             if hasattr(media, 'name') and os.path.exists(media.name):
@@ -421,53 +52,412 @@ def process_diary(day, date, total_people, total_machinery, machinery_types, act
             else:
                 file_content = media.read() if hasattr(media, 'read') else media
                 temp_file.write(file_content if isinstance(file_content, bytes) else file_content.read())
         detected_people, detected_machinery, detected_machinery_types = detect_people_and_machinery(temp_path)
         annotated_video_path = None
-        if is_image(media.name):
-            detected_activities = analyze_image_activities(temp_path)
-        else:
-            # Use the subprocess-based video analysis for each chunk
-            detected_activities = analyze_video_activities_subprocess(temp_path)
-            annotated_video_path = annotate_video_with_bboxes(temp_path)
-        if os.path.exists(temp_path):
-            os.remove(temp_path)
         detected_types_str = ", ".join([f"{k}: {v}" for k, v in detected_machinery_types.items()])
-        return [day, date, str(detected_people), str(detected_machinery), detected_types_str, detected_activities, annotated_video_path]
     except Exception as e:
         print(f"Error processing media: {str(e)}")
-        return [day, date, "Error processing media", "Error processing media", "Error processing media", "Error processing media", None]
-with gr.Blocks(title="Digital Site Diary") as demo:
     gr.Markdown("# 📝 Digital Site Diary")
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### User Input")
-            day = gr.Textbox(label="Day", value='9')
-            date = gr.Textbox(label="Date", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
-            total_people = gr.Number(label="Total Number of People", precision=0, value=10)
-            total_machinery = gr.Number(label="Total Number of Machinery", precision=0, value=3)
-            machinery_types = gr.Textbox(label="Number of Machinery Per Type",
-                                         placeholder="e.g., Excavator: 2, Roller: 1",
-                                         value="Excavator: 2, Roller: 1")
-            activities = gr.Textbox(label="Activity",
-                                    placeholder="e.g., 9 AM: Excavation, 10 AM: Concreting",
-                                    value="9 AM: Excavation, 10 AM: Concreting", lines=3)
-            media = gr.File(label="Upload Image/Video", file_types=["image", "video"])
-            submit_btn = gr.Button("Submit", variant="primary")
-        with gr.Column():
-            gr.Markdown("### Model Detection")
-            model_day = gr.Textbox(label="Day")
-            model_date = gr.Textbox(label="Date")
-            model_people = gr.Textbox(label="Total Number of People")
-            model_machinery = gr.Textbox(label="Total Number of Machinery")
-            model_machinery_types = gr.Textbox(label="Number of Machinery Per Type")
-            model_activities = gr.Textbox(label="Activity", lines=5)
-            model_annotated_video = gr.Video(label="Annotated Video")
     submit_btn.click(
         fn=process_diary,
         inputs=[day, date, total_people, total_machinery, machinery_types, activities, media],
-        outputs=[model_day, model_date, model_people, model_machinery, model_machinery_types, model_activities, model_annotated_video]
     )
 if __name__ == "__main__":
-    demo.launch(share=False, debug=True, show_api=False, server_port=args.port, server_name=args.host)

+import gradio as gr
+from datetime import datetime
+import tempfile
 import os
+import json
+import torch
 import gc
 from PIL import Image
 from decord import VideoReader, cpu
+from yolo_detection import (
+    detect_people_and_machinery,
+    annotate_video_with_bboxes,
+    is_image,
+    is_video
+)
+from image_captioning import (
+    analyze_image_activities,
+    analyze_video_activities,
+    process_video_chunk,
+    load_model_and_tokenizer,
+    MAX_NUM_FRAMES
 )
+# Global storage for activities and media paths
+global_activities = []
+global_media_path = None
+# Create tmp directory for storing frames
+tmp_dir = os.path.join('.', 'tmp')
+os.makedirs(tmp_dir, exist_ok=True)
 def process_diary(day, date, total_people, total_machinery, machinery_types, activities, media):
+    """Process the site diary entry"""
+    global global_activities, global_media_path
     if media is None:
+        return [day, date, "No media uploaded", "No media uploaded", "No media uploaded", None, None, [], None]
     try:
         if not hasattr(media, 'name'):
             raise ValueError("Invalid file upload")
         file_ext = get_file_extension(media.name)
         if not (is_image(media.name) or is_video(media.name)):
             raise ValueError(f"Unsupported file type: {file_ext}")
         with tempfile.NamedTemporaryFile(suffix=file_ext, delete=False) as temp_file:
             temp_path = temp_file.name
             if hasattr(media, 'name') and os.path.exists(media.name):
             else:
                 file_content = media.read() if hasattr(media, 'read') else media
                 temp_file.write(file_content if isinstance(file_content, bytes) else file_content.read())
         detected_people, detected_machinery, detected_machinery_types = detect_people_and_machinery(temp_path)
+        print(f"Detected people: {detected_people}, machinery: {detected_machinery}, types: {detected_machinery_types}")
         annotated_video_path = None
+        detected_activities = analyze_image_activities(temp_path) if is_image(media.name) else analyze_video_activities(temp_path)
+        print(f"Detected activities: {detected_activities}")
+        # Store activities and media path globally for chat mode
+        global_activities = detected_activities
+        global_media_path = temp_path
+        if is_video(media.name):
+            annotated_video_path = temp_path  # Or use annotate_video_with_bboxes(temp_path) if implemented
         detected_types_str = ", ".join([f"{k}: {v}" for k, v in detected_machinery_types.items()])
+        # We'll return the activities as a list for the card display
+        # Clear the chat history when loading new media
+        chat_history = []
+        # Extract data for the activity table
+        activity_rows = []
+        for activity in detected_activities:
+            time = activity.get('time', 'Unknown')
+            summary = activity.get('summary', 'No description available')
+            activity_rows.append([time, summary])
+        return [day, date, str(detected_people), str(detected_machinery),
+                detected_types_str, gr.update(visible=True), annotated_video_path,
+                detected_activities, chat_history, activity_rows]
     except Exception as e:
         print(f"Error processing media: {str(e)}")
+        return [day, date, "Error processing media", "Error processing media",
+                "Error processing media", None, None, [], None, []]
+def get_file_extension(filename):
+    return os.path.splitext(filename)[1].lower()
+def on_card_click(activity_indices, history, evt: gr.SelectData):
+    """Handle clicking on an activity card in the gallery"""
+    global global_activities, global_media_path
+    # Get the index of the selected activity from the SelectData event
+    selected_idx = evt.index
+    # Map the gallery index to the actual activity index
+    if selected_idx < 0 or selected_idx >= len(activity_indices):
+        return [gr.update(visible=True), gr.update(visible=False), [], None]
+    card_idx = activity_indices[selected_idx]
+    print(f"Gallery item {selected_idx} clicked, corresponds to activity index: {card_idx}")
+    if card_idx < 0 or card_idx >= len(global_activities):
+        return [gr.update(visible=True), gr.update(visible=False), [], None]
+    selected_activity = global_activities[card_idx]
+    chunk_video_path = None
+    # Use the pre-saved chunk video if available
+    if 'chunk_path' in selected_activity and os.path.exists(selected_activity['chunk_path']):
+        chunk_video_path = selected_activity['chunk_path']
+        print(f"Using pre-saved chunk video: {chunk_video_path}")
+    else:
+        # Fallback to full video if chunk not available
+        chunk_video_path = global_media_path
+        print(f"Chunk video not available, using full video: {chunk_video_path}")
+    # Add the selected activity to chat history
+    history = []
+    history.append((None, f"🎬 Selected video at timestamp {selected_activity['time']}"))
+    # Add the thumbnail to the chat as a visual element
+    if 'thumbnail' in selected_activity and os.path.exists(selected_activity['thumbnail']):
+        # Use the tuple format for images in chatbot
+        thumbnail_path = selected_activity['thumbnail']
+        history.append((None, f"📷 Video frame at {selected_activity['time']}"))
+        history.append((None, thumbnail_path))
+    # Format message about the detected activity
+    activity_info = f"I detected the following activity:\n\n{selected_activity['summary']}"
+    if selected_activity['objects']:
+        activity_info += f"\n\nIdentified objects: {', '.join(selected_activity['objects'])}"
+    history.append(("Tell me about this video segment", activity_info))
+    return [gr.update(visible=False), gr.update(visible=True), history, chunk_video_path]
+def chat_with_video(message, history):
+    """Chat with the mPLUG model about the selected video segment"""
+    global global_activities, global_media_path
+    try:
+        # Get the selected activity from the history to identify which chunk we're discussing
+        selected_chunk_idx = None
+        selected_time = None
+        selected_activity = None
+        for entry in history:
+            if entry[0] is None and "Selected video at timestamp" in entry[1]:
+                time_str = entry[1].split("Selected video at timestamp ")[1]
+                selected_time = time_str.strip()
+                break
+        # Find the corresponding chunk
+        if selected_time:
+            for i, activity in enumerate(global_activities):
+                if activity.get('time') == selected_time:
+                    selected_chunk_idx = activity.get('chunk_id')
+                    selected_activity = activity
+                    break
+        # If we found the chunk, use the model to analyze it
+        if selected_chunk_idx is not None and global_media_path and selected_activity:
+            # Load model
+            model, tokenizer, processor = load_model_and_tokenizer()
+            # Generate prompt based on user question and add context about what's in the video
+            context = f"This video shows construction site activities at timestamp {selected_time}."
+            if selected_activity.get('objects'):
+                context += f" The scene contains {', '.join(selected_activity.get('objects'))}."
+            prompt = f"{context} Analyze this segment of construction site video and answer this question: {message}"
+            # This would ideally use the specific chunk, but for simplicity we'll use the global path
+            # In a production system, you'd extract just that chunk of the video
+            vr = VideoReader(global_media_path, ctx=cpu(0))
+            # Get the frames for this chunk
+            sample_fps = round(vr.get_avg_fps() / 1)
+            frame_idx = [i for i in range(0, len(vr), sample_fps)]
+            # Extract frames for the specific chunk
+            chunk_size = MAX_NUM_FRAMES  # From the constants in image_captioning.py
+            start_idx = selected_chunk_idx * chunk_size
+            end_idx = min(start_idx + chunk_size, len(frame_idx))
+            chunk_frames = frame_idx[start_idx:end_idx]
+            if chunk_frames:
+                frames = vr.get_batch(chunk_frames).asnumpy()
+                frames_pil = [Image.fromarray(v.astype('uint8')) for v in frames]
+                # Process frames with model
+                response = process_video_chunk(frames_pil, model, tokenizer, processor, prompt)
+                # If we couldn't save a frame, just return the text response
+                # Clean up
+                del model, tokenizer, processor
+                torch.cuda.empty_cache()
+                gc.collect()
+                return history + [(message, response)]
+            else:
+                return history + [(message, "Could not extract frames for this segment.")]
+        else:
+            # Fallback response if we can't identify the chunk
+            thumbnail = None
+            response_text = f"I'm analyzing your question about the video segment: {message}\n\nBased on what I can see in this segment, it appears to show construction activity with various machinery and workers on site. The specific details would depend on the exact timestamp you're referring to."
+            # Try to get a thumbnail from the selected activity if available
+            if selected_activity and 'thumbnail' in selected_activity and os.path.exists(selected_activity['thumbnail']):
+                thumbnail = selected_activity['thumbnail']
+                new_history = history + [(message, response_text)]
+                new_history.append((None, f"📷 Video frame at {selected_time}"))
+                new_history.append((None, thumbnail))
+                return new_history
+            return history + [(message, response_text)]
+    except Exception as e:
+        print(f"Error in chat_with_video: {str(e)}")
+        return history + [(message, f"I encountered an error while processing your question. Let me try to answer based on what I can see: {message}\n\nThe video appears to show construction site activities, but I'm having trouble with the detailed analysis at the moment.")]
+# Native Gradio activity cards
+def create_activity_cards_ui(activities):
+    """Create activity cards using native Gradio components"""
+    if not activities:
+        return gr.HTML("<div class='activity-timeline'><h3>No activities detected</h3></div>"), []
+    # Prepare data for gallery
+    thumbnails = []
+    captions = []
+    activity_indices = []
+    for i, activity in enumerate(activities):
+        thumbnail = activity.get('thumbnail', '')
+        time = activity.get('time', 'Unknown')
+        summary = activity.get('summary', 'No description available')
+        objects_list = activity.get('objects', [])
+        objects_text = f"Objects: {', '.join(objects_list)}" if objects_list else ""
+        # Truncate summary if too long
+        if len(summary) > 150:
+            summary = summary[:147] + "..."
+        thumbnails.append(thumbnail)
+        captions.append(f"Timestamp: {time} | {summary}")
+        activity_indices.append(i)
+    # Create a gallery for the thumbnails
+    gallery = gr.Gallery(
+        value=[(path, caption) for path, caption in zip(thumbnails, captions)],
+        columns=5,
+        rows=None,
+        height="auto",
+        object_fit="contain",
+        label="Activity Timeline"
+    )
+    return gallery, activity_indices
+# Create the Gradio interface
+with gr.Blocks(title="Digital Site Diary", css="") as demo:
     gr.Markdown("# 📝 Digital Site Diary")
+    # Activity data and indices storage
+    activity_data = gr.State([])
+    activity_indices = gr.State([])
+    # Create tabs for different views
+    with gr.Tabs() as tabs:
+        with gr.Tab("Site Diary"):
+            with gr.Row():
+                # User Input Column
+                with gr.Column():
+                    gr.Markdown("### User Input")
+                    day = gr.Textbox(label="Day",value='9')
+                    date = gr.Textbox(label="Date", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
+                    total_people = gr.Number(label="Total Number of People", precision=0, value=10)
+                    total_machinery = gr.Number(label="Total Number of Machinery", precision=0, value=3)
+                    machinery_types = gr.Textbox(
+                        label="Number of Machinery Per Type",
+                        placeholder="e.g., Excavator: 2, Roller: 1",
+                        value="Excavator: 2, Roller: 1"
+                    )
+                    activities = gr.Textbox(
+                        label="Activity",
+                        placeholder="e.g., 9 AM: Excavation, 10 AM: Concreting",
+                        value="9 AM: Excavation, 10 AM: Concreting",
+                        lines=3
+                    )
+                    media = gr.File(label="Upload Image/Video", file_types=["image", "video"])
+                    submit_btn = gr.Button("Submit", variant="primary")
+                # Model Detection Column
+                with gr.Column():
+                    gr.Markdown("### Model Detection")
+                    model_day = gr.Textbox(label="Day")
+                    model_date = gr.Textbox(label="Date")
+                    model_people = gr.Textbox(label="Total Number of People")
+                    model_machinery = gr.Textbox(label="Total Number of Machinery")
+                    model_machinery_types = gr.Textbox(label="Number of Machinery Per Type")
+                    # Activity Row with Timestamps
+                    with gr.Row():
+                        gr.Markdown("#### Activities with Timestamps")
+                    model_activities = gr.Dataframe(
+                        headers=["Time", "Activity Description"],
+                        datatype=["str", "str"],
+                        label="Detected Activities",
+                        interactive=False,
+                        wrap=True
+                    )
+            # Activity timeline section
+            with gr.Row():
+                # Timeline View (default visible)
+                with gr.Column(visible=True) as timeline_view:
+                    activity_gallery = gr.Gallery(label="Activity Timeline")
+                    model_annotated_video = gr.Video(label="Full Video")
+                # Chat View (initially hidden)
+                with gr.Column(visible=False) as chat_view:
+                    chunk_video = gr.Video(label="Chunk video")
+                    chatbot = gr.Chatbot(height=400)
+                    chat_input = gr.Textbox(
+                        placeholder="Ask about this video segment...",
+                        show_label=False
+                    )
+                    back_btn = gr.Button("← Back to Timeline")
+    # Connect the submit button to the processing function
     submit_btn.click(
         fn=process_diary,
         inputs=[day, date, total_people, total_machinery, machinery_types, activities, media],
+        outputs=[
+            model_day,
+            model_date,
+            model_people,
+            model_machinery,
+            model_machinery_types,
+            timeline_view,
+            model_annotated_video,
+            activity_data,
+            chatbot,
+            model_activities
+        ]
+    )
+    # Process activity data into gallery
+    activity_data.change(
+        fn=create_activity_cards_ui,
+        inputs=[activity_data],
+        outputs=[activity_gallery, activity_indices]
+    )
+    # Handle gallery selection
+    activity_gallery.select(
+        fn=on_card_click,
+        inputs=[activity_indices, chatbot],
+        outputs=[timeline_view, chat_view, chatbot, chunk_video]
+    )
+    # Chat submission
+    chat_input.submit(
+        fn=chat_with_video,
+        inputs=[chat_input, chatbot],
+        outputs=[chatbot]
+    )
+    # Back button
+    back_btn.click(
+        fn=lambda: [gr.update(visible=True), gr.update(visible=False)],
+        inputs=None,
+        outputs=[timeline_view, chat_view]
     )
+    # Add enhanced CSS styling
+    gr.HTML("""
+    <style>
+        /* Gallery customizations */
+        .gradio-container .gallery-item {
+            border: 1px solid #444444 !important;
+            border-radius: 8px !important;
+            padding: 8px !important;
+            margin: 10px !important;
+            cursor: pointer !important;
+            transition: all 0.3s !important;
+            background: #18181b !important;
+            box-shadow: 0 2px 5px rgba(0,0,0,0.2) !important;
+        }
+        .gradio-container .gallery-item:hover {
+            transform: translateY(-2px) !important;
+            box-shadow: 0 4px 12px rgba(0,0,0,0.25) !important;
+            border-color: #007bff !important;
+            background: #202025 !important;
+        }
+        .gradio-container .gallery-item.selected {
+            border: 2px solid #007bff !important;
+            background: #202030 !important;
+        }
+        /* Improved image display */
+        .gradio-container .gallery-item img {
+            height: 180px !important;
+            object-fit: cover !important;
+            border-radius: 4px !important;
+            border: 1px solid #444444 !important;
+            margin-bottom: 8px !important;
+        }
+        /* Caption styling */
+        .gradio-container .caption {
+            color: #e0e0e0 !important;
+            font-size: 0.9em !important;
+            margin-top: 8px !important;
+            line-height: 1.4 !important;
+            padding: 0 4px !important;
+        }
+        /* Gallery container */
+        .gradio-container [id*='gallery'] > div:first-child {
+            background-color: #27272a !important;
+            padding: 15px !important;
+            border-radius: 10px !important;
+        }
+        /* Chatbot styling */
+        .gradio-container .chatbot {
+            background-color: #27272a !important;
+            border-radius: 10px !important;
+            border: 1px solid #444444 !important;
+        }
+        .gradio-container .chatbot .message.user {
+            background-color: #18181b !important;
+            border-radius: 8px !important;
+        }
+        .gradio-container .chatbot .message.bot {
+            background-color: #202030 !important;
+            border-radius: 8px !important;
+        }
+        /* Button styling */
+        .gradio-container button.secondary {
+            background-color: #3d4452 !important;
+            color: white !important;
+        }
+    </style>
+    """)
 if __name__ == "__main__":
+    demo.launch(share=True, allowed_paths=["./tmp"])