V3Test

Sleeping

App Files Files Community

assentian1970 commited on Mar 4

Commit

83b0d3a

verified ·

1 Parent(s): ecd2075

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -197

app.py CHANGED Viewed

@@ -1,227 +1,200 @@
-#!/usr/bin/env python
-# encoding: utf-8
 import spaces
 import torch
-@spaces.GPU
-def debug():
-    torch.randn(10).cuda()
-debug()
 import argparse
-from transformers import AutoModel, AutoTokenizer
 import gradio as gr
 from PIL import Image
 from decord import VideoReader, cpu
-import io
-import os
-os.system("nvidia-smi")
-import copy
-import requests
-import base64
-import json
-import traceback
-import re
-import modelscope_studio as mgr
 from modelscope.hub.snapshot_download import snapshot_download
-# Configuration
-model_dir = snapshot_download('iic/mPLUG-Owl3-7B-240728', cache_dir='./')
-device_map = "auto"
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Argparser
-parser = argparse.ArgumentParser(description='demo')
-parser.add_argument('--device', type=str, default='cuda', help='cuda, mps or cpu')
-parser.add_argument("--host", type=str, default="0.0.0.0")
-parser.add_argument("--port", type=int, default=7860)
-args = parser.parse_args()
-device = args.device
-# Before model loading, add:
-torch.set_num_threads(4)  # Limit CPU threads
-torch._C._jit_set_texpr_fuser_enabled(False)
-# Replace the model loading section with:
-model = AutoModel.from_pretrained(
-    model_path,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16 if 'int4' not in model_path else torch.float32,
-    attn_implementation="sdpa"  # Use scaled dot-product attention instead of flash-attn
-).to(device)
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-model.eval()
-# Constants
-ERROR_MSG = "Error occurred, please check inputs and try again"
-MAX_NUM_FRAMES = 64
 IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
-VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
 def get_file_extension(filename):
-    return os.path.splitext(filename)[1].lower()
 def is_image(filename):
     return get_file_extension(filename) in IMAGE_EXTENSIONS
-def is_video(filename):
-    return get_file_extension(filename) in VIDEO_EXTENSIONS
-def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
-    return mgr.MultimodalInput(
-        upload_image_button_props={'label': 'Upload Image', 'disabled': upload_image_disabled, 'file_count': 'multiple'},
-        upload_video_button_props={'label': 'Upload Video', 'disabled': upload_video_disabled, 'file_count': 'single'},
-        submit_button_props={'label': 'Submit'}
-    )
-@spaces.GPU
-def chat(images, messages, params):
-    try:
-        response = model.chat(
-            images=images,
-            messages=messages,
-            tokenizer=tokenizer,
-            **params
-        )
-        return 0, response, None
-    except Exception as e:
-        print(f"Error in chat: {str(e)}")
-        traceback.print_exc()
-        return -1, ERROR_MSG, None
-def encode_image(image):
-    try:
-        if not isinstance(image, Image.Image):
-            image = Image.open(image.file.path).convert("RGB")
-        max_size = 448 * 16
-        if max(image.size) > max_size:
-            ratio = max_size / max(image.size)
-            new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
-            image = image.resize(new_size, Image.BICUBIC)
-        return image
-    except Exception as e:
-        raise gr.Error(f"Image processing error: {str(e)}")
-def encode_video(video):
     try:
-        vr = VideoReader(video.file.path, ctx=cpu(0))
-        sample_fps = round(vr.get_avg_fps() / 1)
-        frame_idx = [i for i in range(0, len(vr), sample_fps)]
-        if len(frame_idx) > MAX_NUM_FRAMES:
-            frame_idx = frame_idx[:MAX_NUM_FRAMES]
-        frames = vr.get_batch(frame_idx).asnumpy()
-        return [Image.fromarray(frame.astype('uint8')) for frame in frames]
-    except Exception as e:
-        raise gr.Error(f"Video processing error: {str(e)}")
-def process_inputs(_question, _app_cfg):
-    try:
-        files = _question.files
-        text = _question.text
-        pattern = r"\[mm_media\]\d+\[/mm_media\]"
-        matches = re.split(pattern, text)
-        if len(matches) != len(files) + 1:
-            raise gr.Error("Media placeholders don't match uploaded files count")
-        message = []
-        media_count = 0
-        for i, match in enumerate(matches):
-            if match.strip():
-                message.append({"type": "text", "content": match.strip()})
-            if i < len(files):
-                file = files[i]
-                if is_image(file.file.path):
-                    message.append({"type": "image", "content": encode_image(file)})
-                elif is_video(file.file.path):
-                    message.append({"type": "video", "content": encode_video(file)})
-                media_count += 1
-        return message, media_count
     except Exception as e:
-        traceback.print_exc()
-        raise gr.Error(f"Input processing failed: {str(e)}")
-def generate_response(_question, _chat_history, _app_cfg, params_form):
     try:
-        params = {
-            'max_new_tokens': 2048,
-            'temperature': 0.7 if params_form == 'Sampling' else 1.0,
-            'top_p': 0.8 if params_form == 'Sampling' else None,
-            'num_beams': 3 if params_form == 'Beam Search' else 1,
-            'repetition_penalty': 1.1
-        }
-        processed_input, media_count = process_inputs(_question, _app_cfg)
-        _app_cfg['media_count'] += media_count
-        code, response, _ = chat(
-            images=[item['content'] for item in processed_input if item['type'] == 'image'],
-            messages=[{"role": "user", "content": processed_input}],
-            params=params
-        )
-        if code != 0:
-            raise gr.Error("Model response generation failed")
-        _chat_history.append((_question, response))
-        return _chat_history, _app_cfg
     except Exception as e:
-        traceback.print_exc()
-        raise gr.Error(f"Generation failed: {str(e)}")
-def reset_chat():
-    return [], {'media_count': 0, 'ctx': []}
-with gr.Blocks(css="video {height: auto !important;}") as demo:
-    with gr.Tab("mPLUG-Owl3"):
-        gr.Markdown("## mPLUG-Owl3 Multi-Modal Chat Interface")
-        # State management
-        app_state = gr.State({'media_count': 0, 'ctx': []})
-        # Chat interface
-        chatbot = mgr.Chatbot(height=600)
-        input_interface = create_multimodal_input()
-        # Controls
-        with gr.Row():
-            decode_type = gr.Radio(
-                choices=['Beam Search', 'Sampling'],
-                value='Sampling',
-                label="Decoding Strategy"
-            )
-            clear_btn = gr.Button("Clear History")
-            regenerate_btn = gr.Button("Regenerate")
-        # Event handlers
-        input_interface.submit(
-            generate_response,
-            [input_interface, chatbot, app_state, decode_type],
-            [chatbot, app_state]
-        )
-        clear_btn.click(
-            reset_chat,
-            outputs=[chatbot, app_state]
-        )
-        regenerate_btn.click(
-            lambda history: history[:-1] if history else [],
-            inputs=[chatbot],
-            outputs=[chatbot]
-        )
 if __name__ == "__main__":
-    demo.launch(
-        server_name=args.host,
-        server_port=args.port,
-        share=False,
-        debug=True
-    )

 import spaces
 import torch
 import argparse
+import os
+import gc
+import tempfile
+import cv2
+import numpy as np
 import gradio as gr
+from datetime import datetime
 from PIL import Image
 from decord import VideoReader, cpu
+from transformers import AutoModel, AutoTokenizer
 from modelscope.hub.snapshot_download import snapshot_download
+from ultralytics import YOLO
+os.system("nvidia-smi")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+if DEVICE == "cuda":
+    def debug():
+        torch.randn(10).cuda()
+    debug()
 IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
+VIDEO_EXTENSIONS = {'.mp4', '.avi', '.mov', '.mkv'}  # Example, define properly
 def get_file_extension(filename):
+    return os.path.splitext(filename)[-1].lower()
+def is_video(filename):
+    return get_file_extension(filename) in VIDEO_EXTENSIONS
 def is_image(filename):
     return get_file_extension(filename) in IMAGE_EXTENSIONS
+parser = argparse.ArgumentParser(description='demo')
+parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
+parser.add_argument("--host", type=str, default="0.0.0.0")
+parser.add_argument("--port", type=int)
+args = parser.parse_args()
+device = args.device
+assert device in ['cuda', 'mps']
+MODEL_NAME = 'iic/mPLUG-Owl3-7B-240728'
+MODEL_CACHE_DIR = os.getenv('TRANSFORMERS_CACHE', './models')
+os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
+try:
+    model_path = snapshot_download(MODEL_NAME, cache_dir=MODEL_CACHE_DIR)
+except Exception as e:
+    print(f"Error downloading model: {str(e)}")
+    model_path = os.path.join(MODEL_CACHE_DIR, MODEL_NAME)
+YOLO_MODEL = YOLO('./best_yolov11.pt')
+MAX_NUM_FRAMES = 64
+def load_model_and_tokenizer():
+    if DEVICE == "cuda":
+        torch.cuda.empty_cache()
+        gc.collect()
+    model = AutoModel.from_pretrained(
+        model_path,
+        attn_implementation='flash_attention_2',
+        trust_remote_code=True,
+        torch_dtype=torch.half,
+        device_map='auto'
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path,
+        trust_remote_code=True
+    )
+    return model, tokenizer, None  # Assuming processor is missing
+def encode_video_in_chunks(video_path):
+    vr = VideoReader(video_path, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+    chunks = [frame_idx[i:i + MAX_NUM_FRAMES] for i in range(0, len(frame_idx), MAX_NUM_FRAMES)]
+    for chunk_idx, chunk in enumerate(chunks):
+        frames = vr.get_batch(chunk).asnumpy()
+        frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+        yield chunk_idx, frames
+def detect_people_and_machinery(media_path):
     try:
+        max_people_count = 0
+        max_machine_types = {key: 0 for key in [
+            "Tower Crane", "Mobile Crane", "Compactor/Roller", "Bulldozer",
+            "Excavator", "Dump Truck", "Concrete Mixer", "Loader",
+            "Pump Truck", "Pile Driver", "Grader", "Other Vehicle"
+        ]}
+        if is_video(media_path):
+            cap = cv2.VideoCapture(media_path)
+            fps = cap.get(cv2.CAP_PROP_FPS)
+            sample_rate = max(1, int(fps))
+            frame_count = 0
+            while cap.isOpened():
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                if frame_count % sample_rate == 0:
+                    results = YOLO_MODEL(frame)
+                    people, _, machine_types = process_yolo_results(results)
+                    max_people_count = max(max_people_count, people)
+                    for k, v in machine_types.items():
+                        max_machine_types[k] = max(max_machine_types[k], v)
+                frame_count += 1
+            cap.release()
+        else:
+            img = cv2.imread(media_path)
+            results = YOLO_MODEL(img)
+            max_people_count, _, max_machine_types = process_yolo_results(results)
+        max_machine_types = {k: v for k, v in max_machine_types.items() if v > 0}
+        total_machinery_count = sum(max_machine_types.values())
+        return max_people_count, total_machinery_count, max_machine_types
     except Exception as e:
+        print(f"Error in YOLO detection: {str(e)}")
+        return 0, 0, {}
+def process_yolo_results(results):
+    people_count = 0
+    machine_types = {key: 0 for key in [
+        "Tower Crane", "Mobile Crane", "Compactor/Roller", "Bulldozer",
+        "Excavator", "Dump Truck", "Concrete Mixer", "Loader",
+        "Pump Truck", "Pile Driver", "Grader", "Other Vehicle"
+    ]}
+    for r in results:
+        for box in r.boxes:
+            cls = int(box.cls[0])
+            conf = float(box.conf[0])
+            class_name = YOLO_MODEL.names[cls]
+            if class_name.lower() == 'worker' and conf > 0.5:
+                people_count += 1
+            machinery_mapping = {
+                'tower_crane': "Tower Crane",
+                'mobile_crane': "Mobile Crane",
+                'grader': "Grader",
+                'other_vehicle': "Other Vehicle"
+            }
+            if conf > 0.5:
+                for key, value in machinery_mapping.items():
+                    if key in class_name.lower():
+                        machine_types[value] += 1
+                        break
+    return people_count, sum(machine_types.values()), machine_types
+@spaces.GPU
+def process_diary(day, date, total_people, total_machinery, machinery_types, activities, media):
+    if media is None:
+        return [day, date, "No media uploaded", "No media uploaded", "No media uploaded", "No media uploaded", None]
     try:
+        file_ext = get_file_extension(media.name)
+        if not (is_image(media.name) or is_video(media.name)):
+            raise ValueError(f"Unsupported file type: {file_ext}")
+        with tempfile.NamedTemporaryFile(suffix=file_ext, delete=False) as temp_file:
+            temp_path = temp_file.name
+            temp_file.write(media.read())
+        detected_people, detected_machinery, detected_machinery_types = detect_people_and_machinery(temp_path)
+        detected_types_str = ", ".join([f"{k}: {v}" for k, v in detected_machinery_types.items()])
+        detected_activities = "Sample activity analysis."  # Placeholder
+        os.remove(temp_path)
+        return [day, date, str(detected_people), str(detected_machinery), detected_types_str, detected_activities, None]
     except Exception as e:
+        print(f"Error processing media: {str(e)}")
+        return [day, date, "Error", "Error", "Error", "Error", None]
+with gr.Blocks(title="Digital Site Diary") as demo:
+    gr.Markdown("# 📝 Digital Site Diary")
+    with gr.Row():
+        with gr.Column():
+            day = gr.Textbox(label="Day", value='9')
+            date = gr.Textbox(label="Date", value=datetime.now().strftime("%Y-%m-%d"))
+            total_people = gr.Number(label="Total Number of People", value=10)
+            total_machinery = gr.Number(label="Total Number of Machinery", value=3)
+            media = gr.File(label="Upload Image/Video", file_types=["image", "video"])
+            submit_btn = gr.Button("Submit")
+        with gr.Column():
+            model_day = gr.Textbox(label="Day")
+            model_date = gr.Textbox(label="Date")
+            model_people = gr.Textbox(label="Total Number of People")
+            model_machinery = gr.Textbox(label="Total Machinery")
+            model_machinery_types = gr.Textbox(label="Machinery Types")
+            model_activities = gr.Textbox(label="Activities")
+    submit_btn.click(
+        fn=process_diary,
+        inputs=[day, date, total_people, total_machinery, None, None, media],
+        outputs=[model_day, model_date, model_people, model_machinery, model_machinery_types, model_activities, None]
+    )
 if __name__ == "__main__":
+    demo.launch(share=False, debug=True, show_api=False, server_port=args.port, server_name=args.host)