V3Test

Sleeping

App Files Files Community

assentian1970 commited on Mar 2

Commit

7e990b0

verified ·

1 Parent(s): 722e2d7

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -224

app.py CHANGED Viewed

@@ -1,53 +1,59 @@
 #!/usr/bin/env python
 # encoding: utf-8
 import spaces
 import torch
-import os
-import gc
-import tempfile
-import numpy as np
-import cv2
-from datetime import datetime
-from PIL import Image
-from decord import VideoReader, cpu
 from transformers import AutoModel, AutoTokenizer
 import gradio as gr
-from ultralytics import YOLO
 from modelscope.hub.snapshot_download import snapshot_download
-# Initialize GPU first
-@spaces.GPU
-def initialize_gpu():
-    return torch.randn(10).cuda()
-initialize_gpu()
-# Configuration
-MODEL_NAME = 'mPLUG-Owl3'
-YOLO_MODEL = YOLO('best_yolov11.pt')
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MAX_NUM_FRAMES = 64
 IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
 VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
-# Download models
-model_dir = snapshot_download('iic/mPLUG-Owl3-7B-240728', cache_dir='./')
-# Replace the model loading section with:
-# Load models with ZeroGPU optimization
-model = AutoModel.from_pretrained(
-    model_dir,
-    attn_implementation='sdpa',
-    trust_remote_code=True,
-    torch_dtype=torch.float16,  # Use float16 instead of bfloat16
-    device_map="auto"
-)
 def get_file_extension(filename):
     return os.path.splitext(filename)[1].lower()
@@ -57,219 +63,162 @@ def is_image(filename):
 def is_video(filename):
     return get_file_extension(filename) in VIDEO_EXTENSIONS
-def process_yolo_results(results):
-    counts = {
-        "people": 0,
-        "machinery": {
-            "Tower Crane": 0, "Mobile Crane": 0, "Compactor/Roller": 0,
-            "Bulldozer": 0, "Excavator": 0, "Dump Truck": 0,
-            "Concrete Mixer": 0, "Loader": 0, "Pump Truck": 0,
-            "Pile Driver": 0, "Grader": 0, "Other Vehicle": 0
-        }
-    }
-    for r in results:
-        for box in r.boxes:
-            cls_id = int(box.cls[0])
-            conf = float(box.conf[0])
-            if conf < 0.5:
-                continue
-            class_name = YOLO_MODEL.names[cls_id].lower()
-            if 'worker' in class_name:
-                counts["people"] += 1
-            else:
-                machinery_mapping = {
-                    'tower_crane': "Tower Crane",
-                    'mobile_crane': "Mobile Crane",
-                    'compactor': "Compactor/Roller",
-                    'roller': "Compactor/Roller",
-                    'bulldozer': "Bulldozer",
-                    'excavator': "Excavator",
-                    'dump_truck': "Dump Truck",
-                    'concrete_mixer': "Concrete Mixer",
-                    'loader': "Loader",
-                    'pump_truck': "Pump Truck",
-                    'pile_driver': "Pile Driver",
-                    'grader': "Grader"
-                }
-                counts["machinery"][machinery_mapping.get(class_name, "Other Vehicle")] += 1
-    return counts
-def detect_objects(media_path):
-    if is_video(media_path):
-        cap = cv2.VideoCapture(media_path)
-        max_counts = {"people": 0, "machinery": {}}
-        frame_count = 0
-        while cap.isOpened():
-            ret, frame = cap.read()
-            if not ret:
-                break
-            if frame_count % 30 == 0:  # Process every 30th frame
-                results = YOLO_MODEL(frame)
-                counts = process_yolo_results(results)
-                max_counts["people"] = max(max_counts["people"], counts["people"])
-                for key, value in counts["machinery"].items():
-                    max_counts["machinery"][key] = max(max_counts["machinery"].get(key, 0), value)
-            frame_count += 1
-        cap.release()
-        return max_counts
-    else:
-        img = cv2.imread(media_path)
-        results = YOLO_MODEL(img)
-        return process_yolo_results(results)
-def analyze_media(media_path):
     try:
-        if is_image(media_path):
-            return analyze_image(media_path)
-        return analyze_video(media_path)
     except Exception as e:
-        print(f"Analysis error: {str(e)}")
-        return "Analysis unavailable"
-def analyze_image(image_path):
     try:
-        image = Image.open(image_path).convert("RGB")
-        messages = [{
-            "role": "user",
-            "content": "Analyze this construction site image. Describe visible activities, equipment, and safety observations.",
-            "images": [image]
-        }]
-        inputs = model.build_inputs(
-            messages=messages,
-            tokenizer=tokenizer,
-            max_new_tokens=1000,
-            padding=True
-        )
-        inputs = inputs.to(DEVICE)
-        with torch.no_grad():
-            outputs = model.generate(**inputs)
-        return tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
-        print(f"Image analysis error: {str(e)}")
-        return "Image analysis failed"
-def analyze_video(video_path):
     try:
-        vr = VideoReader(video_path, ctx=cpu(0))
-        frame_step = max(1, len(vr) // MAX_NUM_FRAMES)
-        frames = [Image.fromarray(vr[i].asnumpy()) for i in range(0, len(vr), frame_step)]
-        messages = [{
-            "role": "user",
-            "content": "Analyze this construction site video. Describe ongoing activities, equipment usage, and safety observations.",
-            "videos": frames[:MAX_NUM_FRAMES]
-        }]
-        inputs = model.build_inputs(
-            messages=messages,
-            tokenizer=tokenizer,
-            max_new_tokens=1000,
-            padding=True
         )
-        inputs = inputs.to(DEVICE)
-        with torch.no_grad():
-            outputs = model.generate(**inputs)
-        return tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
-        print(f"Video analysis error: {str(e)}")
-        return "Video analysis failed"
-def annotate_video(input_path):
-    cap = cv2.VideoCapture(input_path)
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
-        output_path = temp_file.name
-    writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
-    while cap.isOpened():
-        ret, frame = cap.read()
-        if not ret:
-            break
-        results = YOLO_MODEL(frame)
-        annotated_frame = results[0].plot()
-        writer.write(annotated_frame)
-    cap.release()
-    writer.release()
-    return output_path
-@spaces.GPU
-def process_entry(day, date, media):
-    try:
-        if not media:
-            return [day, date, "No media", "No media", "No media", None]
-        with tempfile.NamedTemporaryFile(delete=False) as tmp:
-            tmp.write(media.read())
-            tmp_path = tmp.name
-        detection = detect_objects(tmp_path)
-        analysis = analyze_media(tmp_path)
-        annotated_video = annotate_video(tmp_path) if is_video(tmp_path) else None
-        machinery_str = ", ".join(
-            f"{k}: {v}" for k, v in detection['machinery'].items() if v > 0
-        ) if isinstance(detection, dict) else "Detection failed"
-        return [
-            day,
-            date,
-            str(detection.get('people', 0)),
-            machinery_str,
-            analysis,
-            annotated_video
-        ]
-    except Exception as e:
-        print(f"Processing error: {str(e)}")
-        return [day, date, "Error", "Error", "Error", None]
-with gr.Blocks(title="Construction Site Diary", css="footer {visibility: hidden}") as demo:
-    gr.Markdown("# 🏗️ Digital Construction Site Diary")
-    with gr.Row():
-        with gr.Column(scale=1):
-            day_input = gr.Number(label="Day Number", value=1)
-            date_input = gr.Textbox(label="Date", value=datetime.now().strftime("%Y-%m-%d"))
-            media_input = gr.File(label="Upload Site Photo/Video", file_types=["image", "video"])
-            submit_btn = gr.Button("Analyze Site", variant="primary")
-        with gr.Column(scale=2):
-            day_output = gr.Textbox(label="Day")
-            date_output = gr.Textbox(label="Date")
-            people_output = gr.Textbox(label="People Detected")
-            machinery_output = gr.Textbox(label="Equipment Detected")
-            analysis_output = gr.Textbox(label="Activity Analysis", lines=4)
-            video_output = gr.Video(label="Annotated Video Preview")
-    submit_btn.click(
-        fn=process_entry,
-        inputs=[day_input, date_input, media_input],
-        outputs=[day_output, date_output, people_output, machinery_output, analysis_output, video_output]
-    )
 if __name__ == "__main__":
     demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
     )

 #!/usr/bin/env python
 # encoding: utf-8
 import spaces
 import torch
+@spaces.GPU
+def debug():
+    torch.randn(10).cuda()
+debug()
+import argparse
 from transformers import AutoModel, AutoTokenizer
 import gradio as gr
+from PIL import Image
+from decord import VideoReader, cpu
+import io
+import os
+os.system("nvidia-smi")
+import copy
+import requests
+import base64
+import json
+import traceback
+import re
+import modelscope_studio as mgr
 from modelscope.hub.snapshot_download import snapshot_download
+# Configuration
+model_dir = snapshot_download('iic/mPLUG-Owl3-7B-240728', cache_dir='./')
+device_map = "auto"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Argparser
+parser = argparse.ArgumentParser(description='demo')
+parser.add_argument('--device', type=str, default='cuda', help='cuda, mps or cpu')
+parser.add_argument("--host", type=str, default="0.0.0.0")
+parser.add_argument("--port", type=int, default=7860)
+args = parser.parse_args()
+device = args.device
+# Load model and tokenizer
+model_path = './iic/mPLUG-Owl3-7B-240728'
+model = AutoModel.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16 if 'int4' not in model_path else torch.float32,
+    attn_implementation="flash_attention_2" if device == 'cuda' else None
+).to(device)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model.eval()
+# Constants
+ERROR_MSG = "Error occurred, please check inputs and try again"
 MAX_NUM_FRAMES = 64
 IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
 VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
 def get_file_extension(filename):
     return os.path.splitext(filename)[1].lower()
 def is_video(filename):
     return get_file_extension(filename) in VIDEO_EXTENSIONS
+def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
+    return mgr.MultimodalInput(
+        upload_image_button_props={'label': 'Upload Image', 'disabled': upload_image_disabled, 'file_count': 'multiple'},
+        upload_video_button_props={'label': 'Upload Video', 'disabled': upload_video_disabled, 'file_count': 'single'},
+        submit_button_props={'label': 'Submit'}
+    )
+@spaces.GPU
+def chat(images, messages, params):
+    try:
+        response = model.chat(
+            images=images,
+            messages=messages,
+            tokenizer=tokenizer,
+            **params
+        )
+        return 0, response, None
+    except Exception as e:
+        print(f"Error in chat: {str(e)}")
+        traceback.print_exc()
+        return -1, ERROR_MSG, None
+def encode_image(image):
+    try:
+        if not isinstance(image, Image.Image):
+            image = Image.open(image.file.path).convert("RGB")
+        max_size = 448 * 16
+        if max(image.size) > max_size:
+            ratio = max_size / max(image.size)
+            new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
+            image = image.resize(new_size, Image.BICUBIC)
+        return image
+    except Exception as e:
+        raise gr.Error(f"Image processing error: {str(e)}")
+def encode_video(video):
     try:
+        vr = VideoReader(video.file.path, ctx=cpu(0))
+        sample_fps = round(vr.get_avg_fps() / 1)
+        frame_idx = [i for i in range(0, len(vr), sample_fps)]
+        if len(frame_idx) > MAX_NUM_FRAMES:
+            frame_idx = frame_idx[:MAX_NUM_FRAMES]
+        frames = vr.get_batch(frame_idx).asnumpy()
+        return [Image.fromarray(frame.astype('uint8')) for frame in frames]
     except Exception as e:
+        raise gr.Error(f"Video processing error: {str(e)}")
+def process_inputs(_question, _app_cfg):
     try:
+        files = _question.files
+        text = _question.text
+        pattern = r"\[mm_media\]\d+\[/mm_media\]"
+        matches = re.split(pattern, text)
+        if len(matches) != len(files) + 1:
+            raise gr.Error("Media placeholders don't match uploaded files count")
+        message = []
+        media_count = 0
+        for i, match in enumerate(matches):
+            if match.strip():
+                message.append({"type": "text", "content": match.strip()})
+            if i < len(files):
+                file = files[i]
+                if is_image(file.file.path):
+                    message.append({"type": "image", "content": encode_image(file)})
+                elif is_video(file.file.path):
+                    message.append({"type": "video", "content": encode_video(file)})
+                media_count += 1
+        return message, media_count
     except Exception as e:
+        traceback.print_exc()
+        raise gr.Error(f"Input processing failed: {str(e)}")
+def generate_response(_question, _chat_history, _app_cfg, params_form):
     try:
+        params = {
+            'max_new_tokens': 2048,
+            'temperature': 0.7 if params_form == 'Sampling' else 1.0,
+            'top_p': 0.8 if params_form == 'Sampling' else None,
+            'num_beams': 3 if params_form == 'Beam Search' else 1,
+            'repetition_penalty': 1.1
+        }
+        processed_input, media_count = process_inputs(_question, _app_cfg)
+        _app_cfg['media_count'] += media_count
+        code, response, _ = chat(
+            images=[item['content'] for item in processed_input if item['type'] == 'image'],
+            messages=[{"role": "user", "content": processed_input}],
+            params=params
         )
+        if code != 0:
+            raise gr.Error("Model response generation failed")
+        _chat_history.append((_question, response))
+        return _chat_history, _app_cfg
     except Exception as e:
+        traceback.print_exc()
+        raise gr.Error(f"Generation failed: {str(e)}")
+def reset_chat():
+    return [], {'media_count': 0, 'ctx': []}
+with gr.Blocks(css="video {height: auto !important;}") as demo:
+    with gr.Tab("mPLUG-Owl3"):
+        gr.Markdown("## mPLUG-Owl3 Multi-Modal Chat Interface")
+        # State management
+        app_state = gr.State({'media_count': 0, 'ctx': []})
+        # Chat interface
+        chatbot = mgr.Chatbot(height=600)
+        input_interface = create_multimodal_input()
+        # Controls
+        with gr.Row():
+            decode_type = gr.Radio(
+                choices=['Beam Search', 'Sampling'],
+                value='Sampling',
+                label="Decoding Strategy"
+            )
+            clear_btn = gr.Button("Clear History")
+            regenerate_btn = gr.Button("Regenerate")
+        # Event handlers
+        input_interface.submit(
+            generate_response,
+            [input_interface, chatbot, app_state, decode_type],
+            [chatbot, app_state]
+        )
+        clear_btn.click(
+            reset_chat,
+            outputs=[chatbot, app_state]
+        )
+        regenerate_btn.click(
+            lambda history: history[:-1] if history else [],
+            inputs=[chatbot],
+            outputs=[chatbot]
+        )
 if __name__ == "__main__":
     demo.launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=False,
+        debug=True
     )