Spaces:

assentian1970
/

DigitalSiteDiaryV2

Runtime error

App Files Files Community

assentian1970 commited on Mar 5

Commit

56084f9

verified ·

1 Parent(s): a35db3d

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -21

app.py CHANGED Viewed

@@ -21,7 +21,8 @@ import io
 # Install flash-attn (using prebuilt wheel mode if needed)
 subprocess.run(
     'pip install flash-attn --no-build-isolation',
-    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': 'TRUE'}, shell=True
 )
 # --------------------------------------------------------------------
@@ -54,7 +55,6 @@ if not args.chunk_inference:
         print(f"Error downloading model: {str(e)}")
         model_path = os.path.join(MODEL_CACHE_DIR, MODEL_NAME)
 else:
-    # In worker mode, use the passed model path (unused here)
     model_path = args.model_path_arg
 MAX_NUM_FRAMES = 64
@@ -81,7 +81,7 @@ def is_video(filename):
 def load_model_and_tokenizer():
     """Load a fresh instance of the model and tokenizer."""
     try:
-        # Clear GPU memory if using CUDA
         if device == "cuda":
             torch.cuda.empty_cache()
             gc.collect()
@@ -122,16 +122,15 @@ def process_video_chunk(video_frames, model, tokenizer, processor, prompt):
         videos=videos if videos else None
     )
     inputs.to('cuda')
-    # Update inputs: disable caching to prevent memory buildup.
     inputs.update({
         'tokenizer': tokenizer,
         'max_new_tokens': 100,
         'decode_text': True,
-        'use_cache': False
     })
-    response = model.generate(**inputs)
-    # Explicitly delete inputs to free memory
-    del inputs
     return response[0]
 # --------------------------------------------------------------------
@@ -254,9 +253,10 @@ def process_image(image_path, model, tokenizer, processor, prompt):
             'tokenizer': tokenizer,
             'max_new_tokens': 100,
             'decode_text': True,
-            'use_cache': False  # disable cache for memory efficiency
         })
-        response = model.generate(**inputs)
         del inputs
         return response[0]
     except Exception as e:
@@ -271,9 +271,8 @@ def analyze_image_activities(image_path):
                   "Focus on construction activities, machinery usage, and worker actions.")
         response = process_image(image_path, model, tokenizer, processor, prompt)
         del model, tokenizer, processor
-        if device == "cuda":
-            torch.cuda.empty_cache()
-            gc.collect()
         return response
     except Exception as e:
         print(f"Error analyzing image: {str(e)}")
@@ -330,12 +329,12 @@ def annotate_video_with_bboxes(video_path):
 @spaces.GPU
 def analyze_video_activities_single_instance(video_path):
     """Analyze video using mPLUG model with chunking.
-       Reuse a single mPLUG model instance for all chunks (no reloading)."""
     try:
         all_responses = []
         chunk_generator = encode_video_in_chunks(video_path)
-        # Load model instance once and keep it for all chunks
         model, tokenizer, processor = load_model_and_tokenizer()
         for chunk_idx, video_frames in chunk_generator:
@@ -343,15 +342,12 @@ def analyze_video_activities_single_instance(video_path):
                 "Analyze this construction site video chunk and describe the activities happening. "
                 "Focus on construction activities, machinery usage, and worker actions."
             )
-            # Wrap inference in torch.no_grad() to prevent gradient accumulation
             with torch.no_grad():
                 response = process_video_chunk(video_frames, model, tokenizer, processor, prompt)
             all_responses.append(f"Time period {chunk_idx + 1}:\n{response}")
-            # Attempt to free temporary memory after each chunk
-            torch.cuda.empty_cache()
-            gc.collect()
-        # Final cleanup: free the model instance
         del model, tokenizer, processor
         torch.cuda.empty_cache()
         gc.collect()

 # Install flash-attn (using prebuilt wheel mode if needed)
 subprocess.run(
     'pip install flash-attn --no-build-isolation',
+    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': 'TRUE'},
+    shell=True
 )
 # --------------------------------------------------------------------
         print(f"Error downloading model: {str(e)}")
         model_path = os.path.join(MODEL_CACHE_DIR, MODEL_NAME)
 else:
     model_path = args.model_path_arg
 MAX_NUM_FRAMES = 64
 def load_model_and_tokenizer():
     """Load a fresh instance of the model and tokenizer."""
     try:
+        # Clear GPU memory if using CUDA (only at initial load)
         if device == "cuda":
             torch.cuda.empty_cache()
             gc.collect()
         videos=videos if videos else None
     )
     inputs.to('cuda')
     inputs.update({
         'tokenizer': tokenizer,
         'max_new_tokens': 100,
         'decode_text': True,
+        'use_cache': False  # disable caching to reduce memory buildup
     })
+    with torch.no_grad():
+        response = model.generate(**inputs)
+    del inputs  # delete inputs to free temporary memory
     return response[0]
 # --------------------------------------------------------------------
             'tokenizer': tokenizer,
             'max_new_tokens': 100,
             'decode_text': True,
+            'use_cache': False
         })
+        with torch.no_grad():
+            response = model.generate(**inputs)
         del inputs
         return response[0]
     except Exception as e:
                   "Focus on construction activities, machinery usage, and worker actions.")
         response = process_image(image_path, model, tokenizer, processor, prompt)
         del model, tokenizer, processor
+        torch.cuda.empty_cache()  # Final cleanup after image processing
+        gc.collect()
         return response
     except Exception as e:
         print(f"Error analyzing image: {str(e)}")
 @spaces.GPU
 def analyze_video_activities_single_instance(video_path):
     """Analyze video using mPLUG model with chunking.
+       Use a single mPLUG model instance for all chunks without any per-chunk cleanup."""
     try:
         all_responses = []
         chunk_generator = encode_video_in_chunks(video_path)
+        # Load model instance once
         model, tokenizer, processor = load_model_and_tokenizer()
         for chunk_idx, video_frames in chunk_generator:
                 "Analyze this construction site video chunk and describe the activities happening. "
                 "Focus on construction activities, machinery usage, and worker actions."
             )
             with torch.no_grad():
                 response = process_video_chunk(video_frames, model, tokenizer, processor, prompt)
             all_responses.append(f"Time period {chunk_idx + 1}:\n{response}")
+            # No per-chunk cache clearing is performed here
+        # Final cleanup after processing all chunks
         del model, tokenizer, processor
         torch.cuda.empty_cache()
         gc.collect()