Spaces:

collinear-ai
/

Multimodal_Jailbreak_external_v1

Running

App Files Files Community

MeghanaArakkal commited on 19 days ago

Commit

0cbd969

verified ·

1 Parent(s): 4fbe7db

Update app.py

Browse files

Files changed (1) hide show

app.py +166 -89

app.py CHANGED Viewed

@@ -3,7 +3,7 @@
 Multimodal chat front‑end for Amazon Bedrock Nova Premier v1
 (text and/or image ➜ assistant text) with per‑session JSONL logging.
 Logs FULL conversations to HF Dataset and images to HF repo via Git LFS.
-Supports multiple concurrent users with separate sessions.
 Prereqs:
     pip install gradio==5.38 boto3 pillow datasets huggingface_hub
@@ -17,9 +17,10 @@ Update REPO_ID to your HF dataset repository.
 """
 from __future__ import annotations
-import base64, datetime, io, json, pathlib, uuid, os, threading
 from typing import Dict, List, Optional, Tuple
-from concurrent.futures import ThreadPoolExecutor, Future
 import boto3
 from botocore.config import Config
@@ -45,7 +46,7 @@ Never mention Amazon or Nova.
 # HuggingFace Configuration
 HF_TOKEN = os.getenv("HF_TOKEN")  # Set this in your Space's secrets
-REPO_ID = "collinear-ai/nova-premier-redteaming-external"  # Updated to your actual repo
 HF_API = HfApi()
 # Local directories (for temporary storage)
@@ -56,7 +57,11 @@ IMG_DIR.mkdir(exist_ok=True)
 # Thread pool for background operations
 executor = ThreadPoolExecutor(max_workers=4)
-dataset_lock = threading.Lock()  # Prevent concurrent dataset updates
 # ====== Bedrock client ====== #
 bedrock = boto3.client(
@@ -115,10 +120,12 @@ def upload_image_to_hf_repo_sync(session_id: str, pil_img: Image.Image, message_
     try:
         # Create unique filename with message index and timestamp
         ts = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S")
-        filename = f"images/{session_id}_{message_index:03d}_{ts}.png"
         # Save locally first
-        local_path = IMG_DIR / f"{session_id}_{message_index:03d}_{ts}.png"
         pil_img.save(local_path, format="PNG")
         # Upload to HF repo and wait for completion
@@ -140,66 +147,74 @@ def upload_image_to_hf_repo_sync(session_id: str, pil_img: Image.Image, message_
         print(f"❌ Failed to upload image to HF repo: {e}")
         return None
-def save_full_conversation_to_hf_dataset_async(session_id: str, chat_history: List[Tuple], br_history: List[Dict], image_urls: Dict[int, str]):
-    """Save the ENTIRE conversation to HF Dataset with thread safety."""
-    if not HF_TOKEN:
-        return
-    def background_save():
         try:
-            with dataset_lock:  # Prevent concurrent dataset updates
-                # Convert chat history to structured format
-                messages = []
-                for i, (user_msg, assistant_msg) in enumerate(chat_history):
-                    messages.append({
-                        "message_index": i,
-                        "role": "user",
-                        "content": user_msg,
-                        "image_url": image_urls.get(i, ""),
-                        "has_image": i in image_urls
-                    })
-                    messages.append({
-                        "message_index": i,
-                        "role": "assistant",
-                        "content": assistant_msg,
-                        "image_url": "",
-                        "has_image": False
-                    })
-                # Create conversation record
                 conversation_record = {
                     "session_id": session_id,
                     "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
                     "message_count": len(chat_history),
-                    "total_messages": len(messages),
-                    "conversation_messages": messages,
-                    "bedrock_history": br_history,  # Full Bedrock conversation context
-                    "images_count": len(image_urls)
                 }
-                # Load existing dataset
                 try:
                     existing_dataset = Dataset.load_dataset(REPO_ID, token=HF_TOKEN, split="train")
                     records = existing_dataset.to_list()
-                    # Check if session already exists and update it
-                    session_exists = False
-                    for idx, record in enumerate(records):
-                        if record.get("session_id") == session_id:
-                            records[idx] = conversation_record
-                            session_exists = True
-                            break
-                    if not session_exists:
-                        records.append(conversation_record)
                     updated_dataset = Dataset.from_list(records)
                 except Exception as load_error:
-                    # Dataset doesn't exist yet, create new one
                     updated_dataset = Dataset.from_list([conversation_record])
-                # Push updated dataset
                 updated_dataset.push_to_hub(
                     REPO_ID,
                     token=HF_TOKEN,
@@ -207,13 +222,47 @@ def save_full_conversation_to_hf_dataset_async(session_id: str, chat_history: Li
                     commit_message=f"Update conversation {session_id[:8]} ({len(chat_history)} exchanges)"
                 )
-                print(f"✅ Conversation {session_id[:8]} saved to HF dataset")
         except Exception as e:
-            print(f"❌ Failed to save conversation {session_id[:8]} to HF dataset: {e}")
-    # Run in background thread to avoid blocking UI
-    executor.submit(background_save)
 def save_local_conversation_log(session_id: str, chat_history: List[Tuple], image_urls: Dict[int, str]):
     """Save full conversation to local JSONL file."""
@@ -237,6 +286,10 @@ def save_local_conversation_log(session_id: str, chat_history: List[Tuple], imag
     with path.open("w", encoding="utf-8") as f:
         f.write(json.dumps(conversation_record, ensure_ascii=False) + "\n")
 # ====== Gradio UI ====== #
 with gr.Blocks(title="Nova Premier Red Team Chat") as demo:
     gr.Markdown(
@@ -247,16 +300,17 @@ with gr.Blocks(title="Nova Premier Red Team Chat") as demo:
         **Multi-User Support:**
         - 👥 Each browser tab/session gets a unique conversation ID
         - 🔒 Conversations are isolated between users
         - ⚡ Real-time image upload with direct URLs
         **Logging Features:**
-        - 💾 **Full conversation history** saved after each message
         - 🔄 **Context preservation** - entire chat context maintained
         - 📸 **Image tracking** - direct links to uploaded images
         - 🏷️ **Session management** - unique session ID per conversation
         **Storage:**
-        - 🤗 HF Dataset: {"✅ Enabled" if HF_TOKEN else "❌ Disabled (set HF_TOKEN)"} - Repo: `{REPO_ID}`
         - 🖼️ Images: {"✅ Uploaded with direct URLs" if HF_TOKEN else "❌ Local only"}
         """
     )
@@ -299,6 +353,15 @@ with gr.Blocks(title="Nova Premier Red Team Chat") as demo:
             max_lines=1
         )
     # ---- main handler ---- #
     def chat(chat_log, br_history, sess_id, img_urls_dict,
              image, text,
@@ -343,12 +406,14 @@ with gr.Blocks(title="Nova Premier Red Team Chat") as demo:
         display_user = text.strip() if text.strip() else "[image uploaded]"
         chat_log.append((display_user, reply))
-        # Save FULL conversation (local immediately, HF in background)
         save_local_conversation_log(sess_id, chat_log, img_urls_dict)
-        save_full_conversation_to_hf_dataset_async(sess_id, chat_log, new_br_history, img_urls_dict)
         # Update status message with real image URL
-        status_msg = f"✅ Full conversation logged for session {sess_id[:8]}\n"
         status_msg += f"📊 Total exchanges: {len(chat_log)} | Messages in context: {len(new_br_history)}\n"
         if image and hf_img_url:
@@ -365,9 +430,13 @@ with gr.Blocks(title="Nova Premier Red Team Chat") as demo:
         # Update session info
         image_count = len([k for k in img_urls_dict.keys() if isinstance(k, int)])
         session_display = f"Session: {sess_id[:8]}... | Messages: {len(chat_log)} | Images: {image_count}"
         return (chat_log, chat_log, new_br_history, sess_id, img_urls_dict,
-                None, "", status_msg, session_display)
     send_btn.click(
         chat,
@@ -375,60 +444,67 @@ with gr.Blocks(title="Nova Premier Red Team Chat") as demo:
                 img_in, txt_in,
                 max_tk, temp, top_p, top_k],
         outputs=[chatbot, chat_state, br_state, sess_state, img_urls_state,
-                img_in, txt_in, log_status, session_info],
     )
     # ---- clear chat ---- #
     def reset():
         new_session_id = str(uuid.uuid4())
         return ([], [], new_session_id, {}, None, "",
                 "Ready to start logging full conversations...",
-                f"New session: {new_session_id[:8]}...")
     clear_btn.click(
         reset,
         inputs=None,
         outputs=[chatbot, chat_state, sess_state, img_urls_state,
-                img_in, txt_in, log_status, session_info],
         queue=False,
     )
-    # Add info about the logging structure
     gr.Markdown(
         f"""
-        ### 📊 Red Team Conversation Log Structure
-        Each conversation is saved with:
-        - **Session ID**: Unique identifier for each conversation (8-char prefix shown in UI)
-        - **Full message history**: All user and assistant messages in chronological order
-        - **Image URLs**: Direct links to uploaded images in HF repo
-        - **Bedrock context**: Complete conversation context sent to the AI
-        - **Timestamps**: When the conversation was last updated
         ### 🖼️ Image Storage Format
-        Images are stored as: `images/{{session_id}}_{{message_index:03d}}_{{timestamp}}.png`
-        - Example: `images/d8771c05_001_20240123T143022.png`
         - **session_id**: First 8 chars of session UUID
-        - **message_index**: 3-digit message number (000, 001, 002...)
-        - **timestamp**: UTC timestamp when uploaded
-        ### 🔍 Viewing Your Logs
         **HF Dataset**: [https://huggingface.co/datasets/{REPO_ID}](https://huggingface.co/datasets/{REPO_ID})
-        - Each record contains a complete conversation from one user session
         - **Images folder**: [https://huggingface.co/datasets/{REPO_ID}/tree/main/images](https://huggingface.co/datasets/{REPO_ID}/tree/main/images)
-        - Search/filter by session_id to find specific conversations
-        - Image URLs in the conversation data link directly to the files
-        **Local logs**: Saved as `[session_id]_full.jsonl` (temporary until Space restarts)
-        ### 🔧 Performance Notes
-        - Images are uploaded synchronously to get real URLs immediately
-        - Dataset updates happen in background threads for performance
-        - Each browser tab gets its own isolated conversation
-        - Direct image URLs allow easy correlation with conversation data
         """
     )
@@ -441,10 +517,11 @@ if __name__ == "__main__":
         print("   1. Go to your Space settings → Repository secrets")
         print("   2. Add HF_TOKEN with your HuggingFace token (write permissions)")
     else:
-        print(f"✅ HF logging enabled. Full conversations will be saved to: {REPO_ID}")
         print(f"📸 Images will be stored at: https://huggingface.co/datasets/{REPO_ID}/tree/main/images")
-    print("🔴 Nova Premier Red Team Chat with full conversation logging ready!")
     demo.queue(max_size=100)
     demo.launch(share=True)

 Multimodal chat front‑end for Amazon Bedrock Nova Premier v1
 (text and/or image ➜ assistant text) with per‑session JSONL logging.
 Logs FULL conversations to HF Dataset and images to HF repo via Git LFS.
+Uses append-only logging to prevent data loss from concurrent users.
 Prereqs:
     pip install gradio==5.38 boto3 pillow datasets huggingface_hub
 """
 from __future__ import annotations
+import base64, datetime, io, json, pathlib, uuid, os, threading, time
 from typing import Dict, List, Optional, Tuple
+from concurrent.futures import ThreadPoolExecutor
+import queue
 import boto3
 from botocore.config import Config
 # HuggingFace Configuration
 HF_TOKEN = os.getenv("HF_TOKEN")  # Set this in your Space's secrets
+REPO_ID = "collinear-ai/nova-premier-redteaming-external"
 HF_API = HfApi()
 # Local directories (for temporary storage)
 # Thread pool for background operations
 executor = ThreadPoolExecutor(max_workers=4)
+# Queue-based logging system to prevent race conditions
+log_queue = queue.Queue()
+dataset_worker_running = False
+dataset_worker_lock = threading.Lock()
 # ====== Bedrock client ====== #
 bedrock = boto3.client(
     try:
         # Create unique filename with message index and timestamp
         ts = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S")
+        # Add a random component to ensure uniqueness even with concurrent uploads
+        random_suffix = str(uuid.uuid4())[:8]
+        filename = f"images/{session_id}_{message_index:03d}_{ts}_{random_suffix}.png"
         # Save locally first
+        local_path = IMG_DIR / f"{session_id}_{message_index:03d}_{ts}_{random_suffix}.png"
         pil_img.save(local_path, format="PNG")
         # Upload to HF repo and wait for completion
         print(f"❌ Failed to upload image to HF repo: {e}")
         return None
+def dataset_worker():
+    """Background worker that processes dataset updates sequentially to prevent race conditions."""
+    print("🔧 Dataset worker started")
+    while True:
         try:
+            # Get next log entry from queue (blocks until available)
+            log_entry = log_queue.get(timeout=30)  # 30 second timeout
+            if log_entry is None:  # Shutdown signal
+                print("🛑 Dataset worker shutting down")
+                break
+            # Process the log entry
+            session_id = log_entry["session_id"]
+            chat_history = log_entry["chat_history"]
+            br_history = log_entry["br_history"]
+            image_urls = log_entry["image_urls"]
+            try:
+                # Create individual conversation record (append-only approach)
                 conversation_record = {
                     "session_id": session_id,
                     "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
                     "message_count": len(chat_history),
+                    "conversation_messages": [],
+                    "bedrock_history": br_history,
+                    "images_count": len(image_urls),
+                    "record_type": "full_conversation"  # For filtering
                 }
+                # Convert chat history to structured format
+                for i, (user_msg, assistant_msg) in enumerate(chat_history):
+                    conversation_record["conversation_messages"].extend([
+                        {
+                            "message_index": i,
+                            "role": "user",
+                            "content": user_msg,
+                            "image_url": image_urls.get(i, ""),
+                            "has_image": i in image_urls
+                        },
+                        {
+                            "message_index": i,
+                            "role": "assistant",
+                            "content": assistant_msg,
+                            "image_url": "",
+                            "has_image": False
+                        }
+                    ])
+                # Append to dataset (safer than overwrite)
                 try:
+                    # Load existing dataset
                     existing_dataset = Dataset.load_dataset(REPO_ID, token=HF_TOKEN, split="train")
                     records = existing_dataset.to_list()
+                    # Remove any existing record with same session_id to avoid duplicates
+                    records = [r for r in records if r.get("session_id") != session_id]
+                    # Add new record
+                    records.append(conversation_record)
                     updated_dataset = Dataset.from_list(records)
                 except Exception as load_error:
+                    print(f"Creating new dataset (load failed): {load_error}")
                     updated_dataset = Dataset.from_list([conversation_record])
+                # Push to hub
                 updated_dataset.push_to_hub(
                     REPO_ID,
                     token=HF_TOKEN,
                     commit_message=f"Update conversation {session_id[:8]} ({len(chat_history)} exchanges)"
                 )
+                print(f"✅ Conversation {session_id[:8]} saved to dataset (queue size: {log_queue.qsize()})")
+            except Exception as e:
+                print(f"❌ Failed to save conversation {session_id[:8]} to dataset: {e}")
+            # Mark task as done
+            log_queue.task_done()
+        except queue.Empty:
+            # Timeout occurred, continue loop
+            continue
         except Exception as e:
+            print(f"❌ Dataset worker error: {e}")
+            time.sleep(5)  # Wait before retrying
+def start_dataset_worker():
+    """Start the dataset worker thread if not already running."""
+    global dataset_worker_running
+    with dataset_worker_lock:
+        if not dataset_worker_running:
+            dataset_worker_running = True
+            worker_thread = threading.Thread(target=dataset_worker, daemon=True)
+            worker_thread.start()
+            print("🚀 Dataset worker thread started")
+def queue_conversation_save(session_id: str, chat_history: List[Tuple], br_history: List[Dict], image_urls: Dict[int, str]):
+    """Queue a conversation for saving to prevent race conditions."""
+    if not HF_TOKEN:
+        return
+    log_entry = {
+        "session_id": session_id,
+        "chat_history": chat_history.copy(),  # Copy to avoid reference issues
+        "br_history": br_history.copy(),
+        "image_urls": image_urls.copy(),
+        "queued_at": datetime.datetime.utcnow().isoformat()
+    }
+    log_queue.put(log_entry)
+    print(f"📝 Queued conversation {session_id[:8]} for saving (queue size: {log_queue.qsize()})")
 def save_local_conversation_log(session_id: str, chat_history: List[Tuple], image_urls: Dict[int, str]):
     """Save full conversation to local JSONL file."""
     with path.open("w", encoding="utf-8") as f:
         f.write(json.dumps(conversation_record, ensure_ascii=False) + "\n")
+# Start the dataset worker when module loads
+if HF_TOKEN:
+    start_dataset_worker()
 # ====== Gradio UI ====== #
 with gr.Blocks(title="Nova Premier Red Team Chat") as demo:
     gr.Markdown(
         **Multi-User Support:**
         - 👥 Each browser tab/session gets a unique conversation ID
         - 🔒 Conversations are isolated between users
+        - 🚫 **Race condition safe** - sequential dataset updates prevent data loss
         - ⚡ Real-time image upload with direct URLs
         **Logging Features:**
+        - 💾 **Queue-based logging** prevents concurrent write conflicts
         - 🔄 **Context preservation** - entire chat context maintained
         - 📸 **Image tracking** - direct links to uploaded images
         - 🏷️ **Session management** - unique session ID per conversation
         **Storage:**
+        - 🤗 HF Dataset: {"✅ Enabled with queue-based safety" if HF_TOKEN else "❌ Disabled (set HF_TOKEN)"} - Repo: `{REPO_ID}`
         - 🖼️ Images: {"✅ Uploaded with direct URLs" if HF_TOKEN else "❌ Local only"}
         """
     )
             max_lines=1
         )
+    # Queue status display
+    with gr.Row():
+        queue_status = gr.Textbox(
+            label="Logging Queue Status",
+            value=f"Queue-based logging {'enabled' if HF_TOKEN else 'disabled'} - prevents data loss",
+            interactive=False,
+            max_lines=1
+        )
     # ---- main handler ---- #
     def chat(chat_log, br_history, sess_id, img_urls_dict,
              image, text,
         display_user = text.strip() if text.strip() else "[image uploaded]"
         chat_log.append((display_user, reply))
+        # Save locally immediately
         save_local_conversation_log(sess_id, chat_log, img_urls_dict)
+        # Queue for HF dataset save (prevents race conditions)
+        queue_conversation_save(sess_id, chat_log, new_br_history, img_urls_dict)
         # Update status message with real image URL
+        status_msg = f"✅ Conversation logged for session {sess_id[:8]}\n"
         status_msg += f"📊 Total exchanges: {len(chat_log)} | Messages in context: {len(new_br_history)}\n"
         if image and hf_img_url:
         # Update session info
         image_count = len([k for k in img_urls_dict.keys() if isinstance(k, int)])
         session_display = f"Session: {sess_id[:8]}... | Messages: {len(chat_log)} | Images: {image_count}"
+        # Update queue status
+        queue_size = log_queue.qsize() if HF_TOKEN else 0
+        queue_display = f"Dataset queue: {queue_size} pending | Queue-based logging prevents data loss"
         return (chat_log, chat_log, new_br_history, sess_id, img_urls_dict,
+                None, "", status_msg, session_display, queue_display)
     send_btn.click(
         chat,
                 img_in, txt_in,
                 max_tk, temp, top_p, top_k],
         outputs=[chatbot, chat_state, br_state, sess_state, img_urls_state,
+                img_in, txt_in, log_status, session_info, queue_status],
     )
     # ---- clear chat ---- #
     def reset():
         new_session_id = str(uuid.uuid4())
+        queue_size = log_queue.qsize() if HF_TOKEN else 0
         return ([], [], new_session_id, {}, None, "",
                 "Ready to start logging full conversations...",
+                f"New session: {new_session_id[:8]}...",
+                f"Dataset queue: {queue_size} pending | Queue-based logging prevents data loss")
     clear_btn.click(
         reset,
         inputs=None,
         outputs=[chatbot, chat_state, sess_state, img_urls_state,
+                img_in, txt_in, log_status, session_info, queue_status],
         queue=False,
     )
+    # Add info about the race-condition-safe logging
     gr.Markdown(
         f"""
+        ### 🛡️ Race Condition Protection
+        **Problem Solved**: Multiple users uploading simultaneously could overwrite each other's data.
+        **Solution**: Queue-based sequential processing:
+        - 📝 Each conversation update goes into a queue
+        - 🔧 Background worker processes updates one at a time
+        - 🚫 No concurrent dataset writes = no data loss
+        - ⚡ UI stays responsive while background worker handles persistence
+        ### 📊 Logging Architecture
+        1. **Immediate**: Local JSONL files for instant access
+        2. **Queued**: HF dataset updates processed sequentially
+        3. **Direct**: Image uploads happen immediately for real URLs
+        4. **Safe**: Each session overwrites only its own record
         ### 🖼️ Image Storage Format
+        Images: `images/{{session_id}}_{{msg_idx:03d}}_{{timestamp}}_{{random}}.png`
         - **session_id**: First 8 chars of session UUID
+        - **msg_idx**: 3-digit message number (000, 001, 002...)
+        - **timestamp**: UTC timestamp when uploaded
+        - **random**: 8-char random suffix for uniqueness
+        ### 🔍 Dataset Structure
         **HF Dataset**: [https://huggingface.co/datasets/{REPO_ID}](https://huggingface.co/datasets/{REPO_ID})
+        - One record per session (updated as conversation progresses)
         - **Images folder**: [https://huggingface.co/datasets/{REPO_ID}/tree/main/images](https://huggingface.co/datasets/{REPO_ID}/tree/main/images)
+        - Filter by `session_id` to find specific conversations
+        - Image URLs in conversation data link directly to files
+        **Benefits**:
+        - ✅ No data loss from concurrent users
+        - ✅ Real image URLs immediately available
+        - ✅ Complete conversation context preserved
+        - ✅ Scalable to many simultaneous users
         """
     )
         print("   1. Go to your Space settings → Repository secrets")
         print("   2. Add HF_TOKEN with your HuggingFace token (write permissions)")
     else:
+        print(f"✅ HF logging enabled with queue-based safety. Conversations will be saved to: {REPO_ID}")
         print(f"📸 Images will be stored at: https://huggingface.co/datasets/{REPO_ID}/tree/main/images")
+        print("🛡️ Race condition protection: Dataset updates processed sequentially")
+    print("🔴 Nova Premier Red Team Chat with safe multi-user logging ready!")
     demo.queue(max_size=100)
     demo.launch(share=True)