Spaces:

tomg-group-umd
/

lm-watermarking

Running on A10G

App Files Files Community

jwkirchenbauer commited on 25 days ago

Commit

34e65b0

1 Parent(s): 5ac7b67

limit to one small model that fits in 24gb vram

Browse files

Files changed (2) hide show

app.py +9 -9
demo_watermark.py +25 -17

app.py CHANGED Viewed

@@ -24,17 +24,17 @@ arg_dict = {
     # 'model_name_or_path': 'facebook/opt-2.7b', # historical
     # 'model_name_or_path': 'facebook/opt-6.7b', # historical
     # 'model_name_or_path': 'meta-llama/Llama-2-7b-hf', # historical
-    'model_name_or_path': 'meta-llama/Llama-3.1-8B',
     'all_models':[
-        "meta-llama/Llama-3.1-8B",
         "meta-llama/Llama-3.2-3B",
-        "meta-llama/Llama-3.2-1B",
-        "Qwen/Qwen3-8B",
-        "Qwen/Qwen3-4B",
-        "Qwen/Qwen3-1.7B",
-        "Qwen/Qwen3-0.6B",
-        "Qwen/Qwen3-4B-Instruct-2507",
-        "Qwen/Qwen3-4B-Thinking-2507",
     ],
     # 'load_fp16' : True,
     'load_fp16' : False,

     # 'model_name_or_path': 'facebook/opt-2.7b', # historical
     # 'model_name_or_path': 'facebook/opt-6.7b', # historical
     # 'model_name_or_path': 'meta-llama/Llama-2-7b-hf', # historical
+    'model_name_or_path': 'meta-llama/Llama-3.2-3B',
     'all_models':[
+        # "meta-llama/Llama-3.1-8B", # too big for the A10G 24GB
         "meta-llama/Llama-3.2-3B",
+        # "meta-llama/Llama-3.2-1B",
+        # "Qwen/Qwen3-8B", # too big for the A10G 24GB
+        # "Qwen/Qwen3-4B",
+        # "Qwen/Qwen3-1.7B",
+        # "Qwen/Qwen3-0.6B",
+        # "Qwen/Qwen3-4B-Instruct-2507",
+        # "Qwen/Qwen3-4B-Thinking-2507",
     ],
     # 'load_fp16' : True,
     'load_fp16' : False,

demo_watermark.py CHANGED Viewed

@@ -19,6 +19,8 @@ import argparse
 from pprint import pprint
 from functools import partial
 import numpy # for gradio hot reload
 import gradio as gr
@@ -206,9 +208,11 @@ def load_model(args):
         model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path)
     elif args.is_decoder_only_model:
         if args.load_fp16:
-            model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.float16, device_map='auto')
         elif args.load_bf16:
-            model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.bfloat16, device_map='auto')
         else:
             model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
     else:
@@ -216,12 +220,18 @@ def load_model(args):
     if args.use_gpu:
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        if args.load_fp16 or args.load_bf16:
-            pass
-        else:
-            model = model.to(device)
     else:
         device = "cpu"
     model.eval()
     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
@@ -268,7 +278,7 @@ def generate_with_api(prompt, args):
         yield all_without_words, all_with_words
-def check_prompt(prompt, args, tokenizer, model, device=None):
     # This applies to both the local and API model scenarios
     if args.model_name_or_path in API_MODEL_MAP:
@@ -288,7 +298,7 @@ def check_prompt(prompt, args, tokenizer, model, device=None):
-def generate(prompt, args, tokenizer, model, device=None):
     """Instatiate the WatermarkLogitsProcessor according to the watermark parameters
        and generate watermarked text by passing it to the generate method of the model
        as a logits processor. """
@@ -486,11 +496,10 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
         default_prompt = args.__dict__.pop("default_prompt")
         session_args = gr.State(value=args)
         # note that state obj automatically calls value if it's a callable, want to avoid calling tokenizer at startup
-        session_tokenizer = gr.State(value=lambda : tokenizer)
-        session_model = gr.State(value=lambda : model)
-        check_prompt_partial = partial(check_prompt, device=device)
-        generate_partial = partial(generate, device=device)
         detect_partial = partial(detect, device=device)
         with gr.Tab("Welcome"):
@@ -704,8 +713,8 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
                 """)
         # Register main generation tab click, outputing generations as well as a the encoded+redecoded+potentially truncated prompt and flag, then call detection
-        generate_btn.click(fn=check_prompt_partial, inputs=[prompt,session_args,session_tokenizer, session_model], outputs=[redecoded_input, truncation_warning, session_args]).success(
-                           fn=generate_partial, inputs=[redecoded_input,session_args,session_tokenizer,session_model], outputs=[output_without_watermark, output_with_watermark]).success(
                            fn=detect_partial, inputs=[output_without_watermark,session_args,session_tokenizer], outputs=[without_watermark_detection_result,session_args,session_tokenizer,html_without_watermark]).success(
                            fn=detect_partial, inputs=[output_with_watermark,session_args,session_tokenizer], outputs=[with_watermark_detection_result,session_args,session_tokenizer,html_with_watermark])
         # Show truncated version of prompt if truncation occurred
@@ -781,6 +790,7 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
         def update_model(state, old_model):
             del old_model
             torch.cuda.empty_cache()
             model, _, _ = load_model(state)
             return model
@@ -803,8 +813,6 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
             update_model_state,inputs=[session_args, model_selector], outputs=[session_args]
         ).then(
             update_tokenizer,inputs=[model_selector], outputs=[session_tokenizer]
-        ).then(
-            update_model,inputs=[session_args, session_model], outputs=[session_model]
         ).then(
             lambda value: str(value), inputs=[session_args], outputs=[current_parameters]
         )
@@ -852,7 +860,7 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
         select_green_tokens.change(fn=detect_partial, inputs=[output_with_watermark,session_args,session_tokenizer], outputs=[with_watermark_detection_result,session_args,session_tokenizer,html_with_watermark])
         select_green_tokens.change(fn=detect_partial, inputs=[detection_input,session_args,session_tokenizer], outputs=[detection_result,session_args,session_tokenizer,html_detection_input])
-    # demo.queue(concurrency_count=3)
     demo.queue()
     if args.demo_public:

 from pprint import pprint
 from functools import partial
+import gc
 import numpy # for gradio hot reload
 import gradio as gr
         model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path)
     elif args.is_decoder_only_model:
         if args.load_fp16:
+            # model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.float16, device_map='auto')
+            model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.float16)
         elif args.load_bf16:
+            # model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.bfloat16, device_map='auto')
+            model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.bfloat16)
         else:
             model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
     else:
     if args.use_gpu:
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        # if args.load_fp16 or args.load_bf16:
+        #     pass
+        # else:
+        model = model.to(device)
     else:
         device = "cpu"
+    if args.load_bf16:
+        model = model.to(torch.bfloat16)
+    if args.load_fp16:
+        model = model.to(torch.float16)
     model.eval()
     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
         yield all_without_words, all_with_words
+def check_prompt(prompt, args, tokenizer, model=None, device=None):
     # This applies to both the local and API model scenarios
     if args.model_name_or_path in API_MODEL_MAP:
+def generate(prompt, args, tokenizer, model=None, device=None):
     """Instatiate the WatermarkLogitsProcessor according to the watermark parameters
        and generate watermarked text by passing it to the generate method of the model
        as a logits processor. """
         default_prompt = args.__dict__.pop("default_prompt")
         session_args = gr.State(value=args)
         # note that state obj automatically calls value if it's a callable, want to avoid calling tokenizer at startup
+        session_tokenizer = gr.State(value=lambda : tokenizer)
+        check_prompt_partial = partial(check_prompt, model=model, device=device)
+        generate_partial = partial(generate, model=model, device=device)
         detect_partial = partial(detect, device=device)
         with gr.Tab("Welcome"):
                 """)
         # Register main generation tab click, outputing generations as well as a the encoded+redecoded+potentially truncated prompt and flag, then call detection
+        generate_btn.click(fn=check_prompt_partial, inputs=[prompt,session_args,session_tokenizer], outputs=[redecoded_input, truncation_warning, session_args]).success(
+                           fn=generate_partial, inputs=[redecoded_input,session_args,session_tokenizer], outputs=[output_without_watermark, output_with_watermark]).success(
                            fn=detect_partial, inputs=[output_without_watermark,session_args,session_tokenizer], outputs=[without_watermark_detection_result,session_args,session_tokenizer,html_without_watermark]).success(
                            fn=detect_partial, inputs=[output_with_watermark,session_args,session_tokenizer], outputs=[with_watermark_detection_result,session_args,session_tokenizer,html_with_watermark])
         # Show truncated version of prompt if truncation occurred
         def update_model(state, old_model):
             del old_model
             torch.cuda.empty_cache()
+            gc.collect()
             model, _, _ = load_model(state)
             return model
             update_model_state,inputs=[session_args, model_selector], outputs=[session_args]
         ).then(
             update_tokenizer,inputs=[model_selector], outputs=[session_tokenizer]
         ).then(
             lambda value: str(value), inputs=[session_args], outputs=[current_parameters]
         )
         select_green_tokens.change(fn=detect_partial, inputs=[output_with_watermark,session_args,session_tokenizer], outputs=[with_watermark_detection_result,session_args,session_tokenizer,html_with_watermark])
         select_green_tokens.change(fn=detect_partial, inputs=[detection_input,session_args,session_tokenizer], outputs=[detection_result,session_args,session_tokenizer,html_detection_input])
     demo.queue()
     if args.demo_public: