Spaces:

BryanBradfo
/

GemmaTextAppeal

Sleeping

App Files Files Community

BryanBradfo commited on Apr 5

Commit

9b002fb

1 Parent(s): 995f0f7

generating for too long

Browse files

Files changed (1) hide show

app.py +27 -47

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import streamlit as st
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-import time
 import os
 from dotenv import load_dotenv
@@ -20,7 +19,7 @@ st.title("✨ GemmaTextAppeal")
 st.markdown("""
 ### Interactive Demo of Google's Gemma 2-2B-IT Model
 This app demonstrates the text generation capabilities of Google's Gemma 2-2B-IT model.
-Enter a prompt below and see the model generate text in real-time!
 """)
 # Function to load model
@@ -32,31 +31,24 @@ def load_model():
         if not huggingface_token:
             return None, None, "No Hugging Face API token found. Please add your token as a secret named 'HF_TOKEN'."
-        # Attempt to download model with explicit token
         tokenizer = AutoTokenizer.from_pretrained(
             "google/gemma-2-2b-it",
             token=huggingface_token
         )
-        # Load model - use CPU configuration if no GPU available
         model_kwargs = {
             "token": huggingface_token,
-            "torch_dtype": torch.float16
         }
-        # Only add device_map if GPU is available
-        if torch.cuda.is_available():
-            model_kwargs["device_map"] = "auto"
         model = AutoModelForCausalLM.from_pretrained(
             "google/gemma-2-2b-it",
             **model_kwargs
         )
-        # Move model to CPU if no GPU
-        if not torch.cuda.is_available():
-            model = model.to("cpu")
         return tokenizer, model, None
     except Exception as e:
         return None, None, str(e)
@@ -158,47 +150,35 @@ def generate_text(prompt, max_new_tokens=300, temperature=0.7):
         # Format the prompt according to Gemma's expected format
         formatted_prompt = f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
-        # Create the progress bar and status indicators
-        progress_bar = st.progress(0)
         status_text = st.empty()
         output_area = st.empty()
         status_text.text("Generating response...")
         # Tokenize the input
-        encoding = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
-        input_ids = encoding["input_ids"]
-        # Ensure we have a proper attention mask
-        attention_mask = torch.ones_like(input_ids)
-        # Simple approach - generate all at once
-        output_ids = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=max_new_tokens,
-            do_sample=True,
-            temperature=temperature,
-            pad_token_id=tokenizer.eos_token_id
-        )
-        # Get only the generated part (exclude the prompt)
-        new_tokens = output_ids[0][input_ids.shape[1]:]
-        generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
-        # Display incrementally for visual effect
-        display_text = ""
-        words = generated_text.split()
-        total_words = len(words)
-        for i, word in enumerate(words):
-            display_text += word + " "
-            progress = min(1.0, (i + 1) / total_words)
-            progress_bar.progress(progress)
-            output_area.markdown(f"**Generated Response:**\n\n{display_text}")
-            time.sleep(0.05)  # Brief delay for visual effect
         status_text.text("Generation complete!")
-        progress_bar.progress(1.0)
         return generated_text

 import streamlit as st
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
 from dotenv import load_dotenv
 st.markdown("""
 ### Interactive Demo of Google's Gemma 2-2B-IT Model
 This app demonstrates the text generation capabilities of Google's Gemma 2-2B-IT model.
+Enter a prompt below and see the model generate text!
 """)
 # Function to load model
         if not huggingface_token:
             return None, None, "No Hugging Face API token found. Please add your token as a secret named 'HF_TOKEN'."
+        # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(
             "google/gemma-2-2b-it",
             token=huggingface_token
         )
+        # Load model with appropriate configuration
         model_kwargs = {
             "token": huggingface_token,
+            "device_map": "auto" if torch.cuda.is_available() else None,
+            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32
         }
         model = AutoModelForCausalLM.from_pretrained(
             "google/gemma-2-2b-it",
             **model_kwargs
         )
         return tokenizer, model, None
     except Exception as e:
         return None, None, str(e)
         # Format the prompt according to Gemma's expected format
         formatted_prompt = f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
+        # Create the status indicator and output area
         status_text = st.empty()
         output_area = st.empty()
         status_text.text("Generating response...")
         # Tokenize the input
+        with torch.no_grad():
+            encoding = tokenizer(formatted_prompt, return_tensors="pt")
+            # Move to the appropriate device
+            if torch.cuda.is_available():
+                encoding = {k: v.to("cuda") for k, v in encoding.items()}
+            # Generate the text - streamlined version
+            output_ids = model.generate(
+                **encoding,
+                max_new_tokens=max_new_tokens,
+                do_sample=True,
+                temperature=temperature,
+                pad_token_id=tokenizer.eos_token_id
+            )
+            # Get only the generated part (exclude the prompt)
+            new_tokens = output_ids[0][encoding["input_ids"].shape[1]:]
+            generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
+        # Display the result
+        output_area.markdown(f"**Generated Response:**\n\n{generated_text}")
         status_text.text("Generation complete!")
         return generated_text