Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -1,12 +1,10 @@ 
     | 
|
| 1 | 
         
             
            import streamlit as st
         
     | 
| 2 | 
         
             
            import torch
         
     | 
| 3 | 
         
            -
            from transformers import 
     | 
| 4 | 
         
             
            from transformers.cache_utils import DynamicCache
         
     | 
| 5 | 
         
             
            import os
         
     | 
| 6 | 
         
             
            from time import time
         
     | 
| 7 | 
         
             
            import pandas as pd
         
     | 
| 8 | 
         
            -
             
     | 
| 9 | 
         
            -
            import os
         
     | 
| 10 | 
         
             
            from huggingface_hub import login
         
     | 
| 11 | 
         | 
| 12 | 
         
             
            HF_TOKEN = os.getenv("NEX_MODEL")  # Updated key name for clarity
         
     | 
| 
         @@ -14,7 +12,8 @@ HF_TOKEN = os.getenv("NEX_MODEL")  # Updated key name for clarity 
     | 
|
| 14 | 
         
             
            if not HF_TOKEN:
         
     | 
| 15 | 
         
             
                raise ValueError("Hugging Face token not found. Please set the 'NEX_MODEL' environment variable.")
         
     | 
| 16 | 
         | 
| 17 | 
         
            -
             
     | 
| 
         | 
|
| 18 | 
         
             
            # ==============================
         
     | 
| 19 | 
         
             
            # Helper: Human-readable bytes
         
     | 
| 20 | 
         
             
            def sizeof_fmt(num, suffix="B"):
         
     | 
| 
         @@ -82,27 +81,37 @@ def calculate_cache_size(cache): 
     | 
|
| 82 | 
         
             
                return total_memory /(1024*1024)
         
     | 
| 83 | 
         | 
| 84 | 
         
             
            @st.cache_resource
         
     | 
| 85 | 
         
            -
            def load_model_and_tokenizer( 
     | 
| 86 | 
         
            -
                model_name = " 
     | 
| 87 | 
         
            -
             
     | 
| 88 | 
         
            -
                 
     | 
| 89 | 
         
            -
                 
     | 
| 90 | 
         
             
                    model_name,
         
     | 
| 91 | 
         
            -
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
         
     | 
| 92 | 
         
            -
                    device_map="auto",
         
     | 
| 93 | 
         
             
                    trust_remote_code=True
         
     | 
| 94 | 
         
             
                    ,token=HF_TOKEN
         
     | 
| 95 | 
         
             
                )
         
     | 
| 96 | 
         
            -
             
     | 
| 97 | 
         
            -
                # Load the tokenizer
         
     | 
| 98 | 
         
            -
                tokenizer = AutoTokenizer.from_pretrained(
         
     | 
| 99 | 
         
             
                    model_name,
         
     | 
| 100 | 
         
            -
                     
     | 
| 101 | 
         
            -
                     
     | 
| 
         | 
|
| 102 | 
         
             
                    ,token=HF_TOKEN
         
     | 
| 103 | 
         
             
                )
         
     | 
| 104 | 
         
            -
                return  
     | 
| 105 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 106 | 
         
             
            def clone_cache(cache):
         
     | 
| 107 | 
         
             
                new_cache = DynamicCache()
         
     | 
| 108 | 
         
             
                for key, value in zip(cache.key_cache, cache.value_cache):
         
     | 
| 
         @@ -117,7 +126,16 @@ def load_document_and_cache(file_path): 
     | 
|
| 117 | 
         
             
                    with open(file_path, 'r') as file:
         
     | 
| 118 | 
         
             
                        doc_text = file.read()
         
     | 
| 119 | 
         
             
                    doc_text_count = len(doc_text)
         
     | 
| 120 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 121 | 
         
             
                    system_prompt = f"""
         
     | 
| 122 | 
         
             
                    <|system|>
         
     | 
| 123 | 
         
             
                    You are a helpful assistant. Provide concise, factual answers based only on the provided context.
         
     | 
| 
         @@ -194,11 +212,10 @@ if uploaded_file: 
     | 
|
| 194 | 
         
             
                print(f"π Document Preview Display Time: {t_end3 - t_start3:.2f} s")
         
     | 
| 195 | 
         
             
                t_start4 = time()
         
     | 
| 196 | 
         
             
                # PART 4: Show Basic Info
         
     | 
| 197 | 
         
            -
                 
     | 
| 198 | 
         
            -
                #cache_size = os.path.getsize("temp_cache.pth") / 1024 if os.path.exists("temp_cache.pth") else "N/A"
         
     | 
| 199 | 
         
             
                t_end4 = time()
         
     | 
| 200 | 
         
             
                log.append(f"π doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s")
         
     | 
| 201 | 
         
            -
                print(f"π doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s")
         
     | 
| 202 | 
         
             
                #st.info(
         
     | 
| 203 | 
         
             
                  #  f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "
         
     | 
| 204 | 
         
             
                   # f"Cache Size: {cache_size if cache_size == 'N/A' else f'{cache_size:.2f} KB'}"
         
     | 
| 
         @@ -222,10 +239,10 @@ if uploaded_file: 
     | 
|
| 222 | 
         | 
| 223 | 
         
             
                        # PART 4.2: Tokenize Prompt
         
     | 
| 224 | 
         
             
                        t_start6 = time()
         
     | 
| 225 | 
         
            -
             
     | 
| 226 | 
         
             
                        full_prompt = f"""
         
     | 
| 227 | 
         
             
                        <|user|>
         
     | 
| 228 | 
         
            -
                        Question: {query}
         
     | 
| 229 | 
         
             
                        <|assistant|>
         
     | 
| 230 | 
         
             
                        """.strip()
         
     | 
| 231 | 
         
             
                        input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
         
     | 
| 
         @@ -249,7 +266,7 @@ if uploaded_file: 
     | 
|
| 249 | 
         | 
| 250 | 
         
             
                        st.success("Answer:")
         
     | 
| 251 | 
         
             
                        st.write(response)
         
     | 
| 252 | 
         
            -
             
     | 
| 253 | 
         
             
                        # Final Info Display
         
     | 
| 254 | 
         
             
                        st.info(
         
     | 
| 255 | 
         
             
                         #   f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "
         
     | 
| 
         | 
|
| 1 | 
         
             
            import streamlit as st
         
     | 
| 2 | 
         
             
            import torch
         
     | 
| 3 | 
         
            +
            from transformers import AutoTokenizer, AutoModelForCausalLM
         
     | 
| 4 | 
         
             
            from transformers.cache_utils import DynamicCache
         
     | 
| 5 | 
         
             
            import os
         
     | 
| 6 | 
         
             
            from time import time
         
     | 
| 7 | 
         
             
            import pandas as pd
         
     | 
| 
         | 
|
| 
         | 
|
| 8 | 
         
             
            from huggingface_hub import login
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            HF_TOKEN = os.getenv("NEX_MODEL")  # Updated key name for clarity
         
     | 
| 
         | 
|
| 12 | 
         
             
            if not HF_TOKEN:
         
     | 
| 13 | 
         
             
                raise ValueError("Hugging Face token not found. Please set the 'NEX_MODEL' environment variable.")
         
     | 
| 14 | 
         | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
             
            # ==============================
         
     | 
| 18 | 
         
             
            # Helper: Human-readable bytes
         
     | 
| 19 | 
         
             
            def sizeof_fmt(num, suffix="B"):
         
     | 
| 
         | 
|
| 81 | 
         
             
                return total_memory /(1024*1024)
         
     | 
| 82 | 
         | 
| 83 | 
         
             
            @st.cache_resource
         
     | 
| 84 | 
         
            +
            def load_model_and_tokenizer():
         
     | 
| 85 | 
         
            +
                model_name = "GeneZC/MiniChat-1.5-3B"
         
     | 
| 86 | 
         
            +
                    
         
     | 
| 87 | 
         
            +
                
         
     | 
| 88 | 
         
            +
                tokenizer = AutoTokenizer.from_pretrained(
         
     | 
| 89 | 
         
             
                    model_name,
         
     | 
| 
         | 
|
| 
         | 
|
| 90 | 
         
             
                    trust_remote_code=True
         
     | 
| 91 | 
         
             
                    ,token=HF_TOKEN
         
     | 
| 92 | 
         
             
                )
         
     | 
| 93 | 
         
            +
                model = AutoModelForCausalLM.from_pretrained(
         
     | 
| 
         | 
|
| 
         | 
|
| 94 | 
         
             
                    model_name,
         
     | 
| 95 | 
         
            +
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
         
     | 
| 96 | 
         
            +
                    device_map="auto",
         
     | 
| 97 | 
         
            +
                    trust_remote_code=True
         
     | 
| 98 | 
         
             
                    ,token=HF_TOKEN
         
     | 
| 99 | 
         
             
                )
         
     | 
| 100 | 
         
            +
                return model, tokenizer
         
     | 
| 101 | 
         
            +
            def calculate_cache_size(cache):
         
     | 
| 102 | 
         
            +
                """
         
     | 
| 103 | 
         
            +
                Calculate the total memory used by the key-value cache (past_key_values) in megabytes.
         
     | 
| 104 | 
         
            +
                Args:
         
     | 
| 105 | 
         
            +
                    cache: The past_key_values object (usually a tuple of (key, value) pairs per layer).
         
     | 
| 106 | 
         
            +
                Returns:
         
     | 
| 107 | 
         
            +
                    Total memory in megabytes.
         
     | 
| 108 | 
         
            +
                """
         
     | 
| 109 | 
         
            +
                total_memory = 0
         
     | 
| 110 | 
         
            +
                for layer_cache in cache:
         
     | 
| 111 | 
         
            +
                    key_tensor, value_tensor = layer_cache
         
     | 
| 112 | 
         
            +
                    total_memory += key_tensor.element_size() * key_tensor.nelement()
         
     | 
| 113 | 
         
            +
                    total_memory += value_tensor.element_size() * value_tensor.nelement()
         
     | 
| 114 | 
         
            +
                return total_memory / (1024 * 1024)  # Convert to MB
         
     | 
| 115 | 
         
             
            def clone_cache(cache):
         
     | 
| 116 | 
         
             
                new_cache = DynamicCache()
         
     | 
| 117 | 
         
             
                for key, value in zip(cache.key_cache, cache.value_cache):
         
     | 
| 
         | 
|
| 126 | 
         
             
                    with open(file_path, 'r') as file:
         
     | 
| 127 | 
         
             
                        doc_text = file.read()
         
     | 
| 128 | 
         
             
                    doc_text_count = len(doc_text)
         
     | 
| 129 | 
         
            +
                    max_length = int(1.3 * (doc_text_count * 0.3 + 1))
         
     | 
| 130 | 
         
            +
                
         
     | 
| 131 | 
         
            +
                    # Cap the value at 16824
         
     | 
| 132 | 
         
            +
                    if max_length > 16824:
         
     | 
| 133 | 
         
            +
                        max_length = 16824
         
     | 
| 134 | 
         
            +
                    print(f" model_max_length set to: {max_length}")
         
     | 
| 135 | 
         
            +
                    
         
     | 
| 136 | 
         
            +
                    model, tokenizer = load_model_and_tokenizer()
         
     | 
| 137 | 
         
            +
                    tokenizer.model_max_length=max_length
         
     | 
| 138 | 
         
            +
             
     | 
| 139 | 
         
             
                    system_prompt = f"""
         
     | 
| 140 | 
         
             
                    <|system|>
         
     | 
| 141 | 
         
             
                    You are a helpful assistant. Provide concise, factual answers based only on the provided context.
         
     | 
| 
         | 
|
| 212 | 
         
             
                print(f"π Document Preview Display Time: {t_end3 - t_start3:.2f} s")
         
     | 
| 213 | 
         
             
                t_start4 = time()
         
     | 
| 214 | 
         
             
                # PART 4: Show Basic Info
         
     | 
| 215 | 
         
            +
                s_cache=calculate_cache_size(cache)
         
     | 
| 
         | 
|
| 216 | 
         
             
                t_end4 = time()
         
     | 
| 217 | 
         
             
                log.append(f"π doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s")
         
     | 
| 218 | 
         
            +
                print(f"π doc_size_kb Preview Display Time: {t_end4 - t_start4:.2f} s||||||| size of the cache : {s_cache} MB")
         
     | 
| 219 | 
         
             
                #st.info(
         
     | 
| 220 | 
         
             
                  #  f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "
         
     | 
| 221 | 
         
             
                   # f"Cache Size: {cache_size if cache_size == 'N/A' else f'{cache_size:.2f} KB'}"
         
     | 
| 
         | 
|
| 239 | 
         | 
| 240 | 
         
             
                        # PART 4.2: Tokenize Prompt
         
     | 
| 241 | 
         
             
                        t_start6 = time()
         
     | 
| 242 | 
         
            +
             
     | 
| 243 | 
         
             
                        full_prompt = f"""
         
     | 
| 244 | 
         
             
                        <|user|>
         
     | 
| 245 | 
         
            +
                        Question: Please provide a clear and concise answer to the question .{query}
         
     | 
| 246 | 
         
             
                        <|assistant|>
         
     | 
| 247 | 
         
             
                        """.strip()
         
     | 
| 248 | 
         
             
                        input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
         
     | 
| 
         | 
|
| 266 | 
         | 
| 267 | 
         
             
                        st.success("Answer:")
         
     | 
| 268 | 
         
             
                        st.write(response)
         
     | 
| 269 | 
         
            +
                        print(f"***************************************************************************************")
         
     | 
| 270 | 
         
             
                        # Final Info Display
         
     | 
| 271 | 
         
             
                        st.info(
         
     | 
| 272 | 
         
             
                         #   f"Document Chars: {len(doc_text)} | Size: {doc_size_kb:.2f} KB | "
         
     |