Sentinel-AI-Web-Search-Test

Sleeping

App Files Files Community

Shreyas094 commited on Jun 20, 2024

Commit

753d9d8

verified ·

1 Parent(s): a65ba38

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -52

app.py CHANGED Viewed

@@ -1,69 +1,30 @@
-import os
-import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-from huggingface_hub import login
-import time
-import torch.quantization
-# Directly assign your Hugging Face token here
-hf_token = "your_hugging_face_api_token"
-# Log in to Hugging Face
-login(token=hf_token)
-# Load the Mixtral-8x7B-Instruct model and tokenizer with authorization header
-model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
-headers = {"Authorization": f"Bearer {hf_token}"}
-# Ensure sentencepiece is installed
-try:
-    import sentencepiece
-except ImportError:
-    raise ImportError("The sentencepiece library is required for this tokenizer. Please install it with `pip install sentencepiece`.")
-# Start time to measure execution time
-start_time = time.time()
-# Load tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
-model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token)
-# Quantize the model
-quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
-# Check if a GPU is available and if not, fall back to CPU
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 quantized_model.to(device)
-# Measure time for loading tokenizer, model, and quantization
-loading_time = time.time() - start_time
-print(f"Time taken to load tokenizer, model, and quantize: {loading_time:.2f} seconds")
 # Example text input
 text_input = "How did Tesla perform in Q1 2024?"
-# Start time for inference
-inference_start_time = time.time()
-# Tokenize the input text
 inputs = tokenizer(text_input, return_tensors="pt").to(device)
-# Measure time for tokenization
-tokenization_time = time.time() - inference_start_time
-# Generate a response
 outputs = quantized_model.generate(**inputs, max_length=150, do_sample=False)
-# Measure time for inference
-inference_time = time.time() - inference_start_time
-print(f"Time taken for inference with quantized model: {inference_time:.2f} seconds")
-# Decode the generated tokens to a readable string
 response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-# Print the response
-print(f"Generated response: {response}")
-# Total execution time
-total_time = time.time() - start_time
-print(f"Total execution time with quantized model: {total_time:.2f} seconds")

 from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+# Path to the locally saved quantized model directory
+model_path = '/path/to/your/quantized_model_directory'
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Load quantized model
+quantized_model = AutoModelForCausalLM.from_pretrained(model_path)
+# Check if a GPU is available and move model to GPU if available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 quantized_model.to(device)
 # Example text input
 text_input = "How did Tesla perform in Q1 2024?"
+# Tokenize input
 inputs = tokenizer(text_input, return_tensors="pt").to(device)
+# Generate response
 outputs = quantized_model.generate(**inputs, max_length=150, do_sample=False)
+# Decode generated tokens to readable string
 response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+# Print generated response
+print(f"Generated response: {response}")