import os import time import torch import gradio as gr from huggingface_hub import login from transformers import AutoTokenizer from auto_gptq import AutoGPTQForCausalLM from sentence_transformers import SentenceTransformer from langchain_community.vectorstores import FAISS # Load HF token and login hf_token = os.environ.get("HUGGINGFACE_TOKEN") if not hf_token: raise ValueError("Please set the HUGGINGFACE_TOKEN environment variable") login(token=hf_token) # Load tokenizer and quantized model model_id = "TheBloke/mistral-7B-GPTQ" device = "cuda" if torch.cuda.is_available() else "cpu" print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) print("Loading quantized model...") start = time.time() model = AutoGPTQForCausalLM.from_quantized( model_id, use_safetensors=True, device=device, use_triton=True, quantize_config=None, ) print(f"Model loaded in {time.time() - start:.2f} seconds on {device}") # Load embedding model for FAISS vector store embedder = SentenceTransformer("all-MiniLM-L6-v2") # Sample documents to build vector index (can replace with your own) texts = [ "Hello world", "Mistral 7B is a powerful language model", "Langchain and FAISS make vector search easy", "This is a test document for vector search", ] embeddings = embedder.encode(texts) faiss_index = FAISS.from_embeddings(embeddings, texts) # Generate text from prompt def generate_text(prompt, max_length=128): inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate(**inputs, max_length=max_length) decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) return decoded # Search docs with vector similarity def search_docs(query): query_emb = embedder.encode([query]) results = faiss_index.similarity_search_by_vector(query_emb[0], k=3) return "\n\n".join(results) # Gradio UI with gr.Blocks() as demo: gr.Markdown("# Mistral GPTQ + FAISS Vector Search Demo") with gr.Tab("Text Generation"): prompt_input = gr.Textbox(label="Enter prompt", lines=3) generate_btn = gr.Button("Generate") output_text = gr.Textbox(label="Output", lines=6) generate_btn.click(fn=generate_text, inputs=prompt_input, outputs=output_text) with gr.Tab("Vector Search"): query_input = gr.Textbox(label="Enter search query", lines=2) search_btn = gr.Button("Search") search_output = gr.Textbox(label="Search Results", lines=6) search_btn.click(fn=search_docs, inputs=query_input, outputs=search_output) if __name__ == "__main__": demo.launch()