Tim Luka Horstmann
commited on
Commit
·
a29c4ff
1
Parent(s):
83ec808
Switched to Llama-3.2-1B Q4_K, added impersonation, optimized performance
Browse files
app.py
CHANGED
|
@@ -19,31 +19,33 @@ app = FastAPI()
|
|
| 19 |
# Authenticate with Hugging Face
|
| 20 |
hf_token = os.getenv("HF_TOKEN")
|
| 21 |
if not hf_token:
|
| 22 |
-
logger.error("HF_TOKEN environment variable not set.
|
| 23 |
raise ValueError("HF_TOKEN not set")
|
| 24 |
login(token=hf_token)
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
try:
|
| 27 |
-
# Load
|
| 28 |
logger.info("Loading CV embeddings from cv_embeddings.json")
|
| 29 |
with open("cv_embeddings.json", "r", encoding="utf-8") as f:
|
| 30 |
cv_data = json.load(f)
|
| 31 |
cv_chunks = [item["chunk"] for item in cv_data]
|
| 32 |
cv_embeddings = np.array([item["embedding"] for item in cv_data]).astype('float32')
|
| 33 |
-
faiss.normalize_L2(cv_embeddings)
|
| 34 |
-
faiss_index = faiss.IndexFlatIP(cv_embeddings.shape[1])
|
| 35 |
faiss_index.add(cv_embeddings)
|
| 36 |
logger.info("FAISS index built successfully")
|
| 37 |
-
logger.info("CV embeddings loaded successfully")
|
| 38 |
|
| 39 |
# Load embedding model
|
| 40 |
logger.info("Loading SentenceTransformer model")
|
| 41 |
-
embedder = SentenceTransformer(
|
| 42 |
logger.info("SentenceTransformer model loaded")
|
| 43 |
|
| 44 |
-
# Load
|
| 45 |
-
repo_id = "bartowski/Llama-3.2-3B-Instruct-GGUF" # "bartowski/Qwen2.5-7B-Instruct-GGUF"
|
| 46 |
-
filename = "Llama-3.2-3B-Instruct-Q6_K_L.gguf" # "Qwen2.5-7B-Instruct-Q4_K_M.gguf"
|
| 47 |
logger.info(f"Loading {filename} model")
|
| 48 |
model_path = hf_hub_download(
|
| 49 |
repo_id=repo_id,
|
|
@@ -54,11 +56,11 @@ try:
|
|
| 54 |
|
| 55 |
generator = Llama(
|
| 56 |
model_path=model_path,
|
| 57 |
-
n_ctx=
|
| 58 |
-
n_threads=
|
| 59 |
-
n_batch=512,
|
| 60 |
-
n_gpu_layers=0,
|
| 61 |
-
verbose=True,
|
| 62 |
)
|
| 63 |
logger.info(f"{filename} model loaded")
|
| 64 |
|
|
@@ -68,7 +70,6 @@ except Exception as e:
|
|
| 68 |
|
| 69 |
def retrieve_context(query, top_k=3):
|
| 70 |
try:
|
| 71 |
-
# Encode query and normalize for FAISS
|
| 72 |
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
|
| 73 |
query_embedding = query_embedding.reshape(1, -1)
|
| 74 |
faiss.normalize_L2(query_embedding)
|
|
@@ -83,8 +84,9 @@ def stream_response(query):
|
|
| 83 |
logger.info(f"Processing query: {query}")
|
| 84 |
context = retrieve_context(query)
|
| 85 |
prompt = (
|
| 86 |
-
f"<|im_start|>system\
|
| 87 |
-
f"<|
|
|
|
|
| 88 |
f"<|im_start|>assistant\n"
|
| 89 |
)
|
| 90 |
|
|
@@ -93,8 +95,8 @@ def stream_response(query):
|
|
| 93 |
max_tokens=512,
|
| 94 |
stream=True,
|
| 95 |
stop=["<|im_end|>", "[DONE]"],
|
| 96 |
-
temperature=0.7,
|
| 97 |
-
top_p=0.9,
|
| 98 |
):
|
| 99 |
yield f"data: {chunk['choices'][0]['text']}\n\n"
|
| 100 |
yield "data: [DONE]\n\n"
|
|
@@ -117,10 +119,20 @@ async def predict(request: QueryRequest):
|
|
| 117 |
async def health_check():
|
| 118 |
return {"status": "healthy"}
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
@app.on_event("startup")
|
| 121 |
async def warm_up_model():
|
| 122 |
logger.info("Warming up the model...")
|
| 123 |
-
dummy_query = "
|
| 124 |
for _ in stream_response(dummy_query):
|
| 125 |
pass
|
| 126 |
logger.info("Model warm-up complete.")
|
|
|
|
| 19 |
# Authenticate with Hugging Face
|
| 20 |
hf_token = os.getenv("HF_TOKEN")
|
| 21 |
if not hf_token:
|
| 22 |
+
logger.error("HF_TOKEN environment variable not set.")
|
| 23 |
raise ValueError("HF_TOKEN not set")
|
| 24 |
login(token=hf_token)
|
| 25 |
|
| 26 |
+
# Models
|
| 27 |
+
sentence_transformer_model = "all-MiniLM-L6-v2"
|
| 28 |
+
repo_id = "bartowski/Llama-3.2-1B-Instruct-GGUF"
|
| 29 |
+
filename = "Llama-3.2-1B-Instruct-Q4_K_M.gguf" # Q4_K for speed
|
| 30 |
+
|
| 31 |
try:
|
| 32 |
+
# Load CV embeddings and build FAISS index
|
| 33 |
logger.info("Loading CV embeddings from cv_embeddings.json")
|
| 34 |
with open("cv_embeddings.json", "r", encoding="utf-8") as f:
|
| 35 |
cv_data = json.load(f)
|
| 36 |
cv_chunks = [item["chunk"] for item in cv_data]
|
| 37 |
cv_embeddings = np.array([item["embedding"] for item in cv_data]).astype('float32')
|
| 38 |
+
faiss.normalize_L2(cv_embeddings)
|
| 39 |
+
faiss_index = faiss.IndexFlatIP(cv_embeddings.shape[1])
|
| 40 |
faiss_index.add(cv_embeddings)
|
| 41 |
logger.info("FAISS index built successfully")
|
|
|
|
| 42 |
|
| 43 |
# Load embedding model
|
| 44 |
logger.info("Loading SentenceTransformer model")
|
| 45 |
+
embedder = SentenceTransformer(sentence_transformer_model, device="cpu")
|
| 46 |
logger.info("SentenceTransformer model loaded")
|
| 47 |
|
| 48 |
+
# Load Llama model
|
|
|
|
|
|
|
| 49 |
logger.info(f"Loading {filename} model")
|
| 50 |
model_path = hf_hub_download(
|
| 51 |
repo_id=repo_id,
|
|
|
|
| 56 |
|
| 57 |
generator = Llama(
|
| 58 |
model_path=model_path,
|
| 59 |
+
n_ctx=1024, # Reduced for speed
|
| 60 |
+
n_threads=2, # Match HF Spaces vCPUs
|
| 61 |
+
n_batch=512,
|
| 62 |
+
n_gpu_layers=0,
|
| 63 |
+
verbose=True,
|
| 64 |
)
|
| 65 |
logger.info(f"{filename} model loaded")
|
| 66 |
|
|
|
|
| 70 |
|
| 71 |
def retrieve_context(query, top_k=3):
|
| 72 |
try:
|
|
|
|
| 73 |
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
|
| 74 |
query_embedding = query_embedding.reshape(1, -1)
|
| 75 |
faiss.normalize_L2(query_embedding)
|
|
|
|
| 84 |
logger.info(f"Processing query: {query}")
|
| 85 |
context = retrieve_context(query)
|
| 86 |
prompt = (
|
| 87 |
+
f"<|im_start|>system\nI am Tim Luka Horstmann, a German Computer Scientist. This is my CV:\n{context}\n"
|
| 88 |
+
f"I will answer your questions about my CV as myself. Please ask me anything!\n<|im_end|>\n"
|
| 89 |
+
f"<|im_start|>user\n{query}\n<|im_end>\n"
|
| 90 |
f"<|im_start|>assistant\n"
|
| 91 |
)
|
| 92 |
|
|
|
|
| 95 |
max_tokens=512,
|
| 96 |
stream=True,
|
| 97 |
stop=["<|im_end|>", "[DONE]"],
|
| 98 |
+
temperature=0.7,
|
| 99 |
+
top_p=0.9,
|
| 100 |
):
|
| 101 |
yield f"data: {chunk['choices'][0]['text']}\n\n"
|
| 102 |
yield "data: [DONE]\n\n"
|
|
|
|
| 119 |
async def health_check():
|
| 120 |
return {"status": "healthy"}
|
| 121 |
|
| 122 |
+
@app.get("/model_info")
|
| 123 |
+
async def model_info():
|
| 124 |
+
return {
|
| 125 |
+
"model_name": "Llama-3.2-1B-Instruct-GGUF",
|
| 126 |
+
"model_size": "1B",
|
| 127 |
+
"embedding_model": sentence_transformer_model,
|
| 128 |
+
"faiss_index_size": len(cv_chunks),
|
| 129 |
+
"faiss_index_dim": cv_embeddings.shape[1],
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
@app.on_event("startup")
|
| 133 |
async def warm_up_model():
|
| 134 |
logger.info("Warming up the model...")
|
| 135 |
+
dummy_query = "Hi" # Shorter prompt
|
| 136 |
for _ in stream_response(dummy_query):
|
| 137 |
pass
|
| 138 |
logger.info("Model warm-up complete.")
|