Tim Luka Horstmann commited on
Commit
a2d5223
·
1 Parent(s): bd95004
Files changed (3) hide show
  1. Dockerfile +1 -0
  2. app.py +34 -17
  3. requirements.txt +2 -1
Dockerfile CHANGED
@@ -11,6 +11,7 @@ RUN apt-get update && apt-get install -y \
11
  libffi-dev \
12
  libgcc-s1 \
13
  libstdc++6 \
 
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
  # Set environment variables for cache
 
11
  libffi-dev \
12
  libgcc-s1 \
13
  libstdc++6 \
14
+ libopenblas-dev \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
  # Set environment variables for cache
app.py CHANGED
@@ -1,8 +1,6 @@
1
  import json
2
  import numpy as np
3
  from sentence_transformers import SentenceTransformer
4
- import torch
5
- import torch.nn.functional as F
6
  from fastapi import FastAPI, HTTPException
7
  from fastapi.responses import StreamingResponse
8
  from pydantic import BaseModel
@@ -10,6 +8,7 @@ from llama_cpp import Llama
10
  from huggingface_hub import login, hf_hub_download
11
  import logging
12
  import os
 
13
 
14
  # Set up logging
15
  logging.basicConfig(level=logging.INFO)
@@ -25,13 +24,16 @@ if not hf_token:
25
  login(token=hf_token)
26
 
27
  try:
28
- # Load precomputed CV embeddings
29
  logger.info("Loading CV embeddings from cv_embeddings.json")
30
  with open("cv_embeddings.json", "r", encoding="utf-8") as f:
31
  cv_data = json.load(f)
32
  cv_chunks = [item["chunk"] for item in cv_data]
33
- cv_embeddings = np.array([item["embedding"] for item in cv_data])
34
- cv_embeddings_tensor = torch.tensor(cv_embeddings)
 
 
 
35
  logger.info("CV embeddings loaded successfully")
36
 
37
  # Load embedding model
@@ -39,11 +41,13 @@ try:
39
  embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
40
  logger.info("SentenceTransformer model loaded")
41
 
42
- # Load Qwen 2.5 1.5B model with llama_cpp
43
- logger.info("Loading Qwen 2.5 1.5B model")
 
 
44
  model_path = hf_hub_download(
45
- repo_id="bartowski/Qwen2.5-7B-Instruct-GGUF", #"Qwen/Qwen2.5-1.5B-Instruct-GGUF",
46
- filename="Qwen2.5-7B-Instruct-Q6_K.gguf", #"qwen2.5-1.5b-instruct-q4_0.gguf",
47
  local_dir="/app/cache" if os.getenv("HF_HOME") else None,
48
  token=hf_token,
49
  )
@@ -52,8 +56,11 @@ try:
52
  model_path=model_path,
53
  n_ctx=2048,
54
  n_threads=4,
 
 
 
55
  )
56
- logger.info("Qwen 2.5 1.5B model loaded")
57
 
58
  except Exception as e:
59
  logger.error(f"Startup error: {str(e)}", exc_info=True)
@@ -61,11 +68,12 @@ except Exception as e:
61
 
62
  def retrieve_context(query, top_k=3):
63
  try:
64
- query_embedding = embedder.encode(query, convert_to_tensor=True).unsqueeze(0)
65
- similarities = F.cosine_similarity(query_embedding, cv_embeddings_tensor, dim=1)
66
- top_k = min(top_k, len(similarities))
67
- top_indices = torch.topk(similarities, k=top_k).indices.cpu().numpy()
68
- return "\n".join([cv_chunks[i] for i in top_indices])
 
69
  except Exception as e:
70
  logger.error(f"Error in retrieve_context: {str(e)}")
71
  raise
@@ -80,12 +88,13 @@ def stream_response(query):
80
  f"<|im_start|>assistant\n"
81
  )
82
 
83
- # Stream response with llama_cpp
84
  for chunk in generator(
85
  prompt,
86
  max_tokens=512,
87
  stream=True,
88
  stop=["<|im_end|>", "[DONE]"],
 
 
89
  ):
90
  yield f"data: {chunk['choices'][0]['text']}\n\n"
91
  yield "data: [DONE]\n\n"
@@ -106,4 +115,12 @@ async def predict(request: QueryRequest):
106
 
107
  @app.get("/health")
108
  async def health_check():
109
- return {"status": "healthy"}
 
 
 
 
 
 
 
 
 
1
  import json
2
  import numpy as np
3
  from sentence_transformers import SentenceTransformer
 
 
4
  from fastapi import FastAPI, HTTPException
5
  from fastapi.responses import StreamingResponse
6
  from pydantic import BaseModel
 
8
  from huggingface_hub import login, hf_hub_download
9
  import logging
10
  import os
11
+ import faiss
12
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO)
 
24
  login(token=hf_token)
25
 
26
  try:
27
+ # Load precomputed CV embeddings and build FAISS index
28
  logger.info("Loading CV embeddings from cv_embeddings.json")
29
  with open("cv_embeddings.json", "r", encoding="utf-8") as f:
30
  cv_data = json.load(f)
31
  cv_chunks = [item["chunk"] for item in cv_data]
32
+ cv_embeddings = np.array([item["embedding"] for item in cv_data]).astype('float32')
33
+ faiss.normalize_L2(cv_embeddings) # Normalize for cosine similarity
34
+ faiss_index = faiss.IndexFlatIP(cv_embeddings.shape[1]) # Inner Product for cosine similarity
35
+ faiss_index.add(cv_embeddings)
36
+ logger.info("FAISS index built successfully")
37
  logger.info("CV embeddings loaded successfully")
38
 
39
  # Load embedding model
 
41
  embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
42
  logger.info("SentenceTransformer model loaded")
43
 
44
+ # Load Qwen 2.5 7B model
45
+ repo_id = "bartowski/Qwen2.5-7B-Instruct-GGUF"
46
+ filename = "Qwen2.5-7B-Instruct-Q4_K_M.gguf"
47
+ logger.info(f"Loading {filename} model")
48
  model_path = hf_hub_download(
49
+ repo_id=repo_id,
50
+ filename=filename,
51
  local_dir="/app/cache" if os.getenv("HF_HOME") else None,
52
  token=hf_token,
53
  )
 
56
  model_path=model_path,
57
  n_ctx=2048,
58
  n_threads=4,
59
+ n_batch=512, # Increase batch size for faster eval
60
+ n_gpu_layers=0, # Explicitly set to 0 (no GPU in HF Spaces)
61
+ verbose=True, # Keep for perf logging
62
  )
63
+ logger.info(f"{filename} model loaded")
64
 
65
  except Exception as e:
66
  logger.error(f"Startup error: {str(e)}", exc_info=True)
 
68
 
69
  def retrieve_context(query, top_k=3):
70
  try:
71
+ # Encode query and normalize for FAISS
72
+ query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
73
+ query_embedding = query_embedding.reshape(1, -1)
74
+ faiss.normalize_L2(query_embedding)
75
+ distances, indices = faiss_index.search(query_embedding, top_k)
76
+ return "\n".join([cv_chunks[i] for i in indices[0]])
77
  except Exception as e:
78
  logger.error(f"Error in retrieve_context: {str(e)}")
79
  raise
 
88
  f"<|im_start|>assistant\n"
89
  )
90
 
 
91
  for chunk in generator(
92
  prompt,
93
  max_tokens=512,
94
  stream=True,
95
  stop=["<|im_end|>", "[DONE]"],
96
+ temperature=0.7, # Slightly lower for consistency
97
+ top_p=0.9, # Narrow sampling for faster generation
98
  ):
99
  yield f"data: {chunk['choices'][0]['text']}\n\n"
100
  yield "data: [DONE]\n\n"
 
115
 
116
  @app.get("/health")
117
  async def health_check():
118
+ return {"status": "healthy"}
119
+
120
+ @app.on_event("startup")
121
+ async def warm_up_model():
122
+ logger.info("Warming up the model...")
123
+ dummy_query = "Hello, please warm up your model."
124
+ for _ in stream_response(dummy_query):
125
+ pass
126
+ logger.info("Model warm-up complete.")
requirements.txt CHANGED
@@ -4,4 +4,5 @@ sentence-transformers==3.1.1
4
  torch==2.4.1
5
  numpy==1.26.4
6
  llama-cpp-python==0.3.1
7
- huggingface_hub==0.30.1
 
 
4
  torch==2.4.1
5
  numpy==1.26.4
6
  llama-cpp-python==0.3.1
7
+ huggingface_hub==0.30.1
8
+ faiss-cpu==1.8.0