Tim Luka Horstmann
commited on
Commit
·
a2d5223
1
Parent(s):
bd95004
Speedup
Browse files- Dockerfile +1 -0
- app.py +34 -17
- requirements.txt +2 -1
Dockerfile
CHANGED
@@ -11,6 +11,7 @@ RUN apt-get update && apt-get install -y \
|
|
11 |
libffi-dev \
|
12 |
libgcc-s1 \
|
13 |
libstdc++6 \
|
|
|
14 |
&& rm -rf /var/lib/apt/lists/*
|
15 |
|
16 |
# Set environment variables for cache
|
|
|
11 |
libffi-dev \
|
12 |
libgcc-s1 \
|
13 |
libstdc++6 \
|
14 |
+
libopenblas-dev \
|
15 |
&& rm -rf /var/lib/apt/lists/*
|
16 |
|
17 |
# Set environment variables for cache
|
app.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
import json
|
2 |
import numpy as np
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
-
import torch
|
5 |
-
import torch.nn.functional as F
|
6 |
from fastapi import FastAPI, HTTPException
|
7 |
from fastapi.responses import StreamingResponse
|
8 |
from pydantic import BaseModel
|
@@ -10,6 +8,7 @@ from llama_cpp import Llama
|
|
10 |
from huggingface_hub import login, hf_hub_download
|
11 |
import logging
|
12 |
import os
|
|
|
13 |
|
14 |
# Set up logging
|
15 |
logging.basicConfig(level=logging.INFO)
|
@@ -25,13 +24,16 @@ if not hf_token:
|
|
25 |
login(token=hf_token)
|
26 |
|
27 |
try:
|
28 |
-
# Load precomputed CV embeddings
|
29 |
logger.info("Loading CV embeddings from cv_embeddings.json")
|
30 |
with open("cv_embeddings.json", "r", encoding="utf-8") as f:
|
31 |
cv_data = json.load(f)
|
32 |
cv_chunks = [item["chunk"] for item in cv_data]
|
33 |
-
cv_embeddings = np.array([item["embedding"] for item in cv_data])
|
34 |
-
|
|
|
|
|
|
|
35 |
logger.info("CV embeddings loaded successfully")
|
36 |
|
37 |
# Load embedding model
|
@@ -39,11 +41,13 @@ try:
|
|
39 |
embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
|
40 |
logger.info("SentenceTransformer model loaded")
|
41 |
|
42 |
-
# Load Qwen 2.5
|
43 |
-
|
|
|
|
|
44 |
model_path = hf_hub_download(
|
45 |
-
repo_id=
|
46 |
-
filename=
|
47 |
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
|
48 |
token=hf_token,
|
49 |
)
|
@@ -52,8 +56,11 @@ try:
|
|
52 |
model_path=model_path,
|
53 |
n_ctx=2048,
|
54 |
n_threads=4,
|
|
|
|
|
|
|
55 |
)
|
56 |
-
logger.info("
|
57 |
|
58 |
except Exception as e:
|
59 |
logger.error(f"Startup error: {str(e)}", exc_info=True)
|
@@ -61,11 +68,12 @@ except Exception as e:
|
|
61 |
|
62 |
def retrieve_context(query, top_k=3):
|
63 |
try:
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
69 |
except Exception as e:
|
70 |
logger.error(f"Error in retrieve_context: {str(e)}")
|
71 |
raise
|
@@ -80,12 +88,13 @@ def stream_response(query):
|
|
80 |
f"<|im_start|>assistant\n"
|
81 |
)
|
82 |
|
83 |
-
# Stream response with llama_cpp
|
84 |
for chunk in generator(
|
85 |
prompt,
|
86 |
max_tokens=512,
|
87 |
stream=True,
|
88 |
stop=["<|im_end|>", "[DONE]"],
|
|
|
|
|
89 |
):
|
90 |
yield f"data: {chunk['choices'][0]['text']}\n\n"
|
91 |
yield "data: [DONE]\n\n"
|
@@ -106,4 +115,12 @@ async def predict(request: QueryRequest):
|
|
106 |
|
107 |
@app.get("/health")
|
108 |
async def health_check():
|
109 |
-
return {"status": "healthy"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import numpy as np
|
3 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
4 |
from fastapi import FastAPI, HTTPException
|
5 |
from fastapi.responses import StreamingResponse
|
6 |
from pydantic import BaseModel
|
|
|
8 |
from huggingface_hub import login, hf_hub_download
|
9 |
import logging
|
10 |
import os
|
11 |
+
import faiss
|
12 |
|
13 |
# Set up logging
|
14 |
logging.basicConfig(level=logging.INFO)
|
|
|
24 |
login(token=hf_token)
|
25 |
|
26 |
try:
|
27 |
+
# Load precomputed CV embeddings and build FAISS index
|
28 |
logger.info("Loading CV embeddings from cv_embeddings.json")
|
29 |
with open("cv_embeddings.json", "r", encoding="utf-8") as f:
|
30 |
cv_data = json.load(f)
|
31 |
cv_chunks = [item["chunk"] for item in cv_data]
|
32 |
+
cv_embeddings = np.array([item["embedding"] for item in cv_data]).astype('float32')
|
33 |
+
faiss.normalize_L2(cv_embeddings) # Normalize for cosine similarity
|
34 |
+
faiss_index = faiss.IndexFlatIP(cv_embeddings.shape[1]) # Inner Product for cosine similarity
|
35 |
+
faiss_index.add(cv_embeddings)
|
36 |
+
logger.info("FAISS index built successfully")
|
37 |
logger.info("CV embeddings loaded successfully")
|
38 |
|
39 |
# Load embedding model
|
|
|
41 |
embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
|
42 |
logger.info("SentenceTransformer model loaded")
|
43 |
|
44 |
+
# Load Qwen 2.5 7B model
|
45 |
+
repo_id = "bartowski/Qwen2.5-7B-Instruct-GGUF"
|
46 |
+
filename = "Qwen2.5-7B-Instruct-Q4_K_M.gguf"
|
47 |
+
logger.info(f"Loading {filename} model")
|
48 |
model_path = hf_hub_download(
|
49 |
+
repo_id=repo_id,
|
50 |
+
filename=filename,
|
51 |
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
|
52 |
token=hf_token,
|
53 |
)
|
|
|
56 |
model_path=model_path,
|
57 |
n_ctx=2048,
|
58 |
n_threads=4,
|
59 |
+
n_batch=512, # Increase batch size for faster eval
|
60 |
+
n_gpu_layers=0, # Explicitly set to 0 (no GPU in HF Spaces)
|
61 |
+
verbose=True, # Keep for perf logging
|
62 |
)
|
63 |
+
logger.info(f"{filename} model loaded")
|
64 |
|
65 |
except Exception as e:
|
66 |
logger.error(f"Startup error: {str(e)}", exc_info=True)
|
|
|
68 |
|
69 |
def retrieve_context(query, top_k=3):
|
70 |
try:
|
71 |
+
# Encode query and normalize for FAISS
|
72 |
+
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
|
73 |
+
query_embedding = query_embedding.reshape(1, -1)
|
74 |
+
faiss.normalize_L2(query_embedding)
|
75 |
+
distances, indices = faiss_index.search(query_embedding, top_k)
|
76 |
+
return "\n".join([cv_chunks[i] for i in indices[0]])
|
77 |
except Exception as e:
|
78 |
logger.error(f"Error in retrieve_context: {str(e)}")
|
79 |
raise
|
|
|
88 |
f"<|im_start|>assistant\n"
|
89 |
)
|
90 |
|
|
|
91 |
for chunk in generator(
|
92 |
prompt,
|
93 |
max_tokens=512,
|
94 |
stream=True,
|
95 |
stop=["<|im_end|>", "[DONE]"],
|
96 |
+
temperature=0.7, # Slightly lower for consistency
|
97 |
+
top_p=0.9, # Narrow sampling for faster generation
|
98 |
):
|
99 |
yield f"data: {chunk['choices'][0]['text']}\n\n"
|
100 |
yield "data: [DONE]\n\n"
|
|
|
115 |
|
116 |
@app.get("/health")
|
117 |
async def health_check():
|
118 |
+
return {"status": "healthy"}
|
119 |
+
|
120 |
+
@app.on_event("startup")
|
121 |
+
async def warm_up_model():
|
122 |
+
logger.info("Warming up the model...")
|
123 |
+
dummy_query = "Hello, please warm up your model."
|
124 |
+
for _ in stream_response(dummy_query):
|
125 |
+
pass
|
126 |
+
logger.info("Model warm-up complete.")
|
requirements.txt
CHANGED
@@ -4,4 +4,5 @@ sentence-transformers==3.1.1
|
|
4 |
torch==2.4.1
|
5 |
numpy==1.26.4
|
6 |
llama-cpp-python==0.3.1
|
7 |
-
huggingface_hub==0.30.1
|
|
|
|
4 |
torch==2.4.1
|
5 |
numpy==1.26.4
|
6 |
llama-cpp-python==0.3.1
|
7 |
+
huggingface_hub==0.30.1
|
8 |
+
faiss-cpu==1.8.0
|