Spaces:
Sleeping
Sleeping
FROM archlinux:latest | |
ENV DEBIAN_FRONTEND=noninteractive | |
# passed from space environment | |
ARG MODEL_ID="unsloth/Qwen3-1.7B-GGUF" | |
ARG QUANT="Q4_K_M" | |
ARG SERVED_NAME="qwen-nano" | |
ARG PARALLEL=4 | |
ARG CTX_SIZE=8192 | |
ARG EMBEDDING_ONLY=0 | |
ARG RERANK_ONLY=0 | |
# llama.cpp env configs | |
ENV LLAMA_ARG_HF_REPO="${MODEL_ID}:${QUANT}" | |
ENV LLAMA_ARG_CTX_SIZE=${CTX_SIZE} | |
ENV LLAMA_ARG_BATCH=512 | |
ENV LLAMA_ARG_N_PARALLEL=${PARALLEL} | |
ENV LLAMA_ARG_FLASH_ATTN=1 | |
ENV LLAMA_ARG_CACHE_TYPE_K="q8_0" | |
ENV LLAMA_ARG_CACHE_TYPE_V="q4_1" | |
ENV LLAMA_ARG_MLOCK=1 | |
ENV LLAMA_ARG_N_GPU_LAYERS=0 | |
ENV LLAMA_ARG_HOST="0.0.0.0" | |
ENV LLAMA_ARG_PORT=7860 | |
ENV LLAMA_ARG_ALIAS="${SERVED_NAME}" | |
ENV LLAMA_ARG_EMBEDDINGS=${EMBEDDING_ONLY} | |
ENV LLAMA_ARG_RERANKING=${RERANK_ONLY} | |
RUN pacman -Syu --noconfirm --overwrite '*' | |
RUN pacman -S base-devel git git-lfs cmake curl openblas openblas64 blas64-openblas python gcc-libs glibc --noconfirm --overwrite '*' | |
RUN mkdir -p /app && mkdir -p /.cache | |
# cache dir for llama.cpp to download models | |
RUN chmod -R 777 /.cache | |
WORKDIR /app | |
RUN git clone --depth 1 --single-branch --branch master https://github.com/ggml-org/llama.cpp.git | |
# RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git llama.cpp | |
WORKDIR /app/llama.cpp | |
RUN cmake -B build \ | |
-DGGML_LTO=ON \ | |
-DLLAMA_CURL=ON \ | |
-DLLAMA_BUILD_SERVER=ON \ | |
-DLLAMA_BUILD_EXAMPLES=ON \ | |
-DGGML_ALL_WARNINGS=OFF \ | |
-DGGML_ALL_WARNINGS_3RD_PARTY=OFF \ | |
-DGGML_BLAS=ON \ | |
-DGGML_BLAS_VENDOR=OpenBLAS \ | |
-DGGML_NATIVE=ON \ | |
-DGGML_LLAMAFILE=ON \ | |
-Wno-dev \ | |
-DCMAKE_BUILD_TYPE=Release | |
RUN cmake --build build --config Release --target llama-server -j $(nproc) | |
WORKDIR /app | |
EXPOSE 7860 | |
CMD ["/app/llama.cpp/build/bin/llama-server", "--verbose-prompt", "--swa-full"] | |