File size: 2,477 Bytes
f6e13e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/bin/bash
cd text-generation-inference
conda create -n tgi python=3.11
eval "$(/home/user/miniconda3/bin/conda shell.bash hook)"
conda install -c conda-forge pkg-config openssl


conda activate tgi
export OPENSSL_DIR=$CONDA_PREFIX && \
export OPENSSL_INCLUDE_DIR=$CONDA_PREFIX/include && \
export OPENSSL_LIB_DIR=$CONDA_PREFIX/lib && \
export PKG_CONFIG_PATH=$CONDA_PREFIX/lib/pkgconfig
export PYTHONPATH=/home/user/miniconda3/envs/tgi/lib/python3.11/site-packages
export LD_LIBRARY_PATH=/home/user/:$LD_LIBRARY_PATH
ln -s /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 /home/user/libnvidia-ml.so

nohup text-generation-launcher   --model-id HuggingFaceH4/zephyr-7b-beta   -p 7860 &> qwen2.log &

PYTHONPATH=/home/user/:$PYTHONPATH \
LD_LIBRARY_PATH=/home/user/:$LD_LIBRARY_PATH \
text-generation-launcher \
  --model-id HuggingFaceH4/zephyr-7b-beta \
  --disable-custom-kernels \
  -p 7860

LD_LIBRARY_PATH=/home/user/:$LD_LIBRARY_PATH \
text-generation-launcher \
  --model-id HuggingFaceH4/zephyr-7b-beta \
  -p 7860

LD_LIBRARY_PATH=/home/user/:$LD_LIBRARY_PATH text-generation-launcher --model-id Qwen/Qwen2.5-VL-7B-Instruct -p 7860
LD_LIBRARY_PATH=/home/user/:$LD_LIBRARY_PATH text-generation-launcher --model-id Qwen/Qwen3-8B -p 7860


text-generation-launcher \
  --model-id HuggingFaceH4/zephyr-7b-beta \
  --disable-custom-kernels


# To run the server in the background, use:
nohup text-generation-launcher \
    --model-id mistralai/Mistral-7B-v0.1 \
    --port 8080 \
    --max-batch-prefill-tokens 2048 \
    --max-batch-total-tokens 4096 \
    --max-input-length 4096 \
    --max-total-tokens 8192 \
    --max-batch-size 32 \
    --max-waiting-tokens 20 \
    --hostname 0.0.0.0 \
    --cuda-memory-fraction 0.95 \
    --max-concurrent-requests 128 \
    --trust-remote-code \
    --json-output > tgi.log 2>&1 &


# To stop the server, use:
ps aux | grep text-generation-launcher
pkill -f text-generation-launcher
kill -9 $(nvidia-smi | grep python | awk '{ print $5 }')



curl https://jdelavande-dev-tgi.hf.space/generate \
  -X POST \
  -H "Content-Type: application/json" \
  -d '{"inputs":"Bonjour !", "parameters":{"max_new_tokens":20}}'

  curl https://jdelavande-dev-tgi2.hf.space/ \
  -X POST \
  -H "Content-Type: application/json" \
  -d '{"inputs":"Bonjour !", "parameters":{"max_new_tokens":20}}'

    curl localhost:7860/generate \
  -X POST \
  -H "Content-Type: application/json" \
  -d '{"inputs":"Bonjour !", "parameters":{"max_new_tokens":20}}'