MVPilgrim commited on
Commit
c0ba0c7
·
1 Parent(s): 7b821bd
Files changed (3) hide show
  1. Dockerfile +2 -2
  2. README.md +1 -0
  3. app.py +3 -3
Dockerfile CHANGED
@@ -48,8 +48,8 @@ RUN FORCE_CMAKE=1 CMAKE_SYSTEM_PROCESSOR=AMD64 pip3 install --break-system-packa
48
  RUN pip3 install --break-system-packages cffi
49
  # Install text2vec-transformers
50
  WORKDIR /app/text2vec-transformers
51
- COPY --from=semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1 /app /app/text2vec-transformers
52
- COPY --from=semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1 /usr/local/bin /app/text2vec-transformers/bin
53
  COPY ./multi-qa-MiniLM-L6-cos-v1 /app/text2vec-transformers
54
  RUN ./custom_prerequisites.py
55
 
 
48
  RUN pip3 install --break-system-packages cffi
49
  # Install text2vec-transformers
50
  WORKDIR /app/text2vec-transformers
51
+ #COPY --from=semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1 /app /app/text2vec-transformers
52
+ #COPY --from=semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1 /usr/local/bin /app/text2vec-transformers/bin
53
  COPY ./multi-qa-MiniLM-L6-cos-v1 /app/text2vec-transformers
54
  RUN ./custom_prerequisites.py
55
 
README.md CHANGED
@@ -10,6 +10,7 @@ app_port: 8501
10
  #app_file: app.py
11
  pinned: true
12
  startup_duration_timeout: 3 hours
 
13
  ---
14
 
15
  # POC for Retrieval Augmented Generation with Large Language Models
 
10
  #app_file: app.py
11
  pinned: true
12
  startup_duration_timeout: 3 hours
13
+ hardware: gpu
14
  ---
15
 
16
  # POC for Retrieval Augmented Generation with Large Language Models
app.py CHANGED
@@ -346,7 +346,7 @@ try:
346
  logger.info("### Initializing LLM.")
347
  llm = Llama(model_path,
348
  #*,
349
- n_gpu_layers=0,
350
  split_mode=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
351
  main_gpu=0,
352
  tensor_split=None,
@@ -398,7 +398,7 @@ try:
398
  ###############################################################################
399
  # Initial the the sentence transformer and encode the query prompt.
400
  logger.debug(f"#### Encode text query prompt to create vectors. {promptText}")
401
- model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
402
  vector = model.encode(promptText)
403
 
404
  logLevel = logger.getEffectiveLevel()
@@ -539,7 +539,7 @@ try:
539
  placeHolder = st.empty()
540
  else:
541
  st.session_state.spinGenMsg = False;
542
- with st.spinner('Generating Completion (but slowly. 40+ seconds.)...'):
543
  st.session_state.sysTAtext = st.session_state.sysTA
544
  logger.debug(f"sysTAtext: {st.session_state.sysTAtext}")
545
  wrklist = setPrompt(st.session_state.userpTA,st.selectRag)
 
346
  logger.info("### Initializing LLM.")
347
  llm = Llama(model_path,
348
  #*,
349
+ n_gpu_layers=-1,
350
  split_mode=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
351
  main_gpu=0,
352
  tensor_split=None,
 
398
  ###############################################################################
399
  # Initial the the sentence transformer and encode the query prompt.
400
  logger.debug(f"#### Encode text query prompt to create vectors. {promptText}")
401
+ model = SentenceTransformer('/app/text2vec-transformers/multi-qa-MiniLM-L6-cos-v1')
402
  vector = model.encode(promptText)
403
 
404
  logLevel = logger.getEffectiveLevel()
 
539
  placeHolder = st.empty()
540
  else:
541
  st.session_state.spinGenMsg = False;
542
+ with st.spinner('Generating Completion...'):
543
  st.session_state.sysTAtext = st.session_state.sysTA
544
  logger.debug(f"sysTAtext: {st.session_state.sysTAtext}")
545
  wrklist = setPrompt(st.session_state.userpTA,st.selectRag)