Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -8,6 +8,7 @@ from requests import JSONDecodeError
|
|
8 |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
9 |
from langchain_community.vectorstores import SupabaseVectorStore
|
10 |
from langchain_community.llms import HuggingFaceEndpoint
|
|
|
11 |
|
12 |
from langchain.chains import ConversationalRetrievalChain
|
13 |
from langchain.memory import ConversationBufferMemory
|
@@ -60,17 +61,27 @@ def response_generator(query: str) -> str:
|
|
60 |
logger.info("Using HF model %s", model)
|
61 |
|
62 |
# prepare HF text-generation LLM
|
63 |
-
hf = HuggingFaceEndpoint(
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
# conversational RAG chain
|
76 |
qa = ConversationalRetrievalChain.from_llm(
|
|
|
8 |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
9 |
from langchain_community.vectorstores import SupabaseVectorStore
|
10 |
from langchain_community.llms import HuggingFaceEndpoint
|
11 |
+
from langchain_openai import ChatOpenAI
|
12 |
|
13 |
from langchain.chains import ConversationalRetrievalChain
|
14 |
from langchain.memory import ConversationBufferMemory
|
|
|
61 |
logger.info("Using HF model %s", model)
|
62 |
|
63 |
# prepare HF text-generation LLM
|
64 |
+
# hf = HuggingFaceEndpoint(
|
65 |
+
# # endpoint_url=f"https://api-inference.huggingface.co/models/{model}",
|
66 |
+
# endpoint_url=f"https://router.huggingface.co/hf-inference/models/{model}",
|
67 |
+
# task="text-generation",
|
68 |
+
# huggingfacehub_api_token=hf_api_key,
|
69 |
+
# model_kwargs={
|
70 |
+
# "temperature": temperature,
|
71 |
+
# "max_new_tokens": max_tokens,
|
72 |
+
# "return_full_text": False,
|
73 |
+
# },
|
74 |
+
# )
|
75 |
+
|
76 |
+
hf = ChatOpenAI(
|
77 |
+
base_url=f"https://router.huggingface.co/hf-inference/models/{model}/v1",
|
78 |
+
api_key=hf_api_key,
|
79 |
+
model=model,
|
80 |
+
temperature=temperature,
|
81 |
+
max_tokens=max_tokens,
|
82 |
+
timeout=30, # Add timeout
|
83 |
+
max_retries=3, # Built-in retry logic
|
84 |
+
)
|
85 |
|
86 |
# conversational RAG chain
|
87 |
qa = ConversationalRetrievalChain.from_llm(
|