Spaces:

disLodge
/

Call_model

Sleeping

App Files Files Community

disLodge commited on May 7

Commit

41ec366

verified ·

1 Parent(s): a87b35b

fx

Browse files

Files changed (1) hide show

app.py +51 -23

app.py CHANGED Viewed

@@ -17,12 +17,56 @@ import logging
 import os
-# lo = "hf_JyAJApaXhIrONPFSIo"
-# ve = "wbnJbrXViYurrsvP"
-last_call_time = 0
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "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")
 def extract_pdf_text(url: str) -> str:
     response = requests.get(url)
     pdf_file = BytesIO(response.content)
@@ -45,13 +89,7 @@ vectorstore = Chroma.from_documents(
 )
 retriever = vectorstore.as_retriever()
-llm = ChatOpenAI(
-    model="gpt-3.5-turbo",
-    api_key=OPENAI_API_KEY,
-    max_tokens=512,
-    temperature=0.7,
-    top_p=0.95
-)
 # After RAG chain
 after_rag_template = """You are a {role}. Summarize the following content for yourself and speak in terms of first person.
@@ -79,18 +117,8 @@ after_rag_chain = (
 def process_query(role, system_message, max_tokens, temperature, top_p):
-    global last_call_time
-    current_time = time.time()
-    if current_time - last_call_time < 60:
-        wait_time = int(60 - (current_time - last_call_time))
-        return f"Rate limit exceeded. Please wait {wait_time} seconds before trying again."
-    # llm.update_params(max_tokens, temperature, top_p)
-    last_call_time = current_time
-    llm.max_tokens = max_tokens
-    llm.temperature = temperature
-    llm.top_p = top_p
     # After RAG
     after_rag_result = after_rag_chain.invoke({"role": role})

 import os
+lo = "hf_JyAJApaXhIrONPFSIo"
+ve = "wbnJbrXViYurrsvP"
+half = lo+ve
+HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", half)
+client = InferenceClient(
+    model="mistralai/Mixtral-8x7B-Instruct-v0.1",
+    token=HF_TOKEN,
+    provider="hf-inference"
+)
+class HuggingFaceInferenceClientRunnable(Runnable):
+    def __init__(self, client, max_tokens=512, temperature=0.7, top_p=0.95):
+        self.client = client
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.top_p = top_p
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((requests.exceptions.ConnectionError, requests.exceptions.Timeout))
+    )
+    def invoke(self, input, config=None):
+        # Extract the prompt from the input (ChatPromptTemplate output)
+        prompt = input.to_messages()[0].content
+        messages = [{"role": "user", "content": prompt}]
+        # Call the InferenceClient with streaming
+        response = ""
+        for parts in self.client.chat_completion(
+            messages,
+            max_tokens=self.max_tokens,
+            stream=True,
+            temperature=self.temperature,
+            top_p=self.top_p
+        ):
+            # Handle streaming response parts
+            for part in parts.choices:
+                token = part.delta.content
+                if token:
+                    response += token
+        return response
+    def update_params(self, max_tokens, temperature, top_p):
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.top_p = top_p
 def extract_pdf_text(url: str) -> str:
     response = requests.get(url)
     pdf_file = BytesIO(response.content)
 )
 retriever = vectorstore.as_retriever()
+llm = HuggingFaceInferenceClientRunnable(client)
 # After RAG chain
 after_rag_template = """You are a {role}. Summarize the following content for yourself and speak in terms of first person.
 def process_query(role, system_message, max_tokens, temperature, top_p):
+    llm.update_params(max_tokens, temperature, top_p)
     # After RAG
     after_rag_result = after_rag_chain.invoke({"role": role})