SearchGPT

Running

App Files Files Community

Shreyas094 commited on Jul 24, 2024

Commit

ee9e2d5

verified ·

1 Parent(s): 4afe293

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -34

app.py CHANGED Viewed

@@ -12,11 +12,18 @@ from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from llama_parse import LlamaParse
 from langchain_core.documents import Document
 # Environment variables and configurations
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
 # Initialize LlamaParse
 llama_parser = LlamaParse(
     api_key=llama_cloud_api_key,
@@ -70,43 +77,32 @@ def update_vectors(files, parser):
     return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}."
 def generate_chunked_response(prompt, max_tokens=1000, max_chunks=5, temperature=0.3, repetition_penalty=1.1):
-    API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct"
-    headers = {"Authorization": f"Bearer {huggingface_token}"}
-    payload = {
-        "inputs": prompt,
-        "parameters": {
-            "max_new_tokens": max_tokens,
-            "temperature": temperature,
-            "top_p": 0.4,
-            "top_k": 40,
-            "repetition_penalty": repetition_penalty,
-            "stop": ["</s>", "[/INST]"]
-        }
-    }
     full_response = ""
     for _ in range(max_chunks):
-        response = requests.post(API_URL, headers=headers, json=payload)
-        if response.status_code == 200:
-            result = response.json()
-            if isinstance(result, list) and len(result) > 0:
-                chunk = result[0].get('generated_text', '')
-                # Remove any part of the chunk that's already in full_response
-                new_content = chunk[len(full_response):].strip()
-                if not new_content:
-                    break  # No new content, so we're done
-                full_response += new_content
-                if chunk.endswith((".", "!", "?", "</s>", "[/INST]")):
-                    break
-                # Update the prompt for the next iteration
-                payload["inputs"] = full_response
-            else:
                 break
         else:
             break

 from langchain_community.embeddings import HuggingFaceEmbeddings
 from llama_parse import LlamaParse
 from langchain_core.documents import Document
+from huggingface_hub import InferenceClient
 # Environment variables and configurations
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
+# Initialize the InferenceClient
+client = InferenceClient(
+    "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    token=huggingface_token,  # Use your environment variable for the token
+)
 # Initialize LlamaParse
 llama_parser = LlamaParse(
     api_key=llama_cloud_api_key,
     return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}."
 def generate_chunked_response(prompt, max_tokens=1000, max_chunks=5, temperature=0.3, repetition_penalty=1.1):
     full_response = ""
     for _ in range(max_chunks):
+        response = client.chat_completion(
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=max_tokens,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            stream=False,
+        )
+        if response and "choices" in response and len(response["choices"]) > 0:
+            chunk = response["choices"][0]["message"]["content"]
+            # Remove any part of the chunk that's already in full_response
+            new_content = chunk[len(full_response):].strip()
+            if not new_content:
+                break  # No new content, so we're done
+            full_response += new_content
+            if chunk.endswith((".", "!", "?", "</s>", "[/INST]")):
                 break
+            # Update the prompt for the next iteration
+            prompt = full_response
         else:
             break