Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -12,11 +12,18 @@ from langchain_community.document_loaders import PyPDFLoader
|
|
12 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
13 |
from llama_parse import LlamaParse
|
14 |
from langchain_core.documents import Document
|
|
|
15 |
|
16 |
# Environment variables and configurations
|
17 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
18 |
llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
# Initialize LlamaParse
|
21 |
llama_parser = LlamaParse(
|
22 |
api_key=llama_cloud_api_key,
|
@@ -70,43 +77,32 @@ def update_vectors(files, parser):
|
|
70 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}."
|
71 |
|
72 |
def generate_chunked_response(prompt, max_tokens=1000, max_chunks=5, temperature=0.3, repetition_penalty=1.1):
|
73 |
-
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct"
|
74 |
-
headers = {"Authorization": f"Bearer {huggingface_token}"}
|
75 |
-
payload = {
|
76 |
-
"inputs": prompt,
|
77 |
-
"parameters": {
|
78 |
-
"max_new_tokens": max_tokens,
|
79 |
-
"temperature": temperature,
|
80 |
-
"top_p": 0.4,
|
81 |
-
"top_k": 40,
|
82 |
-
"repetition_penalty": repetition_penalty,
|
83 |
-
"stop": ["</s>", "[/INST]"]
|
84 |
-
}
|
85 |
-
}
|
86 |
-
|
87 |
full_response = ""
|
88 |
for _ in range(max_chunks):
|
89 |
-
response =
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
break
|
|
|
|
|
|
|
110 |
else:
|
111 |
break
|
112 |
|
|
|
12 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
13 |
from llama_parse import LlamaParse
|
14 |
from langchain_core.documents import Document
|
15 |
+
from huggingface_hub import InferenceClient
|
16 |
|
17 |
# Environment variables and configurations
|
18 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
19 |
llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
|
20 |
|
21 |
+
# Initialize the InferenceClient
|
22 |
+
client = InferenceClient(
|
23 |
+
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
24 |
+
token=huggingface_token, # Use your environment variable for the token
|
25 |
+
)
|
26 |
+
|
27 |
# Initialize LlamaParse
|
28 |
llama_parser = LlamaParse(
|
29 |
api_key=llama_cloud_api_key,
|
|
|
77 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}."
|
78 |
|
79 |
def generate_chunked_response(prompt, max_tokens=1000, max_chunks=5, temperature=0.3, repetition_penalty=1.1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
full_response = ""
|
81 |
for _ in range(max_chunks):
|
82 |
+
response = client.chat_completion(
|
83 |
+
messages=[{"role": "user", "content": prompt}],
|
84 |
+
max_tokens=max_tokens,
|
85 |
+
temperature=temperature,
|
86 |
+
repetition_penalty=repetition_penalty,
|
87 |
+
stream=False,
|
88 |
+
)
|
89 |
+
|
90 |
+
if response and "choices" in response and len(response["choices"]) > 0:
|
91 |
+
chunk = response["choices"][0]["message"]["content"]
|
92 |
+
|
93 |
+
# Remove any part of the chunk that's already in full_response
|
94 |
+
new_content = chunk[len(full_response):].strip()
|
95 |
+
|
96 |
+
if not new_content:
|
97 |
+
break # No new content, so we're done
|
98 |
+
|
99 |
+
full_response += new_content
|
100 |
+
|
101 |
+
if chunk.endswith((".", "!", "?", "</s>", "[/INST]")):
|
102 |
break
|
103 |
+
|
104 |
+
# Update the prompt for the next iteration
|
105 |
+
prompt = full_response
|
106 |
else:
|
107 |
break
|
108 |
|