Shreyas094 commited on
Commit
ee9e2d5
·
verified ·
1 Parent(s): 4afe293

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -34
app.py CHANGED
@@ -12,11 +12,18 @@ from langchain_community.document_loaders import PyPDFLoader
12
  from langchain_community.embeddings import HuggingFaceEmbeddings
13
  from llama_parse import LlamaParse
14
  from langchain_core.documents import Document
 
15
 
16
  # Environment variables and configurations
17
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
18
  llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
19
 
 
 
 
 
 
 
20
  # Initialize LlamaParse
21
  llama_parser = LlamaParse(
22
  api_key=llama_cloud_api_key,
@@ -70,43 +77,32 @@ def update_vectors(files, parser):
70
  return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}."
71
 
72
  def generate_chunked_response(prompt, max_tokens=1000, max_chunks=5, temperature=0.3, repetition_penalty=1.1):
73
- API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct"
74
- headers = {"Authorization": f"Bearer {huggingface_token}"}
75
- payload = {
76
- "inputs": prompt,
77
- "parameters": {
78
- "max_new_tokens": max_tokens,
79
- "temperature": temperature,
80
- "top_p": 0.4,
81
- "top_k": 40,
82
- "repetition_penalty": repetition_penalty,
83
- "stop": ["</s>", "[/INST]"]
84
- }
85
- }
86
-
87
  full_response = ""
88
  for _ in range(max_chunks):
89
- response = requests.post(API_URL, headers=headers, json=payload)
90
- if response.status_code == 200:
91
- result = response.json()
92
- if isinstance(result, list) and len(result) > 0:
93
- chunk = result[0].get('generated_text', '')
94
-
95
- # Remove any part of the chunk that's already in full_response
96
- new_content = chunk[len(full_response):].strip()
97
-
98
- if not new_content:
99
- break # No new content, so we're done
100
-
101
- full_response += new_content
102
-
103
- if chunk.endswith((".", "!", "?", "</s>", "[/INST]")):
104
- break
105
-
106
- # Update the prompt for the next iteration
107
- payload["inputs"] = full_response
108
- else:
109
  break
 
 
 
110
  else:
111
  break
112
 
 
12
  from langchain_community.embeddings import HuggingFaceEmbeddings
13
  from llama_parse import LlamaParse
14
  from langchain_core.documents import Document
15
+ from huggingface_hub import InferenceClient
16
 
17
  # Environment variables and configurations
18
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
19
  llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
20
 
21
+ # Initialize the InferenceClient
22
+ client = InferenceClient(
23
+ "meta-llama/Meta-Llama-3.1-8B-Instruct",
24
+ token=huggingface_token, # Use your environment variable for the token
25
+ )
26
+
27
  # Initialize LlamaParse
28
  llama_parser = LlamaParse(
29
  api_key=llama_cloud_api_key,
 
77
  return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}."
78
 
79
  def generate_chunked_response(prompt, max_tokens=1000, max_chunks=5, temperature=0.3, repetition_penalty=1.1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  full_response = ""
81
  for _ in range(max_chunks):
82
+ response = client.chat_completion(
83
+ messages=[{"role": "user", "content": prompt}],
84
+ max_tokens=max_tokens,
85
+ temperature=temperature,
86
+ repetition_penalty=repetition_penalty,
87
+ stream=False,
88
+ )
89
+
90
+ if response and "choices" in response and len(response["choices"]) > 0:
91
+ chunk = response["choices"][0]["message"]["content"]
92
+
93
+ # Remove any part of the chunk that's already in full_response
94
+ new_content = chunk[len(full_response):].strip()
95
+
96
+ if not new_content:
97
+ break # No new content, so we're done
98
+
99
+ full_response += new_content
100
+
101
+ if chunk.endswith((".", "!", "?", "</s>", "[/INST]")):
102
  break
103
+
104
+ # Update the prompt for the next iteration
105
+ prompt = full_response
106
  else:
107
  break
108