Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
# app.py
|
2 |
|
3 |
-
import os
|
4 |
import logging
|
5 |
import re
|
6 |
import requests
|
@@ -17,44 +16,37 @@ from langchain.chains.summarize import load_summarize_chain
|
|
17 |
from langchain.docstore.document import Document
|
18 |
from langchain.chains import RetrievalQA
|
19 |
|
20 |
-
# Load your Together API key securely (recommended on HF Spaces)
|
21 |
-
TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY")
|
22 |
-
|
23 |
# Logging setup
|
24 |
logging.basicConfig(level=logging.INFO)
|
25 |
logger = logging.getLogger(__name__)
|
26 |
|
27 |
-
# Load
|
28 |
-
logger.info("π Loading sentence transformer
|
29 |
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
|
30 |
llm = Together(
|
31 |
model="togethercomputer/llama-3-70b-chat",
|
32 |
temperature=0.7,
|
33 |
max_tokens=512,
|
34 |
-
together_api_key=
|
35 |
)
|
36 |
|
37 |
-
# Global cache
|
38 |
-
vector_index = None
|
39 |
-
doc_chunks = []
|
40 |
-
doc_texts = []
|
41 |
-
doc_embeddings = []
|
42 |
-
|
43 |
-
# Helper Functions
|
44 |
def fetch_webpage_text(url):
|
45 |
try:
|
46 |
response = requests.get(url)
|
47 |
response.raise_for_status()
|
48 |
soup = BeautifulSoup(response.text, "html.parser")
|
49 |
-
|
50 |
-
return
|
51 |
except Exception as e:
|
52 |
-
logger.error(f"
|
53 |
return ""
|
54 |
|
55 |
def clean_text(text):
|
56 |
text = re.sub(r'\[\s*\d+\s*\]', '', text)
|
57 |
text = re.sub(r'\[\s*[a-zA-Z]+\s*\]', '', text)
|
|
|
58 |
text = re.sub(r'\n{2,}', '\n', text)
|
59 |
text = re.sub(r'[ \t]+', ' ', text)
|
60 |
return text.strip()
|
@@ -79,63 +71,49 @@ def get_summary(chunks):
|
|
79 |
summarize_chain = load_summarize_chain(llm, chain_type="map_reduce")
|
80 |
return summarize_chain.run([full_doc])
|
81 |
|
82 |
-
def
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
query_vector = embed_model.encode(question).astype(np.float32)
|
87 |
-
index = faiss.IndexFlatL2(doc_embeddings[0].shape[0])
|
88 |
-
index.add(np.array(doc_embeddings).astype(np.float32))
|
89 |
D, I = index.search(np.array([query_vector]), k=5)
|
90 |
-
top_chunks = [
|
91 |
-
rag_doc = "\n\n".join(top_chunks)
|
92 |
|
|
|
93 |
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=None)
|
94 |
-
return qa_chain.run(input_documents=[Document(page_content=rag_doc)], question=
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
if not
|
106 |
-
return "β
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
summarize_btn = gr.Button("Summarize Content")
|
129 |
-
|
130 |
-
# Button logic
|
131 |
-
def user_chat_handler(q, history):
|
132 |
-
response = chat_with_bot(q)
|
133 |
-
history.append((q, response))
|
134 |
-
return history, ""
|
135 |
-
|
136 |
-
ask_btn.click(fn=user_chat_handler, inputs=[question, chatbot], outputs=[chatbot, question])
|
137 |
-
clear_btn.click(lambda: [], None, chatbot)
|
138 |
-
summarize_btn.click(fn=summarize_content, inputs=[], outputs=summary_output)
|
139 |
-
|
140 |
-
demo.launch()
|
141 |
|
|
|
|
|
|
1 |
# app.py
|
2 |
|
|
|
3 |
import logging
|
4 |
import re
|
5 |
import requests
|
|
|
16 |
from langchain.docstore.document import Document
|
17 |
from langchain.chains import RetrievalQA
|
18 |
|
|
|
|
|
|
|
19 |
# Logging setup
|
20 |
logging.basicConfig(level=logging.INFO)
|
21 |
logger = logging.getLogger(__name__)
|
22 |
|
23 |
+
# Load Embedding Model
|
24 |
+
logger.info("π Loading sentence transformer...")
|
25 |
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
|
26 |
+
|
27 |
+
# Load LLM (Replace with your API Key)
|
28 |
llm = Together(
|
29 |
model="togethercomputer/llama-3-70b-chat",
|
30 |
temperature=0.7,
|
31 |
max_tokens=512,
|
32 |
+
together_api_key="your_together_api_key"
|
33 |
)
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def fetch_webpage_text(url):
|
36 |
try:
|
37 |
response = requests.get(url)
|
38 |
response.raise_for_status()
|
39 |
soup = BeautifulSoup(response.text, "html.parser")
|
40 |
+
content_div = soup.find("div", {"id": "mw-content-text"}) or soup.body
|
41 |
+
return content_div.get_text(separator="\n", strip=True)
|
42 |
except Exception as e:
|
43 |
+
logger.error(f"Error fetching content: {e}")
|
44 |
return ""
|
45 |
|
46 |
def clean_text(text):
|
47 |
text = re.sub(r'\[\s*\d+\s*\]', '', text)
|
48 |
text = re.sub(r'\[\s*[a-zA-Z]+\s*\]', '', text)
|
49 |
+
text = re.sub(r'^\[\s*\d+\s*\]$', '', text, flags=re.MULTILINE)
|
50 |
text = re.sub(r'\n{2,}', '\n', text)
|
51 |
text = re.sub(r'[ \t]+', ' ', text)
|
52 |
return text.strip()
|
|
|
71 |
summarize_chain = load_summarize_chain(llm, chain_type="map_reduce")
|
72 |
return summarize_chain.run([full_doc])
|
73 |
|
74 |
+
def retrieve_answer(query, chunks, embeddings, texts):
|
75 |
+
query_vector = embed_model.encode(query).astype(np.float32)
|
76 |
+
index = faiss.IndexFlatL2(embeddings[0].shape[0])
|
77 |
+
index.add(np.array(embeddings).astype(np.float32))
|
|
|
|
|
|
|
78 |
D, I = index.search(np.array([query_vector]), k=5)
|
79 |
+
top_chunks = [texts[i] for i in I[0]]
|
|
|
80 |
|
81 |
+
rag_doc = "\n\n".join(top_chunks)
|
82 |
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=None)
|
83 |
+
return qa_chain.run(input_documents=[Document(page_content=rag_doc)], question=query)
|
84 |
+
|
85 |
+
# Gradio Interface
|
86 |
+
def run_chatbot(url, query):
|
87 |
+
raw_text = fetch_webpage_text(url)
|
88 |
+
if not raw_text:
|
89 |
+
return "β Failed to fetch content.", ""
|
90 |
+
|
91 |
+
cleaned = clean_text(raw_text)
|
92 |
+
chunks = chunk_text(cleaned)
|
93 |
+
|
94 |
+
if not chunks:
|
95 |
+
return "β No valid content to process.", ""
|
96 |
+
|
97 |
+
summary = get_summary(chunks)
|
98 |
+
index, texts, embeddings = create_vectorstore(chunks)
|
99 |
+
answer = retrieve_answer(query, chunks, embeddings, texts)
|
100 |
+
|
101 |
+
return summary, answer
|
102 |
+
|
103 |
+
demo = gr.Interface(
|
104 |
+
fn=run_chatbot,
|
105 |
+
inputs=[
|
106 |
+
gr.Textbox(label="Webpage URL", placeholder="Enter a Wikipedia link"),
|
107 |
+
gr.Textbox(label="Your Question", placeholder="Ask a question about the webpage")
|
108 |
+
],
|
109 |
+
outputs=[
|
110 |
+
gr.Textbox(label="Webpage Summary"),
|
111 |
+
gr.Textbox(label="Answer")
|
112 |
+
],
|
113 |
+
title="π¦ LLaMA RAG Chatbot",
|
114 |
+
description="Enter a Wikipedia article URL and ask a question. Powered by Together AI and LangChain.",
|
115 |
+
allow_flagging="never"
|
116 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
+
if __name__ == "__main__":
|
119 |
+
demo.launch()
|