IProject-10 commited on
Commit
be79cad
Β·
verified Β·
1 Parent(s): c0de0f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -73
app.py CHANGED
@@ -1,6 +1,5 @@
1
  # app.py
2
 
3
- import os
4
  import logging
5
  import re
6
  import requests
@@ -17,44 +16,37 @@ from langchain.chains.summarize import load_summarize_chain
17
  from langchain.docstore.document import Document
18
  from langchain.chains import RetrievalQA
19
 
20
- # Load your Together API key securely (recommended on HF Spaces)
21
- TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY")
22
-
23
  # Logging setup
24
  logging.basicConfig(level=logging.INFO)
25
  logger = logging.getLogger(__name__)
26
 
27
- # Load models
28
- logger.info("πŸ” Loading sentence transformer and LLM...")
29
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
 
 
30
  llm = Together(
31
  model="togethercomputer/llama-3-70b-chat",
32
  temperature=0.7,
33
  max_tokens=512,
34
- together_api_key=TOGETHER_API_KEY,
35
  )
36
 
37
- # Global cache
38
- vector_index = None
39
- doc_chunks = []
40
- doc_texts = []
41
- doc_embeddings = []
42
-
43
- # Helper Functions
44
  def fetch_webpage_text(url):
45
  try:
46
  response = requests.get(url)
47
  response.raise_for_status()
48
  soup = BeautifulSoup(response.text, "html.parser")
49
- content = soup.find("div", {"id": "mw-content-text"}) or soup.body
50
- return content.get_text(separator="\n", strip=True)
51
  except Exception as e:
52
- logger.error(f"❌ Error fetching content: {e}")
53
  return ""
54
 
55
  def clean_text(text):
56
  text = re.sub(r'\[\s*\d+\s*\]', '', text)
57
  text = re.sub(r'\[\s*[a-zA-Z]+\s*\]', '', text)
 
58
  text = re.sub(r'\n{2,}', '\n', text)
59
  text = re.sub(r'[ \t]+', ' ', text)
60
  return text.strip()
@@ -79,63 +71,49 @@ def get_summary(chunks):
79
  summarize_chain = load_summarize_chain(llm, chain_type="map_reduce")
80
  return summarize_chain.run([full_doc])
81
 
82
- def chat_with_bot(question):
83
- if not doc_chunks or not doc_embeddings:
84
- return "⚠️ Please load a webpage and summarize it first."
85
-
86
- query_vector = embed_model.encode(question).astype(np.float32)
87
- index = faiss.IndexFlatL2(doc_embeddings[0].shape[0])
88
- index.add(np.array(doc_embeddings).astype(np.float32))
89
  D, I = index.search(np.array([query_vector]), k=5)
90
- top_chunks = [doc_texts[i] for i in I[0]]
91
- rag_doc = "\n\n".join(top_chunks)
92
 
 
93
  qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=None)
94
- return qa_chain.run(input_documents=[Document(page_content=rag_doc)], question=question)
95
-
96
- def summarize_content():
97
- if not doc_chunks:
98
- return "⚠️ No content loaded yet. Please load a valid webpage."
99
- return get_summary(doc_chunks)
100
-
101
- def process_webpage_and_load(url):
102
- global doc_chunks, vector_index, doc_texts, doc_embeddings
103
- logger.info(f"🌐 Loading URL: {url}")
104
- text = fetch_webpage_text(url)
105
- if not text:
106
- return "❌ Failed to load or parse webpage."
107
- cleaned = clean_text(text)
108
- doc_chunks = chunk_text(cleaned)
109
- vector_index, doc_texts, doc_embeddings = create_vectorstore(doc_chunks)
110
- return "βœ… Webpage content processed and ready!"
111
-
112
- # Gradio UI
113
- with gr.Blocks() as demo:
114
- gr.Markdown("## πŸ€– Chat with LLaMA Webpage Content")
115
-
116
- with gr.Row():
117
- chatbot = gr.Chatbot(label="Chat History")
118
-
119
- with gr.Row():
120
- question = gr.Textbox(
121
- label="Ask your question about LLaMA",
122
- placeholder="e.g., Who developed LLaMA?"
123
- )
124
- ask_btn = gr.Button("Submit")
125
- clear_btn = gr.Button("Clear Chat")
126
-
127
- summary_output = gr.Textbox(label="πŸ“‹ Summary of the Webpage", lines=8)
128
- summarize_btn = gr.Button("Summarize Content")
129
-
130
- # Button logic
131
- def user_chat_handler(q, history):
132
- response = chat_with_bot(q)
133
- history.append((q, response))
134
- return history, ""
135
-
136
- ask_btn.click(fn=user_chat_handler, inputs=[question, chatbot], outputs=[chatbot, question])
137
- clear_btn.click(lambda: [], None, chatbot)
138
- summarize_btn.click(fn=summarize_content, inputs=[], outputs=summary_output)
139
-
140
- demo.launch()
141
 
 
 
 
1
  # app.py
2
 
 
3
  import logging
4
  import re
5
  import requests
 
16
  from langchain.docstore.document import Document
17
  from langchain.chains import RetrievalQA
18
 
 
 
 
19
  # Logging setup
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
22
 
23
+ # Load Embedding Model
24
+ logger.info("πŸ” Loading sentence transformer...")
25
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
26
+
27
+ # Load LLM (Replace with your API Key)
28
  llm = Together(
29
  model="togethercomputer/llama-3-70b-chat",
30
  temperature=0.7,
31
  max_tokens=512,
32
+ together_api_key="your_together_api_key"
33
  )
34
 
 
 
 
 
 
 
 
35
  def fetch_webpage_text(url):
36
  try:
37
  response = requests.get(url)
38
  response.raise_for_status()
39
  soup = BeautifulSoup(response.text, "html.parser")
40
+ content_div = soup.find("div", {"id": "mw-content-text"}) or soup.body
41
+ return content_div.get_text(separator="\n", strip=True)
42
  except Exception as e:
43
+ logger.error(f"Error fetching content: {e}")
44
  return ""
45
 
46
  def clean_text(text):
47
  text = re.sub(r'\[\s*\d+\s*\]', '', text)
48
  text = re.sub(r'\[\s*[a-zA-Z]+\s*\]', '', text)
49
+ text = re.sub(r'^\[\s*\d+\s*\]$', '', text, flags=re.MULTILINE)
50
  text = re.sub(r'\n{2,}', '\n', text)
51
  text = re.sub(r'[ \t]+', ' ', text)
52
  return text.strip()
 
71
  summarize_chain = load_summarize_chain(llm, chain_type="map_reduce")
72
  return summarize_chain.run([full_doc])
73
 
74
+ def retrieve_answer(query, chunks, embeddings, texts):
75
+ query_vector = embed_model.encode(query).astype(np.float32)
76
+ index = faiss.IndexFlatL2(embeddings[0].shape[0])
77
+ index.add(np.array(embeddings).astype(np.float32))
 
 
 
78
  D, I = index.search(np.array([query_vector]), k=5)
79
+ top_chunks = [texts[i] for i in I[0]]
 
80
 
81
+ rag_doc = "\n\n".join(top_chunks)
82
  qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=None)
83
+ return qa_chain.run(input_documents=[Document(page_content=rag_doc)], question=query)
84
+
85
+ # Gradio Interface
86
+ def run_chatbot(url, query):
87
+ raw_text = fetch_webpage_text(url)
88
+ if not raw_text:
89
+ return "❌ Failed to fetch content.", ""
90
+
91
+ cleaned = clean_text(raw_text)
92
+ chunks = chunk_text(cleaned)
93
+
94
+ if not chunks:
95
+ return "❌ No valid content to process.", ""
96
+
97
+ summary = get_summary(chunks)
98
+ index, texts, embeddings = create_vectorstore(chunks)
99
+ answer = retrieve_answer(query, chunks, embeddings, texts)
100
+
101
+ return summary, answer
102
+
103
+ demo = gr.Interface(
104
+ fn=run_chatbot,
105
+ inputs=[
106
+ gr.Textbox(label="Webpage URL", placeholder="Enter a Wikipedia link"),
107
+ gr.Textbox(label="Your Question", placeholder="Ask a question about the webpage")
108
+ ],
109
+ outputs=[
110
+ gr.Textbox(label="Webpage Summary"),
111
+ gr.Textbox(label="Answer")
112
+ ],
113
+ title="πŸ¦™ LLaMA RAG Chatbot",
114
+ description="Enter a Wikipedia article URL and ask a question. Powered by Together AI and LangChain.",
115
+ allow_flagging="never"
116
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ if __name__ == "__main__":
119
+ demo.launch()