IProject-10 commited on
Commit
4fdeec5
Β·
verified Β·
1 Parent(s): 3eef18c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -61
app.py CHANGED
@@ -1,35 +1,55 @@
 
 
1
  import os
2
- import re
3
  import logging
 
4
  import requests
5
  import numpy as np
6
  import faiss
 
7
  from bs4 import BeautifulSoup
8
  from sentence_transformers import SentenceTransformer
9
  from langchain.embeddings import HuggingFaceEmbeddings
10
- from langchain_community.vectorstores import FAISS as LangchainFAISS
11
- from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from langchain.llms import Together
 
 
 
13
  from langchain.chains import RetrievalQA
14
- import gradio as gr
15
 
16
- # Set Together.ai API key
17
- os.environ["TOGETHER_API_KEY"] = os.getenv("TOGETHER_API_KEY", "a36246d65d8290f43667350b364c5b6bb8562eb50a4b947eec5bd7e79f2dffc6")
18
 
19
  # Logging setup
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
22
 
23
- # Step 1: Load and chunk webpage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def fetch_webpage_text(url):
25
  try:
26
  response = requests.get(url)
27
  response.raise_for_status()
28
  soup = BeautifulSoup(response.text, "html.parser")
29
- content_div = soup.find("div", {"id": "mw-content-text"}) or soup.body
30
- return content_div.get_text(separator="\n", strip=True)
31
  except Exception as e:
32
- logger.error(f"Error fetching content from {url}: {e}")
33
  return ""
34
 
35
  def clean_text(text):
@@ -40,75 +60,75 @@ def clean_text(text):
40
  return text.strip()
41
 
42
  def chunk_text(text, chunk_size=500, overlap=50):
43
- cleaned = clean_text(text)
44
- splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
45
- return splitter.split_text(cleaned)
46
-
47
- def load_and_chunk_webpage(url):
48
- text = fetch_webpage_text(url)
49
- return chunk_text(text)
50
-
51
- # Step 2: Embed chunks using SentenceTransformer
52
- def embed_chunks(chunks):
53
- model = SentenceTransformer('all-MiniLM-L6-v2')
54
- embeddings = model.encode(chunks, normalize_embeddings=True)
55
- return embeddings, model
56
-
57
- # Step 3: Build FAISS index using LangChain wrapper
58
- def build_retriever(chunks):
59
- embedding_func = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
60
- db = LangchainFAISS.from_texts(chunks, embedding_func)
61
- return db.as_retriever(search_type="similarity", search_kwargs={"k": 3}), db
62
-
63
- # Step 4: Initialize LLM and RAG Chain
64
- def initialize_llm():
65
- return Together(
66
- model="meta-llama/Llama-3-8b-chat-hf",
67
- temperature=0.7,
68
- max_tokens=512
69
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- # Initialize all components
72
- wiki_url = "https://en.wikipedia.org/wiki/LLaMA"
73
- chunks = load_and_chunk_webpage(wiki_url)
74
- embeddings, embed_model = embed_chunks(chunks)
75
- retriever, db = build_retriever(chunks)
76
- llm = initialize_llm()
77
-
78
- qa_chain = RetrievalQA.from_chain_type(
79
- llm=llm,
80
- retriever=retriever,
81
- chain_type="stuff"
82
- )
83
-
84
- # Chat logic
85
- def chat_with_bot(query):
86
- if not query.strip():
87
- return "❗ Please enter a question."
88
- return qa_chain.run(query)
89
-
90
- # Summary logic
91
  def summarize_content():
92
- sample_text = " ".join(chunks[:20])
93
- prompt = f"Summarize this text in 5 bullet points:\n\n{sample_text[:3000]}"
94
- summary = llm.invoke(prompt)
95
- return summary.content if hasattr(summary, "content") else summary
 
 
 
 
 
 
 
 
 
 
96
 
97
  # Gradio UI
98
  with gr.Blocks() as demo:
99
  gr.Markdown("## πŸ€– Chat with LLaMA Webpage Content")
100
 
 
 
 
 
101
  with gr.Row():
102
  chatbot = gr.Chatbot(label="Chat History")
103
 
104
  with gr.Row():
105
- question = gr.Textbox(label="Ask your question about LLaMA", placeholder="e.g., Who developed LLaMA?")
106
  ask_btn = gr.Button("Submit")
107
  clear_btn = gr.Button("Clear Chat")
108
 
109
  summary_output = gr.Textbox(label="πŸ“‹ Summary of the Webpage", lines=8)
110
  summarize_btn = gr.Button("Summarize Content")
111
 
 
112
  def user_chat_handler(q, history):
113
  response = chat_with_bot(q)
114
  history.append((q, response))
@@ -117,5 +137,6 @@ with gr.Blocks() as demo:
117
  ask_btn.click(fn=user_chat_handler, inputs=[question, chatbot], outputs=[chatbot, question])
118
  clear_btn.click(lambda: [], None, chatbot)
119
  summarize_btn.click(fn=summarize_content, inputs=[], outputs=summary_output)
 
120
 
121
  demo.launch()
 
1
+ # app.py
2
+
3
  import os
 
4
  import logging
5
+ import re
6
  import requests
7
  import numpy as np
8
  import faiss
9
+ import gradio as gr
10
  from bs4 import BeautifulSoup
11
  from sentence_transformers import SentenceTransformer
12
  from langchain.embeddings import HuggingFaceEmbeddings
13
+ from langchain.vectorstores.faiss import FAISS
 
14
  from langchain.llms import Together
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain.chains.summarize import load_summarize_chain
17
+ from langchain.docstore.document import Document
18
  from langchain.chains import RetrievalQA
 
19
 
20
+ # Load your Together API key securely (recommended on HF Spaces)
21
+ TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY")
22
 
23
  # Logging setup
24
  logging.basicConfig(level=logging.INFO)
25
  logger = logging.getLogger(__name__)
26
 
27
+ # Load models
28
+ logger.info("πŸ” Loading sentence transformer and LLM...")
29
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2")
30
+ llm = Together(
31
+ model="togethercomputer/llama-3-70b-chat",
32
+ temperature=0.7,
33
+ max_tokens=512,
34
+ together_api_key=TOGETHER_API_KEY,
35
+ )
36
+
37
+ # Global cache
38
+ vector_index = None
39
+ doc_chunks = []
40
+ doc_texts = []
41
+ doc_embeddings = []
42
+
43
+ # Helper Functions
44
  def fetch_webpage_text(url):
45
  try:
46
  response = requests.get(url)
47
  response.raise_for_status()
48
  soup = BeautifulSoup(response.text, "html.parser")
49
+ content = soup.find("div", {"id": "mw-content-text"}) or soup.body
50
+ return content.get_text(separator="\n", strip=True)
51
  except Exception as e:
52
+ logger.error(f"❌ Error fetching content: {e}")
53
  return ""
54
 
55
  def clean_text(text):
 
60
  return text.strip()
61
 
62
  def chunk_text(text, chunk_size=500, overlap=50):
63
+ splitter = RecursiveCharacterTextSplitter(
64
+ chunk_size=chunk_size,
65
+ chunk_overlap=overlap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  )
67
+ return splitter.split_text(text)
68
+
69
+ def create_vectorstore(chunks):
70
+ texts = [chunk for chunk in chunks]
71
+ embeddings = [embed_model.encode(text) for text in texts]
72
+ dim = embeddings[0].shape[0]
73
+ index = faiss.IndexFlatL2(dim)
74
+ index.add(np.array(embeddings).astype(np.float32))
75
+ return index, texts, embeddings
76
+
77
+ def get_summary(chunks):
78
+ full_doc = Document(page_content="\n\n".join(chunks))
79
+ summarize_chain = load_summarize_chain(llm, chain_type="map_reduce")
80
+ return summarize_chain.run([full_doc])
81
+
82
+ def chat_with_bot(question):
83
+ if not doc_chunks or not doc_embeddings:
84
+ return "⚠️ Please load a webpage and summarize it first."
85
+
86
+ query_vector = embed_model.encode(question).astype(np.float32)
87
+ index = faiss.IndexFlatL2(doc_embeddings[0].shape[0])
88
+ index.add(np.array(doc_embeddings).astype(np.float32))
89
+ D, I = index.search(np.array([query_vector]), k=5)
90
+ top_chunks = [doc_texts[i] for i in I[0]]
91
+ rag_doc = "\n\n".join(top_chunks)
92
+
93
+ qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=None)
94
+ return qa_chain.run(input_documents=[Document(page_content=rag_doc)], question=question)
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  def summarize_content():
97
+ if not doc_chunks:
98
+ return "⚠️ No content loaded yet. Please load a valid webpage."
99
+ return get_summary(doc_chunks)
100
+
101
+ def process_webpage_and_load(url):
102
+ global doc_chunks, vector_index, doc_texts, doc_embeddings
103
+ logger.info(f"🌐 Loading URL: {url}")
104
+ text = fetch_webpage_text(url)
105
+ if not text:
106
+ return "❌ Failed to load or parse webpage."
107
+ cleaned = clean_text(text)
108
+ doc_chunks = chunk_text(cleaned)
109
+ vector_index, doc_texts, doc_embeddings = create_vectorstore(doc_chunks)
110
+ return "βœ… Webpage content processed and ready!"
111
 
112
  # Gradio UI
113
  with gr.Blocks() as demo:
114
  gr.Markdown("## πŸ€– Chat with LLaMA Webpage Content")
115
 
116
+ with gr.Row():
117
+ url_input = gr.Textbox(label="🌐 Webpage URL", placeholder="Enter a Wikipedia or article URL")
118
+ load_btn = gr.Button("Load Webpage")
119
+
120
  with gr.Row():
121
  chatbot = gr.Chatbot(label="Chat History")
122
 
123
  with gr.Row():
124
+ question = gr.Textbox(label="Ask your question about the webpage", placeholder="e.g., Who developed LLaMA?")
125
  ask_btn = gr.Button("Submit")
126
  clear_btn = gr.Button("Clear Chat")
127
 
128
  summary_output = gr.Textbox(label="πŸ“‹ Summary of the Webpage", lines=8)
129
  summarize_btn = gr.Button("Summarize Content")
130
 
131
+ # Function bindings
132
  def user_chat_handler(q, history):
133
  response = chat_with_bot(q)
134
  history.append((q, response))
 
137
  ask_btn.click(fn=user_chat_handler, inputs=[question, chatbot], outputs=[chatbot, question])
138
  clear_btn.click(lambda: [], None, chatbot)
139
  summarize_btn.click(fn=summarize_content, inputs=[], outputs=summary_output)
140
+ load_btn.click(fn=process_webpage_and_load, inputs=[url_input], outputs=[summary_output])
141
 
142
  demo.launch()