IProject-10 commited on
Commit
3eef18c
·
verified ·
1 Parent(s): bed8d9b

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +121 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import logging
4
+ import requests
5
+ import numpy as np
6
+ import faiss
7
+ from bs4 import BeautifulSoup
8
+ from sentence_transformers import SentenceTransformer
9
+ from langchain.embeddings import HuggingFaceEmbeddings
10
+ from langchain_community.vectorstores import FAISS as LangchainFAISS
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain.llms import Together
13
+ from langchain.chains import RetrievalQA
14
+ import gradio as gr
15
+
16
+ # Set Together.ai API key
17
+ os.environ["TOGETHER_API_KEY"] = os.getenv("TOGETHER_API_KEY", "a36246d65d8290f43667350b364c5b6bb8562eb50a4b947eec5bd7e79f2dffc6")
18
+
19
+ # Logging setup
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Step 1: Load and chunk webpage
24
+ def fetch_webpage_text(url):
25
+ try:
26
+ response = requests.get(url)
27
+ response.raise_for_status()
28
+ soup = BeautifulSoup(response.text, "html.parser")
29
+ content_div = soup.find("div", {"id": "mw-content-text"}) or soup.body
30
+ return content_div.get_text(separator="\n", strip=True)
31
+ except Exception as e:
32
+ logger.error(f"Error fetching content from {url}: {e}")
33
+ return ""
34
+
35
+ def clean_text(text):
36
+ text = re.sub(r'\[\s*\d+\s*\]', '', text)
37
+ text = re.sub(r'\[\s*[a-zA-Z]+\s*\]', '', text)
38
+ text = re.sub(r'\n{2,}', '\n', text)
39
+ text = re.sub(r'[ \t]+', ' ', text)
40
+ return text.strip()
41
+
42
+ def chunk_text(text, chunk_size=500, overlap=50):
43
+ cleaned = clean_text(text)
44
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
45
+ return splitter.split_text(cleaned)
46
+
47
+ def load_and_chunk_webpage(url):
48
+ text = fetch_webpage_text(url)
49
+ return chunk_text(text)
50
+
51
+ # Step 2: Embed chunks using SentenceTransformer
52
+ def embed_chunks(chunks):
53
+ model = SentenceTransformer('all-MiniLM-L6-v2')
54
+ embeddings = model.encode(chunks, normalize_embeddings=True)
55
+ return embeddings, model
56
+
57
+ # Step 3: Build FAISS index using LangChain wrapper
58
+ def build_retriever(chunks):
59
+ embedding_func = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
60
+ db = LangchainFAISS.from_texts(chunks, embedding_func)
61
+ return db.as_retriever(search_type="similarity", search_kwargs={"k": 3}), db
62
+
63
+ # Step 4: Initialize LLM and RAG Chain
64
+ def initialize_llm():
65
+ return Together(
66
+ model="meta-llama/Llama-3-8b-chat-hf",
67
+ temperature=0.7,
68
+ max_tokens=512
69
+ )
70
+
71
+ # Initialize all components
72
+ wiki_url = "https://en.wikipedia.org/wiki/LLaMA"
73
+ chunks = load_and_chunk_webpage(wiki_url)
74
+ embeddings, embed_model = embed_chunks(chunks)
75
+ retriever, db = build_retriever(chunks)
76
+ llm = initialize_llm()
77
+
78
+ qa_chain = RetrievalQA.from_chain_type(
79
+ llm=llm,
80
+ retriever=retriever,
81
+ chain_type="stuff"
82
+ )
83
+
84
+ # Chat logic
85
+ def chat_with_bot(query):
86
+ if not query.strip():
87
+ return "❗ Please enter a question."
88
+ return qa_chain.run(query)
89
+
90
+ # Summary logic
91
+ def summarize_content():
92
+ sample_text = " ".join(chunks[:20])
93
+ prompt = f"Summarize this text in 5 bullet points:\n\n{sample_text[:3000]}"
94
+ summary = llm.invoke(prompt)
95
+ return summary.content if hasattr(summary, "content") else summary
96
+
97
+ # Gradio UI
98
+ with gr.Blocks() as demo:
99
+ gr.Markdown("## 🤖 Chat with LLaMA Webpage Content")
100
+
101
+ with gr.Row():
102
+ chatbot = gr.Chatbot(label="Chat History")
103
+
104
+ with gr.Row():
105
+ question = gr.Textbox(label="Ask your question about LLaMA", placeholder="e.g., Who developed LLaMA?")
106
+ ask_btn = gr.Button("Submit")
107
+ clear_btn = gr.Button("Clear Chat")
108
+
109
+ summary_output = gr.Textbox(label="📋 Summary of the Webpage", lines=8)
110
+ summarize_btn = gr.Button("Summarize Content")
111
+
112
+ def user_chat_handler(q, history):
113
+ response = chat_with_bot(q)
114
+ history.append((q, response))
115
+ return history, ""
116
+
117
+ ask_btn.click(fn=user_chat_handler, inputs=[question, chatbot], outputs=[chatbot, question])
118
+ clear_btn.click(lambda: [], None, chatbot)
119
+ summarize_btn.click(fn=summarize_content, inputs=[], outputs=summary_output)
120
+
121
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ beautifulsoup4
3
+ requests
4
+ langchain
5
+ langchain-community
6
+ huggingface-hub
7
+ sentence-transformers
8
+ faiss-cpu
9
+ together