File size: 4,039 Bytes
3eef18c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import re
import logging
import requests
import numpy as np
import faiss
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS as LangchainFAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import Together
from langchain.chains import RetrievalQA
import gradio as gr

# Set Together.ai API key
os.environ["TOGETHER_API_KEY"] = os.getenv("TOGETHER_API_KEY", "a36246d65d8290f43667350b364c5b6bb8562eb50a4b947eec5bd7e79f2dffc6")

# Logging setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Step 1: Load and chunk webpage
def fetch_webpage_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        content_div = soup.find("div", {"id": "mw-content-text"}) or soup.body
        return content_div.get_text(separator="\n", strip=True)
    except Exception as e:
        logger.error(f"Error fetching content from {url}: {e}")
        return ""

def clean_text(text):
    text = re.sub(r'\[\s*\d+\s*\]', '', text)
    text = re.sub(r'\[\s*[a-zA-Z]+\s*\]', '', text)
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()

def chunk_text(text, chunk_size=500, overlap=50):
    cleaned = clean_text(text)
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return splitter.split_text(cleaned)

def load_and_chunk_webpage(url):
    text = fetch_webpage_text(url)
    return chunk_text(text)

# Step 2: Embed chunks using SentenceTransformer
def embed_chunks(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(chunks, normalize_embeddings=True)
    return embeddings, model

# Step 3: Build FAISS index using LangChain wrapper
def build_retriever(chunks):
    embedding_func = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    db = LangchainFAISS.from_texts(chunks, embedding_func)
    return db.as_retriever(search_type="similarity", search_kwargs={"k": 3}), db

# Step 4: Initialize LLM and RAG Chain
def initialize_llm():
    return Together(
        model="meta-llama/Llama-3-8b-chat-hf",
        temperature=0.7,
        max_tokens=512
    )

# Initialize all components
wiki_url = "https://en.wikipedia.org/wiki/LLaMA"
chunks = load_and_chunk_webpage(wiki_url)
embeddings, embed_model = embed_chunks(chunks)
retriever, db = build_retriever(chunks)
llm = initialize_llm()

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"
)

# Chat logic
def chat_with_bot(query):
    if not query.strip():
        return "❗ Please enter a question."
    return qa_chain.run(query)

# Summary logic
def summarize_content():
    sample_text = " ".join(chunks[:20])
    prompt = f"Summarize this text in 5 bullet points:\n\n{sample_text[:3000]}"
    summary = llm.invoke(prompt)
    return summary.content if hasattr(summary, "content") else summary

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## πŸ€– Chat with LLaMA Webpage Content")

    with gr.Row():
        chatbot = gr.Chatbot(label="Chat History")

    with gr.Row():
        question = gr.Textbox(label="Ask your question about LLaMA", placeholder="e.g., Who developed LLaMA?")
        ask_btn = gr.Button("Submit")
        clear_btn = gr.Button("Clear Chat")

    summary_output = gr.Textbox(label="πŸ“‹ Summary of the Webpage", lines=8)
    summarize_btn = gr.Button("Summarize Content")

    def user_chat_handler(q, history):
        response = chat_with_bot(q)
        history.append((q, response))
        return history, ""

    ask_btn.click(fn=user_chat_handler, inputs=[question, chatbot], outputs=[chatbot, question])
    clear_btn.click(lambda: [], None, chatbot)
    summarize_btn.click(fn=summarize_content, inputs=[], outputs=summary_output)

demo.launch()