File size: 4,941 Bytes
1725afa
972a93c
1086067
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1725afa
 
972a93c
1086067
7bf6ead
 
 
 
 
714b045
1725afa
1086067
 
972a93c
1086067
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8dec22
1086067
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import streamlit as st
import PyPDF2
from pdfminer.high_level import extract_text
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from groq import Groq

# --- Helper Functions ---

def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        return text
    except Exception as e:
        st.warning(f"PyPDF2 failed with error: {e}. Trying pdfminer.six...")
        return extract_text(pdf_path)

def chunk_text_with_tokenizer(text, tokenizer, chunk_size=150, chunk_overlap=30):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
        start += chunk_size - chunk_overlap
    return chunks

def retrieve_relevant_chunks(question, index, embeddings_model, text_chunks, k=3):
    question_embedding = embeddings_model.encode([question])[0]
    D, I = index.search(np.array([question_embedding]), k)
    relevant_chunks = [text_chunks[i] for i in I[0]]
    return relevant_chunks

def generate_answer_with_groq(question, context):
    prompt = f"Based on the following context, answer the question: '{question}'\n\nContext:\n{context}"
    model_name = "llama-3.3-70b-versatile"  # Adjust model if needed
    try:
        groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
        response = groq_client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content
    except Exception as e:
        st.error(f"Error generating answer with Groq API: {e}")
        return "I'm sorry, I couldn't generate an answer at this time."

# --- Streamlit UI & Logic ---

st.set_page_config(page_title="SMEHelpBot πŸ€–", layout="wide")
st.title("πŸ€– SMEHelpBot – Your AI Assistant for Small Businesses")

# GROQ API key check
GROQ_API_KEY = st.secrets.get("GROQ_API_KEY") or os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    st.error("❌ Please set your GROQ_API_KEY in environment or .streamlit/secrets.toml")
    st.stop()

os.environ["GROQ_API_KEY"] = GROQ_API_KEY

# File uploader
uploaded_pdf = st.file_uploader("πŸ“ Upload PDF document(s) for SME knowledge base", type=["pdf"], accept_multiple_files=False)

# Text input for question
user_question = st.text_input("πŸ’¬ Ask your question about SME documents:")

# Button to trigger processing
if st.button("Get Answer") or (user_question and uploaded_pdf):
    if not uploaded_pdf:
        st.warning("Please upload a PDF file first.")
    elif not user_question:
        st.warning("Please enter a question.")
    else:
        with st.spinner("Processing PDF and generating answer..."):
            # Save uploaded file temporarily for PyPDF2/pdfminer
            temp_path = f"/tmp/{uploaded_pdf.name}"
            with open(temp_path, "wb") as f:
                f.write(uploaded_pdf.getbuffer())

            # Extract text
            pdf_text = extract_text_from_pdf(temp_path)

            # Tokenizer + Chunk
            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
            text_chunks = chunk_text_with_tokenizer(pdf_text, tokenizer)

            # Embeddings
            embedding_model = SentenceTransformer('all-mpnet-base-v2')
            all_embeddings = embedding_model.encode(text_chunks) if text_chunks else []

            if not all_embeddings:
                st.error("No text chunks found to create embeddings.")
            else:
                # Create FAISS index
                embedding_dim = all_embeddings[0].shape[0]
                index = faiss.IndexFlatL2(embedding_dim)
                index.add(np.array(all_embeddings))

                # Retrieve relevant chunks
                relevant_chunks = retrieve_relevant_chunks(user_question, index, embedding_model, text_chunks)
                context = "\n\n".join(relevant_chunks)

                # Generate answer with Groq
                answer = generate_answer_with_groq(user_question, context)

                # Display outputs
                st.markdown("### Extracted Text Snippet:")
                st.write(pdf_text[:500] + "...")

                st.markdown("### Sample Text Chunks:")
                for i, chunk in enumerate(text_chunks[:3]):
                    st.write(f"Chunk {i+1}: {chunk[:200]}...")

                st.markdown("### Answer:")
                st.success(answer)