File size: 2,554 Bytes
6a8f952
 
 
da4f565
 
 
3cfed0b
 
6a8f952
da4f565
4c284da
6a8f952
da4f565
6a8f952
 
 
 
 
 
da4f565
3cfed0b
 
da4f565
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cfed0b
da4f565
 
3cfed0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da4f565
 
3cfed0b
da4f565
 
 
 
 
 
 
 
3cfed0b
da4f565
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import streamlit as st
from groq import Groq
from PyPDF2 import PdfReader
from docx import Document
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize Groq API Client
client = Groq(api_key=os.environ.get("Groq_Api"))

# Title with Book Icon
st.title("πŸ“– A&Q From a File")

# File Upload
uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"])

if uploaded_file:
    st.write(f"**File Name:** {uploaded_file.name}")  # Display file name

    # Extract Text
    def extract_text(file):
        if file.name.endswith(".pdf"):
            reader = PdfReader(file)
            return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
        elif file.name.endswith(".docx"):
            doc = Document(file)
            return "\n".join([para.text for para in doc.paragraphs])
        return ""

    file_text = extract_text(uploaded_file)

    if file_text:
        st.success("File uploaded and text extracted successfully!")
        st.write("Ask a question about the file:")
        query = st.text_input("Enter your question")

        if query:
            # Load Sentence Transformer Model
            model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

            # Chunk & Embed Text
            chunk_size = 512
            chunks = [file_text[i:i + chunk_size] for i in range(0, len(file_text), chunk_size)]
            embeddings = model.encode(chunks, convert_to_numpy=True)

            # Build FAISS Index for Fast Retrieval
            index = faiss.IndexFlatL2(embeddings.shape[1])
            index.add(embeddings)

            # Query Embedding
            query_embedding = model.encode([query], convert_to_numpy=True)
            _, retrieved_idx = index.search(query_embedding, k=3)

            # Retrieve Top 3 Relevant Chunks
            relevant_text = " ".join([chunks[i] for i in retrieved_idx[0]])

            # Query Groq API with relevant chunks only
            chat_completion = client.chat.completions.create(
                messages=[
                    {"role": "user", "content": f"Answer based on this document: {query}\n\n{relevant_text}"},
                ],
                model="llama-3.3-70b-versatile",
            )

            # Display Answer
            answer = chat_completion.choices[0].message.content
            st.subheader("Answer:")
            st.write(answer)

    else:
        st.error("Failed to extract text from the file. Please check the format.")