File size: 2,960 Bytes
2d8c319
42d3ee2
 
c48b838
91b268b
42d3ee2
 
 
c48b838
42d3ee2
c48b838
42d3ee2
c48b838
 
 
 
 
 
 
 
42d3ee2
 
 
f8c1ecf
c48b838
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8c1ecf
c48b838
4d6816c
c48b838
 
 
 
 
 
2d88065
c48b838
 
 
 
 
 
 
 
0e5b4a4
c48b838
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import gradio as gr
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer

def load_documents(file_path="study_materials"):
    documents = []
    for filename in os.listdir(file_path):
        path = os.path.join(file_path, filename)
        if filename.endswith(".pdf"):
            loader = PyMuPDFLoader(path)
            documents.extend(loader.load())
        elif filename.endswith(".txt"):
            loader = TextLoader(path)
            documents.extend(loader.load())
    return documents

def create_qa_system():
    try:
        # Load documents
        documents = load_documents()
        if not documents:
            raise ValueError("πŸ“š No study materials found")
        
        # Text splitting
        text_splitter = CharacterTextSplitter(
            chunk_size=1100,
            chunk_overlap=200,
            separator="\n\n"
        )
        texts = text_splitter.split_documents(documents)
        
        # Embeddings
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        
        # Vector store
        db = FAISS.from_documents(texts, embeddings)
        
        # LLM setup with proper LangChain wrapper
        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")  # ←
        pipe = pipeline(
            "text2text-generation",
            model="google/flan-t5-large",
            max_length=600,
            temperature=0.7,
            tokenizer=tokenizer,
            do_sample=True,
            top_k=50,
            device=-1
        )
        
        # Wrap pipeline in LangChain component
        llm = HuggingFacePipeline(pipeline=pipe)
        
        # Create QA chain
        return RetrievalQA.from_llm(
            llm=llm,
            retriever=db.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True
        )
    except Exception as e:
        raise gr.Error(f"Error: {str(e)}")

# Initialize system
try:
    qa = create_qa_system()
except Exception as e:
    print(f"Startup failed: {str(e)}")
    raise

def ask_question(question, history):
    try:
        result = qa.invoke({"query": question})
        answer = result["result"]
        sources = list({doc.metadata['source'] for doc in result['source_documents']})
        return f"{answer}\n\nπŸ“š Sources: {', '.join(sources)}"
    except Exception as e:
        return f"Error: {str(e)[:150]}"

gr.ChatInterface(
    ask_question,
    title="Study Assistant",
    description="Upload PDF/TXT files in 'study_materials' folder and ask questions!",
    theme="soft"
).launch()