random2222 commited on
Commit
62390c0
Β·
verified Β·
1 Parent(s): d168db4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -68
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import os
3
  import gradio as gr
4
  from langchain_community.vectorstores import FAISS
@@ -6,86 +5,55 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.document_loaders import PyMuPDFLoader
7
  from langchain.text_splitter import CharacterTextSplitter
8
  from langchain.chains import RetrievalQA
9
- from langchain_community.llms import HuggingFaceHub # Updated import path
10
- import zipfile
11
 
12
- # Rest of your existing code remains the same...
 
13
 
14
- # Extract PDFs from zip file
15
- def extract_pdfs_from_zip(zip_path="data.zip", extract_to="data"):
16
- if not os.path.exists(zip_path):
17
- raise FileNotFoundError(f"Zip file '{zip_path}' not found.")
 
18
 
19
- if not os.path.exists(extract_to):
20
- os.makedirs(extract_to)
 
21
 
22
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
23
- zip_ref.extractall(extract_to)
24
-
25
- def load_pdfs(directory="data"):
26
- if not os.path.exists(directory):
27
- raise FileNotFoundError(f"The directory '{directory}' does not exist.")
28
 
29
- raw_documents = []
30
- for filename in os.listdir(directory):
31
- if filename.endswith(".pdf"):
32
- loader = PyMuPDFLoader(os.path.join(directory, filename))
33
- docs = loader.load()
34
- raw_documents.extend(docs)
35
- return raw_documents
36
-
37
- def split_documents(documents):
38
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
39
- return text_splitter.split_documents(documents)
40
-
41
- def initialize_qa_system():
42
- print("πŸ“¦ Extracting PDFs from zip...")
43
- extract_pdfs_from_zip()
44
-
45
- print("πŸ”„ Loading PDFs...")
46
- raw_docs = load_pdfs()
47
- print(f"βœ… Loaded {len(raw_docs)} raw documents.")
48
-
49
- if len(raw_docs) == 0:
50
- raise ValueError("No PDF documents found in the 'data' directory.")
51
-
52
- print("πŸͺ“ Splitting documents into chunks...")
53
- docs = split_documents(raw_docs)
54
- print(f"βœ… Split into {len(docs)} chunks.")
55
-
56
- print("🧠 Generating embeddings...")
57
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
58
-
59
- print("πŸ“¦ Creating FAISS vector store...")
60
- db = FAISS.from_documents(docs, embeddings)
61
- print("βœ… Vector store created successfully!")
62
-
63
- print("πŸ€– Initializing LLM...")
64
  llm = HuggingFaceHub(
65
- repo_id="google/flan-t5-xxl",
66
- model_kwargs={"temperature": 0.5, "max_length": 512}
67
  )
68
 
69
- qa = RetrievalQA.from_chain_type(
70
  llm=llm,
71
  chain_type="stuff",
72
- retriever=db.as_retriever(search_kwargs={"k": 3})
73
  )
74
- return qa
75
 
76
- # Initialize the QA system
77
- qa_system = initialize_qa_system()
78
 
79
- def chat_response(message, history):
80
- response = qa_system({"query": message})
 
81
  return response["result"]
82
 
83
- # Create Gradio interface
84
- demo = gr.ChatInterface(
85
- fn=chat_response,
86
- title="PDF Knowledge Chatbot",
87
- description="Ask questions about the content in your PDF documents"
88
- )
89
-
90
- if __name__ == "__main__":
91
- demo.launch()
 
 
1
  import os
2
  import gradio as gr
3
  from langchain_community.vectorstores import FAISS
 
5
  from langchain_community.document_loaders import PyMuPDFLoader
6
  from langchain.text_splitter import CharacterTextSplitter
7
  from langchain.chains import RetrievalQA
8
+ from langchain_community.llms import HuggingFaceHub
9
+ from huggingface_hub import login
10
 
11
+ # 1. Authentication (MUST HAVE)
12
+ login(token=os.environ.get('HF_TOKEN'))
13
 
14
+ # 2. PDF Processing Function
15
+ def create_qa_system():
16
+ # File check
17
+ if not os.path.exists("data.pdf"):
18
+ raise gr.Error("❌ data.pdf not found! Upload it in Space's Files tab")
19
 
20
+ # Load PDF
21
+ loader = PyMuPDFLoader("data.pdf")
22
+ documents = loader.load()
23
 
24
+ # Split text
25
+ text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
26
+ texts = text_splitter.split_documents(documents)
 
 
 
27
 
28
+ # Create embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
30
+
31
+ # Build vector store
32
+ db = FAISS.from_documents(texts, embeddings)
33
+
34
+ # Initialize LLM (Free-tier compatible)
 
35
  llm = HuggingFaceHub(
36
+ repo_id="google/flan-t5-base", # Changed to smaller model
37
+ model_kwargs={"temperature": 0.2, "max_length": 256}
38
  )
39
 
40
+ return RetrievalQA.from_chain_type(
41
  llm=llm,
42
  chain_type="stuff",
43
+ retriever=db.as_retriever(search_kwargs={"k": 2})
44
  )
 
45
 
46
+ # 3. Initialize system
47
+ qa = create_qa_system()
48
 
49
+ # 4. Chat interface
50
+ def chat(message, history):
51
+ response = qa({"query": message})
52
  return response["result"]
53
 
54
+ # 5. Launch Gradio
55
+ gr.ChatInterface(
56
+ chat,
57
+ title="PDF Chatbot",
58
+ description="Upload your PDF in Files tab ➑️ Ask questions!",
59
+ ).launch()