random2222 commited on
Commit
0e1a332
·
verified ·
1 Parent(s): 0946a91

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -62
app.py CHANGED
@@ -8,97 +8,83 @@ from langchain.chains import RetrievalQA
8
  from langchain_community.llms import HuggingFacePipeline
9
  from transformers import pipeline, AutoTokenizer
10
 
11
- # Configuration
12
- DOCS_FOLDER = "study_materials"
13
- CHUNK_SIZE = 1000
14
- CHUNK_OVERLAP = 150
15
- MODEL_NAME = "google/flan-t5-base"
16
-
17
- def get_documents():
18
- """Load and process documents without external dependencies"""
19
  documents = []
20
- for file in os.listdir(DOCS_FOLDER):
21
- path = os.path.join(DOCS_FOLDER, file)
22
- if file.endswith(".pdf"):
23
  loader = PyMuPDFLoader(path)
24
  documents.extend(loader.load())
25
- elif file.endswith(".txt"):
26
  loader = TextLoader(path)
27
  documents.extend(loader.load())
28
  return documents
29
 
30
- def initialize_system():
31
  try:
32
- # 1. Document Processing
33
- docs = get_documents()
34
- if not docs:
35
- raise RuntimeError(f"⚠️ No documents found in {DOCS_FOLDER} folder")
36
 
37
- # 2. Text Chunking
38
- splitter = CharacterTextSplitter(
39
- chunk_size=CHUNK_SIZE,
40
- chunk_overlap=CHUNK_OVERLAP,
41
  separator="\n\n"
42
  )
43
- chunks = splitter.split_documents(docs)
44
 
45
- # 3. Local Embeddings
46
  embeddings = HuggingFaceEmbeddings(
47
  model_name="sentence-transformers/all-MiniLM-L6-v2"
48
  )
49
 
50
- # 4. Vector Store
51
- vector_db = FAISS.from_documents(chunks, embeddings)
52
 
53
- # 5. Local LLM Setup
54
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
55
- text_gen = pipeline(
56
- task="text2text-generation",
57
- model=MODEL_NAME,
58
  tokenizer=tokenizer,
59
- max_length=500,
60
- temperature=0.4,
61
- device=-1 # Force CPU
62
  )
63
 
64
- # 6. LangChain Integration
65
- llm = HuggingFacePipeline(pipeline=text_gen)
66
 
67
- return RetrievalQA.from_chain_type(
 
68
  llm=llm,
69
- chain_type="stuff",
70
- retriever=vector_db.as_retriever(search_kwargs={"k": 3}),
71
  return_source_documents=True
72
  )
73
- except Exception as error:
74
- raise RuntimeError(f"Initialization failed: {str(error)}")
75
 
76
- # Initialize QA system
77
  try:
78
- qa_system = initialize_system()
79
- except Exception as error:
80
- print(f"Fatal Error: {str(error)}")
81
  raise
82
 
83
- def handle_query(query, history):
84
- """Process user queries with enhanced error handling"""
85
  try:
86
- result = qa_system.invoke({"query": query})
87
- response = result["result"]
88
- sources = {doc.metadata['source'] for doc in result['source_documents']}
89
- return f"{response}\n\nSources: {', '.join(sources)}"
90
- except Exception as error:
91
- print(f"Query Error: {str(error)}")
92
- return "Error processing request. Please check document formatting."
93
 
94
- # Create interface
95
  gr.ChatInterface(
96
- fn=handle_query,
97
- title="Local Document AI",
98
- description="Upload PDF/TXT files to 'study_materials' folder and ask questions",
99
- examples=[
100
- "Summarize the main points from chapter 3",
101
- "Explain the key concepts in section 2.1",
102
- "What are the advantages discussed on page 4?"
103
- ]
104
  ).launch()
 
8
  from langchain_community.llms import HuggingFacePipeline
9
  from transformers import pipeline, AutoTokenizer
10
 
11
+ def load_documents(file_path="study_materials"):
 
 
 
 
 
 
 
12
  documents = []
13
+ for filename in os.listdir(file_path):
14
+ path = os.path.join(file_path, filename)
15
+ if filename.endswith(".pdf"):
16
  loader = PyMuPDFLoader(path)
17
  documents.extend(loader.load())
18
+ elif filename.endswith(".txt"):
19
  loader = TextLoader(path)
20
  documents.extend(loader.load())
21
  return documents
22
 
23
+ def create_qa_system():
24
  try:
25
+ # Load documents
26
+ documents = load_documents()
27
+ if not documents:
28
+ raise ValueError("📚 No study materials found")
29
 
30
+ # Text splitting
31
+ text_splitter = CharacterTextSplitter(
32
+ chunk_size=800,
33
+ chunk_overlap=100,
34
  separator="\n\n"
35
  )
36
+ texts = text_splitter.split_documents(documents)
37
 
38
+ # Embeddings
39
  embeddings = HuggingFaceEmbeddings(
40
  model_name="sentence-transformers/all-MiniLM-L6-v2"
41
  )
42
 
43
+ # Vector store
44
+ db = FAISS.from_documents(texts, embeddings)
45
 
46
+ # LLM setup with proper LangChain wrapper
47
+ tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
48
+ pipe = pipeline(
49
+ "text2text-generation",
50
+ model="google/flan-t5-base",
51
  tokenizer=tokenizer,
52
+ max_length=300,
53
+ temperature=0.3,
54
+ device=-1
55
  )
56
 
57
+ # Wrap pipeline in LangChain component
58
+ llm = HuggingFacePipeline(pipeline=pipe)
59
 
60
+ # Create QA chain
61
+ return RetrievalQA.from_llm(
62
  llm=llm,
63
+ retriever=db.as_retriever(search_kwargs={"k": 2}),
 
64
  return_source_documents=True
65
  )
66
+ except Exception as e:
67
+ raise gr.Error(f"Error: {str(e)}")
68
 
69
+ # Initialize system
70
  try:
71
+ qa = create_qa_system()
72
+ except Exception as e:
73
+ print(f"Startup failed: {str(e)}")
74
  raise
75
 
76
+ def ask_question(question, history):
 
77
  try:
78
+ result = qa.invoke({"query": question})
79
+ answer = result["result"]
80
+ sources = list({doc.metadata['source'] for doc in result['source_documents']})
81
+ return f"{answer}\n\n📚 Sources: {', '.join(sources)}"
82
+ except Exception as e:
83
+ return f"Error: {str(e)[:150]}"
 
84
 
 
85
  gr.ChatInterface(
86
+ ask_question,
87
+ title="Study Assistant",
88
+ description="Upload PDF/TXT files in 'study_materials' folder and ask questions!",
89
+ theme="soft"
 
 
 
 
90
  ).launch()