random2222 commited on
Commit
0946a91
·
verified ·
1 Parent(s): 5326991

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -54
app.py CHANGED
@@ -8,95 +8,97 @@ from langchain.chains import RetrievalQA
8
  from langchain_community.llms import HuggingFacePipeline
9
  from transformers import pipeline, AutoTokenizer
10
 
11
- def load_documents(file_path="study_materials"):
 
 
 
 
 
 
 
12
  documents = []
13
- for filename in os.listdir(file_path):
14
- path = os.path.join(file_path, filename)
15
- if filename.endswith(".pdf"):
16
  loader = PyMuPDFLoader(path)
17
  documents.extend(loader.load())
18
- elif filename.endswith(".txt"):
19
  loader = TextLoader(path)
20
  documents.extend(loader.load())
21
  return documents
22
 
23
- def create_qa_system():
24
  try:
25
- # Load and process documents
26
- documents = load_documents()
27
- if not documents:
28
- raise ValueError(" No documents found in 'study_materials' folder")
29
 
30
- # Document processing
31
- text_splitter = CharacterTextSplitter(
32
- chunk_size=800,
33
- chunk_overlap=100,
34
  separator="\n\n"
35
  )
36
- texts = text_splitter.split_documents(documents)
37
 
38
- # Local embeddings
39
  embeddings = HuggingFaceEmbeddings(
40
  model_name="sentence-transformers/all-MiniLM-L6-v2"
41
  )
42
 
43
- # Create vector store
44
- db = FAISS.from_documents(texts, embeddings)
45
 
46
- # Configure local LLM
47
- tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
48
- local_pipe = pipeline(
49
- "text2text-generation",
50
- model="google/flan-t5-base",
51
  tokenizer=tokenizer,
52
- max_length=400, # Increased response length
53
  temperature=0.4,
54
  device=-1 # Force CPU
55
  )
56
 
57
- # LangChain integration
58
- llm = HuggingFacePipeline(pipeline=local_pipe)
59
 
60
  return RetrievalQA.from_chain_type(
61
  llm=llm,
62
  chain_type="stuff",
63
- retriever=db.as_retriever(search_kwargs={"k": 3}),
64
  return_source_documents=True
65
  )
66
- except Exception as e:
67
- raise gr.Error(f"Setup Error: {str(e)}")
68
 
69
- # Initialize system
70
  try:
71
- qa = create_qa_system()
72
- except Exception as e:
73
- print(f"Startup Failed: {str(e)}")
74
  raise
75
 
76
- def ask_question(question, history):
 
77
  try:
78
- result = qa({"query": question})
79
- answer = result["result"]
80
-
81
- # Enforce minimum answer length
82
- min_words = 75
83
- if len(answer.split()) < min_words:
84
- answer += f"\n\n[Note: This answer is shorter than {min_words} words. Consider rephrasing your question for more details.]"
85
-
86
- # Show sources
87
- sources = list({doc.metadata['source'] for doc in result['source_documents']})
88
- return f"{answer}\n\n📚 Sources: {', '.join(sources)}"
89
- except Exception as e:
90
- return f"Error: {str(e)[:150]}"
91
 
92
- # Launch interface
93
  gr.ChatInterface(
94
- ask_question,
95
- title="Local Study Assistant",
96
- description="100% local AI - No APIs required! Upload PDF/TXT files in 'study_materials' folder",
97
  examples=[
98
- "Explain the key concepts from Chapter 4 in detail",
99
- "What are the three main points made in section 2.3?",
100
- "Compare and contrast the theories presented in pages 50-60"
101
  ]
102
  ).launch()
 
8
  from langchain_community.llms import HuggingFacePipeline
9
  from transformers import pipeline, AutoTokenizer
10
 
11
+ # Configuration
12
+ DOCS_FOLDER = "study_materials"
13
+ CHUNK_SIZE = 1000
14
+ CHUNK_OVERLAP = 150
15
+ MODEL_NAME = "google/flan-t5-base"
16
+
17
+ def get_documents():
18
+ """Load and process documents without external dependencies"""
19
  documents = []
20
+ for file in os.listdir(DOCS_FOLDER):
21
+ path = os.path.join(DOCS_FOLDER, file)
22
+ if file.endswith(".pdf"):
23
  loader = PyMuPDFLoader(path)
24
  documents.extend(loader.load())
25
+ elif file.endswith(".txt"):
26
  loader = TextLoader(path)
27
  documents.extend(loader.load())
28
  return documents
29
 
30
+ def initialize_system():
31
  try:
32
+ # 1. Document Processing
33
+ docs = get_documents()
34
+ if not docs:
35
+ raise RuntimeError(f"⚠️ No documents found in {DOCS_FOLDER} folder")
36
 
37
+ # 2. Text Chunking
38
+ splitter = CharacterTextSplitter(
39
+ chunk_size=CHUNK_SIZE,
40
+ chunk_overlap=CHUNK_OVERLAP,
41
  separator="\n\n"
42
  )
43
+ chunks = splitter.split_documents(docs)
44
 
45
+ # 3. Local Embeddings
46
  embeddings = HuggingFaceEmbeddings(
47
  model_name="sentence-transformers/all-MiniLM-L6-v2"
48
  )
49
 
50
+ # 4. Vector Store
51
+ vector_db = FAISS.from_documents(chunks, embeddings)
52
 
53
+ # 5. Local LLM Setup
54
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
55
+ text_gen = pipeline(
56
+ task="text2text-generation",
57
+ model=MODEL_NAME,
58
  tokenizer=tokenizer,
59
+ max_length=500,
60
  temperature=0.4,
61
  device=-1 # Force CPU
62
  )
63
 
64
+ # 6. LangChain Integration
65
+ llm = HuggingFacePipeline(pipeline=text_gen)
66
 
67
  return RetrievalQA.from_chain_type(
68
  llm=llm,
69
  chain_type="stuff",
70
+ retriever=vector_db.as_retriever(search_kwargs={"k": 3}),
71
  return_source_documents=True
72
  )
73
+ except Exception as error:
74
+ raise RuntimeError(f"Initialization failed: {str(error)}")
75
 
76
+ # Initialize QA system
77
  try:
78
+ qa_system = initialize_system()
79
+ except Exception as error:
80
+ print(f"Fatal Error: {str(error)}")
81
  raise
82
 
83
+ def handle_query(query, history):
84
+ """Process user queries with enhanced error handling"""
85
  try:
86
+ result = qa_system.invoke({"query": query})
87
+ response = result["result"]
88
+ sources = {doc.metadata['source'] for doc in result['source_documents']}
89
+ return f"{response}\n\nSources: {', '.join(sources)}"
90
+ except Exception as error:
91
+ print(f"Query Error: {str(error)}")
92
+ return "Error processing request. Please check document formatting."
 
 
 
 
 
 
93
 
94
+ # Create interface
95
  gr.ChatInterface(
96
+ fn=handle_query,
97
+ title="Local Document AI",
98
+ description="Upload PDF/TXT files to 'study_materials' folder and ask questions",
99
  examples=[
100
+ "Summarize the main points from chapter 3",
101
+ "Explain the key concepts in section 2.1",
102
+ "What are the advantages discussed on page 4?"
103
  ]
104
  ).launch()