random2222 commited on
Commit
d640554
Β·
verified Β·
1 Parent(s): 59d35cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -29
app.py CHANGED
@@ -5,10 +5,10 @@ from langchain_text_splitters import CharacterTextSplitter
5
  from langchain_community.vectorstores import FAISS
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain.chains import RetrievalQA
 
8
  from transformers import pipeline, AutoTokenizer
9
 
10
  def load_documents(file_path="study_materials"):
11
- # Supports both PDF and TXT files
12
  documents = []
13
  for filename in os.listdir(file_path):
14
  path = os.path.join(file_path, filename)
@@ -22,71 +22,69 @@ def load_documents(file_path="study_materials"):
22
 
23
  def create_qa_system():
24
  try:
25
- # 1. Load study materials
26
  documents = load_documents()
27
  if not documents:
28
- raise ValueError("πŸ“š No PDF/TXT files found in 'study_materials' folder")
29
 
30
- # 2. Smart text splitting for educational content
31
  text_splitter = CharacterTextSplitter(
32
- chunk_size=800, # Optimized for textbook content
33
  chunk_overlap=100,
34
- separator="\n\n" # Preserve paragraph structure
35
  )
36
  texts = text_splitter.split_documents(documents)
37
 
38
- # 3. Educational-focused embeddings
39
  embeddings = HuggingFaceEmbeddings(
40
  model_name="sentence-transformers/all-MiniLM-L6-v2"
41
  )
42
 
43
- # 4. Create knowledge base
44
  db = FAISS.from_documents(texts, embeddings)
45
 
46
- # 5. Configure student-friendly AI
47
- qa_pipeline = pipeline(
 
48
  "text2text-generation",
49
  model="google/flan-t5-base",
50
- tokenizer=AutoTokenizer.from_pretrained("google/flan-t5-base"),
51
- max_length=300, # Longer answers for explanations
52
- temperature=0.3, # Balance creativity/facts
53
- device=-1 # Force CPU usage
54
  )
55
 
56
- return RetrievalQA.from_chain_type(
57
- llm=qa_pipeline,
58
- chain_type="stuff",
 
 
 
59
  retriever=db.as_retriever(search_kwargs={"k": 2}),
60
  return_source_documents=True
61
  )
62
  except Exception as e:
63
- raise gr.Error(f"🚨 Study Assistant Setup Failed: {str(e)}")
64
 
65
  # Initialize system
66
  try:
67
  qa = create_qa_system()
68
  except Exception as e:
69
- print(f"Critical Error: {str(e)}")
70
  raise
71
 
72
  def ask_question(question, history):
73
  try:
74
- result = qa({"query": question})
75
  answer = result["result"]
76
  sources = list({doc.metadata['source'] for doc in result['source_documents']})
77
  return f"{answer}\n\nπŸ“š Sources: {', '.join(sources)}"
78
  except Exception as e:
79
- return f"❌ Error: {str(e)[:150]}"
80
 
81
- # Student-friendly interface
82
  gr.ChatInterface(
83
  ask_question,
84
- title="Study Buddy AI",
85
- description="Ask questions about your course materials!",
86
- examples=[
87
- "Explain the key points from Chapter 3",
88
- "What's the difference between mitosis and meiosis?",
89
- "List the main causes of World War II"
90
- ],
91
  theme="soft"
92
  ).launch()
 
5
  from langchain_community.vectorstores import FAISS
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain.chains import RetrievalQA
8
+ from langchain_community.llms import HuggingFacePipeline
9
  from transformers import pipeline, AutoTokenizer
10
 
11
  def load_documents(file_path="study_materials"):
 
12
  documents = []
13
  for filename in os.listdir(file_path):
14
  path = os.path.join(file_path, filename)
 
22
 
23
  def create_qa_system():
24
  try:
25
+ # Load documents
26
  documents = load_documents()
27
  if not documents:
28
+ raise ValueError("πŸ“š No study materials found")
29
 
30
+ # Text splitting
31
  text_splitter = CharacterTextSplitter(
32
+ chunk_size=800,
33
  chunk_overlap=100,
34
+ separator="\n\n"
35
  )
36
  texts = text_splitter.split_documents(documents)
37
 
38
+ # Embeddings
39
  embeddings = HuggingFaceEmbeddings(
40
  model_name="sentence-transformers/all-MiniLM-L6-v2"
41
  )
42
 
43
+ # Vector store
44
  db = FAISS.from_documents(texts, embeddings)
45
 
46
+ # LLM setup with proper LangChain wrapper
47
+ tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
48
+ pipe = pipeline(
49
  "text2text-generation",
50
  model="google/flan-t5-base",
51
+ tokenizer=tokenizer,
52
+ max_length=300,
53
+ temperature=0.3,
54
+ device=-1
55
  )
56
 
57
+ # Wrap pipeline in LangChain component
58
+ llm = HuggingFacePipeline(pipeline=pipe)
59
+
60
+ # Create QA chain
61
+ return RetrievalQA.from_llm(
62
+ llm=llm,
63
  retriever=db.as_retriever(search_kwargs={"k": 2}),
64
  return_source_documents=True
65
  )
66
  except Exception as e:
67
+ raise gr.Error(f"Error: {str(e)}")
68
 
69
  # Initialize system
70
  try:
71
  qa = create_qa_system()
72
  except Exception as e:
73
+ print(f"Startup failed: {str(e)}")
74
  raise
75
 
76
  def ask_question(question, history):
77
  try:
78
+ result = qa.invoke({"query": question})
79
  answer = result["result"]
80
  sources = list({doc.metadata['source'] for doc in result['source_documents']})
81
  return f"{answer}\n\nπŸ“š Sources: {', '.join(sources)}"
82
  except Exception as e:
83
+ return f"Error: {str(e)[:150]}"
84
 
 
85
  gr.ChatInterface(
86
  ask_question,
87
+ title="Study Assistant",
88
+ description="Upload PDF/TXT files in 'study_materials' folder and ask questions!",
 
 
 
 
 
89
  theme="soft"
90
  ).launch()