Spaces:

random2222
/

trykro

Sleeping

App Files Files Community

random2222 commited on Apr 11

Commit

0e1a332

verified ·

1 Parent(s): 0946a91

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -62

app.py CHANGED Viewed

@@ -8,97 +8,83 @@ from langchain.chains import RetrievalQA
 from langchain_community.llms import HuggingFacePipeline
 from transformers import pipeline, AutoTokenizer
-# Configuration
-DOCS_FOLDER = "study_materials"
-CHUNK_SIZE = 1000
-CHUNK_OVERLAP = 150
-MODEL_NAME = "google/flan-t5-base"
-def get_documents():
-    """Load and process documents without external dependencies"""
     documents = []
-    for file in os.listdir(DOCS_FOLDER):
-        path = os.path.join(DOCS_FOLDER, file)
-        if file.endswith(".pdf"):
             loader = PyMuPDFLoader(path)
             documents.extend(loader.load())
-        elif file.endswith(".txt"):
             loader = TextLoader(path)
             documents.extend(loader.load())
     return documents
-def initialize_system():
     try:
-        # 1. Document Processing
-        docs = get_documents()
-        if not docs:
-            raise RuntimeError(f"⚠️ No documents found in {DOCS_FOLDER} folder")
-        # 2. Text Chunking
-        splitter = CharacterTextSplitter(
-            chunk_size=CHUNK_SIZE,
-            chunk_overlap=CHUNK_OVERLAP,
             separator="\n\n"
         )
-        chunks = splitter.split_documents(docs)
-        # 3. Local Embeddings
         embeddings = HuggingFaceEmbeddings(
             model_name="sentence-transformers/all-MiniLM-L6-v2"
         )
-        # 4. Vector Store
-        vector_db = FAISS.from_documents(chunks, embeddings)
-        # 5. Local LLM Setup
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-        text_gen = pipeline(
-            task="text2text-generation",
-            model=MODEL_NAME,
             tokenizer=tokenizer,
-            max_length=500,
-            temperature=0.4,
-            device=-1  # Force CPU
         )
-        # 6. LangChain Integration
-        llm = HuggingFacePipeline(pipeline=text_gen)
-        return RetrievalQA.from_chain_type(
             llm=llm,
-            chain_type="stuff",
-            retriever=vector_db.as_retriever(search_kwargs={"k": 3}),
             return_source_documents=True
         )
-    except Exception as error:
-        raise RuntimeError(f"Initialization failed: {str(error)}")
-# Initialize QA system
 try:
-    qa_system = initialize_system()
-except Exception as error:
-    print(f"Fatal Error: {str(error)}")
     raise
-def handle_query(query, history):
-    """Process user queries with enhanced error handling"""
     try:
-        result = qa_system.invoke({"query": query})
-        response = result["result"]
-        sources = {doc.metadata['source'] for doc in result['source_documents']}
-        return f"{response}\n\nSources: {', '.join(sources)}"
-    except Exception as error:
-        print(f"Query Error: {str(error)}")
-        return "Error processing request. Please check document formatting."
-# Create interface
 gr.ChatInterface(
-    fn=handle_query,
-    title="Local Document AI",
-    description="Upload PDF/TXT files to 'study_materials' folder and ask questions",
-    examples=[
-        "Summarize the main points from chapter 3",
-        "Explain the key concepts in section 2.1",
-        "What are the advantages discussed on page 4?"
-    ]
 ).launch()

 from langchain_community.llms import HuggingFacePipeline
 from transformers import pipeline, AutoTokenizer
+def load_documents(file_path="study_materials"):
     documents = []
+    for filename in os.listdir(file_path):
+        path = os.path.join(file_path, filename)
+        if filename.endswith(".pdf"):
             loader = PyMuPDFLoader(path)
             documents.extend(loader.load())
+        elif filename.endswith(".txt"):
             loader = TextLoader(path)
             documents.extend(loader.load())
     return documents
+def create_qa_system():
     try:
+        # Load documents
+        documents = load_documents()
+        if not documents:
+            raise ValueError("📚 No study materials found")
+        # Text splitting
+        text_splitter = CharacterTextSplitter(
+            chunk_size=800,
+            chunk_overlap=100,
             separator="\n\n"
         )
+        texts = text_splitter.split_documents(documents)
+        # Embeddings
         embeddings = HuggingFaceEmbeddings(
             model_name="sentence-transformers/all-MiniLM-L6-v2"
         )
+        # Vector store
+        db = FAISS.from_documents(texts, embeddings)
+        # LLM setup with proper LangChain wrapper
+        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
+        pipe = pipeline(
+            "text2text-generation",
+            model="google/flan-t5-base",
             tokenizer=tokenizer,
+            max_length=300,
+            temperature=0.3,
+            device=-1
         )
+        # Wrap pipeline in LangChain component
+        llm = HuggingFacePipeline(pipeline=pipe)
+        # Create QA chain
+        return RetrievalQA.from_llm(
             llm=llm,
+            retriever=db.as_retriever(search_kwargs={"k": 2}),
             return_source_documents=True
         )
+    except Exception as e:
+        raise gr.Error(f"Error: {str(e)}")
+# Initialize system
 try:
+    qa = create_qa_system()
+except Exception as e:
+    print(f"Startup failed: {str(e)}")
     raise
+def ask_question(question, history):
     try:
+        result = qa.invoke({"query": question})
+        answer = result["result"]
+        sources = list({doc.metadata['source'] for doc in result['source_documents']})
+        return f"{answer}\n\n📚 Sources: {', '.join(sources)}"
+    except Exception as e:
+        return f"Error: {str(e)[:150]}"
 gr.ChatInterface(
+    ask_question,
+    title="Study Assistant",
+    description="Upload PDF/TXT files in 'study_materials' folder and ask questions!",
+    theme="soft"
 ).launch()