Spaces:

sunbal7
/

AISmartBookAnalysisSystem

Sleeping

App Files Files Community

sunbal7 commited on Jul 24

Commit

3f00b29

verified ·

1 Parent(s): 66e139c

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -70

app.py CHANGED Viewed

@@ -1,38 +1,51 @@
 import streamlit as st
-st.set_page_config(page_title="RAG Book Analyzer", layout="wide")  # Must be the first Streamlit command
 import torch
 import numpy as np
 import faiss
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from sentence_transformers import SentenceTransformer
-import fitz  # PyMuPDF for PDF extraction
-import docx2txt  # For DOCX extraction
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 # ------------------------
-# Configuration
 # ------------------------
-MODEL_NAME = "microsoft/phi-2"  # Open-source model with good performance
-EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Smaller embedding model
 CHUNK_SIZE = 512
 CHUNK_OVERLAP = 64
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ------------------------
-# Model Loading with Caching
 # ------------------------
-@st.cache_resource
 def load_models():
     try:
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
-            device_map="auto" if DEVICE == "cuda" else None,
             torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
-            trust_remote_code=True
         )
         embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
         return tokenizer, model, embedder
     except Exception as e:
         st.error(f"Model loading failed: {str(e)}")
@@ -52,24 +65,19 @@ def split_text(text):
     return splitter.split_text(text)
 def extract_text(file):
-    file_type = file.type
-    if file_type == "application/pdf":
-        try:
             doc = fitz.open(stream=file.read(), filetype="pdf")
             return "\n".join([page.get_text() for page in doc])
-        except Exception as e:
-            st.error("Error processing PDF: " + str(e))
-            return ""
-    elif file_type == "text/plain":
-        return file.read().decode("utf-8")
-    elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
-        try:
             return docx2txt.process(file)
-        except Exception as e:
-            st.error("Error processing DOCX: " + str(e))
             return ""
-    else:
-        st.error("Unsupported file type: " + file_type)
         return ""
 def build_index(chunks):
@@ -80,74 +88,126 @@ def build_index(chunks):
     return index
 # ------------------------
-# Summarization and Q&A Functions
 # ------------------------
 def generate_summary(text):
-    # Create prompt for Phi-2 model
-    prompt = f"Instruct: Summarize this book in a concise paragraph\nInput: {text[:3000]}\nOutput:"
-    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
     outputs = model.generate(
         **inputs,
-        max_new_tokens=300,
         temperature=0.7,
         top_p=0.9,
-        do_sample=True
     )
-    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return summary.split("Output:")[-1].strip()
 def generate_answer(query, context):
-    # Create prompt for Phi-2 model
-    prompt = f"Instruct: Answer this question based on the context. If unsure, say 'I don't know'.\nQuestion: {query}\nContext: {context}\nOutput:"
-    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
     outputs = model.generate(
         **inputs,
-        max_new_tokens=300,
-        temperature=0.5,
-        top_p=0.9,
-        repetition_penalty=1.2,
-        do_sample=True
     )
-    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return answer.split("Output:")[-1].strip()
 # ------------------------
 # Streamlit UI
 # ------------------------
 st.title("📚 RAG-Based Book Analyzer")
 st.write("Upload a book (PDF, TXT, DOCX) to get a summary and ask questions about its content.")
 uploaded_file = st.file_uploader("Upload File", type=["pdf", "txt", "docx"])
 if uploaded_file:
-    text = extract_text(uploaded_file)
-    if text:
-        st.success("✅ File successfully processed!")
-        with st.spinner("Generating summary..."):
-            summary = generate_summary(text)
-            st.markdown("### Book Summary")
-            st.info(summary)
-        # Process text into chunks and build FAISS index
         chunks = split_text(text)
         index = build_index(chunks)
         st.session_state.chunks = chunks
         st.session_state.index = index
-        st.markdown("### ❓ Ask a Question about the Book")
-        query = st.text_input("Enter your question:")
-        if query:
-            with st.spinner("Searching for answers..."):
-                # Retrieve top 3 relevant chunks as context
-                query_embedding = embedder.encode([query])
-                distances, indices = st.session_state.index.search(query_embedding, k=3)
-                retrieved_chunks = [st.session_state.chunks[i] for i in indices[0] if i < len(st.session_state.chunks)]
-                context = "\n\n".join(retrieved_chunks)
-                answer = generate_answer(query, context)
-                st.markdown("### 💬 Answer")
-                st.success(answer)
-                with st.expander("See context used"):
-                    st.write(context)

 import streamlit as st
+st.set_page_config(page_title="RAG Book Analyzer", layout="wide")
 import torch
 import numpy as np
 import faiss
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from sentence_transformers import SentenceTransformer
+import fitz  # PyMuPDF
+import docx2txt
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 # ------------------------
+# Configuration (optimized for reliability)
 # ------------------------
+MODEL_NAME = "microsoft/phi-2"
+EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Efficient embedding model
 CHUNK_SIZE = 512
 CHUNK_OVERLAP = 64
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MAX_TEXT_LENGTH = 3000  # To prevent OOM errors
 # ------------------------
+# Model Loading with Robust Error Handling
 # ------------------------
+@st.cache_resource(show_spinner="Loading AI models...")
 def load_models():
     try:
+        # Load tokenizer with special settings for Phi-2
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True,
+            padding_side="left"
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+        # Load model with safe defaults
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+            trust_remote_code=True,
+            device_map="auto" if DEVICE == "cuda" else None,
+            low_cpu_mem_usage=True
         )
+        # Load efficient embedding model
         embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
         return tokenizer, model, embedder
     except Exception as e:
         st.error(f"Model loading failed: {str(e)}")
     return splitter.split_text(text)
 def extract_text(file):
+    try:
+        if file.type == "application/pdf":
             doc = fitz.open(stream=file.read(), filetype="pdf")
             return "\n".join([page.get_text() for page in doc])
+        elif file.type == "text/plain":
+            return file.read().decode("utf-8")
+        elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
             return docx2txt.process(file)
+        else:
+            st.error(f"Unsupported file type: {file.type}")
             return ""
+    except Exception as e:
+        st.error(f"Error processing file: {str(e)}")
         return ""
 def build_index(chunks):
     return index
 # ------------------------
+# AI Generation Functions (with safeguards)
 # ------------------------
 def generate_summary(text):
+    text = text[:MAX_TEXT_LENGTH]  # Prevent long inputs
+    prompt = f"Instruction: Summarize this book in a concise paragraph\nText: {text}\nSummary:"
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        max_length=1024,
+        truncation=True
+    ).to(DEVICE)
     outputs = model.generate(
         **inputs,
+        max_new_tokens=200,
         temperature=0.7,
         top_p=0.9,
+        do_sample=True,
+        pad_token_id=tokenizer.eos_token_id
     )
+    summary = tokenizer.decode(
+        outputs[0],
+        skip_special_tokens=True
+    )
+    # Extract just the summary part
+    if "Summary:" in summary:
+        return summary.split("Summary:")[-1].strip()
+    return summary.replace(prompt, "").strip()
 def generate_answer(query, context):
+    context = context[:MAX_TEXT_LENGTH]  # Limit context size
+    prompt = f"Instruction: Answer this question based on the context. If unsure, say 'I don't know'.\nQuestion: {query}\nContext: {context}\nAnswer:"
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        max_length=1024,
+        truncation=True
+    ).to(DEVICE)
     outputs = model.generate(
         **inputs,
+        max_new_tokens=150,
+        temperature=0.4,
+        top_p=0.85,
+        repetition_penalty=1.1,
+        do_sample=True,
+        pad_token_id=tokenizer.eos_token_id
+    )
+    answer = tokenizer.decode(
+        outputs[0],
+        skip_special_tokens=True
     )
+    # Extract just the answer part
+    if "Answer:" in answer:
+        return answer.split("Answer:")[-1].strip()
+    return answer.replace(prompt, "").strip()
 # ------------------------
 # Streamlit UI
 # ------------------------
 st.title("📚 RAG-Based Book Analyzer")
 st.write("Upload a book (PDF, TXT, DOCX) to get a summary and ask questions about its content.")
+st.warning("Note: First run will download models (~1.5GB). Please be patient!")
 uploaded_file = st.file_uploader("Upload File", type=["pdf", "txt", "docx"])
 if uploaded_file:
+    with st.spinner("Extracting text from file..."):
+        text = extract_text(uploaded_file)
+    if not text:
+        st.error("Failed to extract text. Please try another file.")
+        st.stop()
+    st.success(f"✅ Extracted {len(text)} characters")
+    with st.spinner("Generating summary (this may take a minute)..."):
+        summary = generate_summary(text)
+        st.markdown("### Book Summary")
+        st.info(summary)
+    with st.spinner("Preparing document for questions..."):
         chunks = split_text(text)
         index = build_index(chunks)
         st.session_state.chunks = chunks
         st.session_state.index = index
+        st.success(f"✅ Document indexed with {len(chunks)} chunks")
+st.divider()
+if 'chunks' in st.session_state:
+    st.markdown("### ❓ Ask a Question about the Book")
+    query = st.text_input("Enter your question:", key="question")
+    if query:
+        with st.spinner("Searching for answers..."):
+            # Retrieve top 3 relevant chunks
+            query_embedding = embedder.encode([query])
+            distances, indices = st.session_state.index.search(query_embedding, k=3)
+            # Safely retrieve chunks
+            retrieved_chunks = []
+            for i in indices[0]:
+                if i < len(st.session_state.chunks):
+                    retrieved_chunks.append(st.session_state.chunks[i])
+            context = "\n\n".join(retrieved_chunks)
+            # Generate answer
+            answer = generate_answer(query, context)
+            # Display results
+            st.markdown("### 💬 Answer")
+            st.success(answer)
+            with st.expander("View context used for answer"):
+                st.text(context)