Spaces:

priyanshu23456
/

pdfassistant

Sleeping

App Files Files Community

priyanshu23456 commited on Apr 11

Commit

f800f49

verified ·

1 Parent(s): c9b3650

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -27

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
 # Fix caching issue on Hugging Face Spaces
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
@@ -24,22 +26,63 @@ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# ✅ OCR for scanned PDFs
 def ocr_pdf(pdf_path):
-    images = convert_from_path(pdf_path)
-    text = ""
-    for img in images:
-        text += pytesseract.image_to_string(img)
-    return text
-# ✅ Extract text
 def extract_text(pdf_path):
     doc = fitz.open(pdf_path)
     text = ""
     for page in doc:
-        text += page.get_text()
-    if len(text.strip()) < 50:
-        text = ocr_pdf(pdf_path)
     return text
 # ✅ Split into chunks
@@ -85,10 +128,17 @@ def answer_with_qa_pipeline(chunks, question):
     except:
         return ""
-# ✅ Generation fallback
 def answer_with_generation(index, embeddings, chunks, question):
     tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-    model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
         model.config.pad_token_id = model.config.eos_token_id
@@ -100,21 +150,28 @@ def answer_with_generation(index, embeddings, chunks, question):
     context = " ".join(relevant_chunks)
     prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
-    output = model.generate(
-        **inputs,
-        max_new_tokens=300,
-        temperature=0.7,
-        top_p=0.9,
-        do_sample=True,
-        num_beams=3,
-        no_repeat_ngram_size=2
-    )
-    answer = tokenizer.decode(output[0], skip_special_tokens=True)
-    if "Detailed answer:" in answer:
-        return answer.split("Detailed answer:")[-1].strip()
-    return answer.strip()
 # ✅ API route
 @app.route('/')

 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
+import tempfile
+from PIL import Image
 # Fix caching issue on Hugging Face Spaces
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Improved OCR function
 def ocr_pdf(pdf_path):
+    try:
+        # Use a higher DPI for better quality
+        images = convert_from_path(
+            pdf_path,
+            dpi=300,  # Higher DPI for better quality
+            grayscale=False,  # Color might help with some PDFs
+            thread_count=2,  # Use multiple threads
+            use_pdftocairo=True  # pdftocairo often gives better results
+        )
+        text = ""
+        for img in images:
+            # Preprocess the image for better OCR results
+            preprocessed = preprocess_image_for_ocr(img)
+            # Use tesseract with more options
+            text += pytesseract.image_to_string(
+                preprocessed,
+                config='--psm 1 --oem 3 -l eng'  # Page segmentation mode 1 (auto), OCR Engine mode 3 (default)
+            )
+        return text
+    except Exception as e:
+        print(f"OCR error: {str(e)}")
+        return ""
+# Image preprocessing function for better OCR
+def preprocess_image_for_ocr(img):
+    # Convert to grayscale
+    gray = img.convert('L')
+    # Optional: You could add more preprocessing here like:
+    # - Thresholding
+    # - Noise removal
+    # - Contrast enhancement
+    return gray
+# Improved extract_text function with better text detection
 def extract_text(pdf_path):
     doc = fitz.open(pdf_path)
     text = ""
     for page in doc:
+        page_text = page.get_text()
+        text += page_text
+    # Check if the text is meaningful (more sophisticated check)
+    words = text.split()
+    unique_words = set(word.lower() for word in words if len(word) > 2)
+    # If we don't have enough meaningful text, try OCR
+    if len(unique_words) < 20 or len(text.strip()) < 100:
+        ocr_text = ocr_pdf(pdf_path)
+        # If OCR gave us more text, use it
+        if len(ocr_text.strip()) > len(text.strip()):
+            text = ocr_text
     return text
 # ✅ Split into chunks
     except:
         return ""
+# Modify your answer_with_generation function like this:
 def answer_with_generation(index, embeddings, chunks, question):
     tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    # Fix for meta tensor error - load model with device_map="auto"
+    model = AutoModelForCausalLM.from_pretrained(
+        "distilgpt2",
+        device_map="auto",  # This handles device placement automatically
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32  # Use fp16 if possible
+    )
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
         model.config.pad_token_id = model.config.eos_token_id
     context = " ".join(relevant_chunks)
     prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
+    # Handle inputs without explicit device placement
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+    # Let the model handle device placement internally
+    try:
+        output = model.generate(
+            **inputs,
+            max_new_tokens=300,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+            num_beams=3,
+            no_repeat_ngram_size=2
+        )
+        answer = tokenizer.decode(output[0], skip_special_tokens=True)
+        if "Detailed answer:" in answer:
+            return answer.split("Detailed answer:")[-1].strip()
+        return answer.strip()
+    except Exception as e:
+        print(f"Generation error: {str(e)}")
+        return "I couldn't generate a good answer based on the PDF content."
 # ✅ API route
 @app.route('/')