import fitz # PyMuPDF for normal PDFs import pytesseract from pdf2image import convert_from_path # Extract text from normal PDFs def extract_text_from_pdf(pdf_path): text = "" doc = fitz.open(pdf_path) for page in doc: text += page.get_text("text") + "\n" return text.strip() # Extract text from scanned PDFs using OCR def extract_text_from_scanned_pdf(pdf_path): images = convert_from_path(pdf_path) text = "" for img in images: text += pytesseract.image_to_string(img, lang="ara") + "\n" return text.strip() # Main function to extract text from both normal and scanned PDFs def get_pdf_text(pdf_path): text = extract_text_from_pdf(pdf_path) if not text.strip(): # If it's empty, use OCR text = extract_text_from_scanned_pdf(pdf_path) return text