# # utils/file_processor.py # import os # import json # import csv # import docx # From python-docx # import PyPDF2 # class FileProcessor: # """ # A utility class to process various file types and extract their text content. # Supports .txt, .pdf, .docx, .json, and .csv files. # """ # def __init__(self): # """Initializes the FileProcessor.""" # pass # def extract_text(self, file_path: str) -> str: # """ # Extracts text content from a given file based on its extension. # Args: # file_path (str): The full path to the file. # Returns: # str: The extracted text content, or an empty string if extraction fails. # """ # if not os.path.exists(file_path): # print(f"Warning: File not found at {file_path}") # return "" # # Get the file extension and normalize it # _, extension = os.path.splitext(file_path) # extension = extension.lower() # try: # if extension == '.txt': # return self._read_txt(file_path) # elif extension == '.pdf': # return self._read_pdf(file_path) # elif extension == '.docx': # return self._read_docx(file_path) # elif extension == '.json': # return self._read_json(file_path) # elif extension == '.csv': # return self._read_csv(file_path) # elif extension == '.doc': # return "Legacy .doc files are not supported. Please convert to .docx." # else: # print(f"Warning: Unsupported file type: {extension}") # return "" # except Exception as e: # print(f"Error processing file {file_path}: {e}") # return f"Error extracting content from file. It may be corrupted or protected." # def _read_txt(self, file_path: str) -> str: # """Reads content from a .txt file.""" # with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: # return f.read() # def _read_pdf(self, file_path: str) -> str: # """Reads content from a .pdf file using PyPDF2.""" # text = [] # with open(file_path, 'rb') as f: # reader = PyPDF2.PdfReader(f) # for page in reader.pages: # page_text = page.extract_text() # if page_text: # text.append(page_text) # return "\n".join(text) # def _read_docx(self, file_path: str) -> str: # """Reads content from a .docx file using python-docx.""" # doc = docx.Document(file_path) # text = [p.text for p in doc.paragraphs] # return "\n".join(text) # def _read_json(self, file_path: str) -> str: # """Reads and pretty-prints content from a .json file.""" # with open(file_path, 'r', encoding='utf-8') as f: # data = json.load(f) # # Convert JSON object to a nicely formatted string # return json.dumps(data, indent=2) # def _read_csv(self, file_path: str) -> str: # """Reads content from a .csv file and formats it as a string.""" # text = [] # with open(file_path, 'r', encoding='utf-8', newline='') as f: # reader = csv.reader(f) # for row in reader: # text.append(", ".join(row)) # return "\n".join(text) # utils/file_processor.py import os import json import csv import docx import fitz # PyMuPDF library class FileProcessor: """ A utility class to process various file types and extract their text content. Now uses the powerful PyMuPDF library for superior PDF text extraction. """ def extract_text(self, file_path: str) -> str: """ Extracts text content from a given file based on its extension. """ if not os.path.exists(file_path): print(f"Warning: File not found at {file_path}") return "" _, extension = os.path.splitext(file_path) extension = extension.lower() try: if extension == '.txt': return self._read_txt(file_path) elif extension == '.pdf': # Using the new, better PDF reader return self._read_pdf_with_pymupdf(file_path) elif extension == '.docx': return self._read_docx(file_path) elif extension == '.json': return self._read_json(file_path) elif extension == '.csv': return self._read_csv(file_path) elif extension == '.doc': return "Legacy .doc files are not supported. Please convert to .docx." else: print(f"Warning: Unsupported file type: {extension}") return "" except Exception as e: print(f"Error processing file {file_path}: {e}") return f"Error extracting content from file. It may be corrupted or protected." def _read_txt(self, file_path: str) -> str: """Reads content from a .txt file.""" with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: return f.read() def _read_pdf_with_pymupdf(self, file_path: str) -> str: """Reads content from a .pdf file using the PyMuPDF (fitz) library.""" text = [] with fitz.open(file_path) as doc: for page in doc: text.append(page.get_text()) return "\n".join(text) def _read_docx(self, file_path: str) -> str: """Reads content from a .docx file using python-docx.""" doc = docx.Document(file_path) text = [p.text for p in doc.paragraphs] return "\n".join(text) def _read_json(self, file_path: str) -> str: """Reads and pretty-prints content from a .json file.""" with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) return json.dumps(data, indent=2) def _read_csv(self, file_path: str) -> str: """Reads content from a .csv file and formats it as a string.""" text = [] with open(file_path, 'r', encoding='utf-8', newline='') as f: reader = csv.reader(f) for row in reader: text.append(", ".join(row)) return "\n".join(text)