Spaces:
Sleeping
Sleeping
# # utils/file_processor.py | |
# import os | |
# import json | |
# import csv | |
# import docx # From python-docx | |
# import PyPDF2 | |
# class FileProcessor: | |
# """ | |
# A utility class to process various file types and extract their text content. | |
# Supports .txt, .pdf, .docx, .json, and .csv files. | |
# """ | |
# def __init__(self): | |
# """Initializes the FileProcessor.""" | |
# pass | |
# def extract_text(self, file_path: str) -> str: | |
# """ | |
# Extracts text content from a given file based on its extension. | |
# Args: | |
# file_path (str): The full path to the file. | |
# Returns: | |
# str: The extracted text content, or an empty string if extraction fails. | |
# """ | |
# if not os.path.exists(file_path): | |
# print(f"Warning: File not found at {file_path}") | |
# return "" | |
# # Get the file extension and normalize it | |
# _, extension = os.path.splitext(file_path) | |
# extension = extension.lower() | |
# try: | |
# if extension == '.txt': | |
# return self._read_txt(file_path) | |
# elif extension == '.pdf': | |
# return self._read_pdf(file_path) | |
# elif extension == '.docx': | |
# return self._read_docx(file_path) | |
# elif extension == '.json': | |
# return self._read_json(file_path) | |
# elif extension == '.csv': | |
# return self._read_csv(file_path) | |
# elif extension == '.doc': | |
# return "Legacy .doc files are not supported. Please convert to .docx." | |
# else: | |
# print(f"Warning: Unsupported file type: {extension}") | |
# return "" | |
# except Exception as e: | |
# print(f"Error processing file {file_path}: {e}") | |
# return f"Error extracting content from file. It may be corrupted or protected." | |
# def _read_txt(self, file_path: str) -> str: | |
# """Reads content from a .txt file.""" | |
# with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
# return f.read() | |
# def _read_pdf(self, file_path: str) -> str: | |
# """Reads content from a .pdf file using PyPDF2.""" | |
# text = [] | |
# with open(file_path, 'rb') as f: | |
# reader = PyPDF2.PdfReader(f) | |
# for page in reader.pages: | |
# page_text = page.extract_text() | |
# if page_text: | |
# text.append(page_text) | |
# return "\n".join(text) | |
# def _read_docx(self, file_path: str) -> str: | |
# """Reads content from a .docx file using python-docx.""" | |
# doc = docx.Document(file_path) | |
# text = [p.text for p in doc.paragraphs] | |
# return "\n".join(text) | |
# def _read_json(self, file_path: str) -> str: | |
# """Reads and pretty-prints content from a .json file.""" | |
# with open(file_path, 'r', encoding='utf-8') as f: | |
# data = json.load(f) | |
# # Convert JSON object to a nicely formatted string | |
# return json.dumps(data, indent=2) | |
# def _read_csv(self, file_path: str) -> str: | |
# """Reads content from a .csv file and formats it as a string.""" | |
# text = [] | |
# with open(file_path, 'r', encoding='utf-8', newline='') as f: | |
# reader = csv.reader(f) | |
# for row in reader: | |
# text.append(", ".join(row)) | |
# return "\n".join(text) | |
# utils/file_processor.py | |
import os | |
import json | |
import csv | |
import docx | |
import fitz # PyMuPDF library | |
class FileProcessor: | |
""" | |
A utility class to process various file types and extract their text content. | |
Now uses the powerful PyMuPDF library for superior PDF text extraction. | |
""" | |
def extract_text(self, file_path: str) -> str: | |
""" | |
Extracts text content from a given file based on its extension. | |
""" | |
if not os.path.exists(file_path): | |
print(f"Warning: File not found at {file_path}") | |
return "" | |
_, extension = os.path.splitext(file_path) | |
extension = extension.lower() | |
try: | |
if extension == '.txt': | |
return self._read_txt(file_path) | |
elif extension == '.pdf': | |
# Using the new, better PDF reader | |
return self._read_pdf_with_pymupdf(file_path) | |
elif extension == '.docx': | |
return self._read_docx(file_path) | |
elif extension == '.json': | |
return self._read_json(file_path) | |
elif extension == '.csv': | |
return self._read_csv(file_path) | |
elif extension == '.doc': | |
return "Legacy .doc files are not supported. Please convert to .docx." | |
else: | |
print(f"Warning: Unsupported file type: {extension}") | |
return "" | |
except Exception as e: | |
print(f"Error processing file {file_path}: {e}") | |
return f"Error extracting content from file. It may be corrupted or protected." | |
def _read_txt(self, file_path: str) -> str: | |
"""Reads content from a .txt file.""" | |
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
return f.read() | |
def _read_pdf_with_pymupdf(self, file_path: str) -> str: | |
"""Reads content from a .pdf file using the PyMuPDF (fitz) library.""" | |
text = [] | |
with fitz.open(file_path) as doc: | |
for page in doc: | |
text.append(page.get_text()) | |
return "\n".join(text) | |
def _read_docx(self, file_path: str) -> str: | |
"""Reads content from a .docx file using python-docx.""" | |
doc = docx.Document(file_path) | |
text = [p.text for p in doc.paragraphs] | |
return "\n".join(text) | |
def _read_json(self, file_path: str) -> str: | |
"""Reads and pretty-prints content from a .json file.""" | |
with open(file_path, 'r', encoding='utf-8') as f: | |
data = json.load(f) | |
return json.dumps(data, indent=2) | |
def _read_csv(self, file_path: str) -> str: | |
"""Reads content from a .csv file and formats it as a string.""" | |
text = [] | |
with open(file_path, 'r', encoding='utf-8', newline='') as f: | |
reader = csv.reader(f) | |
for row in reader: | |
text.append(", ".join(row)) | |
return "\n".join(text) |