MyPharmaAI / utils /file_processor.py
Ajey95
Restore app source files without FAISS index
f39ba75
# # utils/file_processor.py
# import os
# import json
# import csv
# import docx # From python-docx
# import PyPDF2
# class FileProcessor:
# """
# A utility class to process various file types and extract their text content.
# Supports .txt, .pdf, .docx, .json, and .csv files.
# """
# def __init__(self):
# """Initializes the FileProcessor."""
# pass
# def extract_text(self, file_path: str) -> str:
# """
# Extracts text content from a given file based on its extension.
# Args:
# file_path (str): The full path to the file.
# Returns:
# str: The extracted text content, or an empty string if extraction fails.
# """
# if not os.path.exists(file_path):
# print(f"Warning: File not found at {file_path}")
# return ""
# # Get the file extension and normalize it
# _, extension = os.path.splitext(file_path)
# extension = extension.lower()
# try:
# if extension == '.txt':
# return self._read_txt(file_path)
# elif extension == '.pdf':
# return self._read_pdf(file_path)
# elif extension == '.docx':
# return self._read_docx(file_path)
# elif extension == '.json':
# return self._read_json(file_path)
# elif extension == '.csv':
# return self._read_csv(file_path)
# elif extension == '.doc':
# return "Legacy .doc files are not supported. Please convert to .docx."
# else:
# print(f"Warning: Unsupported file type: {extension}")
# return ""
# except Exception as e:
# print(f"Error processing file {file_path}: {e}")
# return f"Error extracting content from file. It may be corrupted or protected."
# def _read_txt(self, file_path: str) -> str:
# """Reads content from a .txt file."""
# with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
# return f.read()
# def _read_pdf(self, file_path: str) -> str:
# """Reads content from a .pdf file using PyPDF2."""
# text = []
# with open(file_path, 'rb') as f:
# reader = PyPDF2.PdfReader(f)
# for page in reader.pages:
# page_text = page.extract_text()
# if page_text:
# text.append(page_text)
# return "\n".join(text)
# def _read_docx(self, file_path: str) -> str:
# """Reads content from a .docx file using python-docx."""
# doc = docx.Document(file_path)
# text = [p.text for p in doc.paragraphs]
# return "\n".join(text)
# def _read_json(self, file_path: str) -> str:
# """Reads and pretty-prints content from a .json file."""
# with open(file_path, 'r', encoding='utf-8') as f:
# data = json.load(f)
# # Convert JSON object to a nicely formatted string
# return json.dumps(data, indent=2)
# def _read_csv(self, file_path: str) -> str:
# """Reads content from a .csv file and formats it as a string."""
# text = []
# with open(file_path, 'r', encoding='utf-8', newline='') as f:
# reader = csv.reader(f)
# for row in reader:
# text.append(", ".join(row))
# return "\n".join(text)
# utils/file_processor.py
import os
import json
import csv
import docx
import fitz # PyMuPDF library
class FileProcessor:
"""
A utility class to process various file types and extract their text content.
Now uses the powerful PyMuPDF library for superior PDF text extraction.
"""
def extract_text(self, file_path: str) -> str:
"""
Extracts text content from a given file based on its extension.
"""
if not os.path.exists(file_path):
print(f"Warning: File not found at {file_path}")
return ""
_, extension = os.path.splitext(file_path)
extension = extension.lower()
try:
if extension == '.txt':
return self._read_txt(file_path)
elif extension == '.pdf':
# Using the new, better PDF reader
return self._read_pdf_with_pymupdf(file_path)
elif extension == '.docx':
return self._read_docx(file_path)
elif extension == '.json':
return self._read_json(file_path)
elif extension == '.csv':
return self._read_csv(file_path)
elif extension == '.doc':
return "Legacy .doc files are not supported. Please convert to .docx."
else:
print(f"Warning: Unsupported file type: {extension}")
return ""
except Exception as e:
print(f"Error processing file {file_path}: {e}")
return f"Error extracting content from file. It may be corrupted or protected."
def _read_txt(self, file_path: str) -> str:
"""Reads content from a .txt file."""
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def _read_pdf_with_pymupdf(self, file_path: str) -> str:
"""Reads content from a .pdf file using the PyMuPDF (fitz) library."""
text = []
with fitz.open(file_path) as doc:
for page in doc:
text.append(page.get_text())
return "\n".join(text)
def _read_docx(self, file_path: str) -> str:
"""Reads content from a .docx file using python-docx."""
doc = docx.Document(file_path)
text = [p.text for p in doc.paragraphs]
return "\n".join(text)
def _read_json(self, file_path: str) -> str:
"""Reads and pretty-prints content from a .json file."""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return json.dumps(data, indent=2)
def _read_csv(self, file_path: str) -> str:
"""Reads content from a .csv file and formats it as a string."""
text = []
with open(file_path, 'r', encoding='utf-8', newline='') as f:
reader = csv.reader(f)
for row in reader:
text.append(", ".join(row))
return "\n".join(text)