Spaces:

Ajeya95
/

MyPharmaAI

Sleeping

File size: 6,405 Bytes

f39ba75

# # utils/file_processor.py

# import os
# import json
# import csv
# import docx  # From python-docx
# import PyPDF2

# class FileProcessor:
#     """
#     A utility class to process various file types and extract their text content.
#     Supports .txt, .pdf, .docx, .json, and .csv files.
#     """

#     def __init__(self):
#         """Initializes the FileProcessor."""
#         pass

#     def extract_text(self, file_path: str) -> str:
#         """
#         Extracts text content from a given file based on its extension.

#         Args:
#             file_path (str): The full path to the file.

#         Returns:
#             str: The extracted text content, or an empty string if extraction fails.
#         """
#         if not os.path.exists(file_path):
#             print(f"Warning: File not found at {file_path}")
#             return ""

#         # Get the file extension and normalize it
#         _, extension = os.path.splitext(file_path)
#         extension = extension.lower()

#         try:
#             if extension == '.txt':
#                 return self._read_txt(file_path)
#             elif extension == '.pdf':
#                 return self._read_pdf(file_path)
#             elif extension == '.docx':
#                 return self._read_docx(file_path)
#             elif extension == '.json':
#                 return self._read_json(file_path)
#             elif extension == '.csv':
#                 return self._read_csv(file_path)
#             elif extension == '.doc':
#                 return "Legacy .doc files are not supported. Please convert to .docx."
#             else:
#                 print(f"Warning: Unsupported file type: {extension}")
#                 return ""
#         except Exception as e:
#             print(f"Error processing file {file_path}: {e}")
#             return f"Error extracting content from file. It may be corrupted or protected."

#     def _read_txt(self, file_path: str) -> str:
#         """Reads content from a .txt file."""
#         with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
#             return f.read()

#     def _read_pdf(self, file_path: str) -> str:
#         """Reads content from a .pdf file using PyPDF2."""
#         text = []
#         with open(file_path, 'rb') as f:
#             reader = PyPDF2.PdfReader(f)
#             for page in reader.pages:
#                 page_text = page.extract_text()
#                 if page_text:
#                     text.append(page_text)
#         return "\n".join(text)

#     def _read_docx(self, file_path: str) -> str:
#         """Reads content from a .docx file using python-docx."""
#         doc = docx.Document(file_path)
#         text = [p.text for p in doc.paragraphs]
#         return "\n".join(text)

#     def _read_json(self, file_path: str) -> str:
#         """Reads and pretty-prints content from a .json file."""
#         with open(file_path, 'r', encoding='utf-8') as f:
#             data = json.load(f)
#         # Convert JSON object to a nicely formatted string
#         return json.dumps(data, indent=2)

#     def _read_csv(self, file_path: str) -> str:
#         """Reads content from a .csv file and formats it as a string."""
#         text = []
#         with open(file_path, 'r', encoding='utf-8', newline='') as f:
#             reader = csv.reader(f)
#             for row in reader:
#                 text.append(", ".join(row))
#         return "\n".join(text)
# utils/file_processor.py

import os
import json
import csv
import docx
import fitz  # PyMuPDF library

class FileProcessor:
    """
    A utility class to process various file types and extract their text content.
    Now uses the powerful PyMuPDF library for superior PDF text extraction.
    """

    def extract_text(self, file_path: str) -> str:
        """
        Extracts text content from a given file based on its extension.
        """
        if not os.path.exists(file_path):
            print(f"Warning: File not found at {file_path}")
            return ""

        _, extension = os.path.splitext(file_path)
        extension = extension.lower()

        try:
            if extension == '.txt':
                return self._read_txt(file_path)
            elif extension == '.pdf':
                # Using the new, better PDF reader
                return self._read_pdf_with_pymupdf(file_path)
            elif extension == '.docx':
                return self._read_docx(file_path)
            elif extension == '.json':
                return self._read_json(file_path)
            elif extension == '.csv':
                return self._read_csv(file_path)
            elif extension == '.doc':
                return "Legacy .doc files are not supported. Please convert to .docx."
            else:
                print(f"Warning: Unsupported file type: {extension}")
                return ""
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            return f"Error extracting content from file. It may be corrupted or protected."

    def _read_txt(self, file_path: str) -> str:
        """Reads content from a .txt file."""
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            return f.read()

    def _read_pdf_with_pymupdf(self, file_path: str) -> str:
        """Reads content from a .pdf file using the PyMuPDF (fitz) library."""
        text = []
        with fitz.open(file_path) as doc:
            for page in doc:
                text.append(page.get_text())
        return "\n".join(text)

    def _read_docx(self, file_path: str) -> str:
        """Reads content from a .docx file using python-docx."""
        doc = docx.Document(file_path)
        text = [p.text for p in doc.paragraphs]
        return "\n".join(text)

    def _read_json(self, file_path: str) -> str:
        """Reads and pretty-prints content from a .json file."""
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return json.dumps(data, indent=2)

    def _read_csv(self, file_path: str) -> str:
        """Reads content from a .csv file and formats it as a string."""
        text = []
        with open(file_path, 'r', encoding='utf-8', newline='') as f:
            reader = csv.reader(f)
            for row in reader:
                text.append(", ".join(row))
        return "\n".join(text)