File size: 6,405 Bytes
f39ba75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# # utils/file_processor.py

# import os
# import json
# import csv
# import docx  # From python-docx
# import PyPDF2

# class FileProcessor:
#     """
#     A utility class to process various file types and extract their text content.
#     Supports .txt, .pdf, .docx, .json, and .csv files.
#     """

#     def __init__(self):
#         """Initializes the FileProcessor."""
#         pass

#     def extract_text(self, file_path: str) -> str:
#         """
#         Extracts text content from a given file based on its extension.

#         Args:
#             file_path (str): The full path to the file.

#         Returns:
#             str: The extracted text content, or an empty string if extraction fails.
#         """
#         if not os.path.exists(file_path):
#             print(f"Warning: File not found at {file_path}")
#             return ""

#         # Get the file extension and normalize it
#         _, extension = os.path.splitext(file_path)
#         extension = extension.lower()

#         try:
#             if extension == '.txt':
#                 return self._read_txt(file_path)
#             elif extension == '.pdf':
#                 return self._read_pdf(file_path)
#             elif extension == '.docx':
#                 return self._read_docx(file_path)
#             elif extension == '.json':
#                 return self._read_json(file_path)
#             elif extension == '.csv':
#                 return self._read_csv(file_path)
#             elif extension == '.doc':
#                 return "Legacy .doc files are not supported. Please convert to .docx."
#             else:
#                 print(f"Warning: Unsupported file type: {extension}")
#                 return ""
#         except Exception as e:
#             print(f"Error processing file {file_path}: {e}")
#             return f"Error extracting content from file. It may be corrupted or protected."

#     def _read_txt(self, file_path: str) -> str:
#         """Reads content from a .txt file."""
#         with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
#             return f.read()

#     def _read_pdf(self, file_path: str) -> str:
#         """Reads content from a .pdf file using PyPDF2."""
#         text = []
#         with open(file_path, 'rb') as f:
#             reader = PyPDF2.PdfReader(f)
#             for page in reader.pages:
#                 page_text = page.extract_text()
#                 if page_text:
#                     text.append(page_text)
#         return "\n".join(text)

#     def _read_docx(self, file_path: str) -> str:
#         """Reads content from a .docx file using python-docx."""
#         doc = docx.Document(file_path)
#         text = [p.text for p in doc.paragraphs]
#         return "\n".join(text)

#     def _read_json(self, file_path: str) -> str:
#         """Reads and pretty-prints content from a .json file."""
#         with open(file_path, 'r', encoding='utf-8') as f:
#             data = json.load(f)
#         # Convert JSON object to a nicely formatted string
#         return json.dumps(data, indent=2)

#     def _read_csv(self, file_path: str) -> str:
#         """Reads content from a .csv file and formats it as a string."""
#         text = []
#         with open(file_path, 'r', encoding='utf-8', newline='') as f:
#             reader = csv.reader(f)
#             for row in reader:
#                 text.append(", ".join(row))
#         return "\n".join(text)
# utils/file_processor.py

import os
import json
import csv
import docx
import fitz  # PyMuPDF library

class FileProcessor:
    """
    A utility class to process various file types and extract their text content.
    Now uses the powerful PyMuPDF library for superior PDF text extraction.
    """

    def extract_text(self, file_path: str) -> str:
        """
        Extracts text content from a given file based on its extension.
        """
        if not os.path.exists(file_path):
            print(f"Warning: File not found at {file_path}")
            return ""

        _, extension = os.path.splitext(file_path)
        extension = extension.lower()

        try:
            if extension == '.txt':
                return self._read_txt(file_path)
            elif extension == '.pdf':
                # Using the new, better PDF reader
                return self._read_pdf_with_pymupdf(file_path)
            elif extension == '.docx':
                return self._read_docx(file_path)
            elif extension == '.json':
                return self._read_json(file_path)
            elif extension == '.csv':
                return self._read_csv(file_path)
            elif extension == '.doc':
                return "Legacy .doc files are not supported. Please convert to .docx."
            else:
                print(f"Warning: Unsupported file type: {extension}")
                return ""
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            return f"Error extracting content from file. It may be corrupted or protected."

    def _read_txt(self, file_path: str) -> str:
        """Reads content from a .txt file."""
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            return f.read()

    def _read_pdf_with_pymupdf(self, file_path: str) -> str:
        """Reads content from a .pdf file using the PyMuPDF (fitz) library."""
        text = []
        with fitz.open(file_path) as doc:
            for page in doc:
                text.append(page.get_text())
        return "\n".join(text)

    def _read_docx(self, file_path: str) -> str:
        """Reads content from a .docx file using python-docx."""
        doc = docx.Document(file_path)
        text = [p.text for p in doc.paragraphs]
        return "\n".join(text)

    def _read_json(self, file_path: str) -> str:
        """Reads and pretty-prints content from a .json file."""
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return json.dumps(data, indent=2)

    def _read_csv(self, file_path: str) -> str:
        """Reads content from a .csv file and formats it as a string."""
        text = []
        with open(file_path, 'r', encoding='utf-8', newline='') as f:
            reader = csv.reader(f)
            for row in reader:
                text.append(", ".join(row))
        return "\n".join(text)