Spaces:
Sleeping
Sleeping
File size: 6,405 Bytes
f39ba75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
# # utils/file_processor.py
# import os
# import json
# import csv
# import docx # From python-docx
# import PyPDF2
# class FileProcessor:
# """
# A utility class to process various file types and extract their text content.
# Supports .txt, .pdf, .docx, .json, and .csv files.
# """
# def __init__(self):
# """Initializes the FileProcessor."""
# pass
# def extract_text(self, file_path: str) -> str:
# """
# Extracts text content from a given file based on its extension.
# Args:
# file_path (str): The full path to the file.
# Returns:
# str: The extracted text content, or an empty string if extraction fails.
# """
# if not os.path.exists(file_path):
# print(f"Warning: File not found at {file_path}")
# return ""
# # Get the file extension and normalize it
# _, extension = os.path.splitext(file_path)
# extension = extension.lower()
# try:
# if extension == '.txt':
# return self._read_txt(file_path)
# elif extension == '.pdf':
# return self._read_pdf(file_path)
# elif extension == '.docx':
# return self._read_docx(file_path)
# elif extension == '.json':
# return self._read_json(file_path)
# elif extension == '.csv':
# return self._read_csv(file_path)
# elif extension == '.doc':
# return "Legacy .doc files are not supported. Please convert to .docx."
# else:
# print(f"Warning: Unsupported file type: {extension}")
# return ""
# except Exception as e:
# print(f"Error processing file {file_path}: {e}")
# return f"Error extracting content from file. It may be corrupted or protected."
# def _read_txt(self, file_path: str) -> str:
# """Reads content from a .txt file."""
# with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
# return f.read()
# def _read_pdf(self, file_path: str) -> str:
# """Reads content from a .pdf file using PyPDF2."""
# text = []
# with open(file_path, 'rb') as f:
# reader = PyPDF2.PdfReader(f)
# for page in reader.pages:
# page_text = page.extract_text()
# if page_text:
# text.append(page_text)
# return "\n".join(text)
# def _read_docx(self, file_path: str) -> str:
# """Reads content from a .docx file using python-docx."""
# doc = docx.Document(file_path)
# text = [p.text for p in doc.paragraphs]
# return "\n".join(text)
# def _read_json(self, file_path: str) -> str:
# """Reads and pretty-prints content from a .json file."""
# with open(file_path, 'r', encoding='utf-8') as f:
# data = json.load(f)
# # Convert JSON object to a nicely formatted string
# return json.dumps(data, indent=2)
# def _read_csv(self, file_path: str) -> str:
# """Reads content from a .csv file and formats it as a string."""
# text = []
# with open(file_path, 'r', encoding='utf-8', newline='') as f:
# reader = csv.reader(f)
# for row in reader:
# text.append(", ".join(row))
# return "\n".join(text)
# utils/file_processor.py
import os
import json
import csv
import docx
import fitz # PyMuPDF library
class FileProcessor:
"""
A utility class to process various file types and extract their text content.
Now uses the powerful PyMuPDF library for superior PDF text extraction.
"""
def extract_text(self, file_path: str) -> str:
"""
Extracts text content from a given file based on its extension.
"""
if not os.path.exists(file_path):
print(f"Warning: File not found at {file_path}")
return ""
_, extension = os.path.splitext(file_path)
extension = extension.lower()
try:
if extension == '.txt':
return self._read_txt(file_path)
elif extension == '.pdf':
# Using the new, better PDF reader
return self._read_pdf_with_pymupdf(file_path)
elif extension == '.docx':
return self._read_docx(file_path)
elif extension == '.json':
return self._read_json(file_path)
elif extension == '.csv':
return self._read_csv(file_path)
elif extension == '.doc':
return "Legacy .doc files are not supported. Please convert to .docx."
else:
print(f"Warning: Unsupported file type: {extension}")
return ""
except Exception as e:
print(f"Error processing file {file_path}: {e}")
return f"Error extracting content from file. It may be corrupted or protected."
def _read_txt(self, file_path: str) -> str:
"""Reads content from a .txt file."""
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def _read_pdf_with_pymupdf(self, file_path: str) -> str:
"""Reads content from a .pdf file using the PyMuPDF (fitz) library."""
text = []
with fitz.open(file_path) as doc:
for page in doc:
text.append(page.get_text())
return "\n".join(text)
def _read_docx(self, file_path: str) -> str:
"""Reads content from a .docx file using python-docx."""
doc = docx.Document(file_path)
text = [p.text for p in doc.paragraphs]
return "\n".join(text)
def _read_json(self, file_path: str) -> str:
"""Reads and pretty-prints content from a .json file."""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return json.dumps(data, indent=2)
def _read_csv(self, file_path: str) -> str:
"""Reads content from a .csv file and formats it as a string."""
text = []
with open(file_path, 'r', encoding='utf-8', newline='') as f:
reader = csv.reader(f)
for row in reader:
text.append(", ".join(row))
return "\n".join(text) |