Spaces:

Ajeya95
/

MyPharmaAI

Sleeping

MyPharmaAI / utils /file_processor.py

Ajey95

Restore app source files without FAISS index

f39ba75 7 days ago

6.41 kB

	# # utils/file_processor.py

	# import os
	# import json
	# import csv
	# import docx # From python-docx
	# import PyPDF2

	# class FileProcessor:
	# """
	# A utility class to process various file types and extract their text content.
	# Supports .txt, .pdf, .docx, .json, and .csv files.
	# """

	# def __init__(self):
	# """Initializes the FileProcessor."""
	# pass

	# def extract_text(self, file_path: str) -> str:
	# """
	# Extracts text content from a given file based on its extension.

	# Args:
	# file_path (str): The full path to the file.

	# Returns:
	# str: The extracted text content, or an empty string if extraction fails.
	# """
	# if not os.path.exists(file_path):
	# print(f"Warning: File not found at {file_path}")
	# return ""

	# # Get the file extension and normalize it
	# _, extension = os.path.splitext(file_path)
	# extension = extension.lower()

	# try:
	# if extension == '.txt':
	# return self._read_txt(file_path)
	# elif extension == '.pdf':
	# return self._read_pdf(file_path)
	# elif extension == '.docx':
	# return self._read_docx(file_path)
	# elif extension == '.json':
	# return self._read_json(file_path)
	# elif extension == '.csv':
	# return self._read_csv(file_path)
	# elif extension == '.doc':
	# return "Legacy .doc files are not supported. Please convert to .docx."
	# else:
	# print(f"Warning: Unsupported file type: {extension}")
	# return ""
	# except Exception as e:
	# print(f"Error processing file {file_path}: {e}")
	# return f"Error extracting content from file. It may be corrupted or protected."

	# def _read_txt(self, file_path: str) -> str:
	# """Reads content from a .txt file."""
	# with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	# return f.read()

	# def _read_pdf(self, file_path: str) -> str:
	# """Reads content from a .pdf file using PyPDF2."""
	# text = []
	# with open(file_path, 'rb') as f:
	# reader = PyPDF2.PdfReader(f)
	# for page in reader.pages:
	# page_text = page.extract_text()
	# if page_text:
	# text.append(page_text)
	# return "\n".join(text)

	# def _read_docx(self, file_path: str) -> str:
	# """Reads content from a .docx file using python-docx."""
	# doc = docx.Document(file_path)
	# text = [p.text for p in doc.paragraphs]
	# return "\n".join(text)

	# def _read_json(self, file_path: str) -> str:
	# """Reads and pretty-prints content from a .json file."""
	# with open(file_path, 'r', encoding='utf-8') as f:
	# data = json.load(f)
	# # Convert JSON object to a nicely formatted string
	# return json.dumps(data, indent=2)

	# def _read_csv(self, file_path: str) -> str:
	# """Reads content from a .csv file and formats it as a string."""
	# text = []
	# with open(file_path, 'r', encoding='utf-8', newline='') as f:
	# reader = csv.reader(f)
	# for row in reader:
	# text.append(", ".join(row))
	# return "\n".join(text)
	# utils/file_processor.py

	import os
	import json
	import csv
	import docx
	import fitz # PyMuPDF library

	class FileProcessor:
	"""
	A utility class to process various file types and extract their text content.
	Now uses the powerful PyMuPDF library for superior PDF text extraction.
	"""

	def extract_text(self, file_path: str) -> str:
	"""
	Extracts text content from a given file based on its extension.
	"""
	if not os.path.exists(file_path):
	print(f"Warning: File not found at {file_path}")
	return ""

	_, extension = os.path.splitext(file_path)
	extension = extension.lower()

	try:
	if extension == '.txt':
	return self._read_txt(file_path)
	elif extension == '.pdf':
	# Using the new, better PDF reader
	return self._read_pdf_with_pymupdf(file_path)
	elif extension == '.docx':
	return self._read_docx(file_path)
	elif extension == '.json':
	return self._read_json(file_path)
	elif extension == '.csv':
	return self._read_csv(file_path)
	elif extension == '.doc':
	return "Legacy .doc files are not supported. Please convert to .docx."
	else:
	print(f"Warning: Unsupported file type: {extension}")
	return ""
	except Exception as e:
	print(f"Error processing file {file_path}: {e}")
	return f"Error extracting content from file. It may be corrupted or protected."

	def _read_txt(self, file_path: str) -> str:
	"""Reads content from a .txt file."""
	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	return f.read()

	def _read_pdf_with_pymupdf(self, file_path: str) -> str:
	"""Reads content from a .pdf file using the PyMuPDF (fitz) library."""
	text = []
	with fitz.open(file_path) as doc:
	for page in doc:
	text.append(page.get_text())
	return "\n".join(text)

	def _read_docx(self, file_path: str) -> str:
	"""Reads content from a .docx file using python-docx."""
	doc = docx.Document(file_path)
	text = [p.text for p in doc.paragraphs]
	return "\n".join(text)

	def _read_json(self, file_path: str) -> str:
	"""Reads and pretty-prints content from a .json file."""
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	return json.dumps(data, indent=2)

	def _read_csv(self, file_path: str) -> str:
	"""Reads content from a .csv file and formats it as a string."""
	text = []
	with open(file_path, 'r', encoding='utf-8', newline='') as f:
	reader = csv.reader(f)
	for row in reader:
	text.append(", ".join(row))
	return "\n".join(text)