vizsum-pro / pdf_reader.py
Vartex39's picture
process_input chunk destekli hale getirildi, Claude büyük PDF'leri parça parça işler
a8d7146
raw
history blame
780 Bytes
import fitz # PyMuPDF
def extract_text_chunks_from_pdf(pdf_input, max_chars=4000):
try:
if isinstance(pdf_input, str):
doc = fitz.open(pdf_input)
else:
doc = fitz.open(stream=pdf_input.read(), filetype="pdf")
chunks = []
current_chunk = ""
for page in doc:
text = page.get_text()
if len(current_chunk) + len(text) < max_chars:
current_chunk += "\n" + text
else:
chunks.append(current_chunk.strip())
current_chunk = text
if current_chunk:
chunks.append(current_chunk.strip())
doc.close()
return chunks
except Exception as e:
return [f"[ERROR] PDF bölme hatası: {str(e)}"]