Spaces:

Vartex39
/

vizsum-pro

Sleeping

Vartex39 commited on May 27

Commit

a8d7146

1 Parent(s): 96bc60c

process_input chunk destekli hale getirildi, Claude büyük PDF'leri parça parça işler

Files changed (2) hide show

pdf_reader.py CHANGED Viewed

@@ -1,29 +1,28 @@
 import fitz  # PyMuPDF
-MAX_PAGES = 5  # fazla token yememesi için sınır
-def extract_text_from_pdf(pdf_input):
     try:
-        # Hugging Face ortamında pdf_input bir file-like objedir (upload edilen dosya)
-        if hasattr(pdf_input, "read"):
-            doc = fitz.open(stream=pdf_input.read(), filetype="pdf")
-        elif isinstance(pdf_input, str):
             doc = fitz.open(pdf_input)
         else:
-            return "[ERROR] Geçersiz PDF girişi"
-        total_pages = len(doc)
-        text = ""
-        for i in range(min(MAX_PAGES, total_pages)):
-            text += doc[i].get_text()
-        doc.close()
-        if total_pages > MAX_PAGES:
-            text += f"\n\n[INFO] PDF {total_pages} sayfa. Yalnızca ilk {MAX_PAGES} sayfa işlendi."
-        return text
     except Exception as e:
-        return f"[ERROR] PDF İşleme Hatası: {str(e)}"

 import fitz  # PyMuPDF
+def extract_text_chunks_from_pdf(pdf_input, max_chars=4000):
     try:
+        if isinstance(pdf_input, str):
             doc = fitz.open(pdf_input)
         else:
+            doc = fitz.open(stream=pdf_input.read(), filetype="pdf")
+        chunks = []
+        current_chunk = ""
+        for page in doc:
+            text = page.get_text()
+            if len(current_chunk) + len(text) < max_chars:
+                current_chunk += "\n" + text
+            else:
+                chunks.append(current_chunk.strip())
+                current_chunk = text
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        doc.close()
+        return chunks
     except Exception as e:
+        return [f"[ERROR] PDF bölme hatası: {str(e)}"]

ui.py CHANGED Viewed

@@ -1,29 +1,38 @@
 import gradio as gr
-import tempfile  # Bu satır eksikti
 from ocr_engine import extract_text_from_image
-from pdf_reader import extract_text_from_pdf
 from summarizer import summarize_text
 def process_input(pdf, image, manual_text, mode, model_name):
     if pdf is not None:
-        text = extract_text_from_pdf(pdf)
     elif image is not None:
         text = extract_text_from_image(image)
     elif manual_text.strip() != "":
-        text = manual_text
     else:
         return "Lütfen bir giriş türü seçin.", "", None
-    if "[ERROR]" in text:
-        return text, "", None
-    summary = summarize_text(text, mode, model_name)
     temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode='w', encoding='utf-8')
-    temp_file.write(summary)
     temp_file.close()
-    return text, summary, temp_file.name
 with gr.Blocks() as demo:
     gr.Markdown("## VizSum")

 import gradio as gr
+import tempfile
 from ocr_engine import extract_text_from_image
+from pdf_reader import extract_text_chunks_from_pdf
 from summarizer import summarize_text
 def process_input(pdf, image, manual_text, mode, model_name):
     if pdf is not None:
+        text_chunks = extract_text_chunks_from_pdf(pdf)
+        if any("[ERROR]" in chunk for chunk in text_chunks):
+            return text_chunks[0], "", None
     elif image is not None:
         text = extract_text_from_image(image)
+        if "[ERROR]" in text:
+            return text, "", None
+        text_chunks = [text]
     elif manual_text.strip() != "":
+        text_chunks = [manual_text]
     else:
         return "Lütfen bir giriş türü seçin.", "", None
+    all_text = "\n\n".join(text_chunks)
+    summaries = []
+    for chunk in text_chunks:
+        summary = summarize_text(chunk, mode, model_name)
+        summaries.append(summary)
+    full_summary = "\n\n".join(summaries)
     temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode='w', encoding='utf-8')
+    temp_file.write(full_summary)
     temp_file.close()
+    return all_text, full_summary, temp_file.name
 with gr.Blocks() as demo:
     gr.Markdown("## VizSum")