Spaces:
Running
Running
process_input chunk destekli hale getirildi, Claude büyük PDF'leri parça parça işler
Browse files- pdf_reader.py +17 -18
- ui.py +18 -9
pdf_reader.py
CHANGED
@@ -1,29 +1,28 @@
|
|
1 |
import fitz # PyMuPDF
|
2 |
|
3 |
-
|
4 |
-
|
5 |
-
def extract_text_from_pdf(pdf_input):
|
6 |
try:
|
7 |
-
|
8 |
-
if hasattr(pdf_input, "read"):
|
9 |
-
doc = fitz.open(stream=pdf_input.read(), filetype="pdf")
|
10 |
-
elif isinstance(pdf_input, str):
|
11 |
doc = fitz.open(pdf_input)
|
12 |
else:
|
13 |
-
|
14 |
-
|
15 |
-
total_pages = len(doc)
|
16 |
-
text = ""
|
17 |
|
18 |
-
|
19 |
-
|
20 |
|
21 |
-
doc
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
if
|
24 |
-
|
25 |
|
26 |
-
|
|
|
27 |
|
28 |
except Exception as e:
|
29 |
-
return f"[ERROR] PDF
|
|
|
1 |
import fitz # PyMuPDF
|
2 |
|
3 |
+
def extract_text_chunks_from_pdf(pdf_input, max_chars=4000):
|
|
|
|
|
4 |
try:
|
5 |
+
if isinstance(pdf_input, str):
|
|
|
|
|
|
|
6 |
doc = fitz.open(pdf_input)
|
7 |
else:
|
8 |
+
doc = fitz.open(stream=pdf_input.read(), filetype="pdf")
|
|
|
|
|
|
|
9 |
|
10 |
+
chunks = []
|
11 |
+
current_chunk = ""
|
12 |
|
13 |
+
for page in doc:
|
14 |
+
text = page.get_text()
|
15 |
+
if len(current_chunk) + len(text) < max_chars:
|
16 |
+
current_chunk += "\n" + text
|
17 |
+
else:
|
18 |
+
chunks.append(current_chunk.strip())
|
19 |
+
current_chunk = text
|
20 |
|
21 |
+
if current_chunk:
|
22 |
+
chunks.append(current_chunk.strip())
|
23 |
|
24 |
+
doc.close()
|
25 |
+
return chunks
|
26 |
|
27 |
except Exception as e:
|
28 |
+
return [f"[ERROR] PDF bölme hatası: {str(e)}"]
|
ui.py
CHANGED
@@ -1,29 +1,38 @@
|
|
1 |
import gradio as gr
|
2 |
-
import tempfile
|
3 |
from ocr_engine import extract_text_from_image
|
4 |
-
from pdf_reader import
|
5 |
from summarizer import summarize_text
|
6 |
|
7 |
def process_input(pdf, image, manual_text, mode, model_name):
|
8 |
if pdf is not None:
|
9 |
-
|
|
|
|
|
10 |
elif image is not None:
|
11 |
text = extract_text_from_image(image)
|
|
|
|
|
|
|
12 |
elif manual_text.strip() != "":
|
13 |
-
|
14 |
else:
|
15 |
return "Lütfen bir giriş türü seçin.", "", None
|
16 |
|
17 |
-
|
18 |
-
|
19 |
|
20 |
-
|
|
|
|
|
|
|
|
|
21 |
|
22 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode='w', encoding='utf-8')
|
23 |
-
temp_file.write(
|
24 |
temp_file.close()
|
25 |
|
26 |
-
return
|
27 |
|
28 |
with gr.Blocks() as demo:
|
29 |
gr.Markdown("## VizSum")
|
|
|
1 |
import gradio as gr
|
2 |
+
import tempfile
|
3 |
from ocr_engine import extract_text_from_image
|
4 |
+
from pdf_reader import extract_text_chunks_from_pdf
|
5 |
from summarizer import summarize_text
|
6 |
|
7 |
def process_input(pdf, image, manual_text, mode, model_name):
|
8 |
if pdf is not None:
|
9 |
+
text_chunks = extract_text_chunks_from_pdf(pdf)
|
10 |
+
if any("[ERROR]" in chunk for chunk in text_chunks):
|
11 |
+
return text_chunks[0], "", None
|
12 |
elif image is not None:
|
13 |
text = extract_text_from_image(image)
|
14 |
+
if "[ERROR]" in text:
|
15 |
+
return text, "", None
|
16 |
+
text_chunks = [text]
|
17 |
elif manual_text.strip() != "":
|
18 |
+
text_chunks = [manual_text]
|
19 |
else:
|
20 |
return "Lütfen bir giriş türü seçin.", "", None
|
21 |
|
22 |
+
all_text = "\n\n".join(text_chunks)
|
23 |
+
summaries = []
|
24 |
|
25 |
+
for chunk in text_chunks:
|
26 |
+
summary = summarize_text(chunk, mode, model_name)
|
27 |
+
summaries.append(summary)
|
28 |
+
|
29 |
+
full_summary = "\n\n".join(summaries)
|
30 |
|
31 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode='w', encoding='utf-8')
|
32 |
+
temp_file.write(full_summary)
|
33 |
temp_file.close()
|
34 |
|
35 |
+
return all_text, full_summary, temp_file.name
|
36 |
|
37 |
with gr.Blocks() as demo:
|
38 |
gr.Markdown("## VizSum")
|