Vartex39 commited on
Commit
a8d7146
·
1 Parent(s): 96bc60c

process_input chunk destekli hale getirildi, Claude büyük PDF'leri parça parça işler

Browse files
Files changed (2) hide show
  1. pdf_reader.py +17 -18
  2. ui.py +18 -9
pdf_reader.py CHANGED
@@ -1,29 +1,28 @@
1
  import fitz # PyMuPDF
2
 
3
- MAX_PAGES = 5 # fazla token yememesi için sınır
4
-
5
- def extract_text_from_pdf(pdf_input):
6
  try:
7
- # Hugging Face ortamında pdf_input bir file-like objedir (upload edilen dosya)
8
- if hasattr(pdf_input, "read"):
9
- doc = fitz.open(stream=pdf_input.read(), filetype="pdf")
10
- elif isinstance(pdf_input, str):
11
  doc = fitz.open(pdf_input)
12
  else:
13
- return "[ERROR] Geçersiz PDF girişi"
14
-
15
- total_pages = len(doc)
16
- text = ""
17
 
18
- for i in range(min(MAX_PAGES, total_pages)):
19
- text += doc[i].get_text()
20
 
21
- doc.close()
 
 
 
 
 
 
22
 
23
- if total_pages > MAX_PAGES:
24
- text += f"\n\n[INFO] PDF {total_pages} sayfa. Yalnızca ilk {MAX_PAGES} sayfa işlendi."
25
 
26
- return text
 
27
 
28
  except Exception as e:
29
- return f"[ERROR] PDF İşleme Hatası: {str(e)}"
 
1
  import fitz # PyMuPDF
2
 
3
+ def extract_text_chunks_from_pdf(pdf_input, max_chars=4000):
 
 
4
  try:
5
+ if isinstance(pdf_input, str):
 
 
 
6
  doc = fitz.open(pdf_input)
7
  else:
8
+ doc = fitz.open(stream=pdf_input.read(), filetype="pdf")
 
 
 
9
 
10
+ chunks = []
11
+ current_chunk = ""
12
 
13
+ for page in doc:
14
+ text = page.get_text()
15
+ if len(current_chunk) + len(text) < max_chars:
16
+ current_chunk += "\n" + text
17
+ else:
18
+ chunks.append(current_chunk.strip())
19
+ current_chunk = text
20
 
21
+ if current_chunk:
22
+ chunks.append(current_chunk.strip())
23
 
24
+ doc.close()
25
+ return chunks
26
 
27
  except Exception as e:
28
+ return [f"[ERROR] PDF bölme hatası: {str(e)}"]
ui.py CHANGED
@@ -1,29 +1,38 @@
1
  import gradio as gr
2
- import tempfile # Bu satır eksikti
3
  from ocr_engine import extract_text_from_image
4
- from pdf_reader import extract_text_from_pdf
5
  from summarizer import summarize_text
6
 
7
  def process_input(pdf, image, manual_text, mode, model_name):
8
  if pdf is not None:
9
- text = extract_text_from_pdf(pdf)
 
 
10
  elif image is not None:
11
  text = extract_text_from_image(image)
 
 
 
12
  elif manual_text.strip() != "":
13
- text = manual_text
14
  else:
15
  return "Lütfen bir giriş türü seçin.", "", None
16
 
17
- if "[ERROR]" in text:
18
- return text, "", None
19
 
20
- summary = summarize_text(text, mode, model_name)
 
 
 
 
21
 
22
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode='w', encoding='utf-8')
23
- temp_file.write(summary)
24
  temp_file.close()
25
 
26
- return text, summary, temp_file.name
27
 
28
  with gr.Blocks() as demo:
29
  gr.Markdown("## VizSum")
 
1
  import gradio as gr
2
+ import tempfile
3
  from ocr_engine import extract_text_from_image
4
+ from pdf_reader import extract_text_chunks_from_pdf
5
  from summarizer import summarize_text
6
 
7
  def process_input(pdf, image, manual_text, mode, model_name):
8
  if pdf is not None:
9
+ text_chunks = extract_text_chunks_from_pdf(pdf)
10
+ if any("[ERROR]" in chunk for chunk in text_chunks):
11
+ return text_chunks[0], "", None
12
  elif image is not None:
13
  text = extract_text_from_image(image)
14
+ if "[ERROR]" in text:
15
+ return text, "", None
16
+ text_chunks = [text]
17
  elif manual_text.strip() != "":
18
+ text_chunks = [manual_text]
19
  else:
20
  return "Lütfen bir giriş türü seçin.", "", None
21
 
22
+ all_text = "\n\n".join(text_chunks)
23
+ summaries = []
24
 
25
+ for chunk in text_chunks:
26
+ summary = summarize_text(chunk, mode, model_name)
27
+ summaries.append(summary)
28
+
29
+ full_summary = "\n\n".join(summaries)
30
 
31
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode='w', encoding='utf-8')
32
+ temp_file.write(full_summary)
33
  temp_file.close()
34
 
35
+ return all_text, full_summary, temp_file.name
36
 
37
  with gr.Blocks() as demo:
38
  gr.Markdown("## VizSum")