Spaces:

yunuseduran
/

chatpdf

Sleeping

App Files Files Community

yunuseduran commited on Apr 18

Commit

8ab61e6

verified ·

1 Parent(s): 9da60a4

Update app.py

Browse files

Files changed (1) hide show

app.py +191 -98

app.py CHANGED Viewed

@@ -5,23 +5,31 @@ from docx import Document
 from bs4 import BeautifulSoup
 import shutil
 import os
-import PyPDF2  # PDF işleme için subprocess yerine Python kütüphanesi kullanın
-# Setup your API key
 def setup_api_key():
     google_api_key = os.getenv("GOOGLE_API_KEY")
     genai.configure(api_key=google_api_key)
 def upload_file(file_path):
-    print(f"Uploading file...")
-    text_file = genai.upload_file(path=file_path)
-    print(f"Completed upload: {text_file.uri}")
-    return text_file
 def to_markdown(text):
     text = text.replace('•', '  *')
     return markdown.markdown(text)
 def build_model(text_file):
     generation_config = {
         "temperature": 0.2,
@@ -34,118 +42,203 @@ def build_model(text_file):
     model = genai.GenerativeModel(
         model_name="gemini-1.5-flash",
         generation_config=generation_config,
-        system_instruction="""Answer the questions based on the uploaded file.
-        If there is no related info in the file just reply 'I don't know.' """,
     )
     chat_session = model.start_chat(history=[])
-    response = chat_session.send_message(["Summarize the doc in one sentence", text_file])
-    return chat_session
 def chat(chat_session, prompt):
-    response = chat_session.send_message(prompt)
-    return response.text
-def generate_report(chat_session, questions):
-    report_text = ""
-    report_text += f"\n## QUESTIONS & ANSWERS\n"
-    for question in questions:
-        report_text += f"\n## {question}\n"
         answer = chat(chat_session, question)
-        report_text += f"\n{answer}\n"
     return report_text
 def convert_markdown_to_html(report_text):
-    html_text = markdown.markdown(report_text)
-    return html_text
 def add_html_to_word(html_text, doc):
     soup = BeautifulSoup(html_text, 'html.parser')
-    for element in soup:
-        if element.name == 'h1':
-            doc.add_heading(element.get_text(), level=1)
-        elif element.name == 'h2':
-            doc.add_heading(element.get_text(), level=2)
-        elif element.name == 'h3':
-            doc.add_heading(element.get_text(), level=3)
-        elif element.name == 'h4':
-            doc.add_heading(element.get_text(), level=4)
-        elif element.name == 'h5':
-            doc.add_heading(element.get_text(), level=5)
-        elif element.name == 'h6':
-            doc.add_heading(element.get_text(), level=6)
         elif element.name == 'p':
-            doc.add_paragraph(element.get_text())
         elif element.name == 'ul':
-            for li in element.find_all('li'):
                 doc.add_paragraph(li.get_text(), style='List Bullet')
         elif element.name == 'ol':
-            for li in element.find_all('li'):
                 doc.add_paragraph(li.get_text(), style='List Number')
-        elif element.name:
-            doc.add_paragraph(element.get_text())  # For any other tags
 def extract_text_from_pdf(pdf_path):
-    """PDF dosyasından metin çıkarmak için PyPDF2 kullanır"""
     text = ""
-    with open(pdf_path, 'rb') as file:
-        pdf_reader = PyPDF2.PdfReader(file)
-        for page_num in range(len(pdf_reader.pages)):
-            text += pdf_reader.pages[page_num].extract_text() + "\n"
-    return text
-def process_pdf(pdf_file, user_questions):
-    file_name = pdf_file.split('/')[-1]
-    saved_file_path = f"/tmp/{file_name}"
-    shutil.copyfile(pdf_file, saved_file_path)
-    # PDF'den doğrudan metin çıkar
-    text = extract_text_from_pdf(saved_file_path)
-    # Çıkarılan metni bir dosyaya yaz
-    with open("/tmp/text_file.txt", "w", encoding="utf-8") as f:
-        f.write(text)
-    text_file = upload_file("/tmp/text_file.txt")
-    chat_session = build_model(text_file)
-    questions = user_questions.strip().split('\n')
-    report_text = generate_report(chat_session, questions)
-    doc = Document()
-    html_text = convert_markdown_to_html(report_text)
-    add_html_to_word(html_text, doc)
-    doc_name = file_name.replace(".pdf", ".docx")
-    doc_name = "Report_" + doc_name
-    doc.save(f"/tmp/{doc_name}")
-    return html_text, f"/tmp/{doc_name}"
-questions = [
-    "Makalenin yazarları kimlerdir?",
-    "Hangi modeller kullanılmıştır?",
-    "Kaç referans vardır?",
-    "Hangi yılda yayınlanmıştır?"
-]
-questions_str = "\n".join(questions)
-iface = gr.Interface(
-    fn=process_pdf,
-    inputs=[
-        gr.File(label="Upload PDF", type="filepath"),
-        gr.TextArea(label="Enter Questions", placeholder="Type your questions here, one per line.", value=questions_str)
-    ],
-    outputs=[
-        gr.HTML(label="HTML Formatted Report"),
-        gr.File(label="DOCX File Output", type="binary")
-    ],
-    title="Pdflerinizden kısa rapor oluşturma arac @YED",
-    description="Sorularınızı sormak ve cevap almak için PDF'inizi yükleyin."
-)
-setup_api_key()
-# Hugging Face Spaces için önerilen launch konfigürasyonu
-iface.launch(share=True)

 from bs4 import BeautifulSoup
 import shutil
 import os
+import PyPDF2
+import tempfile
+from datetime import datetime
+# API anahtarı yapılandırması
 def setup_api_key():
     google_api_key = os.getenv("GOOGLE_API_KEY")
+    if not google_api_key:
+        raise ValueError("GOOGLE_API_KEY çevre değişkeni ayarlanmamış.")
     genai.configure(api_key=google_api_key)
+# Dosya yükleme fonksiyonu
 def upload_file(file_path):
+    try:
+        text_file = genai.upload_file(path=file_path)
+        return text_file
+    except Exception as e:
+        raise Exception(f"Dosya yükleme hatası: {str(e)}")
+# Markdown formatına dönüştürme
 def to_markdown(text):
     text = text.replace('•', '  *')
     return markdown.markdown(text)
+# AI modelini oluşturma
 def build_model(text_file):
     generation_config = {
         "temperature": 0.2,
     model = genai.GenerativeModel(
         model_name="gemini-1.5-flash",
         generation_config=generation_config,
+        system_instruction="""PDF belgesinden yüklenen bilgilere dayanarak soruları cevapla.
+        Belge içinde ilgili bilgi yoksa 'Bu konuda belgede bilgi bulamadım.' diye yanıtla.
+        Cevaplarında mümkün olduğunca belgedeki bilgileri referans ver ve doğru bilgi sağla.""",
     )
     chat_session = model.start_chat(history=[])
+    # Belgeyi özetleyerek başla
+    response = chat_session.send_message(["Bu belgeyi kısaca özetle", text_file])
+    return chat_session, response.text
+# Sohbet fonksiyonu
 def chat(chat_session, prompt):
+    try:
+        response = chat_session.send_message(prompt)
+        return response.text
+    except Exception as e:
+        return f"Yanıt alınamadı: {str(e)}"
+# Rapor oluşturma
+def generate_report(chat_session, questions, summary):
+    report_text = "# PDF Belge Analiz Raporu\n\n"
+    report_text += f"*Oluşturulma tarihi: {datetime.now().strftime('%d.%m.%Y %H:%M')}*\n\n"
+    report_text += f"## Belge Özeti\n\n{summary}\n\n"
+    report_text += f"## Soru ve Cevaplar\n\n"
+    for i, question in enumerate(questions, 1):
+        if not question.strip():
+            continue
+        report_text += f"### Soru {i}: {question}\n\n"
         answer = chat(chat_session, question)
+        report_text += f"{answer}\n\n"
     return report_text
+# Markdown'ı HTML'e dönüştürme
 def convert_markdown_to_html(report_text):
+    html_text = markdown.markdown(report_text, extensions=['tables'])
+    return f"""
+    <div style="font-family: Arial, sans-serif; line-height: 1.6; max-width: 800px; margin: 0 auto; padding: 20px;">
+        {html_text}
+    </div>
+    """
+# HTML'i Word belgesine ekleme
 def add_html_to_word(html_text, doc):
     soup = BeautifulSoup(html_text, 'html.parser')
+    for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'li']):
+        if element.name.startswith('h') and element.name[1:].isdigit():
+            level = int(element.name[1])
+            doc.add_heading(element.get_text(), level=level)
         elif element.name == 'p':
+            if element.get_text().strip():
+                doc.add_paragraph(element.get_text())
         elif element.name == 'ul':
+            for li in element.find_all('li', recursive=False):
                 doc.add_paragraph(li.get_text(), style='List Bullet')
         elif element.name == 'ol':
+            for li in element.find_all('li', recursive=False):
                 doc.add_paragraph(li.get_text(), style='List Number')
+# PDF'den metin çıkarma
 def extract_text_from_pdf(pdf_path):
     text = ""
+    try:
+        with open(pdf_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            for page_num in range(len(pdf_reader.pages)):
+                text += pdf_reader.pages[page_num].extract_text() + "\n"
+        return text
+    except Exception as e:
+        raise Exception(f"PDF okuma hatası: {str(e)}")
+# Ana işlem fonksiyonu
+def process_pdf(pdf_file, user_questions, progress=gr.Progress()):
+    if not pdf_file:
+        return "Lütfen bir PDF dosyası yükleyin.", None
+    progress(0, desc="PDF yükleniyor...")
+    # Geçici dosya ve klasör yönetimi
+    temp_dir = tempfile.mkdtemp()
+    file_name = os.path.basename(pdf_file)
+    pdf_path = os.path.join(temp_dir, file_name)
+    try:
+        # PDF dosyasını geçici konuma kopyala
+        shutil.copyfile(pdf_file, pdf_path)
+        progress(20, desc="PDF'den metin çıkarılıyor...")
+        text = extract_text_from_pdf(pdf_path)
+        # Çıkarılan metni bir dosyaya yaz
+        text_file_path = os.path.join(temp_dir, "extracted_text.txt")
+        with open(text_file_path, "w", encoding="utf-8") as f:
+            f.write(text)
+        progress(40, desc="Metin dosyası yükleniyor...")
+        text_file = upload_file(text_file_path)
+        progress(60, desc="AI modeli hazırlanıyor...")
+        chat_session, summary = build_model(text_file)
+        progress(70, desc="Sorular işleniyor...")
+        # Soruları ayırma
+        questions = [q.strip() for q in user_questions.split('\n') if q.strip()]
+        progress(80, desc="Rapor oluşturuluyor...")
+        report_text = generate_report(chat_session, questions, summary)
+        progress(90, desc="Sonuçlar formatlanıyor...")
+        html_output = convert_markdown_to_html(report_text)
+        # Word belgesi oluştur
+        doc = Document()
+        add_html_to_word(html_output, doc)
+        doc_name = f"PDF_Rapor_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
+        doc_path = os.path.join(temp_dir, doc_name)
+        doc.save(doc_path)
+        progress(100, desc="Tamamlandı!")
+        return html_output, doc_path
+    except Exception as e:
+        error_message = f"<div style='color: red; font-weight: bold;'>Hata oluştu: {str(e)}</div>"
+        return error_message, None
+    finally:
+        # Geçici dosyaları silme işlemi (opsiyonel)
+        pass
+# Varsayılan sorular
+default_questions = """Belgenin ana konusu nedir?
+Belgenin yazarları kimlerdir?
+Belgedeki önemli bulgular nelerdir?
+Kaç sayfa ve bölüm vardır?
+Hangi tarihte yayınlanmıştır?"""
+# Gradio arayüzü
+with gr.Blocks(theme=gr.themes.Soft()) as iface:
+    gr.Markdown("""
+    # 📄 PDF Soru-Cevap Asistanı
+    Bu uygulama, yüklediğiniz PDF belgesi üzerinde sorular sormanıza ve detaylı bir rapor almanıza olanak tanır.
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            pdf_input = gr.File(
+                label="PDF Dosyası Yükleyin",
+                file_types=[".pdf"],
+                type="filepath"
+            )
+            questions_input = gr.TextArea(
+                label="Sorularınız",
+                placeholder="Her satıra bir soru yazın...",
+                value=default_questions,
+                lines=10
+            )
+            submit_btn = gr.Button("📝 Rapor Oluştur", variant="primary")
+        with gr.Column(scale=2):
+            with gr.Tabs():
+                with gr.Tab("HTML Görünüm"):
+                    html_output = gr.HTML(label="Rapor Sonucu")
+                with gr.Tab("İndirilebilir Dosya"):
+                    file_output = gr.File(label="DOCX Rapor")
+    with gr.Accordion("Nasıl Kullanılır?", open=False):
+        gr.Markdown("""
+        ### Kullanım Adımları:
+        1. PDF dosyanızı yükleyin
+        2. Belge hakkında cevaplarını almak istediğiniz soruları yazın
+        3. "Rapor Oluştur" düğmesine basın
+        4. Oluşturulan raporu HTML olarak görüntüleyin veya DOCX dosyası olarak indirin
+        ### İpuçları:
+        - Her satıra bir soru yazın
+        - Belgenin içeriğiyle ilgili net sorular sorun
+        - Büyük PDF'ler için işlem süresi uzayabilir
+        """)
+    submit_btn.click(
+        fn=process_pdf,
+        inputs=[pdf_input, questions_input],
+        outputs=[html_output, file_output],
+        show_progress=True
+    )
+# API anahtarını ayarla ve uygulamayı başlat
+if __name__ == "__main__":
+    try:
+        setup_api_key()
+        iface.launch(share=True)
+    except ValueError as e:
+        print(f"Hata: {str(e)}")
+        print("Lütfen GOOGLE_API_KEY çevre değişkenini ayarlayın.")