chatpdf / app.py
yunuseduran's picture
Update app.py
f80bc8c verified
raw
history blame
5.05 kB
import gradio as gr
import google.generativeai as genai
import markdown
from docx import Document
from bs4 import BeautifulSoup
import shutil
import os
import PyPDF2 # PDF işleme için subprocess yerine Python kütüphanesi kullanın
# Setup your API key
def setup_api_key():
google_api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=google_api_key)
def upload_file(file_path):
print(f"Uploading file...")
text_file = genai.upload_file(path=file_path)
print(f"Completed upload: {text_file.uri}")
return text_file
def to_markdown(text):
text = text.replace('•', ' *')
return markdown.markdown(text)
def build_model(text_file):
generation_config = {
"temperature": 0.2,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-flash",
generation_config=generation_config,
system_instruction="""Answer the questions based on the uploaded file.
If there is no related info in the file just reply 'I don't know.' """,
)
chat_session = model.start_chat(history=[])
response = chat_session.send_message(["Summarize the doc in one sentence", text_file])
return chat_session
def chat(chat_session, prompt):
response = chat_session.send_message(prompt)
return response.text
def generate_report(chat_session, questions):
report_text = ""
report_text += f"\n## QUESTIONS & ANSWERS\n"
for question in questions:
report_text += f"\n## {question}\n"
answer = chat(chat_session, question)
report_text += f"\n{answer}\n"
return report_text
def convert_markdown_to_html(report_text):
html_text = markdown.markdown(report_text)
return html_text
def add_html_to_word(html_text, doc):
soup = BeautifulSoup(html_text, 'html.parser')
for element in soup:
if element.name == 'h1':
doc.add_heading(element.get_text(), level=1)
elif element.name == 'h2':
doc.add_heading(element.get_text(), level=2)
elif element.name == 'h3':
doc.add_heading(element.get_text(), level=3)
elif element.name == 'h4':
doc.add_heading(element.get_text(), level=4)
elif element.name == 'h5':
doc.add_heading(element.get_text(), level=5)
elif element.name == 'h6':
doc.add_heading(element.get_text(), level=6)
elif element.name == 'p':
doc.add_paragraph(element.get_text())
elif element.name == 'ul':
for li in element.find_all('li'):
doc.add_paragraph(li.get_text(), style='List Bullet')
elif element.name == 'ol':
for li in element.find_all('li'):
doc.add_paragraph(li.get_text(), style='List Number')
elif element.name:
doc.add_paragraph(element.get_text()) # For any other tags
def extract_text_from_pdf(pdf_path):
"""PDF dosyasından metin çıkarmak için PyPDF2 kullanır"""
text = ""
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text() + "\n"
return text
def process_pdf(pdf_file, user_questions):
file_name = pdf_file.split('/')[-1]
saved_file_path = f"/tmp/{file_name}"
shutil.copyfile(pdf_file, saved_file_path)
# PDF'den doğrudan metin çıkar
text = extract_text_from_pdf(saved_file_path)
# Çıkarılan metni bir dosyaya yaz
with open("/tmp/text_file.txt", "w", encoding="utf-8") as f:
f.write(text)
text_file = upload_file("/tmp/text_file.txt")
chat_session = build_model(text_file)
questions = user_questions.strip().split('\n')
report_text = generate_report(chat_session, questions)
doc = Document()
html_text = convert_markdown_to_html(report_text)
add_html_to_word(html_text, doc)
doc_name = file_name.replace(".pdf", ".docx")
doc_name = "Report_" + doc_name
doc.save(f"/tmp/{doc_name}")
return html_text, f"/tmp/{doc_name}"
questions = [
"Makalenin yazarları kimlerdir?",
"Hangi modeller kullanılmıştır?",
"Kaç referans vardır?",
"Hangi yılda yayınlanmıştır?"
]
questions_str = "\n".join(questions)
iface = gr.Interface(
fn=process_pdf,
inputs=[
gr.File(label="Upload PDF", type="filepath"),
gr.TextArea(label="Enter Questions", placeholder="Type your questions here, one per line.", value=questions_str)
],
outputs=[
gr.HTML(label="HTML Formatted Report"),
gr.File(label="DOCX File Output", type="binary")
],
title="Pdflerinizden kısa rapor oluşturma arac @YED",
description="Sorularınızı sormak ve cevap almak için PDF'inizi yükleyin."
)
setup_api_key()
# Hugging Face Spaces için önerilen launch konfigürasyonu
iface.launch(server_name="0.0.0.0", server_port=7860)