Spaces:

yunuseduran
/

chatpdf

Sleeping

App Files Files Community

yunuseduran commited on Aug 6, 2024

Commit

78fae79

verified ·

1 Parent(s): cfa664f

Create app.py

Browse files

Files changed (1) hide show

app.py +138 -0

app.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import gradio as gr
+import google.generativeai as genai
+import markdown
+from docx import Document
+from bs4 import BeautifulSoup
+import shutil
+import subprocess
+import os
+def setup_api_key():
+    google_api_key = os.getenv("GOOGLE_API_KEY")
+    genai.configure(api_key=google_api_key)
+def upload_file(file_path):
+    print(f"Uploading file...")
+    text_file = genai.upload_file(path=file_path)
+    print(f"Completed upload: {text_file.uri}")
+    return text_file
+def to_markdown(text):
+    text = text.replace('•', '  *')
+    return markdown.markdown(text)
+def build_model(text_file):
+    generation_config = {
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "top_k": 64,
+        "max_output_tokens": 8192,
+        "response_mime_type": "text/plain",
+    }
+    model = genai.GenerativeModel(
+        model_name="gemini-1.5-flash",
+        generation_config=generation_config,
+        system_instruction="""Answer the questions based on the uploaded file.
+        If there is no related info in the file just reply 'I don't know.' """,
+    )
+    chat_session = model.start_chat(history=[])
+    response = chat_session.send_message(["Summarize the doc in one sentence", text_file])
+    return chat_session
+def chat(chat_session, prompt):
+    response = chat_session.send_message(prompt)
+    return response.text
+def generate_report(chat_session, questions):
+    report_text = ""
+    report_text += f"\n## QUESTIONS & ANSWERS\n"
+    for question in questions:
+        report_text += f"\n## {question}\n"
+        answer = chat(chat_session, question)
+        report_text += f"\n{answer}\n"
+    return report_text
+def convert_markdown_to_html(report_text):
+    html_text = markdown.markdown(report_text)
+    return html_text
+def add_html_to_word(html_text, doc):
+    soup = BeautifulSoup(html_text, 'html.parser')
+    for element in soup:
+        if element.name == 'h1':
+            doc.add_heading(element.get_text(), level=1)
+        elif element.name == 'h2':
+            doc.add_heading(element.get_text(), level=2)
+        elif element.name == 'h3':
+            doc.add_heading(element.get_text(), level=3)
+        elif element.name == 'h4':
+            doc.add_heading(element.get_text(), level=4)
+        elif element.name == 'h5':
+            doc.add_heading(element.get_text(), level=5)
+        elif element.name == 'h6':
+            doc.add_heading(element.get_text(), level=6)
+        elif element.name == 'p':
+            doc.add_paragraph(element.get_text())
+        elif element.name == 'ul':
+            for li in element.find_all('li'):
+                doc.add_paragraph(li.get_text(), style='List Bullet')
+        elif element.name == 'ol':
+            for li in element.find_all('li'):
+                doc.add_paragraph(li.get_text(), style='List Number')
+        elif element.name:
+            doc.add_paragraph(element.get_text())  # For any other tags
+def process_pdf(pdf_file, user_questions):
+    file_name = pdf_file.name.split('/')[-1]
+    saved_file_path = f"/tmp/{file_name}"
+    shutil.copyfile(pdf_file.name, saved_file_path)
+    subprocess.run(["apt-get", "update"])
+    subprocess.run(["apt-get", "install", "-y", "poppler-utils"])
+    subprocess.run(["pdftotext", saved_file_path, "/tmp/text_file.txt"])
+    text_file = upload_file("/tmp/text_file.txt")
+    chat_session = build_model(text_file)
+    questions = user_questions.strip().split('\n')
+    report_text = generate_report(chat_session, questions)
+    doc = Document()
+    html_text = convert_markdown_to_html(report_text)
+    add_html_to_word(html_text, doc)
+    doc_name = file_name.replace(".pdf", ".docx")
+    doc_name = "Report_" + doc_name
+    doc.save(f"/tmp/{doc_name}")
+    return html_text, f"/tmp/{doc_name}"
+questions = [
+    "Who are the authors of the article?",
+    "What models were used?",
+    "How many references are there?",
+    "In what year was it published?"
+]
+questions_str = "\n".join(questions)
+iface = gr.Interface(
+    fn=process_pdf,
+    inputs=[
+        gr.File(label="Upload PDF", type="file"),
+        gr.TextArea(label="Enter Questions", placeholder="Type your questions here, one per line.", value=questions_str)
+    ],
+    outputs=[
+        gr.HTML(label="HTML Formatted Report"),
+        gr.File(label="DOCX File Output", type="file")
+    ],
+    title="REPORT GENERATOR: ASK YOUR QUESTIONS TO A PDF FILE @YED",
+    description="Upload a PDF to ask questions and get the answers."
+)
+setup_api_key()
+iface.launch()