Spaces:
Sleeping
Sleeping
File size: 7,067 Bytes
08b59ae a8dfcdd 08b59ae a8dfcdd 08b59ae a8dfcdd 08b59ae a8dfcdd 08b59ae ecf4e44 a8dfcdd 08b59ae a8dfcdd 08b59ae a8dfcdd 08b59ae a8dfcdd 08b59ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import os
import gradio as gr
from transformers import pipeline
import spacy
import lib.read_pdf
# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('sentencizer')
def split_in_sentences(text):
doc = nlp(text)
return [str(sent).strip() for sent in doc.sents]
def make_spans(text, results):
results_list = [res['label'] for res in results]
facts_spans = list(zip(split_in_sentences(text), results_list))
return facts_spans
# Initialize pipelines
summarizer = pipeline("summarization", model="human-centered-summarization/financial-summarization-pegasus")
fin_model = pipeline("sentiment-analysis", model='yiyanghkust/finbert-tone', tokenizer='yiyanghkust/finbert-tone')
def summarize_text(text):
resp = summarizer(text)
return resp[0]['summary_text']
def text_to_sentiment(text):
sentiment = fin_model(text)[0]["label"]
return sentiment
def fin_ext(text):
results = fin_model(split_in_sentences(text))
return make_spans(text, results)
def extract_and_summarize(pdf1, pdf2):
if not pdf1 or not pdf2:
return [], []
pdf1_path = os.path.join(PDF_FOLDER, pdf1)
pdf2_path = os.path.join(PDF_FOLDER, pdf2)
# Extract and format paragraphs
paragraphs_1 = lib.read_pdf.extract_and_format_paragraphs(pdf1_path)
paragraphs_2 = lib.read_pdf.extract_and_format_paragraphs(pdf2_path)
start_keyword = "Main risks to"
end_keywords = ["4. Appendix", "Annex:", "4. Annex", "Detailed tables", "ACKNOWLEDGEMENTS", "STATISTICAL ANNEX", "PROSPECTS BY MEMBER STATES"]
start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)
return paragraphs_1, paragraphs_2
# Gradio interface setup
PDF_FOLDER = "data"
def get_pdf_files(folder):
return [f for f in os.listdir(folder) if f.endswith('.pdf')]
stored_paragraphs_1 = []
stored_paragraphs_2 = []
with gr.Blocks() as demo:
gr.Markdown("## Financial Report Paragraph Selection and Analysis")
with gr.Row():
# Layout for PDF 1 and PDF 2 side by side
with gr.Column():
gr.Markdown("### PDF 1 Analysis")
pdf1 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 1")
b1 = gr.Button("Extract and Display Paragraphs")
paragraph_1_dropdown = gr.Dropdown(label="Select Paragraph from PDF 1")
selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=4)
sentiment_textbox_1 = gr.Textbox(label="Classification for PDF 1", lines=4)
fin_spans_1 = gr.HighlightedText(label="Financial Tone Analysis for PDF 1")
def update_paragraphs(pdf1, pdf2):
global stored_paragraphs_1, stored_paragraphs_2
stored_paragraphs_1, stored_paragraphs_2 = extract_and_summarize(pdf1, pdf2)
updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
def process_paragraph_1(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = stored_paragraphs_1[paragraph_index]
summary = summarize_text(selected_paragraph)
sentiment = text_to_sentiment(selected_paragraph)
fin_spans = fin_ext(selected_paragraph)
return selected_paragraph, summary, sentiment, fin_spans
except (IndexError, ValueError):
return "Invalid selection", "Error", "Error", []
b1.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
summarize_btn1 = gr.Button("Summarize Text from PDF 1")
sentiment_btn1 = gr.Button("Classify Financial Tone from PDF 1")
analyze_btn1 = gr.Button("Analyze Financial Tone and FLS")
summarize_btn1.click(fn=lambda p: process_paragraph_1(p)[1], inputs=paragraph_1_dropdown, outputs=summary_textbox_1)
sentiment_btn1.click(fn=lambda p: process_paragraph_1(p)[2], inputs=paragraph_1_dropdown, outputs=sentiment_textbox_1)
analyze_btn1.click(fn=lambda p: process_paragraph_1(p)[3], inputs=paragraph_1_dropdown, outputs=fin_spans_1)
with gr.Column():
gr.Markdown("### PDF 2 Analysis")
pdf2 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 2")
b2 = gr.Button("Extract and Display Paragraphs")
paragraph_2_dropdown = gr.Dropdown(label="Select Paragraph from PDF 2")
selected_paragraph_2 = gr.Textbox(label="Selected Paragraph 2 Content", lines=4)
summary_textbox_2 = gr.Textbox(label="Summary for PDF 2", lines=4)
sentiment_textbox_2 = gr.Textbox(label="Classification for PDF 2", lines=4)
fin_spans_2 = gr.HighlightedText(label="Financial Tone Analysis for PDF 2")
def process_paragraph_2(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = stored_paragraphs_2[paragraph_index]
summary = summarize_text(selected_paragraph)
sentiment = text_to_sentiment(selected_paragraph)
fin_spans = fin_ext(selected_paragraph)
return selected_paragraph, summary, sentiment, fin_spans
except (IndexError, ValueError):
return "Invalid selection", "Error", "Error", []
b2.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
summarize_btn2 = gr.Button("Summarize Text from PDF 2")
sentiment_btn2 = gr.Button("Classify Financial Tone from PDF 2")
analyze_btn2 = gr.Button("Analyze Financial Tone and FLS")
summarize_btn2.click(fn=lambda p: process_paragraph_2(p)[1], inputs=paragraph_2_dropdown, outputs=summary_textbox_2)
sentiment_btn2.click(fn=lambda p: process_paragraph_2(p)[2], inputs=paragraph_2_dropdown, outputs=sentiment_textbox_2)
analyze_btn2.click(fn=lambda p: process_paragraph_2(p)[3], inputs=paragraph_2_dropdown, outputs=fin_spans_2)
demo.launch()
|