Spaces:

ludigija
/

crosscheck

Sleeping

File size: 3,605 Bytes

import streamlit as st
import difflib
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from xhtml2pdf import pisa
import base64
import os
from io import BytesIO

# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_sbert_similarity(text1, text2):
    emb1 = sbert_model.encode([text1])[0]
    emb2 = sbert_model.encode([text2])[0]
    score = cosine_similarity([emb1], [emb2])[0][0]
    return score

def compute_tfidf_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit([text1, text2])
    vectors = vectorizer.transform([text1, text2])
    return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]

def html_diff(a, b):
    differ = difflib.HtmlDiff()
    return differ.make_table(a.splitlines(), b.splitlines(), fromdesc='Original', todesc='Modified', context=True, numlines=2)

def convert_html_to_pdf(source_html):
    pdf_file = BytesIO()
    pisa_status = pisa.CreatePDF(source_html, dest=pdf_file)
    if pisa_status.err:
        return None
    return pdf_file.getvalue()

def create_download_link(pdf_data, filename="report.pdf"):
    b64 = base64.b64encode(pdf_data).decode()
    href = f'<a href="data:application/pdf;base64,{b64}" download="{filename}">Download PDF Report</a>'
    return href

st.set_page_config(layout="wide")
st.title("Advanced Document Comparison Tool with Semantic and Syntactic Analysis")

col1, col2 = st.columns(2)

with col1:
    uploaded_file1 = st.file_uploader("Upload Original Document", type=["txt", "md"])
with col2:
    uploaded_file2 = st.file_uploader("Upload Modified Document", type=["txt", "md"])

if uploaded_file1 and uploaded_file2:
    original_text = uploaded_file1.read().decode("utf-8")
    modified_text = uploaded_file2.read().decode("utf-8")

    sbert_score = compute_sbert_similarity(original_text, modified_text)
    tfidf_score = compute_tfidf_similarity(original_text, modified_text)

    html_comparison = html_diff(original_text, modified_text)

    st.markdown("### 🔍 Similarity Scores")
    st.markdown(f"**SBERT Semantic Similarity:** {sbert_score:.4f}")
    st.markdown(f"**TF-IDF Syntactic Similarity:** {tfidf_score:.4f}")

    st.markdown("### 📑 Comparison Result")

    html_report = f'''
    <html>
    <head>
    <style>
    .diff {{ font-family: Courier; border: 1px solid #ccc; overflow-x: scroll; }}
    .diff th, .diff td {{ padding: 5px; }}
    iframe {{ width: 100%; height: 600px; border: none; }}
    </style>
    <script>
    window.addEventListener("DOMContentLoaded", () => {{
        const iframes = document.querySelectorAll("iframe");
        if (iframes.length === 2) {{
            const syncScroll = (e) => {{
                iframes.forEach((frame) => {{
                    if (frame !== e.target) {{
                        frame.contentWindow.scrollTo(0, e.target.scrollTop);
                    }}
                }});
            }};
            iframes.forEach((iframe) => {{
                iframe.contentWindow.onscroll = syncScroll;
            }});
        }}
    }});
    </script>
    </head>
    <body>
    {html_comparison}
    </body>
    </html>
    '''

    st.components.v1.html(html_report, height=700, scrolling=True)

    if st.button("Generate PDF Report"):
        pdf_bytes = convert_html_to_pdf(html_report)
        if pdf_bytes:
            st.markdown(create_download_link(pdf_bytes), unsafe_allow_html=True)
        else:
            st.error("❌ Failed to generate PDF. Check for HTML formatting issues.")