crosscheck / app.py
ludigija's picture
Update app.py
1cb59c0 verified
raw
history blame
3.61 kB
import streamlit as st
import difflib
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from xhtml2pdf import pisa
import base64
import os
from io import BytesIO
# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
def compute_sbert_similarity(text1, text2):
emb1 = sbert_model.encode([text1])[0]
emb2 = sbert_model.encode([text2])[0]
score = cosine_similarity([emb1], [emb2])[0][0]
return score
def compute_tfidf_similarity(text1, text2):
vectorizer = TfidfVectorizer().fit([text1, text2])
vectors = vectorizer.transform([text1, text2])
return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
def html_diff(a, b):
differ = difflib.HtmlDiff()
return differ.make_table(a.splitlines(), b.splitlines(), fromdesc='Original', todesc='Modified', context=True, numlines=2)
def convert_html_to_pdf(source_html):
pdf_file = BytesIO()
pisa_status = pisa.CreatePDF(source_html, dest=pdf_file)
if pisa_status.err:
return None
return pdf_file.getvalue()
def create_download_link(pdf_data, filename="report.pdf"):
b64 = base64.b64encode(pdf_data).decode()
href = f'<a href="data:application/pdf;base64,{b64}" download="{filename}">Download PDF Report</a>'
return href
st.set_page_config(layout="wide")
st.title("Advanced Document Comparison Tool with Semantic and Syntactic Analysis")
col1, col2 = st.columns(2)
with col1:
uploaded_file1 = st.file_uploader("Upload Original Document", type=["txt", "md"])
with col2:
uploaded_file2 = st.file_uploader("Upload Modified Document", type=["txt", "md"])
if uploaded_file1 and uploaded_file2:
original_text = uploaded_file1.read().decode("utf-8")
modified_text = uploaded_file2.read().decode("utf-8")
sbert_score = compute_sbert_similarity(original_text, modified_text)
tfidf_score = compute_tfidf_similarity(original_text, modified_text)
html_comparison = html_diff(original_text, modified_text)
st.markdown("### πŸ” Similarity Scores")
st.markdown(f"**SBERT Semantic Similarity:** {sbert_score:.4f}")
st.markdown(f"**TF-IDF Syntactic Similarity:** {tfidf_score:.4f}")
st.markdown("### πŸ“‘ Comparison Result")
html_report = f'''
<html>
<head>
<style>
.diff {{ font-family: Courier; border: 1px solid #ccc; overflow-x: scroll; }}
.diff th, .diff td {{ padding: 5px; }}
iframe {{ width: 100%; height: 600px; border: none; }}
</style>
<script>
window.addEventListener("DOMContentLoaded", () => {{
const iframes = document.querySelectorAll("iframe");
if (iframes.length === 2) {{
const syncScroll = (e) => {{
iframes.forEach((frame) => {{
if (frame !== e.target) {{
frame.contentWindow.scrollTo(0, e.target.scrollTop);
}}
}});
}};
iframes.forEach((iframe) => {{
iframe.contentWindow.onscroll = syncScroll;
}});
}}
}});
</script>
</head>
<body>
{html_comparison}
</body>
</html>
'''
st.components.v1.html(html_report, height=700, scrolling=True)
if st.button("Generate PDF Report"):
pdf_bytes = convert_html_to_pdf(html_report)
if pdf_bytes:
st.markdown(create_download_link(pdf_bytes), unsafe_allow_html=True)
else:
st.error("❌ Failed to generate PDF. Check for HTML formatting issues.")