Spaces:
Running
Running
import streamlit as st | |
import difflib | |
from sentence_transformers import SentenceTransformer | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from xhtml2pdf import pisa | |
import base64 | |
import os | |
from io import BytesIO | |
# Load SBERT model | |
sbert_model = SentenceTransformer('all-MiniLM-L6-v2') | |
def compute_sbert_similarity(text1, text2): | |
emb1 = sbert_model.encode([text1])[0] | |
emb2 = sbert_model.encode([text2])[0] | |
score = cosine_similarity([emb1], [emb2])[0][0] | |
return score | |
def compute_tfidf_similarity(text1, text2): | |
vectorizer = TfidfVectorizer().fit([text1, text2]) | |
vectors = vectorizer.transform([text1, text2]) | |
return cosine_similarity(vectors[0:1], vectors[1:2])[0][0] | |
def html_diff(a, b): | |
differ = difflib.HtmlDiff() | |
return differ.make_table(a.splitlines(), b.splitlines(), fromdesc='Original', todesc='Modified', context=True, numlines=2) | |
def convert_html_to_pdf(source_html): | |
pdf_file = BytesIO() | |
pisa_status = pisa.CreatePDF(source_html, dest=pdf_file) | |
if pisa_status.err: | |
return None | |
return pdf_file.getvalue() | |
def create_download_link(pdf_data, filename="report.pdf"): | |
b64 = base64.b64encode(pdf_data).decode() | |
href = f'<a href="data:application/pdf;base64,{b64}" download="{filename}">Download PDF Report</a>' | |
return href | |
st.set_page_config(layout="wide") | |
st.title("Advanced Document Comparison Tool with Semantic and Syntactic Analysis") | |
col1, col2 = st.columns(2) | |
with col1: | |
uploaded_file1 = st.file_uploader("Upload Original Document", type=["txt", "md"]) | |
with col2: | |
uploaded_file2 = st.file_uploader("Upload Modified Document", type=["txt", "md"]) | |
if uploaded_file1 and uploaded_file2: | |
original_text = uploaded_file1.read().decode("utf-8") | |
modified_text = uploaded_file2.read().decode("utf-8") | |
sbert_score = compute_sbert_similarity(original_text, modified_text) | |
tfidf_score = compute_tfidf_similarity(original_text, modified_text) | |
html_comparison = html_diff(original_text, modified_text) | |
st.markdown("### π Similarity Scores") | |
st.markdown(f"**SBERT Semantic Similarity:** {sbert_score:.4f}") | |
st.markdown(f"**TF-IDF Syntactic Similarity:** {tfidf_score:.4f}") | |
st.markdown("### π Comparison Result") | |
html_report = f''' | |
<html> | |
<head> | |
<style> | |
.diff {{ font-family: Courier; border: 1px solid #ccc; overflow-x: scroll; }} | |
.diff th, .diff td {{ padding: 5px; }} | |
iframe {{ width: 100%; height: 600px; border: none; }} | |
</style> | |
<script> | |
window.addEventListener("DOMContentLoaded", () => {{ | |
const iframes = document.querySelectorAll("iframe"); | |
if (iframes.length === 2) {{ | |
const syncScroll = (e) => {{ | |
iframes.forEach((frame) => {{ | |
if (frame !== e.target) {{ | |
frame.contentWindow.scrollTo(0, e.target.scrollTop); | |
}} | |
}}); | |
}}; | |
iframes.forEach((iframe) => {{ | |
iframe.contentWindow.onscroll = syncScroll; | |
}}); | |
}} | |
}}); | |
</script> | |
</head> | |
<body> | |
{html_comparison} | |
</body> | |
</html> | |
''' | |
st.components.v1.html(html_report, height=700, scrolling=True) | |
if st.button("Generate PDF Report"): | |
pdf_bytes = convert_html_to_pdf(html_report) | |
if pdf_bytes: | |
st.markdown(create_download_link(pdf_bytes), unsafe_allow_html=True) | |
else: | |
st.error("β Failed to generate PDF. Check for HTML formatting issues.") | |