Spaces:

ludigija
/

crosscheck

Running

App Files Files Community

crosscheck / app.py

ludigija

Update app.py

1cb59c0 verified 4 months ago

raw

history blame

3.61 kB

	import streamlit as st
	import difflib
	from sentence_transformers import SentenceTransformer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from xhtml2pdf import pisa
	import base64
	import os
	from io import BytesIO

	# Load SBERT model
	sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

	def compute_sbert_similarity(text1, text2):
	emb1 = sbert_model.encode([text1])[0]
	emb2 = sbert_model.encode([text2])[0]
	score = cosine_similarity([emb1], [emb2])[0][0]
	return score

	def compute_tfidf_similarity(text1, text2):
	vectorizer = TfidfVectorizer().fit([text1, text2])
	vectors = vectorizer.transform([text1, text2])
	return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]

	def html_diff(a, b):
	differ = difflib.HtmlDiff()
	return differ.make_table(a.splitlines(), b.splitlines(), fromdesc='Original', todesc='Modified', context=True, numlines=2)

	def convert_html_to_pdf(source_html):
	pdf_file = BytesIO()
	pisa_status = pisa.CreatePDF(source_html, dest=pdf_file)
	if pisa_status.err:
	return None
	return pdf_file.getvalue()

	def create_download_link(pdf_data, filename="report.pdf"):
	b64 = base64.b64encode(pdf_data).decode()
	href = f'<a href="data:application/pdf;base64,{b64}" download="{filename}">Download PDF Report</a>'
	return href

	st.set_page_config(layout="wide")
	st.title("Advanced Document Comparison Tool with Semantic and Syntactic Analysis")

	col1, col2 = st.columns(2)

	with col1:
	uploaded_file1 = st.file_uploader("Upload Original Document", type=["txt", "md"])
	with col2:
	uploaded_file2 = st.file_uploader("Upload Modified Document", type=["txt", "md"])

	if uploaded_file1 and uploaded_file2:
	original_text = uploaded_file1.read().decode("utf-8")
	modified_text = uploaded_file2.read().decode("utf-8")

	sbert_score = compute_sbert_similarity(original_text, modified_text)
	tfidf_score = compute_tfidf_similarity(original_text, modified_text)

	html_comparison = html_diff(original_text, modified_text)

	st.markdown("### 🔍 Similarity Scores")
	st.markdown(f"SBERT Semantic Similarity: {sbert_score:.4f}")
	st.markdown(f"TF-IDF Syntactic Similarity: {tfidf_score:.4f}")

	st.markdown("### 📑 Comparison Result")

	html_report = f'''
	<html>
	<head>
	<style>
	.diff {{ font-family: Courier; border: 1px solid #ccc; overflow-x: scroll; }}
	.diff th, .diff td {{ padding: 5px; }}
	iframe {{ width: 100%; height: 600px; border: none; }}
	</style>
	<script>
	window.addEventListener("DOMContentLoaded", () => {{
	const iframes = document.querySelectorAll("iframe");
	if (iframes.length === 2) {{
	const syncScroll = (e) => {{
	iframes.forEach((frame) => {{
	if (frame !== e.target) {{
	frame.contentWindow.scrollTo(0, e.target.scrollTop);
	}}
	}});
	}};
	iframes.forEach((iframe) => {{
	iframe.contentWindow.onscroll = syncScroll;
	}});
	}}
	}});
	</script>
	</head>
	<body>
	{html_comparison}
	</body>
	</html>
	'''

	st.components.v1.html(html_report, height=700, scrolling=True)

	if st.button("Generate PDF Report"):
	pdf_bytes = convert_html_to_pdf(html_report)
	if pdf_bytes:
	st.markdown(create_download_link(pdf_bytes), unsafe_allow_html=True)
	else:
	st.error("❌ Failed to generate PDF. Check for HTML formatting issues.")