File size: 7,271 Bytes
8c43573 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import streamlit as st
import torch
from pinecone import Pinecone
from transformers import AutoTokenizer, AutoModel
import time
# Page configuration
st.set_page_config(
page_title="Hukuki Döküman Arama (Metin Detay)",
page_icon="⚖️",
layout="wide",
initial_sidebar_state="expanded"
)
# App title and description
st.title("⚖️ Hukuki Döküman Semantik Arama Tam Metin")
st.markdown("Bu uygulama, 10.000 hukuki dökümanı içeren bir veritabanında semantik arama yapmanızı sağlar.")
# Initialize Pinecone connection
@st.cache_resource
def initialize_pinecone():
pinecone_client = Pinecone(api_key="pcsk_5s8hcC_2zwJTQthP5PSWE992iXmbRx6ykNQbnEWLhj3fDuR1Cw9eKRn31i2zsRyyCxCmgW")
return pinecone_client.Index("etikos2")
# Load the model and tokenizer
@st.cache_resource
def load_model():
model_name = "intfloat/multilingual-e5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
return tokenizer, model, device
# Function to get query embedding
def get_query_embedding(query_text, tokenizer, model):
# Prepare text with prefix required by e5 model
prefix = "query: "
query_text = prefix + query_text
# Tokenize
inputs = tokenizer(
query_text,
padding=True,
truncation=True,
return_tensors="pt",
max_length=1024
).to(model.device)
# Get embeddings
with torch.no_grad():
model_output = model(**inputs)
# Mean pooling
attention_mask = inputs['attention_mask']
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# Normalize
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
# Convert to list
embedding = embeddings[0].cpu().numpy().tolist()
return embedding
# Function to truncate text to a reasonable preview length
def get_text_preview(text, max_chars=1000):
if not text:
return "İçerik mevcut değil."
if len(text) <= max_chars:
return text
return text[:max_chars] + "..."
# Sidebar configuration
st.sidebar.header("Arama Ayarları")
top_k = st.sidebar.slider("Gösterilecek sonuç sayısı:", 1, 30, 5)
preview_length = st.sidebar.slider("Ön izleme uzunluğu (karakter):", 500, 3000, 1000)
# Initialize resources with status indicators
with st.sidebar:
st.subheader("Sistem Durumu")
with st.status("Pinecone bağlantısı kuruluyor...", expanded=True) as status:
try:
index = initialize_pinecone()
status.update(label="Pinecone bağlantısı kuruldu ✅", state="complete", expanded=False)
except Exception as e:
status.update(label=f"Pinecone bağlantı hatası ❌: {str(e)}", state="error", expanded=True)
st.error("Veritabanına bağlanılamadı. Lütfen daha sonra tekrar deneyin.")
st.stop()
with st.status("Model yükleniyor...", expanded=True) as status:
try:
tokenizer, model, device = load_model()
status.update(label=f"Model yüklendi ✅ ({device.upper()} kullanılıyor)", state="complete", expanded=False)
except Exception as e:
status.update(label=f"Model yükleme hatası ❌: {str(e)}", state="error", expanded=True)
st.error("Model yüklenemedi. Lütfen daha sonra tekrar deneyin.")
st.stop()
# Main search interface
query = st.text_area("Aramak istediğiniz konuyu yazın:", height=100,
placeholder="Örnek: Mülkiyet hakkı ile ilgili davalar")
# Search button
search_button = st.button("🔍 Ara", type="primary", use_container_width=True)
# Execute search when button is clicked
if search_button and query:
with st.spinner("Arama yapılıyor..."):
try:
# Get query embedding
start_time = time.time()
query_embedding = get_query_embedding(query, tokenizer, model)
# Search Pinecone
search_results = index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True
)
elapsed_time = time.time() - start_time
# Display results
st.success(f"Arama tamamlandı! ({elapsed_time:.2f} saniye)")
if not search_results.matches:
st.info("Aramanıza uygun sonuç bulunamadı.")
else:
st.subheader(f"Arama Sonuçları ({len(search_results.matches)} döküman)")
# Display each result in a card
for i, match in enumerate(search_results.matches):
with st.container():
col1, col2 = st.columns([4, 1])
with col1:
st.markdown(f"### {i+1}. {match.metadata.get('daire', 'Bilinmeyen Daire')}")
with col2:
st.metric(label="Benzerlik", value=f"{match.score*100:.1f}%")
st.markdown("**Döküman Bilgileri:**")
st.markdown(f"""
- **Karar No:** {match.metadata.get('karar_no', 'Belirtilmemiş')}
- **Esas No:** {match.metadata.get('esas_no', 'Belirtilmemiş')}
- **Tarih:** {match.metadata.get('tarih', 'Belirtilmemiş')}
""")
# Get full text content from metadata
text_content = match.metadata.get('text', match.metadata.get('text_snippet', ''))
# Display text content in an expandable section
with st.expander("Döküman İçeriği", expanded=True):
st.markdown(get_text_preview(text_content, preview_length))
# Add download button if text content exists
if text_content:
st.download_button(
label="Tam Metni İndir",
data=text_content,
file_name=f"karar_{match.metadata.get('karar_no', 'bilinmeyen')}.txt",
mime="text/plain"
)
st.divider()
except Exception as e:
st.error(f"Arama sırasında bir hata oluştu: {str(e)}")
# Footer
st.sidebar.markdown("---")
st.sidebar.caption("© 2023 Hukuki Döküman Arama") |