# ── Streamlit must write to /tmp on Spaces ────────────────────────────────────── import os as _os _os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp" _os.environ["STREAMLIT_CACHE_DIR"] = "/tmp" _os.environ["STREAMLIT_CACHE_STORAGE"] = "filesystem" # ── Imports ──────────────────────────────────────────────────────────────────── import os import io import json import streamlit as st import pandas as pd from collections import Counter from pypdf import PdfReader from pyvis.network import Network from knowledge_graph_maker import ( GraphMaker, Ontology, Document, OpenAIClient ) # ── Page setup ────────────────────────────────────────────────────────────────── st.set_page_config(page_title="Knowledge Graph (OpenRouter)", layout="wide") st.title("Knowledge Graph from Text/PDF — OpenRouter") st.caption("Builds a knowledge graph with knowledge-graph-maker via OpenRouter. Pick a model, choose presets, and render via PyVis or Cytoscape.js.") # ── Secrets / env ─────────────────────────────────────────────────────────────── OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "") # Preset OpenRouter models (extend as needed) OPENROUTER_MODELS = [ "openai/gpt-oss-20b:free", "moonshotai/kimi-k2:free", "google/gemini-2.0-flash-exp:free", "google/gemma-3-27b-it:free", ] # ---- Preset defaults in session state ---- if "temperature" not in st.session_state: st.session_state.temperature = 0.1 if "top_p" not in st.session_state: st.session_state.top_p = 0.9 # ── Sidebar controls ─────────────────────────────────────────────────────────── with st.sidebar: st.subheader("Model & Generation Settings") model_choice = st.selectbox("OpenRouter model", OPENROUTER_MODELS, index=0) custom_model = st.text_input("Custom model id (optional)", placeholder="e.g. meta-llama/llama-3.1-8b-instruct") st.markdown("### Preset") PRESETS = { "Extractive (stable)": {"temperature": 0.1, "top_p": 0.9, "desc": "Most deterministic; best for IE"}, "Balanced": {"temperature": 0.2, "top_p": 0.9, "desc": "Slightly more recall"}, "Exploratory": {"temperature": 0.4, "top_p": 0.95, "desc": "More ideas; may add noise"}, } preset_names = list(PRESETS.keys()) preset = st.selectbox("Choose a preset", preset_names, index=0, help=PRESETS[preset_names[0]]["desc"]) if st.button("Apply preset"): st.session_state.temperature = PRESETS[preset]["temperature"] st.session_state.top_p = PRESETS[preset]["top_p"] st.toast(f"Applied: {preset}", icon="✅") temperature = st.slider( "Temperature", 0.0, 1.0, key="temperature", step=0.05, help="Lower = more deterministic; higher = more variety" ) top_p = st.slider( "Top-p", 0.0, 1.0, key="top_p", step=0.05, help="Nucleus sampling threshold; 0.9 is a good default" ) st.markdown("### Ontology (labels)") labels_text = st.text_area( "Comma-separated labels", value="Person, Object, Event, Place, Document, Organisation, Action, Miscellanous", height=70, ) relationships_text = st.text_input( "Relationships (comma-separated)", value="Relation between any pair of Entities", ) st.markdown("### Visualization") renderer = st.radio("Renderer", ["PyVis (interactive)", "Cytoscape.js (beta)"], index=0) label_mode = st.radio("Edge labels", ["Always visible", "Tooltip only"], index=0) show_legend = st.checkbox("Show color legend", value=True) # ── Helpers ──────────────────────────────────────────────────────────────────── def parse_labels(text: str): return [lbl.strip() for lbl in text.split(",") if lbl.strip()] or [ "Person", "Object", "Event", "Place", "Document", "Organisation", "Action", "Miscellanous" ] def pdf_to_text(file) -> str: reader = PdfReader(file) parts = [] for page in reader.pages: try: parts.append(page.extract_text() or "") except Exception: continue return "\n".join(parts) def chunk_text(text: str, chars: int = 3500) -> list[Document]: docs = [] for i in range(0, len(text), chars): chunk = text[i:i+chars].strip() if chunk: docs.append(Document(text=chunk, metadata={"chunk_id": i // chars})) return docs def edges_to_rdf(edges): """Convert knowledge-graph-maker edges to RDF-like triples.""" triples = [] for e in edges: s = (e.node_1.name or "").strip() p = (e.relationship or "").strip() or "related_to" o = (e.node_2.name or "").strip() if s and o: triples.append({"subject": s, "predicate": p, "object": o}) return triples from collections import Counter def count_relation_frequency(triples): """Return (freq_triplet, freq_predicate).""" freq_triplet = Counter((t["subject"], t["predicate"], t["object"]) for t in triples) freq_predicate = Counter(t["predicate"] for t in triples) return freq_triplet, freq_predicate # Color bins for predicate frequency COLOR_BINS = [ (8, "#2F3B52", "freq ≥ 8"), (5, "#4E6E9E", "5–7"), (3, "#7FA6F8", "3–4"), (1, "#BFD3FF", "1–2"), ] def color_for_predicate(p, freq_pred): f = freq_pred[p] if f >= 8: return "#2F3B52" if f >= 5: return "#4E6E9E" if f >= 3: return "#7FA6F8" return "#BFD3FF" def render_color_legend(freq_pred): if not freq_pred: return counts = {"≥8":0, "5–7":0, "3–4":0, "1–2":0} for _, f in freq_pred.items(): if f >= 8: counts["≥8"] += 1 elif f >= 5: counts["5–7"] += 1 elif f >= 3: counts["3–4"] += 1 else: counts["1–2"] += 1 st.markdown("#### Legend (predicate frequency → edge color)") cols = st.columns(4) bins_disp = [ ("#2F3B52", "≥8", counts["≥8"]), ("#4E6E9E", "5–7", counts["5–7"]), ("#7FA6F8", "3–4", counts["3–4"]), ("#BFD3FF", "1–2", counts["1–2"]), ] for (c, label, cnt), col in zip(bins_disp, cols): col.markdown( f"""