Spaces:

blazingbunny
/

rahulnyk_knowledge_graph

Sleeping

File size: 5,904 Bytes

ba3b7a5

import os
import io
import tempfile
import streamlit as st
import pandas as pd
from pypdf import PdfReader
from pyvis.network import Network

from knowledge_graph_maker import (
    GraphMaker, Ontology, Document,
    OpenAIClient, GroqClient
)

st.set_page_config(page_title="Knowledge Graph Maker", layout="wide")

st.title("Knowledge Graph from Text/PDF (Docker Space)")
st.caption("Uses knowledge-graph-maker with OpenAI or Groq. Paste text or upload a PDF; view the interactive graph below.")

# Choose LLM client based on available env vars
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")

with st.sidebar:
    st.subheader("Model Settings")
    temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.05)
    top_p = st.slider("Top-p", 0.0, 1.0, 0.5, 0.05)
    provider = st.radio("Provider", ["OpenAI", "Groq"], index=0 if OPENAI_API_KEY else 1 if GROQ_API_KEY else 0)
    if provider == "OpenAI":
        oai_model = st.text_input("OpenAI model", value="gpt-3.5-turbo")
    else:
        groq_model = st.text_input("Groq model", value="mixtral-8x7b-32768")

    st.markdown("### Ontology (labels)")
    default_labels = [
        {"Person": "Person name without adjectives (may appear as name or pronoun)"},
        {"Object": "Avoid the definite article 'the' in name"},
        {"Event": "Events involving multiple people; no verbs like gives/leaves"},
        "Place", "Document", "Organisation", "Action",
        {"Miscellanous": "Important concept that fits none of the above"}
    ]
    labels_text = st.text_area("Labels (JSON or comma-separated)", value=", ".join(
        [lbl if isinstance(lbl, str) else list(lbl.keys())[0] for lbl in default_labels]
    ), height=80)
    st.markdown("### Relationships focus")
    relationships_text = st.text_input("Relationships (comma-separated)", value="Relation between any pair of Entities")

def parse_labels(text):
    # Allow simple "A, B, C" input; fall back to defaults above if empty
    if not text.strip():
        return [ "Person","Object","Event","Place","Document","Organisation","Action","Miscellanous" ]
    return [lbl.strip() for lbl in text.split(",") if lbl.strip()]

def split_pdf(file) -> str:
    reader = PdfReader(file)
    parts = []
    for page in reader.pages:
        try:
            parts.append(page.extract_text() or "")
        except Exception:
            continue
    return "\n".join(parts)

def build_graph_documents(text: str) -> list[Document]:
    # Simple chunking: ~900-1000 tokens ≈ ~3000-4000 chars heuristic
    # Adjust if needed.
    CHARS = 3500
    docs = []
    for i in range(0, len(text), CHARS):
        chunk = text[i:i+CHARS].strip()
        if chunk:
            docs.append(Document(text=chunk, metadata={"chunk_id": i//CHARS}))
    return docs

def edges_to_pyvis(edges):
    net = Network(height="700px", width="100%", bgcolor="#ffffff", font_color="#222222", notebook=False, directed=False)
    # Simple map to keep unique node IDs
    node_ids = {}

    def node_key(label, name): return f"{label}:{name}"

    for e in edges:
        n1 = node_key(e.node_1.label, e.node_1.name)
        n2 = node_key(e.node_2.label, e.node_2.name)

        if n1 not in node_ids:
            net.add_node(n1, label=e.node_1.name, title=e.node_1.label)
            node_ids[n1] = True
        if n2 not in node_ids:
            net.add_node(n2, label=e.node_2.name, title=e.node_2.label)
            node_ids[n2] = True

        rel = e.relationship or ""
        net.add_edge(n1, n2, title=rel, value=1)

    net.toggle_physics(True)
    return net

# Input UI
tab_text, tab_pdf = st.tabs(["📝 Paste Text", "📄 Upload PDF"])
input_text = ""
with tab_text:
    input_text = st.text_area("Paste your text here", height=220, placeholder="Paste text…")
with tab_pdf:
    pdf_file = st.file_uploader("Upload a PDF", type=["pdf"])
    if pdf_file:
        input_text = split_pdf(pdf_file)

if st.button("Generate Knowledge Graph", type="primary"):
    if not input_text.strip():
        st.warning("Please provide text or a PDF.")
        st.stop()

    # Prepare LLM client
    if provider == "OpenAI":
        if not OPENAI_API_KEY:
            st.error("OPENAI_API_KEY is not set in the Space Secrets.")
            st.stop()
        llm = OpenAIClient(model=oai_model, temperature=temperature, top_p=top_p)
    else:
        if not GROQ_API_KEY:
            st.error("GROQ_API_KEY is not set in the Space Secrets.")
            st.stop()
        llm = GroqClient(model=groq_model, temperature=temperature, top_p=top_p)

    # Ontology
    ontology = Ontology(
        labels=parse_labels(labels_text),
        relationships=[r.strip() for r in relationships_text.split(",") if r.strip()] or ["Relation between any pair of Entities"]
    )

    st.info("Chunking input and building graph… this may take a bit for longer texts.")

    gm = GraphMaker(ontology=ontology, llm_client=llm, verbose=False)
    docs = build_graph_documents(input_text)

    edges = gm.from_documents(docs, delay_s_between=0)  # tune delay for rate limits
    st.success(f"Graph built with {len(edges)} edges.")

    # Show edges table
    df = pd.DataFrame([{
        "node_1_label": e.node_1.label, "node_1": e.node_1.name,
        "node_2_label": e.node_2.label, "node_2": e.node_2.name,
        "relationship": e.relationship
    } for e in edges])
    st.dataframe(df, use_container_width=True)

    # Render with PyVis inside Streamlit
    net = edges_to_pyvis(edges)
    with tempfile.TemporaryDirectory() as td:
        html_path = os.path.join(td, "graph.html")
        net.save_graph(html_path)
        html_content = open(html_path, "r", encoding="utf-8").read()
        st.components.v1.html(html_content, height=750, scrolling=True)

st.markdown("---")
st.caption("Built with [knowledge-graph-maker](https://github.com/rahulnyk/knowledge_graph_maker).")