Spaces:

dindizz
/

sandhisplitter

Sleeping

App Files Files Community

dindizz commited on 26 days ago

Commit

f120f79

verified ·

1 Parent(s): ef4e2a3

Upload 2 files

Browse files

Files changed (1) hide show

app.py +19 -51

app.py CHANGED Viewed

@@ -1,82 +1,50 @@
 import gradio as gr
-import csv
-# Load lexicon from CSV
-LEXICON = set()
-with open("lexicon.csv", "r", encoding="utf-8") as f:
-    for row in csv.reader(f):
-        if row:
-            LEXICON.add(row[0].strip())
-# --- Expanded Reverse Sandhi Rules ---
 REVERSE_SANDHI_RULES = [
-    ("ा", ["अ+अ"]),          # ā -> a + a
-    ("े", ["अ+इ", "अ+ई"]),  # e -> a+i or a+ī
-    ("ो", ["अ+उ", "अ+ऊ"]),  # o -> a+u or a+ū
-    ("त्त", ["त्+त", "त्+द"]),
     ("ः", ["ः+"]),          # visarga restoration
-    ("ं", ["म्+", "न्+"]),   # anusvara restoration
 ]
 def generate_candidates(word):
     candidates = []
     for i in range(1, len(word)):
         left, right = word[:i], word[i:]
         # Direct split
         if left in LEXICON and right in LEXICON:
             candidates.append((left, right))
-        # Rule-based reverse sandhi
         for ch, expansions in REVERSE_SANDHI_RULES:
             if left.endswith(ch):
                 for exp in expansions:
-                    l_base = left[:-len(ch)] + exp.split("+")[0]
                     r_base = exp.split("+")[1] + right
                     if l_base in LEXICON and r_base in LEXICON:
                         candidates.append((l_base, r_base))
     candidates = list(set(candidates))
     return candidates or [("No plausible split found", "")]
 def sandhi_splitter(word):
-    word = word.strip()
-    if not word:
-        return "Please enter a word."
-    candidates = generate_candidates(word)
     formatted = [" + ".join(c) for c in candidates]
     return "\n".join(formatted)
-# --- Gradio App ---
 with gr.Blocks() as demo:
-    with gr.Row():
-        with gr.Column(scale=3):
-            gr.Markdown("## Sanskrit Sandhi-Splitter (Extended Prototype)")
-            gr.Markdown(
-                "**Instructions:**\n"
-                "1. Enter a **Sanskrit compound word** in Devanagari (e.g. धर्मक्षेत्रे).\n"
-                "2. Click **Split Sandhi** to see possible splits.\n"
-                "3. Candidate splits are based on a small dictionary and reverse sandhi rules.\n\n"
-                "**Contact:** For issues, mail **[email protected]**"
-            )
-            inp = gr.Textbox(label="Compound Word (e.g. धर्मक्षेत्रे)")
-            btn = gr.Button("Split Sandhi")
-            out = gr.Textbox(label="Candidate Splits", lines=5)
-            btn.click(fn=sandhi_splitter, inputs=inp, outputs=out)
-        with gr.Column(scale=1):
-            gr.Markdown(
-                "### How to Use This Tool\n"
-                "- Input any Sanskrit **compound** word.\n"
-                "- Works best with **Devanagari script**.\n"
-                "- Multiple possible splits may appear.\n\n"
-                "### Notes\n"
-                "- Uses **rule-based splitting** + lexicon check.\n"
-                "- Starter lexicon included (~20 entries); extend with full MW dictionary for accuracy.\n\n"
-                "**Support:** [email protected]"
-            )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+# --- Minimal Sanskrit lexicon (extend with real data) ---
+LEXICON = {
+    "राम", "वन", "गच्छति", "गुरु", "इन्द्र", "तत्", "अपि",
+    "धर्म", "क्षेत्र", "कुरु", "क्षेत्रे"
+}
+# --- Basic Reverse Sandhi Rules ---
 REVERSE_SANDHI_RULES = [
+    ("ा", ["अ+अ"]),    # ā → a + a
+    ("े", ["अ+इ", "अ+ई"]),  # e → a+i or a+ī
+    ("ो", ["अ+उ", "अ+ऊ"]),  # o → a+u or a+ū
     ("ः", ["ः+"]),          # visarga restoration
 ]
 def generate_candidates(word):
     candidates = []
     for i in range(1, len(word)):
         left, right = word[:i], word[i:]
         # Direct split
         if left in LEXICON and right in LEXICON:
             candidates.append((left, right))
+        # Apply reverse sandhi substitutions
         for ch, expansions in REVERSE_SANDHI_RULES:
             if left.endswith(ch):
                 for exp in expansions:
+                    l_base = left[:-1] + exp.split("+")[0]
                     r_base = exp.split("+")[1] + right
                     if l_base in LEXICON and r_base in LEXICON:
                         candidates.append((l_base, r_base))
+    # Deduplicate
     candidates = list(set(candidates))
     return candidates or [("No plausible split found", "")]
 def sandhi_splitter(word):
+    candidates = generate_candidates(word.strip())
     formatted = [" + ".join(c) for c in candidates]
     return "\n".join(formatted)
 with gr.Blocks() as demo:
+    gr.Markdown("## Sanskrit Sandhi-Splitter (Prototype)")
+    gr.Markdown("Enter a Sanskrit compound word (Devanagari) to see possible splits.")
+    inp = gr.Textbox(label="Compound Word (e.g. धर्मक्षेत्रे)")
+    out = gr.Textbox(label="Candidate Splits")
+    btn = gr.Button("Split Sandhi")
+    btn.click(fn=sandhi_splitter, inputs=inp, outputs=out)
 if __name__ == "__main__":
     demo.launch()