File size: 7,239 Bytes
9e6af01
 
17bf2b1
9e6af01
 
 
 
 
 
 
3a12425
3d164c4
538b809
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c10d241
538b809
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e6af01
 
 
15357c2
538b809
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e6af01
 
 
 
 
 
 
 
 
 
 
e1402ca
9e6af01
 
 
3d164c4
 
 
 
 
 
 
 
 
9e6af01
 
 
 
 
 
 
 
 
3d164c4
9e6af01
 
 
 
 
 
e1402ca
 
feab015
9e6af01
8e68dc9
9e6af01
 
 
725f05c
 
 
 
 
9e6af01
 
 
3d164c4
9e6af01
 
 
 
3d164c4
 
 
 
 
 
 
f01df71
3d164c4
 
 
9e6af01
 
 
5187cb9
 
 
 
 
 
3a12425
5187cb9
 
 
 
 
 
 
3a12425
5187cb9
3a12425
 
5187cb9
3a12425
 
5187cb9
 
3a12425
5187cb9
 
 
 
 
3a12425
5187cb9
 
 
 
 
 
 
 
 
 
 
 
3a12425
5187cb9
 
 
 
 
 
 
 
 
 
9e6af01
 
 
 
 
 
 
 
3d164c4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
from flask import Flask, render_template, Response
from sonatoki.ilo import Ilo
from sonatoki.Configs import PrefConfig, CorpusConfig
from atproto import FirehoseSubscribeReposClient, parse_subscribe_repos_message
from atproto import CAR, models
import json
import re
import emoji
import queue
import threading
from werkzeug.serving import run_simple
from threading import Lock
# STL
from typing import List, Type, TypedDict

# PDM
from typing_extensions import NotRequired

# LOCAL
from sonatoki.types import Number
from sonatoki.Filters import (
    Or,
    And,
    Len,
    Not,
    Filter,
    PuName,
    Numeric,
    Syllabic,
    NimiUCSUR,
    Alphabetic,
    NimiKuLili,
    NimiKuSuli,
    ProperName,
    Punctuation,
    LongSyllabic,
    Miscellaneous,
    LongAlphabetic,
    LongProperName,
    FalsePosSyllabic,
    NimiLinkuByUsage,
    NimiLinkuObscure,
    NimiLinkuSandbox,
    NimiLinkuUncommon,
    FalsePosAlphabetic,
)
from sonatoki.Scorers import Scorer, Soften, Voting, PassFail, SoftScaling, SoftPassFail
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
from sonatoki.Preprocessors import (
    RECOMMENDED_PREPROCESSORS,
    URLs,
    Emoji,
    Codeblock,
    Reference,
    Preprocessor,
    AngleBracketObject,
    Emails
)

__DICT_PHONOMATCHES = {
    # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
    # In this case, all of these appear more often in English by a factor of at least 10.
    "aka",  # also known as
    "an",  # article
    "api",  # API
    "i",  # 1st person
    "je",  # 1st person pronoun, french
    "kana",  # japanese script
    "me",  # 1st person singular, english
    "ne",  # "no" in several languages
    "nu",  # "new" in english, "now" in dutch
    "omen",  # ominous
    "se",  # spanish particle, english "see"
    "sole",  # singular, of shoe
    "take",  # acquire, perhaps forcefully or without permission
    "ten",  # 10
    "to",  # to, too
    "u",  # no u
    "we",  # 1st person plural, english
    "wi",  # wii and discussions of syllables
    # unexplored candidates for removal
    # "papa",  # father
    # "lo",  # "lo" and "loo"
    # "ewe",  # sheep
    # "pa",  # father- eh?
}


app = Flask(__name__)

ilo = Ilo(**{
    "preprocessors": [
        URLs,
        Emails,
        Emoji,
    ],
    "cleaners": [ConsecutiveDuplicates],
    "ignoring_filters": [Numeric, Punctuation],
    "scoring_filters": [
        Len(Or(NimiLinkuByUsage(30), NimiUCSUR), max=15),
        Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24),
        # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
        Len(ProperName, min=2, max=24),
        Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24),
    ],
    "scorer": SoftScaling,
    "passing_score": 0.8,
})

class JSONExtra(json.JSONEncoder):
    def default(self, obj):
        try:
            return json.JSONEncoder.default(self, obj)
        except:
            return repr(obj)

def clean_text(text: str) -> str:
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'https?://\S+', '', text)
    #text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.strip()
    return text

clients = []
clients_lock = Lock()

def broadcast_message(msg: dict):
    # Send the given message to all connected SSE clients
    with clients_lock:
        for q in clients:
            q.put(msg)

def process_firehose():
    client = FirehoseSubscribeReposClient()

    def on_message_handler(message):
        commit = parse_subscribe_repos_message(message)
        if not isinstance(commit, models.ComAtprotoSyncSubscribeRepos.Commit):
            return
        car = CAR.from_bytes(commit.blocks)
        for op in commit.ops:
            if op.action == "create" and op.cid:
                raw = car.blocks.get(op.cid)
                cooked = models.get_or_create(raw, strict=False)
                if not cooked:
                    continue

                if cooked.py_type == "app.bsky.feed.post":
                    #if ilo.is_toki_pona(raw.get("text", "")):
                    #    print(raw.get("text", ""))
                          
                    cleaned_text = clean_text(raw.get("text", ""))
                    
                    if not cleaned_text or len(cleaned_text.split()) < 3:
                        continue

                    msg = ilo.preprocess(cleaned_text)
                    scorecard = ilo._is_toki_pona(msg)
                    result = scorecard["cleaned"] and scorecard["score"] >= 0.8

                    if not result:
                        continue

                    url = f'https://bsky.app/profile/{commit.repo}/post/{op.path.split("/")[1]}'
                    broadcast_message({'text': raw.get("text", ""), 'url': url})

    client.start(on_message_handler)

def generate_sse():
    # Each client gets its own queue.
    q = queue.Queue()
    with clients_lock:
        clients.append(q)
    try:
        while True:
            message = q.get()  # Blocking until a new message is broadcast
            yield f"data: {json.dumps(message)}\n\n"
    finally:
        with clients_lock:
            clients.remove(q)

@app.route('/')
def index():
    return """<!DOCTYPE html>
<html>
<head>
    <title>Toki Pona Live Stream</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f5f5f5;
        }
        .message {
            background: white;
            padding: 15px;
            margin: 10px 0;
            border-radius: 5px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        a {
            color: #0066cc;
            text-decoration: none;
        }
        h1 {
            text-align: center;
        }
    </style>
</head>
<body>
    <h1>Toki Pona Live Stream</h1>
    <div id="messages"></div>

    <script>
        const evtSource = new EventSource("/stream");
        const messages = document.getElementById('messages');
        
        evtSource.onmessage = function(event) {
            const data = JSON.parse(event.data);
            const messageDiv = document.createElement('div');
            messageDiv.className = 'message';
            messageDiv.innerHTML = `
                <p>${data.text}</p>
                <a href="${data.url}" target="_blank">View on Bluesky</a>
            `;
            messages.insertBefore(messageDiv, messages.firstChild);
            
            if (messages.children.length > 50) {
                messages.removeChild(messages.lastChild);
            }
        };
    </script>
</body>
</html>"""

@app.route('/stream')
def stream():
    return Response(generate_sse(), mimetype='text/event-stream')

if __name__ == '__main__':
    # Start the firehose processing in a separate thread
    threading.Thread(target=process_firehose, daemon=True).start()
    # Use run_simple with threading enabled to allow multiple clients
    run_simple('0.0.0.0', 7860, app, use_reloader=True, use_debugger=True, threaded=True)