File size: 7,239 Bytes
9e6af01 17bf2b1 9e6af01 3a12425 3d164c4 538b809 c10d241 538b809 9e6af01 15357c2 538b809 9e6af01 e1402ca 9e6af01 3d164c4 9e6af01 3d164c4 9e6af01 e1402ca feab015 9e6af01 8e68dc9 9e6af01 725f05c 9e6af01 3d164c4 9e6af01 3d164c4 f01df71 3d164c4 9e6af01 5187cb9 3a12425 5187cb9 3a12425 5187cb9 3a12425 5187cb9 3a12425 5187cb9 3a12425 5187cb9 3a12425 5187cb9 3a12425 5187cb9 9e6af01 3d164c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
from flask import Flask, render_template, Response
from sonatoki.ilo import Ilo
from sonatoki.Configs import PrefConfig, CorpusConfig
from atproto import FirehoseSubscribeReposClient, parse_subscribe_repos_message
from atproto import CAR, models
import json
import re
import emoji
import queue
import threading
from werkzeug.serving import run_simple
from threading import Lock
# STL
from typing import List, Type, TypedDict
# PDM
from typing_extensions import NotRequired
# LOCAL
from sonatoki.types import Number
from sonatoki.Filters import (
Or,
And,
Len,
Not,
Filter,
PuName,
Numeric,
Syllabic,
NimiUCSUR,
Alphabetic,
NimiKuLili,
NimiKuSuli,
ProperName,
Punctuation,
LongSyllabic,
Miscellaneous,
LongAlphabetic,
LongProperName,
FalsePosSyllabic,
NimiLinkuByUsage,
NimiLinkuObscure,
NimiLinkuSandbox,
NimiLinkuUncommon,
FalsePosAlphabetic,
)
from sonatoki.Scorers import Scorer, Soften, Voting, PassFail, SoftScaling, SoftPassFail
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
from sonatoki.Preprocessors import (
RECOMMENDED_PREPROCESSORS,
URLs,
Emoji,
Codeblock,
Reference,
Preprocessor,
AngleBracketObject,
Emails
)
__DICT_PHONOMATCHES = {
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
# In this case, all of these appear more often in English by a factor of at least 10.
"aka", # also known as
"an", # article
"api", # API
"i", # 1st person
"je", # 1st person pronoun, french
"kana", # japanese script
"me", # 1st person singular, english
"ne", # "no" in several languages
"nu", # "new" in english, "now" in dutch
"omen", # ominous
"se", # spanish particle, english "see"
"sole", # singular, of shoe
"take", # acquire, perhaps forcefully or without permission
"ten", # 10
"to", # to, too
"u", # no u
"we", # 1st person plural, english
"wi", # wii and discussions of syllables
# unexplored candidates for removal
# "papa", # father
# "lo", # "lo" and "loo"
# "ewe", # sheep
# "pa", # father- eh?
}
app = Flask(__name__)
ilo = Ilo(**{
"preprocessors": [
URLs,
Emails,
Emoji,
],
"cleaners": [ConsecutiveDuplicates],
"ignoring_filters": [Numeric, Punctuation],
"scoring_filters": [
Len(Or(NimiLinkuByUsage(30), NimiUCSUR), max=15),
Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24),
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
Len(ProperName, min=2, max=24),
Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24),
],
"scorer": SoftScaling,
"passing_score": 0.8,
})
class JSONExtra(json.JSONEncoder):
def default(self, obj):
try:
return json.JSONEncoder.default(self, obj)
except:
return repr(obj)
def clean_text(text: str) -> str:
text = emoji.replace_emoji(text, replace='')
text = re.sub(r'https?://\S+', '', text)
#text = re.sub(r'[^A-Za-z\s]', '', text)
text = text.strip()
return text
clients = []
clients_lock = Lock()
def broadcast_message(msg: dict):
# Send the given message to all connected SSE clients
with clients_lock:
for q in clients:
q.put(msg)
def process_firehose():
client = FirehoseSubscribeReposClient()
def on_message_handler(message):
commit = parse_subscribe_repos_message(message)
if not isinstance(commit, models.ComAtprotoSyncSubscribeRepos.Commit):
return
car = CAR.from_bytes(commit.blocks)
for op in commit.ops:
if op.action == "create" and op.cid:
raw = car.blocks.get(op.cid)
cooked = models.get_or_create(raw, strict=False)
if not cooked:
continue
if cooked.py_type == "app.bsky.feed.post":
#if ilo.is_toki_pona(raw.get("text", "")):
# print(raw.get("text", ""))
cleaned_text = clean_text(raw.get("text", ""))
if not cleaned_text or len(cleaned_text.split()) < 3:
continue
msg = ilo.preprocess(cleaned_text)
scorecard = ilo._is_toki_pona(msg)
result = scorecard["cleaned"] and scorecard["score"] >= 0.8
if not result:
continue
url = f'https://bsky.app/profile/{commit.repo}/post/{op.path.split("/")[1]}'
broadcast_message({'text': raw.get("text", ""), 'url': url})
client.start(on_message_handler)
def generate_sse():
# Each client gets its own queue.
q = queue.Queue()
with clients_lock:
clients.append(q)
try:
while True:
message = q.get() # Blocking until a new message is broadcast
yield f"data: {json.dumps(message)}\n\n"
finally:
with clients_lock:
clients.remove(q)
@app.route('/')
def index():
return """<!DOCTYPE html>
<html>
<head>
<title>Toki Pona Live Stream</title>
<style>
body {
font-family: Arial, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f5f5f5;
}
.message {
background: white;
padding: 15px;
margin: 10px 0;
border-radius: 5px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
a {
color: #0066cc;
text-decoration: none;
}
h1 {
text-align: center;
}
</style>
</head>
<body>
<h1>Toki Pona Live Stream</h1>
<div id="messages"></div>
<script>
const evtSource = new EventSource("/stream");
const messages = document.getElementById('messages');
evtSource.onmessage = function(event) {
const data = JSON.parse(event.data);
const messageDiv = document.createElement('div');
messageDiv.className = 'message';
messageDiv.innerHTML = `
<p>${data.text}</p>
<a href="${data.url}" target="_blank">View on Bluesky</a>
`;
messages.insertBefore(messageDiv, messages.firstChild);
if (messages.children.length > 50) {
messages.removeChild(messages.lastChild);
}
};
</script>
</body>
</html>"""
@app.route('/stream')
def stream():
return Response(generate_sse(), mimetype='text/event-stream')
if __name__ == '__main__':
# Start the firehose processing in a separate thread
threading.Thread(target=process_firehose, daemon=True).start()
# Use run_simple with threading enabled to allow multiple clients
run_simple('0.0.0.0', 7860, app, use_reloader=True, use_debugger=True, threaded=True)
|