|
from flask import Flask, render_template, Response |
|
from sonatoki.ilo import Ilo |
|
from sonatoki.Configs import PrefConfig, CorpusConfig |
|
from atproto import FirehoseSubscribeReposClient, parse_subscribe_repos_message |
|
from atproto import CAR, models |
|
import json |
|
import re |
|
import emoji |
|
import queue |
|
import threading |
|
from werkzeug.serving import run_simple |
|
from threading import Lock |
|
|
|
from typing import List, Type, TypedDict |
|
|
|
|
|
from typing_extensions import NotRequired |
|
|
|
|
|
from sonatoki.types import Number |
|
from sonatoki.Filters import ( |
|
Or, |
|
And, |
|
Len, |
|
Not, |
|
Filter, |
|
PuName, |
|
Numeric, |
|
Syllabic, |
|
NimiUCSUR, |
|
Alphabetic, |
|
NimiKuLili, |
|
NimiKuSuli, |
|
ProperName, |
|
Punctuation, |
|
LongSyllabic, |
|
Miscellaneous, |
|
LongAlphabetic, |
|
LongProperName, |
|
FalsePosSyllabic, |
|
NimiLinkuByUsage, |
|
NimiLinkuObscure, |
|
NimiLinkuSandbox, |
|
NimiLinkuUncommon, |
|
FalsePosAlphabetic, |
|
) |
|
from sonatoki.Scorers import Scorer, Soften, Voting, PassFail, SoftScaling, SoftPassFail |
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates |
|
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe |
|
from sonatoki.Preprocessors import ( |
|
RECOMMENDED_PREPROCESSORS, |
|
URLs, |
|
Emoji, |
|
Codeblock, |
|
Reference, |
|
Preprocessor, |
|
AngleBracketObject, |
|
Emails |
|
) |
|
|
|
__DICT_PHONOMATCHES = { |
|
|
|
|
|
"aka", |
|
"an", |
|
"api", |
|
"i", |
|
"je", |
|
"kana", |
|
"me", |
|
"ne", |
|
"nu", |
|
"omen", |
|
"se", |
|
"sole", |
|
"take", |
|
"ten", |
|
"to", |
|
"u", |
|
"we", |
|
"wi", |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
app = Flask(__name__) |
|
|
|
ilo = Ilo(**{ |
|
"preprocessors": [ |
|
URLs, |
|
Emails, |
|
Emoji, |
|
], |
|
"cleaners": [ConsecutiveDuplicates], |
|
"ignoring_filters": [Numeric, Punctuation], |
|
"scoring_filters": [ |
|
Len(Or(NimiLinkuByUsage(30), NimiUCSUR), max=15), |
|
Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24), |
|
|
|
Len(ProperName, min=2, max=24), |
|
Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24), |
|
], |
|
"scorer": SoftScaling, |
|
"passing_score": 0.8, |
|
}) |
|
|
|
class JSONExtra(json.JSONEncoder): |
|
def default(self, obj): |
|
try: |
|
return json.JSONEncoder.default(self, obj) |
|
except: |
|
return repr(obj) |
|
|
|
def clean_text(text: str) -> str: |
|
text = emoji.replace_emoji(text, replace='') |
|
text = re.sub(r'https?://\S+', '', text) |
|
|
|
text = text.strip() |
|
return text |
|
|
|
clients = [] |
|
clients_lock = Lock() |
|
|
|
def broadcast_message(msg: dict): |
|
|
|
with clients_lock: |
|
for q in clients: |
|
q.put(msg) |
|
|
|
def process_firehose(): |
|
client = FirehoseSubscribeReposClient() |
|
|
|
def on_message_handler(message): |
|
commit = parse_subscribe_repos_message(message) |
|
if not isinstance(commit, models.ComAtprotoSyncSubscribeRepos.Commit): |
|
return |
|
car = CAR.from_bytes(commit.blocks) |
|
for op in commit.ops: |
|
if op.action == "create" and op.cid: |
|
raw = car.blocks.get(op.cid) |
|
cooked = models.get_or_create(raw, strict=False) |
|
if not cooked: |
|
continue |
|
|
|
if cooked.py_type == "app.bsky.feed.post": |
|
|
|
|
|
|
|
cleaned_text = clean_text(raw.get("text", "")) |
|
|
|
if not cleaned_text or len(cleaned_text.split()) < 3: |
|
continue |
|
|
|
msg = ilo.preprocess(cleaned_text) |
|
scorecard = ilo._is_toki_pona(msg) |
|
result = scorecard["cleaned"] and scorecard["score"] >= 0.8 |
|
|
|
if not result: |
|
continue |
|
|
|
url = f'https://bsky.app/profile/{commit.repo}/post/{op.path.split("/")[1]}' |
|
broadcast_message({'text': raw.get("text", ""), 'url': url}) |
|
|
|
client.start(on_message_handler) |
|
|
|
def generate_sse(): |
|
|
|
q = queue.Queue() |
|
with clients_lock: |
|
clients.append(q) |
|
try: |
|
while True: |
|
message = q.get() |
|
yield f"data: {json.dumps(message)}\n\n" |
|
finally: |
|
with clients_lock: |
|
clients.remove(q) |
|
|
|
@app.route('/') |
|
def index(): |
|
return """<!DOCTYPE html> |
|
<html> |
|
<head> |
|
<title>Toki Pona Live Stream</title> |
|
<style> |
|
body { |
|
font-family: Arial, sans-serif; |
|
max-width: 800px; |
|
margin: 0 auto; |
|
padding: 20px; |
|
background-color: #f5f5f5; |
|
} |
|
.message { |
|
background: white; |
|
padding: 15px; |
|
margin: 10px 0; |
|
border-radius: 5px; |
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
} |
|
a { |
|
color: #0066cc; |
|
text-decoration: none; |
|
} |
|
h1 { |
|
text-align: center; |
|
} |
|
</style> |
|
</head> |
|
<body> |
|
<h1>Toki Pona Live Stream</h1> |
|
<div id="messages"></div> |
|
|
|
<script> |
|
const evtSource = new EventSource("/stream"); |
|
const messages = document.getElementById('messages'); |
|
|
|
evtSource.onmessage = function(event) { |
|
const data = JSON.parse(event.data); |
|
const messageDiv = document.createElement('div'); |
|
messageDiv.className = 'message'; |
|
messageDiv.innerHTML = ` |
|
<p>${data.text}</p> |
|
<a href="${data.url}" target="_blank">View on Bluesky</a> |
|
`; |
|
messages.insertBefore(messageDiv, messages.firstChild); |
|
|
|
if (messages.children.length > 50) { |
|
messages.removeChild(messages.lastChild); |
|
} |
|
}; |
|
</script> |
|
</body> |
|
</html>""" |
|
|
|
@app.route('/stream') |
|
def stream(): |
|
return Response(generate_sse(), mimetype='text/event-stream') |
|
|
|
if __name__ == '__main__': |
|
|
|
threading.Thread(target=process_firehose, daemon=True).start() |
|
|
|
run_simple('0.0.0.0', 7860, app, use_reloader=True, use_debugger=True, threaded=True) |
|
|