Spaces:
Sleeping
Sleeping
File size: 6,288 Bytes
a9f7f11 bae6c77 a9f7f11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import os
import gradio as gr
import logging
import json
import hashlib
from pathlib import Path
from fontTools.ttLib import TTFont, TTLibError
from huggingface_hub import HfApi
logging.basicConfig(level=logging.INFO)
API = HfApi()
TOKEN = os.environ.get("TOKEN")
REPO_ID = "Felix92/docTR-resource-collection"
def get_supported_chars(font_path: Path) -> list[str]:
try:
font = TTFont(font_path)
supported_chars = set()
for table in font["cmap"].tables:
supported_chars.update(table.cmap.keys())
chars = [chr(code_point) for code_point in sorted(supported_chars)]
return [char for char in chars if char.isprintable()]
except TTLibError as e:
logging.error(f"Error reading font file {font_path}: {e}")
return []
except Exception as e:
logging.error(f"Unexpected error reading font file {font_path}: {e}")
return []
def get_sha256(file_path: Path) -> str:
hash_sha256 = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()
def file_exists_on_hub(file_name: str, subfolder: str) -> bool:
files = API.list_repo_files(
repo_id=REPO_ID,
repo_type="dataset",
token=TOKEN,
)
return any(file.startswith(f"{subfolder}/{file_name}") for file in files)
def _upload_hub(file_path: str, subfolder: str, sha_hash: str) -> None:
filename = f"{sha_hash}_{Path(file_path).name}"
repo_path = f"{subfolder}/{filename}"
API.upload_file(
path_or_fileobj=file_path,
path_in_repo=repo_path,
token=TOKEN,
repo_type="dataset",
repo_id=REPO_ID,
)
logging.info(f"Uploaded {repo_path}")
def handle_uploads(font_upload, wordlist_upload, agree):
if not agree:
return gr.Markdown("You must agree to the terms and conditions before proceeding."), None, None, None
try:
if font_upload:
font_path = Path(font_upload)
font_sha = get_sha256(font_path)
if file_exists_on_hub(font_sha, "fonts"):
return gr.update(visible=False), gr.Markdown("<div style='text-align: center;'><h3>This font was already uploaded.</h3></div>"), gr.update(value=None), gr.update(value=None)
supported_chars = get_supported_chars(font_path)
if not supported_chars:
return gr.update(visible=False), gr.Markdown("<div style='text-align: center;'><h3>No supported characters found in the font file.</h3></div>"), gr.update(value=None), gr.update(value=None)
metadata = {
"font_name": font_path.stem,
"supported_characters": supported_chars,
}
json_path = font_path.with_suffix(".json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
json_sha = get_sha256(json_path)
_upload_hub(str(font_path), "fonts", font_sha)
_upload_hub(str(json_path), "fonts", json_sha)
if wordlist_upload:
wordlist_path = Path(wordlist_upload)
wordlist_sha = get_sha256(wordlist_path)
if file_exists_on_hub(wordlist_sha, "wordlists"):
return gr.update(visible=False), gr.Markdown("<div style='text-align: center;'><h3>This wordlist was already uploaded.</h3></div>"), gr.update(value=None), gr.update(value=None)
_upload_hub(str(wordlist_path), "wordlists", wordlist_sha)
return gr.update(visible=False), gr.Markdown("<div style='text-align: center;'><h3>Upload was successful! You can upload another item.</h3></div>"), gr.update(value=None), gr.update(value=None)
except Exception as e:
logging.exception("Upload failed")
return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'><h3>An error occurred: {e}</h3></div>"), gr.update(value=None), gr.update(value=None)
with gr.Blocks(fill_height=True) as demo:
agreement_markdown = gr.Markdown(
"""
<div style="text-align: center;">
<h1>File Upload Agreement</h1>
<h3>This is a Hugging Face space for the docTR/OnnxTR community to collect wordlists and fonts for the following project/s:</h3>
<h3><a href="https://github.com/mindee/doctr">docTR</a></h3>
<h3><a href="https://github.com/felixdittrich92/OnnxTR">OnnxTR</a></h3>
</div>
<h3>The uploaded wordlists and fonts will be used to generate synthetic data.</h3>
<h3>All uploaded files can be found here: <a href="https://huggingface.co/datasets/Felix92/docTR-resource-collection">Hugging Face dataset</a></h3>
<br>
<br>
<h3>By uploading a wordlist or font, you explicitly agree to the following terms:</h3>
<h3>1. You affirm that you are the owner or have the necessary rights to upload and share the wordlist or font.</h3>
<h3>2. You agree that the uploaded wordlists / fonts will be made publicly available to everyone.</h3>
<h3>3. You agree that the uploaded wordlists / fonts can be used for any purpose, including commercial use, by any third party.</h3>
"""
)
agree_button = gr.Button("I Agree to the Terms and Conditions")
agree_state = gr.State(value=False)
with gr.Column(visible=False) as upload_section:
success_message = gr.Markdown(visible=True)
font_upload = gr.File(label="Upload Font File [TTF | OTF]", file_types=[".ttf", ".otf"], type="filepath")
wordlist_upload = gr.File(label="Upload Wordlist [TXT]", file_types=[".txt"], type="filepath")
submit_button = gr.Button("Submit")
def toggle_agreement_visibility():
return gr.update(visible=False), gr.update(visible=False), True, gr.update(visible=True)
agree_button.click(fn=toggle_agreement_visibility, inputs=None, outputs=[agreement_markdown, agree_button, agree_state, upload_section])
submit_button.click(
fn=handle_uploads,
inputs=[font_upload, wordlist_upload, agree_state],
outputs=[agree_button, success_message, font_upload, wordlist_upload],
)
if __name__ == "__main__":
demo.launch()
|