File size: 6,288 Bytes
a9f7f11
 
 
 
 
 
 
 
 
 
 
 
bae6c77
a9f7f11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import gradio as gr
import logging
import json
import hashlib
from pathlib import Path
from fontTools.ttLib import TTFont, TTLibError
from huggingface_hub import HfApi

logging.basicConfig(level=logging.INFO)

API = HfApi()
TOKEN = os.environ.get("TOKEN")
REPO_ID = "Felix92/docTR-resource-collection"

def get_supported_chars(font_path: Path) -> list[str]:
    try:
        font = TTFont(font_path)
        supported_chars = set()
        for table in font["cmap"].tables:
            supported_chars.update(table.cmap.keys())
        chars = [chr(code_point) for code_point in sorted(supported_chars)]
        return [char for char in chars if char.isprintable()]
    except TTLibError as e:
        logging.error(f"Error reading font file {font_path}: {e}")
        return []
    except Exception as e:
        logging.error(f"Unexpected error reading font file {font_path}: {e}")
        return []

def get_sha256(file_path: Path) -> str:
    hash_sha256 = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            hash_sha256.update(chunk)
    return hash_sha256.hexdigest()

def file_exists_on_hub(file_name: str, subfolder: str) -> bool:
    files = API.list_repo_files(
        repo_id=REPO_ID,
        repo_type="dataset",
        token=TOKEN,
    )
    return any(file.startswith(f"{subfolder}/{file_name}") for file in files)

def _upload_hub(file_path: str, subfolder: str, sha_hash: str) -> None:
    filename = f"{sha_hash}_{Path(file_path).name}"
    repo_path = f"{subfolder}/{filename}"
    API.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=repo_path,
        token=TOKEN,
        repo_type="dataset",
        repo_id=REPO_ID,
    )
    logging.info(f"Uploaded {repo_path}")

def handle_uploads(font_upload, wordlist_upload, agree):
    if not agree:
        return gr.Markdown("You must agree to the terms and conditions before proceeding."), None, None, None

    try:
        if font_upload:
            font_path = Path(font_upload)
            font_sha = get_sha256(font_path)
            if file_exists_on_hub(font_sha, "fonts"):
                return gr.update(visible=False), gr.Markdown("<div style='text-align: center;'><h3>This font was already uploaded.</h3></div>"), gr.update(value=None), gr.update(value=None)

            supported_chars = get_supported_chars(font_path)
            if not supported_chars:
                return gr.update(visible=False), gr.Markdown("<div style='text-align: center;'><h3>No supported characters found in the font file.</h3></div>"), gr.update(value=None), gr.update(value=None)
            metadata = {
                "font_name": font_path.stem,
                "supported_characters": supported_chars,
            }
            json_path = font_path.with_suffix(".json")
            with open(json_path, "w", encoding="utf-8") as f:
                json.dump(metadata, f, ensure_ascii=False, indent=2)

            json_sha = get_sha256(json_path)

            _upload_hub(str(font_path), "fonts", font_sha)
            _upload_hub(str(json_path), "fonts", json_sha)

        if wordlist_upload:
            wordlist_path = Path(wordlist_upload)
            wordlist_sha = get_sha256(wordlist_path)
            if file_exists_on_hub(wordlist_sha, "wordlists"):
                return gr.update(visible=False), gr.Markdown("<div style='text-align: center;'><h3>This wordlist was already uploaded.</h3></div>"), gr.update(value=None), gr.update(value=None)

            _upload_hub(str(wordlist_path), "wordlists", wordlist_sha)

        return gr.update(visible=False), gr.Markdown("<div style='text-align: center;'><h3>Upload was successful! You can upload another item.</h3></div>"), gr.update(value=None), gr.update(value=None)

    except Exception as e:
        logging.exception("Upload failed")
        return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'><h3>An error occurred: {e}</h3></div>"), gr.update(value=None), gr.update(value=None)

with gr.Blocks(fill_height=True) as demo:
    agreement_markdown = gr.Markdown(
        """
        <div style="text-align: center;">
        <h1>File Upload Agreement</h1>

        <h3>This is a Hugging Face space for the docTR/OnnxTR community to collect wordlists and fonts for the following project/s:</h3>

        <h3><a href="https://github.com/mindee/doctr">docTR</a></h3>

        <h3><a href="https://github.com/felixdittrich92/OnnxTR">OnnxTR</a></h3>
        </div>

        <h3>The uploaded wordlists and fonts will be used to generate synthetic data.</h3>

        <h3>All uploaded files can be found here: <a href="https://huggingface.co/datasets/Felix92/docTR-resource-collection">Hugging Face dataset</a></h3>

        <br>
        <br>

        <h3>By uploading a wordlist or font, you explicitly agree to the following terms:</h3>

        <h3>1. You affirm that you are the owner or have the necessary rights to upload and share the wordlist or font.</h3>

        <h3>2. You agree that the uploaded wordlists / fonts will be made publicly available to everyone.</h3>

        <h3>3. You agree that the uploaded wordlists / fonts can be used for any purpose, including commercial use, by any third party.</h3>
        """
    )
    agree_button = gr.Button("I Agree to the Terms and Conditions")
    agree_state = gr.State(value=False)

    with gr.Column(visible=False) as upload_section:
        success_message = gr.Markdown(visible=True)
        font_upload = gr.File(label="Upload Font File [TTF | OTF]", file_types=[".ttf", ".otf"], type="filepath")
        wordlist_upload = gr.File(label="Upload Wordlist [TXT]", file_types=[".txt"], type="filepath")
        submit_button = gr.Button("Submit")

    def toggle_agreement_visibility():
        return gr.update(visible=False), gr.update(visible=False), True, gr.update(visible=True)

    agree_button.click(fn=toggle_agreement_visibility, inputs=None, outputs=[agreement_markdown, agree_button, agree_state, upload_section])

    submit_button.click(
        fn=handle_uploads,
        inputs=[font_upload, wordlist_upload, agree_state],
        outputs=[agree_button, success_message, font_upload, wordlist_upload],
    )

if __name__ == "__main__":
    demo.launch()