Spaces:
Sleeping
Sleeping
kapllan
commited on
Commit
·
1cdf555
1
Parent(s):
8a95fb3
First commit for migrating the swiss topic modelling space.
Browse files- README.md +3 -3
- app.py +100 -0
- id2label.json +227 -0
- install_packages.py +57 -0
- requirements.txt +21 -0
README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
---
|
| 2 |
title: SwissParlTopicModelling
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 4.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
|
|
|
| 1 |
---
|
| 2 |
title: SwissParlTopicModelling
|
| 3 |
+
emoji: 📉
|
| 4 |
+
colorFrom: indigo
|
| 5 |
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.32.2
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
app.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json as js
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
import fasttext
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import joblib
|
| 9 |
+
import omikuji
|
| 10 |
+
from huggingface_hub import snapshot_download
|
| 11 |
+
from install_packages import download_model
|
| 12 |
+
|
| 13 |
+
download_model('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin', 'lid.176.bin')
|
| 14 |
+
|
| 15 |
+
# Download the model files from Hugging Face
|
| 16 |
+
for repo_id in ['kapllan/omikuji-bonsai-parliament-de-spacy', 'kapllan/omikuji-bonsai-parliament-fr-spacy',
|
| 17 |
+
'kapllan/omikuji-bonsai-parliament-it-spacy']:
|
| 18 |
+
if not os.path.exists(repo_id):
|
| 19 |
+
os.makedirs(repo_id)
|
| 20 |
+
model_dir = snapshot_download(repo_id=repo_id, local_dir=repo_id)
|
| 21 |
+
|
| 22 |
+
lang_model = fasttext.load_model('lid.176.bin')
|
| 23 |
+
|
| 24 |
+
with open('./id2label.json', 'r') as f:
|
| 25 |
+
id2label = js.load(f)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def map_language(language: str) -> str:
|
| 29 |
+
language_mapping = {'de': 'German',
|
| 30 |
+
'it': 'Italian',
|
| 31 |
+
'fr': 'French'}
|
| 32 |
+
if language in language_mapping.keys():
|
| 33 |
+
return language_mapping[language]
|
| 34 |
+
else:
|
| 35 |
+
return language
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def find_model(language: str):
|
| 39 |
+
vectorizer, model = None, None
|
| 40 |
+
if language in ['de', 'fr', 'it']:
|
| 41 |
+
path_to_vectorizer = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/vectorizer'
|
| 42 |
+
path_to_model = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/omikuji-model'
|
| 43 |
+
vectorizer = joblib.load(path_to_vectorizer)
|
| 44 |
+
model = omikuji.Model.load(path_to_model)
|
| 45 |
+
return vectorizer, model
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def predict_lang(text: str) -> str:
|
| 49 |
+
text = re.sub(r'\n', '', text) # Remove linebreaks because fasttext cannot process that otherwise
|
| 50 |
+
predictions = lang_model.predict(text, k=1) # returns top 2 matching languages
|
| 51 |
+
language = predictions[0][0] # returns top 2 matching languages
|
| 52 |
+
language = re.sub(r'__label__', '', language) # returns top 2 matching languages
|
| 53 |
+
return language
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def predict_topic(text: str) -> [List[str], str]:
|
| 57 |
+
results = []
|
| 58 |
+
language = predict_lang(text)
|
| 59 |
+
vectorizer, model = find_model(language)
|
| 60 |
+
language = map_language(language)
|
| 61 |
+
if vectorizer is not None:
|
| 62 |
+
texts = [text]
|
| 63 |
+
vector = vectorizer.transform(texts)
|
| 64 |
+
for row in vector:
|
| 65 |
+
if row.nnz == 0: # All zero vector, empty result
|
| 66 |
+
continue
|
| 67 |
+
feature_values = [(col, row[0, col]) for col in row.nonzero()[1]]
|
| 68 |
+
for subj_id, score in model.predict(feature_values, top_k=1000):
|
| 69 |
+
results.append((id2label[str(subj_id)], score))
|
| 70 |
+
return results, language
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def topic_modeling(text: str, threshold: float) -> [List[str], str]:
|
| 74 |
+
# Prepare labels and scores for the plot
|
| 75 |
+
sorted_topics, language = predict_topic(text)
|
| 76 |
+
if len(sorted_topics) > 0 and language in ['German', 'French', 'Italian']:
|
| 77 |
+
sorted_topics = [t for t in sorted_topics if t[1] >= threshold]
|
| 78 |
+
else:
|
| 79 |
+
sorted_topics = []
|
| 80 |
+
return sorted_topics, language
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
with gr.Blocks() as iface:
|
| 84 |
+
gr.Markdown("# Topic Modeling")
|
| 85 |
+
gr.Markdown("Enter a document and get each topic along with its score.")
|
| 86 |
+
|
| 87 |
+
with gr.Row():
|
| 88 |
+
with gr.Column():
|
| 89 |
+
input_text = gr.Textbox(lines=10, placeholder="Enter a document")
|
| 90 |
+
submit_button = gr.Button("Submit")
|
| 91 |
+
threshold_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Score Threshold", value=0.0)
|
| 92 |
+
language_text = gr.Textbox(lines=1, placeholder="Detected language will be shown here...",
|
| 93 |
+
interactive=False, label="Detected Language")
|
| 94 |
+
with gr.Column():
|
| 95 |
+
output_data = gr.Dataframe(headers=["Label", "Score"])
|
| 96 |
+
|
| 97 |
+
submit_button.click(topic_modeling, inputs=[input_text, threshold_slider], outputs=[output_data, language_text])
|
| 98 |
+
|
| 99 |
+
# Launch the app
|
| 100 |
+
iface.launch(share=True)
|
id2label.json
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"0": "AHV",
|
| 3 |
+
"1": "Abfall",
|
| 4 |
+
"2": "Abgabe",
|
| 5 |
+
"3": "Agrarpolitik",
|
| 6 |
+
"4": "Agrarproduktion",
|
| 7 |
+
"5": "Altersfragen",
|
| 8 |
+
"6": "Arbeit",
|
| 9 |
+
"7": "Arbeitslosenversicherung",
|
| 10 |
+
"8": "Arbeitslosigkeit",
|
| 11 |
+
"9": "Arbeitsmarkt",
|
| 12 |
+
"10": "Arbeitsrecht",
|
| 13 |
+
"11": "Armee",
|
| 14 |
+
"12": "Armut/Ungleichheit",
|
| 15 |
+
"13": "Artenvielfalt",
|
| 16 |
+
"14": "Asylpolitik",
|
| 17 |
+
"15": "Auslandschweizer",
|
| 18 |
+
"16": "Ausländerpolitik",
|
| 19 |
+
"17": "Aussenpolitik : Ausland",
|
| 20 |
+
"18": "Aussenpolitik : Schweiz",
|
| 21 |
+
"19": "Aussenwirtschaftspolitik",
|
| 22 |
+
"20": "Bankenkrise",
|
| 23 |
+
"21": "Bauwesen/Immobilien",
|
| 24 |
+
"22": "Behinderung",
|
| 25 |
+
"23": "Berg",
|
| 26 |
+
"24": "Berufliche Vorsorge",
|
| 27 |
+
"25": "Berufsbildung",
|
| 28 |
+
"26": "Beschwerderecht",
|
| 29 |
+
"27": "Beschäftigung und Arbeit",
|
| 30 |
+
"28": "Bewaffnung",
|
| 31 |
+
"29": "Beziehung Schweiz - EU",
|
| 32 |
+
"30": "Bildung",
|
| 33 |
+
"31": "Boden",
|
| 34 |
+
"32": "Bürgerrecht",
|
| 35 |
+
"33": "Datenschutz",
|
| 36 |
+
"34": "Demokratie",
|
| 37 |
+
"35": "Digitalisierung",
|
| 38 |
+
"36": "Diplomatie",
|
| 39 |
+
"37": "Diskriminierung",
|
| 40 |
+
"38": "Elektrizität",
|
| 41 |
+
"39": "Energie",
|
| 42 |
+
"40": "Energiepolitik",
|
| 43 |
+
"41": "Erberecht",
|
| 44 |
+
"42": "Ernährung",
|
| 45 |
+
"43": "Erwerbsersatzordnung",
|
| 46 |
+
"44": "Europapolitik",
|
| 47 |
+
"45": "Europarat",
|
| 48 |
+
"46": "Europarecht",
|
| 49 |
+
"47": "Europäische Union",
|
| 50 |
+
"48": "Europäisches Parlament",
|
| 51 |
+
"49": "Exekutive",
|
| 52 |
+
"50": "Familienfragen",
|
| 53 |
+
"51": "Familienrecht",
|
| 54 |
+
"52": "Familienzulage",
|
| 55 |
+
"53": "Finanzausgleich",
|
| 56 |
+
"54": "Finanzmarkt",
|
| 57 |
+
"55": "Finanzplatz",
|
| 58 |
+
"56": "Finanzrecht",
|
| 59 |
+
"57": "Finanzwesen",
|
| 60 |
+
"58": "Flüchtling",
|
| 61 |
+
"59": "Forschung",
|
| 62 |
+
"60": "Fortpflanzung",
|
| 63 |
+
"61": "Fossile Energie",
|
| 64 |
+
"62": "Föderalismus",
|
| 65 |
+
"63": "Geld- und Währungspolitik",
|
| 66 |
+
"64": "Geldwäscherei",
|
| 67 |
+
"65": "Gentechnologie",
|
| 68 |
+
"66": "Gerichtswesen",
|
| 69 |
+
"67": "Geschichte Ausland",
|
| 70 |
+
"68": "Geschichte Schweiz",
|
| 71 |
+
"69": "Geschlechterfragen",
|
| 72 |
+
"70": "Gesellschaftsfragen",
|
| 73 |
+
"71": "Gesundheit",
|
| 74 |
+
"72": "Gesundheitspolitik",
|
| 75 |
+
"73": "Gewalt",
|
| 76 |
+
"74": "Gewerkschaft",
|
| 77 |
+
"75": "Globalisierung",
|
| 78 |
+
"76": "Grenze",
|
| 79 |
+
"77": "Grundrechte",
|
| 80 |
+
"78": "Güterverkehr",
|
| 81 |
+
"79": "Handel",
|
| 82 |
+
"80": "Heil- und Hilfsmittel",
|
| 83 |
+
"81": "Informatik",
|
| 84 |
+
"82": "Information",
|
| 85 |
+
"83": "Informationswissenschaft",
|
| 86 |
+
"84": "Internationale Politik",
|
| 87 |
+
"85": "Internationales Recht",
|
| 88 |
+
"86": "Internet und soziale Medien",
|
| 89 |
+
"87": "Interventionspolitik",
|
| 90 |
+
"88": "Invalidenversicherung",
|
| 91 |
+
"89": "Jagd und Fischerei",
|
| 92 |
+
"90": "Kapital",
|
| 93 |
+
"91": "Katastrophe",
|
| 94 |
+
"92": "Kernenergie",
|
| 95 |
+
"93": "Kinder- und Jugendfragen",
|
| 96 |
+
"94": "Kinderrechte",
|
| 97 |
+
"95": "Kindes- und Erwachsenenschutzrecht",
|
| 98 |
+
"96": "Klimafragen",
|
| 99 |
+
"97": "Konkursrecht",
|
| 100 |
+
"98": "Konsum",
|
| 101 |
+
"99": "Korruption",
|
| 102 |
+
"100": "Krankenversicherung",
|
| 103 |
+
"101": "Krieg",
|
| 104 |
+
"102": "Krise",
|
| 105 |
+
"103": "Kultur",
|
| 106 |
+
"104": "Landwirtschaft",
|
| 107 |
+
"105": "Luft",
|
| 108 |
+
"106": "Luftfahrt",
|
| 109 |
+
"107": "Lärm",
|
| 110 |
+
"108": "Medien",
|
| 111 |
+
"109": "Medien / Kommunikation",
|
| 112 |
+
"110": "Medienrecht",
|
| 113 |
+
"111": "Medizinalberuf",
|
| 114 |
+
"112": "Menschenrechte",
|
| 115 |
+
"113": "Miet- und Wohnungswesen",
|
| 116 |
+
"114": "Migration",
|
| 117 |
+
"115": "Migrationsbewegung",
|
| 118 |
+
"116": "Mutterschaftsversicherung",
|
| 119 |
+
"117": "Nationalbank",
|
| 120 |
+
"118": "Obligationenrecht",
|
| 121 |
+
"119": "Parlament",
|
| 122 |
+
"120": "Parlament Ausland",
|
| 123 |
+
"121": "Parlament Schweiz",
|
| 124 |
+
"122": "Patient",
|
| 125 |
+
"123": "Personenrecht",
|
| 126 |
+
"124": "Pflege",
|
| 127 |
+
"125": "Post",
|
| 128 |
+
"126": "Presse",
|
| 129 |
+
"127": "Privatversicherung",
|
| 130 |
+
"128": "Produktion",
|
| 131 |
+
"129": "Radio und Fernsehen",
|
| 132 |
+
"130": "Rassismus",
|
| 133 |
+
"131": "Ratsmitglied",
|
| 134 |
+
"132": "Raumplanung",
|
| 135 |
+
"133": "Raumplanung und Wohnungswesen",
|
| 136 |
+
"134": "Recht Allgemein",
|
| 137 |
+
"135": "Rechte und Freiheiten",
|
| 138 |
+
"136": "Rechtswissenschaft",
|
| 139 |
+
"137": "Religionsfragen",
|
| 140 |
+
"138": "Sachenrecht",
|
| 141 |
+
"139": "Sans-Papiers",
|
| 142 |
+
"140": "Schiedsgerichtsbarkeit",
|
| 143 |
+
"141": "Schienenverkehr",
|
| 144 |
+
"142": "Schifffahrt",
|
| 145 |
+
"143": "Schule",
|
| 146 |
+
"144": "Service public",
|
| 147 |
+
"145": "Sicherheitspolitik",
|
| 148 |
+
"146": "Sicherheitspolitik/Friedenspolitik",
|
| 149 |
+
"147": "Soziale Fragen",
|
| 150 |
+
"148": "Sozialer Schutz",
|
| 151 |
+
"149": "Sozialhilfe",
|
| 152 |
+
"150": "Sozialpolitik",
|
| 153 |
+
"151": "Sozialversicherung",
|
| 154 |
+
"152": "Spiel",
|
| 155 |
+
"153": "Spital",
|
| 156 |
+
"154": "Sport",
|
| 157 |
+
"155": "Sprache",
|
| 158 |
+
"156": "Staat",
|
| 159 |
+
"157": "Staatspolitik",
|
| 160 |
+
"158": "Staatssouveränität",
|
| 161 |
+
"159": "Sterben und Tod",
|
| 162 |
+
"160": "Steuer",
|
| 163 |
+
"161": "Steuerhinterziehung",
|
| 164 |
+
"162": "Steuerrecht",
|
| 165 |
+
"163": "Steuerwettbewerb",
|
| 166 |
+
"164": "Stiftung",
|
| 167 |
+
"165": "Strafprozessordnung",
|
| 168 |
+
"166": "Strafrecht",
|
| 169 |
+
"167": "Straftat",
|
| 170 |
+
"168": "Strassenverkehr",
|
| 171 |
+
"169": "Sucht",
|
| 172 |
+
"170": "Telefonie",
|
| 173 |
+
"171": "Terrorismus",
|
| 174 |
+
"172": "Tierschutz",
|
| 175 |
+
"173": "Tierversuch",
|
| 176 |
+
"174": "Tourismus",
|
| 177 |
+
"175": "Umwelt",
|
| 178 |
+
"176": "Umweltpolitik",
|
| 179 |
+
"177": "Umweltschutz",
|
| 180 |
+
"178": "Unfallversicherung",
|
| 181 |
+
"179": "Universität/Hochschule/Fachhochschule",
|
| 182 |
+
"180": "Unternehmen",
|
| 183 |
+
"181": "Urheberrecht",
|
| 184 |
+
"182": "Verfahrensrecht",
|
| 185 |
+
"183": "Verfassung",
|
| 186 |
+
"184": "Vergaberecht",
|
| 187 |
+
"185": "Verkehr",
|
| 188 |
+
"186": "Verkehrspolitik",
|
| 189 |
+
"187": "Vertrag",
|
| 190 |
+
"188": "Verwaltungsrecht",
|
| 191 |
+
"189": "Volksabstimmung",
|
| 192 |
+
"190": "Vorrechte und Immunität",
|
| 193 |
+
"191": "Wahlen",
|
| 194 |
+
"192": "Wald",
|
| 195 |
+
"193": "Wasser",
|
| 196 |
+
"194": "Weiterbildung",
|
| 197 |
+
"195": "Wettbewerb",
|
| 198 |
+
"196": "Wirtschaft",
|
| 199 |
+
"197": "Wirtschaftsleben",
|
| 200 |
+
"198": "Wirtschaftspolitik",
|
| 201 |
+
"199": "Wissenschaft / Forschung",
|
| 202 |
+
"200": "Zivilprozessordnung",
|
| 203 |
+
"201": "Zivilrecht",
|
| 204 |
+
"202": "Zivilschutz und Bevölkerungsschutz/Zivildienst",
|
| 205 |
+
"203": "Zoll",
|
| 206 |
+
"204": "erneuerbare Energie",
|
| 207 |
+
"205": "innere Sicherheit",
|
| 208 |
+
"206": "internationale Beziehungen",
|
| 209 |
+
"207": "internationale Organisation",
|
| 210 |
+
"208": "internationale Politik",
|
| 211 |
+
"209": "internationale Rechtshilfe",
|
| 212 |
+
"210": "internationale Strafjustiz",
|
| 213 |
+
"211": "internationale Zusammenarbeit",
|
| 214 |
+
"212": "internationaler Konflikt",
|
| 215 |
+
"213": "internationales Abkommen",
|
| 216 |
+
"214": "internationales Privatrecht",
|
| 217 |
+
"215": "internationales Recht",
|
| 218 |
+
"216": "internationales humanitäres Recht",
|
| 219 |
+
"217": "kantonales Parlament",
|
| 220 |
+
"218": "politische Partei",
|
| 221 |
+
"219": "politische Rechte",
|
| 222 |
+
"220": "politisches Leben",
|
| 223 |
+
"221": "politisches System",
|
| 224 |
+
"222": "öffentliche Finanzen",
|
| 225 |
+
"223": "öffentliche Verwaltung",
|
| 226 |
+
"224": "öffentlicher Verkehr"
|
| 227 |
+
}
|
install_packages.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def download_model(url, save_path):
|
| 9 |
+
# Send a GET request to the URL
|
| 10 |
+
response = requests.get(url, stream=True)
|
| 11 |
+
|
| 12 |
+
# Check if the request was successful (status code 200)
|
| 13 |
+
if response.status_code == 200:
|
| 14 |
+
# Open a file in binary write mode to save the downloaded content
|
| 15 |
+
with open(save_path, 'wb') as f:
|
| 16 |
+
# Iterate over the response content in chunks and write to the file
|
| 17 |
+
for chunk in response.iter_content(chunk_size=1024):
|
| 18 |
+
f.write(chunk)
|
| 19 |
+
print("Model downloaded successfully!")
|
| 20 |
+
else:
|
| 21 |
+
# Print an error message if the request was not successful
|
| 22 |
+
print(f"Failed to download model. Status code: {response.status_code}")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def set_tokenizers_parallelism(value):
|
| 26 |
+
"""Set the TOKENIZERS_PARALLELISM environment variable."""
|
| 27 |
+
os.environ['TOKENIZERS_PARALLELISM'] = 'true' if value else 'false'
|
| 28 |
+
print(f"TOKENIZERS_PARALLELISM set to {os.environ['TOKENIZERS_PARALLELISM']}")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def install_requirements():
|
| 32 |
+
"""Install packages listed in requirements.txt"""
|
| 33 |
+
try:
|
| 34 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
|
| 35 |
+
print("All packages from requirements.txt installed successfully.")
|
| 36 |
+
except subprocess.CalledProcessError as e:
|
| 37 |
+
print(f"Failed to install packages from requirements.txt: {e}")
|
| 38 |
+
sys.exit(1)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def install_spacy_model(model_name):
|
| 42 |
+
"""Install a specific spaCy model"""
|
| 43 |
+
try:
|
| 44 |
+
subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
|
| 45 |
+
print(f"spaCy model '{model_name}' installed successfully.")
|
| 46 |
+
except subprocess.CalledProcessError as e:
|
| 47 |
+
print(f"Failed to install spaCy model '{model_name}': {e}")
|
| 48 |
+
sys.exit(1)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
install_requirements()
|
| 53 |
+
install_spacy_model("de_core_news_lg")
|
| 54 |
+
install_spacy_model("fr_core_news_lg")
|
| 55 |
+
install_spacy_model("it_core_news_lg")
|
| 56 |
+
set_tokenizers_parallelism(True)
|
| 57 |
+
download_model('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin', 'lid.176.bin')
|
requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Requests==2.32.2
|
| 2 |
+
annif
|
| 3 |
+
beautifulsoup4==4.12.3
|
| 4 |
+
datasets==2.14.5
|
| 5 |
+
fasttext==0.9.2
|
| 6 |
+
gradio
|
| 7 |
+
iterative_stratification==0.1.7
|
| 8 |
+
nltk==3.8.1
|
| 9 |
+
numpy==1.24.4
|
| 10 |
+
omikuji==0.5.1
|
| 11 |
+
openpyxl
|
| 12 |
+
pandas==2.2.2
|
| 13 |
+
pytz==2023.3.post1
|
| 14 |
+
scikit_learn==1.3.2
|
| 15 |
+
sentence_transformers==2.2.2
|
| 16 |
+
swissparlpy==0.3.0
|
| 17 |
+
tqdm==4.66.1
|
| 18 |
+
transformers==4.39.3
|
| 19 |
+
spacy==3.7.4
|
| 20 |
+
huggingface_hub
|
| 21 |
+
requests
|