Update app.py
Browse files
app.py
CHANGED
|
@@ -999,45 +999,31 @@ target_datasets = {
|
|
| 999 |
|
| 1000 |
def get_korea_datasets():
|
| 1001 |
"""Korea ๊ด๋ จ ๋ฐ์ดํฐ์
๊ฒ์"""
|
| 1002 |
-
|
| 1003 |
-
|
|
|
|
|
|
|
|
|
|
| 1004 |
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
-
"
|
| 1008 |
-
|
| 1009 |
-
|
| 1010 |
-
|
| 1011 |
|
| 1012 |
-
|
| 1013 |
-
|
| 1014 |
-
|
| 1015 |
-
|
| 1016 |
-
|
| 1017 |
-
)
|
| 1018 |
-
|
| 1019 |
-
|
| 1020 |
-
|
| 1021 |
-
|
| 1022 |
-
print(f"Found {len(datasets)} datasets for search term '{term}'")
|
| 1023 |
-
else:
|
| 1024 |
-
print(f"Failed to fetch datasets for term '{term}': {response.status_code}")
|
| 1025 |
-
except Exception as e:
|
| 1026 |
-
print(f"Error fetching datasets for term '{term}': {str(e)}")
|
| 1027 |
-
|
| 1028 |
-
# ์ค๋ณต ์ ๊ฑฐ
|
| 1029 |
-
seen_ids = set()
|
| 1030 |
-
unique_datasets = []
|
| 1031 |
-
for dataset in all_korea_datasets:
|
| 1032 |
-
dataset_id = dataset.get('id', '')
|
| 1033 |
-
if dataset_id and dataset_id not in seen_ids:
|
| 1034 |
-
seen_ids.add(dataset_id)
|
| 1035 |
-
unique_datasets.append(dataset)
|
| 1036 |
-
|
| 1037 |
-
print(f"Total unique Korea-related datasets found: {len(unique_datasets)}")
|
| 1038 |
-
return unique_datasets
|
| 1039 |
|
| 1040 |
-
def get_all_datasets(limit=10000):
|
| 1041 |
"""๋ชจ๋ ๋ฐ์ดํฐ์
๊ณผ Korea ๊ด๋ จ ๋ฐ์ดํฐ์
๊ฐ์ ธ์ค๊ธฐ"""
|
| 1042 |
all_datasets = []
|
| 1043 |
page_size = 1000
|
|
@@ -1049,55 +1035,64 @@ def get_all_datasets(limit=10000): # ๊ธฐ๋ณธ limit ์ฆ๊ฐ
|
|
| 1049 |
'offset': offset
|
| 1050 |
}
|
| 1051 |
|
| 1052 |
-
|
| 1053 |
-
|
| 1054 |
-
|
| 1055 |
-
|
| 1056 |
-
|
| 1057 |
-
|
| 1058 |
-
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
-
else:
|
| 1064 |
-
print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
|
| 1065 |
-
break
|
| 1066 |
-
except Exception as e:
|
| 1067 |
-
print(f"Error fetching datasets at offset {offset}: {str(e)}")
|
| 1068 |
break
|
| 1069 |
|
| 1070 |
-
# Korea ๊ฒ์
|
| 1071 |
-
|
| 1072 |
-
|
|
|
|
|
|
|
|
|
|
| 1073 |
|
| 1074 |
-
|
| 1075 |
-
|
| 1076 |
-
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
added_count += 1
|
| 1080 |
|
| 1081 |
-
|
| 1082 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1083 |
|
|
|
|
| 1084 |
return all_datasets[:limit]
|
| 1085 |
|
| 1086 |
def get_datasets_data(progress=gr.Progress()):
|
| 1087 |
def calculate_rank(dataset_id, all_global_datasets, korea_datasets):
|
|
|
|
| 1088 |
global_rank = next((idx for idx, d in enumerate(all_global_datasets, 1)
|
| 1089 |
if d.get('id', '').strip() == dataset_id.strip()), None)
|
| 1090 |
|
|
|
|
| 1091 |
is_korea = any(d.get('id', '').strip() == dataset_id.strip() for d in korea_datasets)
|
| 1092 |
|
| 1093 |
if is_korea:
|
|
|
|
| 1094 |
korea_rank = next((idx for idx, d in enumerate(korea_datasets, 1)
|
| 1095 |
if d.get('id', '').strip() == dataset_id.strip()), None)
|
| 1096 |
|
| 1097 |
if korea_rank:
|
| 1098 |
-
return min(global_rank or
|
| 1099 |
|
| 1100 |
-
return global_rank if global_rank else 'Not in top
|
| 1101 |
|
| 1102 |
try:
|
| 1103 |
progress(0, desc="Fetching datasets...")
|
|
@@ -1113,9 +1108,15 @@ def get_datasets_data(progress=gr.Progress()):
|
|
| 1113 |
empty_df = pd.DataFrame(columns=['Global Rank', 'Dataset ID', 'Title', 'Downloads', 'Likes', 'Korea Search', 'URL'])
|
| 1114 |
return fig, error_html, empty_df
|
| 1115 |
|
| 1116 |
-
|
|
|
|
| 1117 |
korea_datasets = get_korea_datasets()
|
| 1118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1119 |
filtered_datasets = []
|
| 1120 |
for dataset_id in target_datasets.keys():
|
| 1121 |
try:
|
|
|
|
| 999 |
|
| 1000 |
def get_korea_datasets():
|
| 1001 |
"""Korea ๊ด๋ จ ๋ฐ์ดํฐ์
๊ฒ์"""
|
| 1002 |
+
params = {
|
| 1003 |
+
"search": "korea",
|
| 1004 |
+
"full": "True",
|
| 1005 |
+
"limit": 10000
|
| 1006 |
+
}
|
| 1007 |
|
| 1008 |
+
try:
|
| 1009 |
+
response = requests.get(
|
| 1010 |
+
"https://huggingface.co/api/datasets",
|
| 1011 |
+
headers={'Accept': 'application/json'}, # Authorization ๋์ Accept ํค๋ ์ฌ์ฉ
|
| 1012 |
+
params=params
|
| 1013 |
+
)
|
| 1014 |
|
| 1015 |
+
if response.status_code == 200:
|
| 1016 |
+
korea_datasets = response.json()
|
| 1017 |
+
print(f"Found {len(korea_datasets)} Korea-related datasets")
|
| 1018 |
+
return korea_datasets
|
| 1019 |
+
else:
|
| 1020 |
+
print(f"Failed to fetch Korea datasets: {response.status_code}")
|
| 1021 |
+
return []
|
| 1022 |
+
except Exception as e:
|
| 1023 |
+
print(f"Error fetching Korea datasets: {str(e)}")
|
| 1024 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1025 |
|
| 1026 |
+
def get_all_datasets(limit=10000):
|
| 1027 |
"""๋ชจ๋ ๋ฐ์ดํฐ์
๊ณผ Korea ๊ด๋ จ ๋ฐ์ดํฐ์
๊ฐ์ ธ์ค๊ธฐ"""
|
| 1028 |
all_datasets = []
|
| 1029 |
page_size = 1000
|
|
|
|
| 1035 |
'offset': offset
|
| 1036 |
}
|
| 1037 |
|
| 1038 |
+
response = requests.get(
|
| 1039 |
+
"https://huggingface.co/api/datasets",
|
| 1040 |
+
headers={'Accept': 'application/json'}, # Authorization ๋์ Accept ํค๋ ์ฌ์ฉ
|
| 1041 |
+
params=params
|
| 1042 |
+
)
|
| 1043 |
+
|
| 1044 |
+
if response.status_code == 200:
|
| 1045 |
+
all_datasets.extend(response.json())
|
| 1046 |
+
print(f"Fetched datasets {offset+1} to {offset+len(response.json())}")
|
| 1047 |
+
else:
|
| 1048 |
+
print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
break
|
| 1050 |
|
| 1051 |
+
# Korea ๊ฒ์ ๊ฒฐ๊ณผ๋ ๋์ผํ๊ฒ ํ์ฅ
|
| 1052 |
+
korea_params = {
|
| 1053 |
+
"search": "korea",
|
| 1054 |
+
"full": "True",
|
| 1055 |
+
"limit": limit
|
| 1056 |
+
}
|
| 1057 |
|
| 1058 |
+
korea_response = requests.get(
|
| 1059 |
+
"https://huggingface.co/api/datasets",
|
| 1060 |
+
headers={'Accept': 'application/json'}, # Authorization ๋์ Accept ํค๋ ์ฌ์ฉ
|
| 1061 |
+
params=korea_params
|
| 1062 |
+
)
|
|
|
|
| 1063 |
|
| 1064 |
+
if korea_response.status_code == 200:
|
| 1065 |
+
korea_datasets = korea_response.json()
|
| 1066 |
+
print(f"Fetched {len(korea_datasets)} Korea-related datasets")
|
| 1067 |
+
|
| 1068 |
+
# ์ค๋ณต ์ ๊ฑฐํ๋ฉด์ Korea ๋ฐ์ดํฐ์
์ถ๊ฐ
|
| 1069 |
+
existing_ids = {dataset.get('id', '') for dataset in all_datasets}
|
| 1070 |
+
for dataset in korea_datasets:
|
| 1071 |
+
if dataset.get('id', '') not in existing_ids:
|
| 1072 |
+
all_datasets.append(dataset)
|
| 1073 |
+
existing_ids.add(dataset.get('id', ''))
|
| 1074 |
|
| 1075 |
+
print(f"Total unique datasets: {len(all_datasets)}")
|
| 1076 |
return all_datasets[:limit]
|
| 1077 |
|
| 1078 |
def get_datasets_data(progress=gr.Progress()):
|
| 1079 |
def calculate_rank(dataset_id, all_global_datasets, korea_datasets):
|
| 1080 |
+
# ๊ธ๋ก๋ฒ ์์ ํ์ธ
|
| 1081 |
global_rank = next((idx for idx, d in enumerate(all_global_datasets, 1)
|
| 1082 |
if d.get('id', '').strip() == dataset_id.strip()), None)
|
| 1083 |
|
| 1084 |
+
# Korea ๋ฐ์ดํฐ์
์ธ ๊ฒฝ์ฐ
|
| 1085 |
is_korea = any(d.get('id', '').strip() == dataset_id.strip() for d in korea_datasets)
|
| 1086 |
|
| 1087 |
if is_korea:
|
| 1088 |
+
# Korea ๋ฐ์ดํฐ์
์ค์์์ ์์ ํ์ธ
|
| 1089 |
korea_rank = next((idx for idx, d in enumerate(korea_datasets, 1)
|
| 1090 |
if d.get('id', '').strip() == dataset_id.strip()), None)
|
| 1091 |
|
| 1092 |
if korea_rank:
|
| 1093 |
+
return min(global_rank or 10001, korea_rank + 1000), True
|
| 1094 |
|
| 1095 |
+
return global_rank if global_rank else 'Not in top 10000', is_korea
|
| 1096 |
|
| 1097 |
try:
|
| 1098 |
progress(0, desc="Fetching datasets...")
|
|
|
|
| 1108 |
empty_df = pd.DataFrame(columns=['Global Rank', 'Dataset ID', 'Title', 'Downloads', 'Likes', 'Korea Search', 'URL'])
|
| 1109 |
return fig, error_html, empty_df
|
| 1110 |
|
| 1111 |
+
# ์ผ๋ฐ ๋ฐ์ดํฐ์
๊ณผ Korea ๊ด๋ จ ๋ฐ์ดํฐ์
๋ชจ๋ ๊ฐ์ ธ์ค๊ธฐ
|
| 1112 |
+
all_global_datasets = get_all_datasets(limit=10000)
|
| 1113 |
korea_datasets = get_korea_datasets()
|
| 1114 |
|
| 1115 |
+
print(f"Total global datasets fetched: {len(all_global_datasets)}")
|
| 1116 |
+
print(f"Total Korea datasets fetched: {len(korea_datasets)}")
|
| 1117 |
+
|
| 1118 |
+
|
| 1119 |
+
|
| 1120 |
filtered_datasets = []
|
| 1121 |
for dataset_id in target_datasets.keys():
|
| 1122 |
try:
|