Spaces:
Sleeping
Sleeping
住所検索機能を拡張し、検索結果に都道府県、郡、市区町村、政令市区、大字・町、丁目、小字を追加。データフレームの構造を更新し、住所生成のロジックを改善。処理時間の測定を強化し、重複アドレスを除去する機能を追加。
Browse files
app.py
CHANGED
@@ -15,7 +15,6 @@ import time
|
|
15 |
from contextlib import contextmanager
|
16 |
import numpy as np
|
17 |
|
18 |
-
|
19 |
# .envファイルを読み込む
|
20 |
load_dotenv()
|
21 |
|
@@ -174,16 +173,23 @@ def search_via_milvus(query_vector, top_k, collection_name, thresh=0.0):
|
|
174 |
search_params=search_params,
|
175 |
limit=top_k,
|
176 |
anns_field='embedding',
|
177 |
-
output_fields=['address'],
|
178 |
)[0]
|
179 |
|
180 |
hits = []
|
181 |
for i, result in enumerate(results, start=1):
|
182 |
distance = result['distance']
|
183 |
address = result['entity'].get('address')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
if distance >= thresh:
|
186 |
-
hits.append([i, distance, address])
|
187 |
|
188 |
return hits
|
189 |
|
@@ -373,42 +379,61 @@ def create_vector_search_tab():
|
|
373 |
preprocessed = preprocess(query_address)
|
374 |
with measure('vector_search'):
|
375 |
hits = vector_search(preprocessed, top_k)
|
376 |
-
|
377 |
-
search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所'])
|
378 |
with measure('split_address'):
|
379 |
-
splits =
|
380 |
-
|
381 |
-
'
|
382 |
-
'city':
|
383 |
-
'
|
384 |
-
'
|
|
|
|
|
385 |
}
|
386 |
-
result_df = pd.DataFrame([
|
387 |
with measure('load city_all_file'):
|
388 |
target_dir = Path(r'C:\Users\taish\Development\whispercustom\projects\abr-geocoder\temp\download')
|
389 |
city_all_file = target_dir / 'mt_city_all.csv'
|
390 |
city_all_df = pd.read_csv(city_all_file)
|
391 |
city_all_df_temp = city_all_df[city_all_df['pref'] == splits['pref']]
|
392 |
-
|
|
|
|
|
393 |
if len(lg_codes) > 1:
|
394 |
raise Exception('Too many lg_code')
|
395 |
lg_code = lg_codes[0]
|
396 |
with measure('load parcel_city_file'):
|
397 |
parcel_city_file = target_dir / f'mt_parcel_city{lg_code:06d}.csv'
|
398 |
if not os.path.exists(parcel_city_file):
|
399 |
-
raise gr.Error('Too many lg_code')
|
|
|
400 |
parcel_city_df = pd.read_csv(parcel_city_file)
|
401 |
-
parcel_city_df = parcel_city_df[parcel_city_df['city'].fillna('') + parcel_city_df['ward'].fillna('') == splits['city']]
|
402 |
-
parcel_city_df = parcel_city_df[parcel_city_df['oaza_cho'].fillna('') + parcel_city_df['chome'].fillna('') == splits['town']]
|
403 |
|
404 |
cities = parcel_city_df['city'].fillna('')
|
405 |
wards = parcel_city_df['ward'].fillna('')
|
406 |
oaza_chos = parcel_city_df['oaza_cho'].fillna('')
|
407 |
chomes = parcel_city_df['chome'].fillna('')
|
408 |
koazas = parcel_city_df['koaza'].fillna('')
|
409 |
-
|
410 |
-
|
411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
|
413 |
# アドレスを生成
|
414 |
addresses = [
|
@@ -419,6 +444,47 @@ def create_vector_search_tab():
|
|
419 |
cities, wards, oaza_chos, chomes, koazas, prc_num1s, prc_num2s, prc_num3s
|
420 |
)
|
421 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
with measure('query_embed'):
|
423 |
query_embed = embed_via_multilingual_e5_large([query_address])
|
424 |
with measure('address_embeds'):
|
|
|
15 |
from contextlib import contextmanager
|
16 |
import numpy as np
|
17 |
|
|
|
18 |
# .envファイルを読み込む
|
19 |
load_dotenv()
|
20 |
|
|
|
173 |
search_params=search_params,
|
174 |
limit=top_k,
|
175 |
anns_field='embedding',
|
176 |
+
output_fields=['address', 'pref', 'county', 'city', 'ward', 'oaza_cho', 'chome', 'koaza'],
|
177 |
)[0]
|
178 |
|
179 |
hits = []
|
180 |
for i, result in enumerate(results, start=1):
|
181 |
distance = result['distance']
|
182 |
address = result['entity'].get('address')
|
183 |
+
pref = result['entity'].get('pref')
|
184 |
+
county = result['entity'].get('county')
|
185 |
+
city = result['entity'].get('city')
|
186 |
+
ward = result['entity'].get('ward')
|
187 |
+
oaza_cho = result['entity'].get('oaza_cho')
|
188 |
+
chome = result['entity'].get('chome')
|
189 |
+
koaza = result['entity'].get('koaza')
|
190 |
|
191 |
if distance >= thresh:
|
192 |
+
hits.append([i, distance, address, pref, county, city, ward, oaza_cho, chome, koaza])
|
193 |
|
194 |
return hits
|
195 |
|
|
|
379 |
preprocessed = preprocess(query_address)
|
380 |
with measure('vector_search'):
|
381 |
hits = vector_search(preprocessed, top_k)
|
382 |
+
search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所', '都道府県', '郡', '市区町村', '政令市区', '大字・町', '丁目', '小字'])
|
|
|
383 |
with measure('split_address'):
|
384 |
+
splits = {
|
385 |
+
'pref': hits[0][3],
|
386 |
+
'county': hits[0][4],
|
387 |
+
'city': hits[0][5],
|
388 |
+
'ward': hits[0][6],
|
389 |
+
'oaza_cho': hits[0][7],
|
390 |
+
'chome': hits[0][8],
|
391 |
+
'koaza': hits[0][9],
|
392 |
}
|
393 |
+
result_df = pd.DataFrame([splits.values()], columns=splits.keys())
|
394 |
with measure('load city_all_file'):
|
395 |
target_dir = Path(r'C:\Users\taish\Development\whispercustom\projects\abr-geocoder\temp\download')
|
396 |
city_all_file = target_dir / 'mt_city_all.csv'
|
397 |
city_all_df = pd.read_csv(city_all_file)
|
398 |
city_all_df_temp = city_all_df[city_all_df['pref'] == splits['pref']]
|
399 |
+
city_name1 = city_all_df_temp['county'].fillna('') + city_all_df_temp['city'].fillna('') + city_all_df_temp['ward'].fillna('')
|
400 |
+
city_name2 = splits['county'] + splits['city'] + splits['ward']
|
401 |
+
lg_codes = city_all_df_temp[city_name1 == city_name2]['lg_code'].values
|
402 |
if len(lg_codes) > 1:
|
403 |
raise Exception('Too many lg_code')
|
404 |
lg_code = lg_codes[0]
|
405 |
with measure('load parcel_city_file'):
|
406 |
parcel_city_file = target_dir / f'mt_parcel_city{lg_code:06d}.csv'
|
407 |
if not os.path.exists(parcel_city_file):
|
408 |
+
# raise gr.Error('Too many lg_code')
|
409 |
+
raise Exception('Too many lg_code')
|
410 |
parcel_city_df = pd.read_csv(parcel_city_file)
|
|
|
|
|
411 |
|
412 |
cities = parcel_city_df['city'].fillna('')
|
413 |
wards = parcel_city_df['ward'].fillna('')
|
414 |
oaza_chos = parcel_city_df['oaza_cho'].fillna('')
|
415 |
chomes = parcel_city_df['chome'].fillna('')
|
416 |
koazas = parcel_city_df['koaza'].fillna('')
|
417 |
+
|
418 |
+
city_name1 = cities + wards
|
419 |
+
city_name2 = splits['county'] + splits['city'] + splits['ward']
|
420 |
+
city_mask = city_name1 == city_name2
|
421 |
+
|
422 |
+
town_name1 = oaza_chos + chomes
|
423 |
+
town_name2 = splits['oaza_cho'] + splits['chome']
|
424 |
+
town_mask = town_name1 == town_name2
|
425 |
+
|
426 |
+
koaza_mask = koazas == splits['koaza']
|
427 |
+
parcel_city_df_filtered = parcel_city_df[city_mask & town_mask & koaza_mask]
|
428 |
+
|
429 |
+
cities = parcel_city_df_filtered['city'].fillna('')
|
430 |
+
wards = parcel_city_df_filtered['ward'].fillna('')
|
431 |
+
oaza_chos = parcel_city_df_filtered['oaza_cho'].fillna('')
|
432 |
+
chomes = parcel_city_df_filtered['chome'].fillna('')
|
433 |
+
koazas = parcel_city_df_filtered['koaza'].fillna('')
|
434 |
+
prc_num1s = parcel_city_df_filtered['prc_num1'].fillna(9999).astype(int).astype(str).replace('9999', '')
|
435 |
+
prc_num2s = parcel_city_df_filtered['prc_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
|
436 |
+
prc_num3s = parcel_city_df_filtered['prc_num3'].fillna(9999).astype(int).astype(str).replace('9999', '')
|
437 |
|
438 |
# アドレスを生成
|
439 |
addresses = [
|
|
|
444 |
cities, wards, oaza_chos, chomes, koazas, prc_num1s, prc_num2s, prc_num3s
|
445 |
)
|
446 |
]
|
447 |
+
with measure('load rsdtdsp_file'):
|
448 |
+
pref_code = ('%06d' % lg_code)[0:2]
|
449 |
+
rsdtdsp_dir = Path(rf'G:\マイドライブ\Development\Dataset\Misc\japanese_address\rsdt\original')
|
450 |
+
rsdtdsp_file = rsdtdsp_dir / f'mt_rsdtdsp_rsdt_pref{pref_code}.csv\mt_rsdtdsp_rsdt_pref{pref_code}.csv'
|
451 |
+
if not os.path.exists(rsdtdsp_file):
|
452 |
+
# raise gr.Error(f'Not found: {rsdtdsp_file}')
|
453 |
+
raise Exception(f'Not found: {rsdtdsp_file}')
|
454 |
+
rsdtdsp_df = pd.read_csv(rsdtdsp_file)
|
455 |
+
|
456 |
+
city_name1 = rsdtdsp_df['city'].fillna('') + rsdtdsp_df['ward'].fillna('')
|
457 |
+
city_name2 = splits['county'] + splits['city'] + splits['ward']
|
458 |
+
city_mask = city_name1 == city_name2
|
459 |
+
|
460 |
+
town_name1 = rsdtdsp_df['oaza_cho'].fillna('') + rsdtdsp_df['chome'].fillna('')
|
461 |
+
town_name2 = splits['oaza_cho'] + splits['chome']
|
462 |
+
town_mask = town_name1 == town_name2
|
463 |
+
|
464 |
+
koaza_mask = rsdtdsp_df['koaza'].fillna('') == splits['koaza']
|
465 |
+
|
466 |
+
rsdtdsp_df_filtered = rsdtdsp_df[city_mask & town_mask & koaza_mask]
|
467 |
+
|
468 |
+
cities = rsdtdsp_df_filtered['city'].fillna('')
|
469 |
+
wards = rsdtdsp_df_filtered['ward'].fillna('')
|
470 |
+
oaza_chos = rsdtdsp_df_filtered['oaza_cho'].fillna('')
|
471 |
+
chomes = rsdtdsp_df_filtered['chome'].fillna('')
|
472 |
+
koazas = rsdtdsp_df_filtered['koaza'].fillna('')
|
473 |
+
blk_nums = rsdtdsp_df_filtered['blk_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
|
474 |
+
rsdt_nums = rsdtdsp_df_filtered['rsdt_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
|
475 |
+
rsdt_num2s = rsdtdsp_df_filtered['rsdt_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
|
476 |
+
|
477 |
+
# アドレスを生成
|
478 |
+
addresses += [
|
479 |
+
f"{splits['pref']}{city}{ward}{oaza_cho}{chome}{koaza}{blk_num}" +
|
480 |
+
(f"-{rsdt_num}" if rsdt_num else '') +
|
481 |
+
(f"-{rsdt_num2}" if rsdt_num2 else '')
|
482 |
+
for city, ward, oaza_cho, chome, koaza, blk_num, rsdt_num, rsdt_num2 in zip(
|
483 |
+
cities, wards, oaza_chos, chomes, koazas, blk_nums, rsdt_nums, rsdt_num2s
|
484 |
+
)
|
485 |
+
]
|
486 |
+
|
487 |
+
addresses = list(set(addresses)) # 重複を除去
|
488 |
with measure('query_embed'):
|
489 |
query_embed = embed_via_multilingual_e5_large([query_address])
|
490 |
with measure('address_embeds'):
|