matsuap commited on
Commit
7dffd7c
·
1 Parent(s): 63887f7

住所検索機能を拡張し、検索結果に都道府県、郡、市区町村、政令市区、大字・町、丁目、小字を追加。データフレームの構造を更新し、住所生成のロジックを改善。処理時間の測定を強化し、重複アドレスを除去する機能を追加。

Browse files
Files changed (1) hide show
  1. app.py +85 -19
app.py CHANGED
@@ -15,7 +15,6 @@ import time
15
  from contextlib import contextmanager
16
  import numpy as np
17
 
18
-
19
  # .envファイルを読み込む
20
  load_dotenv()
21
 
@@ -174,16 +173,23 @@ def search_via_milvus(query_vector, top_k, collection_name, thresh=0.0):
174
  search_params=search_params,
175
  limit=top_k,
176
  anns_field='embedding',
177
- output_fields=['address'],
178
  )[0]
179
 
180
  hits = []
181
  for i, result in enumerate(results, start=1):
182
  distance = result['distance']
183
  address = result['entity'].get('address')
 
 
 
 
 
 
 
184
 
185
  if distance >= thresh:
186
- hits.append([i, distance, address])
187
 
188
  return hits
189
 
@@ -373,42 +379,61 @@ def create_vector_search_tab():
373
  preprocessed = preprocess(query_address)
374
  with measure('vector_search'):
375
  hits = vector_search(preprocessed, top_k)
376
- normalized = hits[0][-1]
377
- search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所'])
378
  with measure('split_address'):
379
- splits = split_address(normalized)
380
- data = {
381
- 'pref': splits['pref'],
382
- 'city': splits['city'],
383
- 'town': splits['town'],
384
- 'addr': splits['addr'],
 
 
385
  }
386
- result_df = pd.DataFrame([data])
387
  with measure('load city_all_file'):
388
  target_dir = Path(r'C:\Users\taish\Development\whispercustom\projects\abr-geocoder\temp\download')
389
  city_all_file = target_dir / 'mt_city_all.csv'
390
  city_all_df = pd.read_csv(city_all_file)
391
  city_all_df_temp = city_all_df[city_all_df['pref'] == splits['pref']]
392
- lg_codes = city_all_df_temp[city_all_df_temp['county'].fillna('') + city_all_df_temp['city'].fillna('') + city_all_df_temp['ward'].fillna('') == splits['city']]['lg_code'].values
 
 
393
  if len(lg_codes) > 1:
394
  raise Exception('Too many lg_code')
395
  lg_code = lg_codes[0]
396
  with measure('load parcel_city_file'):
397
  parcel_city_file = target_dir / f'mt_parcel_city{lg_code:06d}.csv'
398
  if not os.path.exists(parcel_city_file):
399
- raise gr.Error('Too many lg_code')
 
400
  parcel_city_df = pd.read_csv(parcel_city_file)
401
- parcel_city_df = parcel_city_df[parcel_city_df['city'].fillna('') + parcel_city_df['ward'].fillna('') == splits['city']]
402
- parcel_city_df = parcel_city_df[parcel_city_df['oaza_cho'].fillna('') + parcel_city_df['chome'].fillna('') == splits['town']]
403
 
404
  cities = parcel_city_df['city'].fillna('')
405
  wards = parcel_city_df['ward'].fillna('')
406
  oaza_chos = parcel_city_df['oaza_cho'].fillna('')
407
  chomes = parcel_city_df['chome'].fillna('')
408
  koazas = parcel_city_df['koaza'].fillna('')
409
- prc_num1s = parcel_city_df['prc_num1'].fillna(9999).astype(int).astype(str).replace('9999', '')
410
- prc_num2s = parcel_city_df['prc_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
411
- prc_num3s = parcel_city_df['prc_num3'].fillna(9999).astype(int).astype(str).replace('9999', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
  # アドレスを生成
414
  addresses = [
@@ -419,6 +444,47 @@ def create_vector_search_tab():
419
  cities, wards, oaza_chos, chomes, koazas, prc_num1s, prc_num2s, prc_num3s
420
  )
421
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  with measure('query_embed'):
423
  query_embed = embed_via_multilingual_e5_large([query_address])
424
  with measure('address_embeds'):
 
15
  from contextlib import contextmanager
16
  import numpy as np
17
 
 
18
  # .envファイルを読み込む
19
  load_dotenv()
20
 
 
173
  search_params=search_params,
174
  limit=top_k,
175
  anns_field='embedding',
176
+ output_fields=['address', 'pref', 'county', 'city', 'ward', 'oaza_cho', 'chome', 'koaza'],
177
  )[0]
178
 
179
  hits = []
180
  for i, result in enumerate(results, start=1):
181
  distance = result['distance']
182
  address = result['entity'].get('address')
183
+ pref = result['entity'].get('pref')
184
+ county = result['entity'].get('county')
185
+ city = result['entity'].get('city')
186
+ ward = result['entity'].get('ward')
187
+ oaza_cho = result['entity'].get('oaza_cho')
188
+ chome = result['entity'].get('chome')
189
+ koaza = result['entity'].get('koaza')
190
 
191
  if distance >= thresh:
192
+ hits.append([i, distance, address, pref, county, city, ward, oaza_cho, chome, koaza])
193
 
194
  return hits
195
 
 
379
  preprocessed = preprocess(query_address)
380
  with measure('vector_search'):
381
  hits = vector_search(preprocessed, top_k)
382
+ search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所', '都道府県', '郡', '市区町村', '政令市区', '大字・町', '丁目', '小字'])
 
383
  with measure('split_address'):
384
+ splits = {
385
+ 'pref': hits[0][3],
386
+ 'county': hits[0][4],
387
+ 'city': hits[0][5],
388
+ 'ward': hits[0][6],
389
+ 'oaza_cho': hits[0][7],
390
+ 'chome': hits[0][8],
391
+ 'koaza': hits[0][9],
392
  }
393
+ result_df = pd.DataFrame([splits.values()], columns=splits.keys())
394
  with measure('load city_all_file'):
395
  target_dir = Path(r'C:\Users\taish\Development\whispercustom\projects\abr-geocoder\temp\download')
396
  city_all_file = target_dir / 'mt_city_all.csv'
397
  city_all_df = pd.read_csv(city_all_file)
398
  city_all_df_temp = city_all_df[city_all_df['pref'] == splits['pref']]
399
+ city_name1 = city_all_df_temp['county'].fillna('') + city_all_df_temp['city'].fillna('') + city_all_df_temp['ward'].fillna('')
400
+ city_name2 = splits['county'] + splits['city'] + splits['ward']
401
+ lg_codes = city_all_df_temp[city_name1 == city_name2]['lg_code'].values
402
  if len(lg_codes) > 1:
403
  raise Exception('Too many lg_code')
404
  lg_code = lg_codes[0]
405
  with measure('load parcel_city_file'):
406
  parcel_city_file = target_dir / f'mt_parcel_city{lg_code:06d}.csv'
407
  if not os.path.exists(parcel_city_file):
408
+ # raise gr.Error('Too many lg_code')
409
+ raise Exception('Too many lg_code')
410
  parcel_city_df = pd.read_csv(parcel_city_file)
 
 
411
 
412
  cities = parcel_city_df['city'].fillna('')
413
  wards = parcel_city_df['ward'].fillna('')
414
  oaza_chos = parcel_city_df['oaza_cho'].fillna('')
415
  chomes = parcel_city_df['chome'].fillna('')
416
  koazas = parcel_city_df['koaza'].fillna('')
417
+
418
+ city_name1 = cities + wards
419
+ city_name2 = splits['county'] + splits['city'] + splits['ward']
420
+ city_mask = city_name1 == city_name2
421
+
422
+ town_name1 = oaza_chos + chomes
423
+ town_name2 = splits['oaza_cho'] + splits['chome']
424
+ town_mask = town_name1 == town_name2
425
+
426
+ koaza_mask = koazas == splits['koaza']
427
+ parcel_city_df_filtered = parcel_city_df[city_mask & town_mask & koaza_mask]
428
+
429
+ cities = parcel_city_df_filtered['city'].fillna('')
430
+ wards = parcel_city_df_filtered['ward'].fillna('')
431
+ oaza_chos = parcel_city_df_filtered['oaza_cho'].fillna('')
432
+ chomes = parcel_city_df_filtered['chome'].fillna('')
433
+ koazas = parcel_city_df_filtered['koaza'].fillna('')
434
+ prc_num1s = parcel_city_df_filtered['prc_num1'].fillna(9999).astype(int).astype(str).replace('9999', '')
435
+ prc_num2s = parcel_city_df_filtered['prc_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
436
+ prc_num3s = parcel_city_df_filtered['prc_num3'].fillna(9999).astype(int).astype(str).replace('9999', '')
437
 
438
  # アドレスを生成
439
  addresses = [
 
444
  cities, wards, oaza_chos, chomes, koazas, prc_num1s, prc_num2s, prc_num3s
445
  )
446
  ]
447
+ with measure('load rsdtdsp_file'):
448
+ pref_code = ('%06d' % lg_code)[0:2]
449
+ rsdtdsp_dir = Path(rf'G:\マイドライブ\Development\Dataset\Misc\japanese_address\rsdt\original')
450
+ rsdtdsp_file = rsdtdsp_dir / f'mt_rsdtdsp_rsdt_pref{pref_code}.csv\mt_rsdtdsp_rsdt_pref{pref_code}.csv'
451
+ if not os.path.exists(rsdtdsp_file):
452
+ # raise gr.Error(f'Not found: {rsdtdsp_file}')
453
+ raise Exception(f'Not found: {rsdtdsp_file}')
454
+ rsdtdsp_df = pd.read_csv(rsdtdsp_file)
455
+
456
+ city_name1 = rsdtdsp_df['city'].fillna('') + rsdtdsp_df['ward'].fillna('')
457
+ city_name2 = splits['county'] + splits['city'] + splits['ward']
458
+ city_mask = city_name1 == city_name2
459
+
460
+ town_name1 = rsdtdsp_df['oaza_cho'].fillna('') + rsdtdsp_df['chome'].fillna('')
461
+ town_name2 = splits['oaza_cho'] + splits['chome']
462
+ town_mask = town_name1 == town_name2
463
+
464
+ koaza_mask = rsdtdsp_df['koaza'].fillna('') == splits['koaza']
465
+
466
+ rsdtdsp_df_filtered = rsdtdsp_df[city_mask & town_mask & koaza_mask]
467
+
468
+ cities = rsdtdsp_df_filtered['city'].fillna('')
469
+ wards = rsdtdsp_df_filtered['ward'].fillna('')
470
+ oaza_chos = rsdtdsp_df_filtered['oaza_cho'].fillna('')
471
+ chomes = rsdtdsp_df_filtered['chome'].fillna('')
472
+ koazas = rsdtdsp_df_filtered['koaza'].fillna('')
473
+ blk_nums = rsdtdsp_df_filtered['blk_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
474
+ rsdt_nums = rsdtdsp_df_filtered['rsdt_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
475
+ rsdt_num2s = rsdtdsp_df_filtered['rsdt_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
476
+
477
+ # アドレスを生成
478
+ addresses += [
479
+ f"{splits['pref']}{city}{ward}{oaza_cho}{chome}{koaza}{blk_num}" +
480
+ (f"-{rsdt_num}" if rsdt_num else '') +
481
+ (f"-{rsdt_num2}" if rsdt_num2 else '')
482
+ for city, ward, oaza_cho, chome, koaza, blk_num, rsdt_num, rsdt_num2 in zip(
483
+ cities, wards, oaza_chos, chomes, koazas, blk_nums, rsdt_nums, rsdt_num2s
484
+ )
485
+ ]
486
+
487
+ addresses = list(set(addresses)) # 重複を除去
488
  with measure('query_embed'):
489
  query_embed = embed_via_multilingual_e5_large([query_address])
490
  with measure('address_embeds'):