matsuap commited on
Commit
05dae99
·
1 Parent(s): c826c92

住所取得機能を追加し、都道府県、郡、市区町村、政令市区、大字・町、丁目、小字を基にアドレスを生成するロジックを実装。重複アドレスの除去機能を強化し、コサイン類似度を用いた検索結果の精度を向上。

Browse files
Files changed (1) hide show
  1. app.py +189 -98
app.py CHANGED
@@ -261,6 +261,150 @@ def search_via_milvus(query_vector, top_k, collection_name, thresh=0.0):
261
 
262
  return hits
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  # =========================
265
  # FastAPI definition
266
  # =========================
@@ -282,8 +426,39 @@ def normalize_address(query_address):
282
  with measure('preprocess'):
283
  preprocessed = preprocess(query_address)
284
  with measure('vector_search'):
285
- result = vector_search(preprocessed, top_k=1)[0][-1]
286
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
  # =========================
289
  # Gradio tabs definition
@@ -459,103 +634,19 @@ def create_vector_search_tab():
459
  'koaza': hits[0][9],
460
  }
461
  result_df = pd.DataFrame([splits.values()], columns=splits.keys())
462
- with measure('load city_all_file'):
463
- city_all_file = TARGET_DIR / 'mt_city_all.csv'
464
- city_all_df = pd.read_csv(city_all_file)
465
- city_all_df_temp = city_all_df[city_all_df['pref'] == splits['pref']]
466
- city_name1 = city_all_df_temp['county'].fillna('') + city_all_df_temp['city'].fillna('') + city_all_df_temp['ward'].fillna('')
467
- city_name2 = splits['county'] + splits['city'] + splits['ward']
468
- lg_codes = city_all_df_temp[city_name1 == city_name2]['lg_code'].values
469
- if len(lg_codes) > 1:
470
- raise Exception('Too many lg_code')
471
- lg_code = lg_codes[0]
472
- with measure('load parcel_city_file'):
473
- parcel_city_file = TARGET_DIR / 'parcel' / f'mt_parcel_city{lg_code:06d}.csv'
474
- if not os.path.exists(parcel_city_file):
475
- # raise gr.Error('Too many lg_code')
476
- raise Exception('Too many lg_code')
477
- parcel_city_df = pd.read_csv(parcel_city_file)
478
-
479
- cities = parcel_city_df['city'].fillna('')
480
- wards = parcel_city_df['ward'].fillna('')
481
- oaza_chos = parcel_city_df['oaza_cho'].fillna('')
482
- chomes = parcel_city_df['chome'].fillna('')
483
- koazas = parcel_city_df['koaza'].fillna('')
484
-
485
- city_name1 = cities + wards
486
- city_name2 = splits['county'] + splits['city'] + splits['ward']
487
- city_mask = city_name1 == city_name2
488
-
489
- town_name1 = oaza_chos + chomes
490
- town_name2 = splits['oaza_cho'] + splits['chome']
491
- town_mask = town_name1 == town_name2
492
-
493
- koaza_mask = koazas == splits['koaza']
494
- parcel_city_df_filtered = parcel_city_df[city_mask & town_mask & koaza_mask]
495
-
496
- cities = parcel_city_df_filtered['city'].fillna('')
497
- wards = parcel_city_df_filtered['ward'].fillna('')
498
- oaza_chos = parcel_city_df_filtered['oaza_cho'].fillna('')
499
- chomes = parcel_city_df_filtered['chome'].fillna('')
500
- koazas = parcel_city_df_filtered['koaza'].fillna('')
501
- prc_num1s = parcel_city_df_filtered['prc_num1'].fillna(9999).astype(int).astype(str).replace('9999', '')
502
- prc_num2s = parcel_city_df_filtered['prc_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
503
- prc_num3s = parcel_city_df_filtered['prc_num3'].fillna(9999).astype(int).astype(str).replace('9999', '')
504
-
505
- # アドレスを生成
506
- addresses = [
507
- f"{splits['pref']}{city}{ward}{oaza_cho}{chome}{koaza}{prc_num1}" +
508
- (f"-{prc_num2}" if prc_num2 else '') +
509
- (f"-{prc_num3}" if prc_num3 else '')
510
- for city, ward, oaza_cho, chome, koaza, prc_num1, prc_num2, prc_num3 in zip(
511
- cities, wards, oaza_chos, chomes, koazas, prc_num1s, prc_num2s, prc_num3s
512
- )
513
- ]
514
- with measure('load rsdtdsp_file'):
515
- pref_code = ('%06d' % lg_code)[0:2]
516
- rsdtdsp_file = TARGET_DIR / 'rsdt' / f'mt_rsdtdsp_rsdt_pref{pref_code}.csv'
517
- if not os.path.exists(rsdtdsp_file):
518
- # raise gr.Error(f'Not found: {rsdtdsp_file}')
519
- raise Exception(f'Not found: {rsdtdsp_file}')
520
- rsdtdsp_df = pd.read_csv(rsdtdsp_file)
521
-
522
- city_name1 = rsdtdsp_df['city'].fillna('') + rsdtdsp_df['ward'].fillna('')
523
- city_name2 = splits['county'] + splits['city'] + splits['ward']
524
- city_mask = city_name1 == city_name2
525
-
526
- town_name1 = rsdtdsp_df['oaza_cho'].fillna('') + rsdtdsp_df['chome'].fillna('')
527
- town_name2 = splits['oaza_cho'] + splits['chome']
528
- town_mask = town_name1 == town_name2
529
-
530
- koaza_mask = rsdtdsp_df['koaza'].fillna('') == splits['koaza']
531
-
532
- rsdtdsp_df_filtered = rsdtdsp_df[city_mask & town_mask & koaza_mask]
533
-
534
- cities = rsdtdsp_df_filtered['city'].fillna('')
535
- wards = rsdtdsp_df_filtered['ward'].fillna('')
536
- oaza_chos = rsdtdsp_df_filtered['oaza_cho'].fillna('')
537
- chomes = rsdtdsp_df_filtered['chome'].fillna('')
538
- koazas = rsdtdsp_df_filtered['koaza'].fillna('')
539
- blk_nums = rsdtdsp_df_filtered['blk_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
540
- rsdt_nums = rsdtdsp_df_filtered['rsdt_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
541
- rsdt_num2s = rsdtdsp_df_filtered['rsdt_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
542
-
543
- # アドレスを生成
544
- addresses += [
545
- f"{splits['pref']}{city}{ward}{oaza_cho}{chome}{koaza}{blk_num}" +
546
- (f"-{rsdt_num}" if rsdt_num else '') +
547
- (f"-{rsdt_num2}" if rsdt_num2 else '')
548
- for city, ward, oaza_cho, chome, koaza, blk_num, rsdt_num, rsdt_num2 in zip(
549
- cities, wards, oaza_chos, chomes, koazas, blk_nums, rsdt_nums, rsdt_num2s
550
- )
551
- ]
552
-
553
  addresses = list(set(addresses)) # 重複を除去
554
- with measure('query_embed'):
555
- query_embed = embed_via_multilingual_e5_large([query_address])
556
- with measure('address_embeds'):
557
- address_embeds = embed_via_multilingual_e5_large(addresses)
558
-
559
  with measure('cosine'):
560
  # コサイン類似度を計算
561
  similarities = cosine_similarity(query_embed, address_embeds)
 
261
 
262
  return hits
263
 
264
+ def get_lg_code(pref, county, city, ward):
265
+ city_all_file = TARGET_DIR / 'mt_city_all.csv'
266
+ city_all_df = pd.read_csv(city_all_file)
267
+ city_all_df_temp = city_all_df[city_all_df['pref'] == pref]
268
+ city_name1 = city_all_df_temp['county'].fillna('') + city_all_df_temp['city'].fillna('') + city_all_df_temp['ward'].fillna('')
269
+ city_name2 = county + city + ward
270
+ lg_codes = city_all_df_temp[city_name1 == city_name2]['lg_code'].values
271
+ if len(lg_codes) > 1:
272
+ raise Exception('Too many lg_code')
273
+ return lg_codes[0]
274
+
275
+ def get_addresses_with_parcel(pref, county, city, ward, oaza_cho, chome, koaza):
276
+ lg_code = get_lg_code(pref, county, city, ward)
277
+ parcel_city_file = TARGET_DIR / 'parcel' / f'mt_parcel_city{lg_code:06d}.csv'
278
+ if not os.path.exists(parcel_city_file):
279
+ raise gr.Error('Too many lg_code')
280
+ parcel_city_df = pd.read_csv(parcel_city_file)
281
+
282
+ cities = parcel_city_df['city'].fillna('')
283
+ wards = parcel_city_df['ward'].fillna('')
284
+ oaza_chos = parcel_city_df['oaza_cho'].fillna('')
285
+ chomes = parcel_city_df['chome'].fillna('')
286
+ koazas = parcel_city_df['koaza'].fillna('')
287
+
288
+ city_name1 = cities + wards
289
+ city_name2 = county + city + ward
290
+ city_mask = city_name1 == city_name2
291
+
292
+ town_name1 = oaza_chos + chomes
293
+ town_name2 = oaza_cho + chome
294
+ town_mask = town_name1 == town_name2
295
+
296
+ koaza_mask = koazas == koaza
297
+ parcel_city_df_filtered = parcel_city_df[city_mask & town_mask & koaza_mask]
298
+
299
+ cities = parcel_city_df_filtered['city'].fillna('')
300
+ wards = parcel_city_df_filtered['ward'].fillna('')
301
+ oaza_chos = parcel_city_df_filtered['oaza_cho'].fillna('')
302
+ chomes = parcel_city_df_filtered['chome'].fillna('')
303
+ koazas = parcel_city_df_filtered['koaza'].fillna('')
304
+ prc_num1s = parcel_city_df_filtered['prc_num1'].fillna(9999).astype(int).astype(str).replace('9999', '')
305
+ prc_num2s = parcel_city_df_filtered['prc_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
306
+ prc_num3s = parcel_city_df_filtered['prc_num3'].fillna(9999).astype(int).astype(str).replace('9999', '')
307
+
308
+ # アドレスを生成
309
+ return [
310
+ f"{pref}{_city}{_ward}{_oaza_cho}{_chome}{_koaza}{_prc_num1}" +
311
+ (f"-{_prc_num2}" if _prc_num2 else '') +
312
+ (f"-{_prc_num3}" if _prc_num3 else '')
313
+ for _city, _ward, _oaza_cho, _chome, _koaza, _prc_num1, _prc_num2, _prc_num3 in zip(
314
+ cities, wards, oaza_chos, chomes, koazas, prc_num1s, prc_num2s, prc_num3s
315
+ )
316
+ ]
317
+
318
+ pref_names = [
319
+ '北海道',
320
+ '青森県',
321
+ '岩手県',
322
+ '宮城県',
323
+ '秋田県',
324
+ '山形県',
325
+ '福島県',
326
+ '茨城県',
327
+ '栃木県',
328
+ '群馬県',
329
+ '埼玉県',
330
+ '千葉県',
331
+ '東京都',
332
+ '神奈川県',
333
+ '新潟県',
334
+ '富山県',
335
+ '石川県',
336
+ '福井県',
337
+ '山梨県',
338
+ '長野県',
339
+ '岐阜県',
340
+ '静岡県',
341
+ '愛知県',
342
+ '三重県',
343
+ '滋賀県',
344
+ '京都府',
345
+ '大阪府',
346
+ '兵庫県',
347
+ '奈良県',
348
+ '和歌山県',
349
+ '鳥取県',
350
+ '島根県',
351
+ '岡山県',
352
+ '広島県',
353
+ '山口県',
354
+ '徳島県',
355
+ '香川県',
356
+ '愛媛県',
357
+ '高知県',
358
+ '福岡県',
359
+ '佐賀県',
360
+ '長崎県',
361
+ '熊本県',
362
+ '大分県',
363
+ '宮崎県',
364
+ '鹿児島県',
365
+ '沖縄県'
366
+ ]
367
+
368
+ def get_pref_code(pref):
369
+ return pref_names.index(pref) + 1
370
+
371
+ def get_addresses_with_rsdtdsp(pref, county, city, ward, oaza_cho, chome, koaza):
372
+ pref_code = get_pref_code(pref)
373
+ rsdtdsp_file = TARGET_DIR / 'rsdt' / f'mt_rsdtdsp_rsdt_pref{pref_code:02d}.csv'
374
+ if not os.path.exists(rsdtdsp_file):
375
+ raise gr.Error(f'Not found: {rsdtdsp_file}')
376
+ rsdtdsp_df = pd.read_csv(rsdtdsp_file)
377
+
378
+ city_name1 = rsdtdsp_df['city'].fillna('') + rsdtdsp_df['ward'].fillna('')
379
+ city_name2 = county + city + ward
380
+ city_mask = city_name1 == city_name2
381
+
382
+ town_name1 = rsdtdsp_df['oaza_cho'].fillna('') + rsdtdsp_df['chome'].fillna('')
383
+ town_name2 = oaza_cho + chome
384
+ town_mask = town_name1 == town_name2
385
+
386
+ koaza_mask = rsdtdsp_df['koaza'].fillna('') == koaza
387
+
388
+ rsdtdsp_df_filtered = rsdtdsp_df[city_mask & town_mask & koaza_mask]
389
+
390
+ cities = rsdtdsp_df_filtered['city'].fillna('')
391
+ wards = rsdtdsp_df_filtered['ward'].fillna('')
392
+ oaza_chos = rsdtdsp_df_filtered['oaza_cho'].fillna('')
393
+ chomes = rsdtdsp_df_filtered['chome'].fillna('')
394
+ koazas = rsdtdsp_df_filtered['koaza'].fillna('')
395
+ blk_nums = rsdtdsp_df_filtered['blk_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
396
+ rsdt_nums = rsdtdsp_df_filtered['rsdt_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
397
+ rsdt_num2s = rsdtdsp_df_filtered['rsdt_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
398
+
399
+ # アドレスを生成
400
+ return [
401
+ f"{pref}{_city}{_ward}{_oaza_cho}{_chome}{_koaza}{_blk_num}" +
402
+ (f"-{_rsdt_num}" if _rsdt_num else '') +
403
+ (f"-{_rsdt_num2}" if _rsdt_num2 else '')
404
+ for _city, _ward, _oaza_cho, _chome, _koaza, _blk_num, _rsdt_num, _rsdt_num2 in zip(
405
+ cities, wards, oaza_chos, chomes, koazas, blk_nums, rsdt_nums, rsdt_num2s)
406
+ ]
407
+
408
  # =========================
409
  # FastAPI definition
410
  # =========================
 
426
  with measure('preprocess'):
427
  preprocessed = preprocess(query_address)
428
  with measure('vector_search'):
429
+ hits = vector_search(preprocessed, 1)
430
+ with measure('split_address'):
431
+ splits = {
432
+ 'pref': hits[0][3],
433
+ 'county': hits[0][4],
434
+ 'city': hits[0][5],
435
+ 'ward': hits[0][6],
436
+ 'oaza_cho': hits[0][7],
437
+ 'chome': hits[0][8],
438
+ 'koaza': hits[0][9],
439
+ }
440
+ with measure('get_addresses_with_parcel'):
441
+ addresses = get_addresses_with_parcel(
442
+ splits['pref'], splits['county'], splits['city'], splits['ward'],
443
+ splits['oaza_cho'], splits['chome'], splits['koaza'])
444
+ with measure('get_addresses_with_rsdtdsp'):
445
+ addresses += get_addresses_with_rsdtdsp(
446
+ splits['pref'], splits['county'], splits['city'], splits['ward'],
447
+ splits['oaza_cho'], splits['chome'], splits['koaza'])
448
+ addresses = list(set(addresses)) # 重複を除去
449
+ with measure('embed_via_multilingual_e5_large'):
450
+ embeds = embed_via_multilingual_e5_large([query_address] + addresses)
451
+ query_embed = [embeds[0]]
452
+ address_embeds = embeds[1:]
453
+ with measure('cosine'):
454
+ # コサイン類似度を計算
455
+ similarities = cosine_similarity(query_embed, address_embeds)
456
+
457
+ best_match_indices = np.argsort(similarities[0])[-1:][::-1] # 上位Kのインデックスを取得
458
+ best_addresses = [addresses[i] for i in best_match_indices]
459
+
460
+ best_address = best_addresses[0]
461
+ return best_address
462
 
463
  # =========================
464
  # Gradio tabs definition
 
634
  'koaza': hits[0][9],
635
  }
636
  result_df = pd.DataFrame([splits.values()], columns=splits.keys())
637
+ with measure('get_addresses_with_parcel'):
638
+ addresses = get_addresses_with_parcel(
639
+ splits['pref'], splits['county'], splits['city'], splits['ward'],
640
+ splits['oaza_cho'], splits['chome'], splits['koaza'])
641
+ with measure('get_addresses_with_rsdtdsp'):
642
+ addresses += get_addresses_with_rsdtdsp(
643
+ splits['pref'], splits['county'], splits['city'], splits['ward'],
644
+ splits['oaza_cho'], splits['chome'], splits['koaza'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
  addresses = list(set(addresses)) # 重複を除去
646
+ with measure('embed_via_multilingual_e5_large'):
647
+ embeds = embed_via_multilingual_e5_large([query_address] + addresses)
648
+ query_embed = [embeds[0]]
649
+ address_embeds = embeds[1:]
 
650
  with measure('cosine'):
651
  # コサイン類似度を計算
652
  similarities = cosine_similarity(query_embed, address_embeds)