matsuap commited on
Commit
d77ce72
·
1 Parent(s): 150a877

Google検索APIを統合し、住所正規化処理を改善する新しいエンドポイントを追加しました。住所のスペル修正機能を実装し、Gradioタブに新しい機能を追加しました。また、正規化処理のバージョン2を実装し、類似度計算を強化しました。

Browse files
Files changed (1) hide show
  1. app.py +152 -3
app.py CHANGED
@@ -37,6 +37,9 @@ VECTOR_SEARCH_TOKEN = os.environ.get('VECTOR_SEARCH_TOKEN')
37
  VECTOR_SEARCH_COLLECTION_NAME = os.environ.get('VECTOR_SEARCH_COLLECTION_NAME')
38
  VECTOR_SEARCH_COLLECTION_NAME_V2 = os.environ.get('VECTOR_SEARCH_COLLECTION_NAME_V2')
39
 
 
 
 
40
  MILVUS_CLIENT = MilvusClient(uri=VECTOR_SEARCH_ENDPOINT, token=VECTOR_SEARCH_TOKEN)
41
  print(f"Connected to DB: {VECTOR_SEARCH_ENDPOINT} successfully")
42
 
@@ -134,6 +137,16 @@ def measure(label="処理"):
134
  end = time.time()
135
  print(f"{label} 実行時間: {end - start:.6f} 秒")
136
 
 
 
 
 
 
 
 
 
 
 
137
  def convert_zenkaku_to_hankaku(text):
138
  zenkaku_numbers = '0123456789'
139
  hankaku_numbers = '0123456789'
@@ -497,6 +510,57 @@ def normalize_address(query_address):
497
  best_address = best_addresses[0]
498
  return best_address + splitted['building']
499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
 
501
  # =========================
502
  # FastAPI definition
@@ -514,7 +578,6 @@ app = FastAPI(
514
  # ---------------------------
515
  # リクエスト・レスポンス定義
516
  # ---------------------------
517
-
518
  class CompareAddressesRequest(BaseModel):
519
  address1: str = Field(..., description="比較する最初の住所", example="東京 墨田区 押上 1丁目1-1")
520
  address2: str = Field(..., description="比較する2番目の住所", example="東京 墨田区 押上 1-1-1")
@@ -532,7 +595,6 @@ class NormalizeAddressResponse(BaseModel):
532
  # ---------------------------
533
  # エンドポイント定義
534
  # ---------------------------
535
-
536
  @app.post(
537
  "/compare-two-addresses",
538
  response_model=CompareAddressesResponse,
@@ -559,7 +621,6 @@ async def compare_two_addresses_api(request: CompareAddressesRequest):
559
  result = compare_two_addresses(request.address1, request.address2)
560
  return {"result": result}
561
 
562
-
563
  @app.post(
564
  "/normalize-address",
565
  response_model=NormalizeAddressResponse,
@@ -585,11 +646,37 @@ async def normalize_address_api(request: NormalizeAddressRequest):
585
  normalized = normalize_address(request.query_address)
586
  return {"normalized": normalized}
587
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
588
 
589
  # =========================
590
  # Gradio tabs definition
591
  # =========================
592
  examples = [
 
593
  '東京都荒川区1−5−6荒川マンション102',
594
  '福岡市中央区天神1の11の2',
595
  '私の住所は京都府京都市右京区太秦青木元町4-10です。',
@@ -685,6 +772,22 @@ def create_function_test_tab():
685
  inputs=[in_tb],
686
  outputs=[out_tb],
687
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688
  def create_split_address_tab():
689
  with gr.Tab("split_address"):
690
  in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
@@ -767,10 +870,23 @@ def create_function_test_tab():
767
  inputs=[in_tb],
768
  outputs=[out_df],
769
  )
 
 
 
 
 
 
 
 
 
 
 
770
 
771
  with gr.Tab("関数テスト"):
772
  create_normalize_address_tab()
 
773
  create_compare_two_addresses_tab()
 
774
  create_get_addresses_with_parcel_tab()
775
  create_vector_search()
776
  create_remove_left_of_pref_tab()
@@ -899,9 +1015,42 @@ def create_vector_search_tab():
899
  outputs=[search_result_df, chiban_result_df, result_tb, result_df],
900
  )
901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
  with gr.Blocks() as demo:
903
  create_function_test_tab()
904
  create_vector_search_tab()
 
905
  create_digital_agency_tab()
906
 
907
  app = gr.mount_gradio_app(app, demo, path='/')
 
37
  VECTOR_SEARCH_COLLECTION_NAME = os.environ.get('VECTOR_SEARCH_COLLECTION_NAME')
38
  VECTOR_SEARCH_COLLECTION_NAME_V2 = os.environ.get('VECTOR_SEARCH_COLLECTION_NAME_V2')
39
 
40
+ GOOGLE_SEARCH_API_KEY = os.environ.get('GOOGLE_SEARCH_API_KEY')
41
+ GOOGLE_SEARCH_ENGINE_ID = os.environ.get('GOOGLE_SEARCH_ENGINE_ID')
42
+
43
  MILVUS_CLIENT = MilvusClient(uri=VECTOR_SEARCH_ENDPOINT, token=VECTOR_SEARCH_TOKEN)
44
  print(f"Connected to DB: {VECTOR_SEARCH_ENDPOINT} successfully")
45
 
 
137
  end = time.time()
138
  print(f"{label} 実行時間: {end - start:.6f} 秒")
139
 
140
+ def get_spelling(query_address):
141
+ # APIリクエストを作成
142
+ url = f'https://www.googleapis.com/customsearch/v1?key={GOOGLE_SEARCH_API_KEY}&cx={GOOGLE_SEARCH_ENGINE_ID}&q={query_address}'
143
+
144
+ # リクエストを送信
145
+ response = requests.get(url)
146
+ results = response.json()
147
+
148
+ return results.get('spelling', {}).get('correctedQuery', '')
149
+
150
  def convert_zenkaku_to_hankaku(text):
151
  zenkaku_numbers = '0123456789'
152
  hankaku_numbers = '0123456789'
 
510
  best_address = best_addresses[0]
511
  return best_address + splitted['building']
512
 
513
+ def convert_no_to_hyphen(query_address):
514
+ return re.sub(r'(?<=\d)の(?=\d)', '-', query_address)
515
+
516
+ def normalize_address_v2(query_address, top_k=1):
517
+ with measure('convert_zenkaku_to_hankaku'):
518
+ query_address = convert_zenkaku_to_hankaku(query_address)
519
+ with measure('split_address_building_with_gpt'):
520
+ splitted = split_address_building_with_gpt(query_address)
521
+ with measure('get_spelling'):
522
+ spelling = get_spelling(splitted['address'])
523
+ if spelling:
524
+ splitted['address'] = spelling
525
+ with measure(''):
526
+ splitted['address'] = convert_no_to_hyphen(splitted['address'])
527
+ with measure('preprocess'):
528
+ preprocessed = preprocess(splitted['address'])
529
+ with measure('vector_search'):
530
+ hits = vector_search(preprocessed, 1)
531
+ with measure('split_address'):
532
+ splits = {
533
+ 'pref': hits[0][3],
534
+ 'county': hits[0][4],
535
+ 'city': hits[0][5],
536
+ 'ward': hits[0][6],
537
+ 'oaza_cho': hits[0][7],
538
+ 'chome': hits[0][8],
539
+ 'koaza': hits[0][9],
540
+ }
541
+ with measure('get_addresses_with_parcel'):
542
+ addresses = get_addresses_with_parcel(
543
+ splits['pref'], splits['county'], splits['city'], splits['ward'],
544
+ splits['oaza_cho'], splits['chome'], splits['koaza'])
545
+ with measure('get_addresses_with_rsdtdsp'):
546
+ addresses += get_addresses_with_rsdtdsp(
547
+ splits['pref'], splits['county'], splits['city'], splits['ward'],
548
+ splits['oaza_cho'], splits['chome'], splits['koaza'])
549
+ addresses = list(set(addresses)) # 重複を除去
550
+ with measure('embed_via_multilingual_e5_large'):
551
+ embeds = embed_via_multilingual_e5_large([splitted['address']] + addresses)
552
+ query_embed = [embeds[0]]
553
+ address_embeds = embeds[1:]
554
+ with measure('cosine'):
555
+ # コサイン類似度を計算
556
+ similarities = cosine_similarity(query_embed, address_embeds)
557
+
558
+ best_match_indices = np.argsort(similarities[0])[-top_k:][::-1] # 上位Kのインデックスを取得
559
+ best_addresses = [addresses[i] for i in best_match_indices]
560
+ best_similarities = similarities[0][best_match_indices]
561
+
562
+ return splitted, hits, splits, best_addresses, best_similarities
563
+
564
 
565
  # =========================
566
  # FastAPI definition
 
578
  # ---------------------------
579
  # リクエスト・レスポンス定義
580
  # ---------------------------
 
581
  class CompareAddressesRequest(BaseModel):
582
  address1: str = Field(..., description="比較する最初の住所", example="東京 墨田区 押上 1丁目1-1")
583
  address2: str = Field(..., description="比較する2番目の住所", example="東京 墨田区 押上 1-1-1")
 
595
  # ---------------------------
596
  # エンドポイント定義
597
  # ---------------------------
 
598
  @app.post(
599
  "/compare-two-addresses",
600
  response_model=CompareAddressesResponse,
 
621
  result = compare_two_addresses(request.address1, request.address2)
622
  return {"result": result}
623
 
 
624
  @app.post(
625
  "/normalize-address",
626
  response_model=NormalizeAddressResponse,
 
646
  normalized = normalize_address(request.query_address)
647
  return {"normalized": normalized}
648
 
649
+ @app.post(
650
+ "/normalize-address-v2",
651
+ response_model=NormalizeAddressResponse,
652
+ summary="住所を正規化する",
653
+ description="指定された住所を正規化し、正規化後の住所を返します。",
654
+ responses={
655
+ 200: {
656
+ "description": "正規化結果の返却",
657
+ "content": {
658
+ "application/json": {
659
+ "example": {
660
+ "normalized": "東京都千代田区一丁目1番"
661
+ }
662
+ }
663
+ }
664
+ }
665
+ }
666
+ )
667
+ async def normalize_address_v2_api(request: NormalizeAddressRequest):
668
+ """
669
+ - **query_address**: 正規化する住所
670
+ """
671
+ _, __, ___, bests, _____ = normalize_address_v2(request.query_address)
672
+ return {"normalized": bests[0]}
673
+
674
 
675
  # =========================
676
  # Gradio tabs definition
677
  # =========================
678
  examples = [
679
+ '東京都中央区みなと3の12の10、プレサンスロゼ東京港301。',
680
  '東京都荒川区1−5−6荒川マンション102',
681
  '福岡市中央区天神1の11の2',
682
  '私の住所は京都府京都市右京区太秦青木元町4-10です。',
 
772
  inputs=[in_tb],
773
  outputs=[out_tb],
774
  )
775
+ def create_normalize_address__v2_tab():
776
+ with gr.Tab("normalize_address_v2"):
777
+ in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
778
+ gr.Examples(examples=examples, inputs=[in_tb])
779
+ out_tb = gr.Textbox(label='アウトプット')
780
+ exe_button = gr.Button(value='実行', variant='primary')
781
+
782
+ def f(query_address):
783
+ splitted, __, ___, bests, _____ = normalize_address_v2(query_address)
784
+ return bests[0] + splitted['building']
785
+
786
+ exe_button.click(
787
+ fn=f,
788
+ inputs=[in_tb],
789
+ outputs=[out_tb],
790
+ )
791
  def create_split_address_tab():
792
  with gr.Tab("split_address"):
793
  in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
 
870
  inputs=[in_tb],
871
  outputs=[out_df],
872
  )
873
+ def create_get_spelling_tab():
874
+ with gr.Tab("create_get_spelling_tab"):
875
+ in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
876
+ gr.Examples(examples=examples, inputs=[in_tb])
877
+ out_tb = gr.Textbox(label='アウトプット')
878
+ exe_button = gr.Button(value='実行', variant='primary')
879
+ exe_button.click(
880
+ fn=get_spelling,
881
+ inputs=[in_tb],
882
+ outputs=[out_tb],
883
+ )
884
 
885
  with gr.Tab("関数テスト"):
886
  create_normalize_address_tab()
887
+ create_normalize_address__v2_tab()
888
  create_compare_two_addresses_tab()
889
+ create_get_spelling_tab()
890
  create_get_addresses_with_parcel_tab()
891
  create_vector_search()
892
  create_remove_left_of_pref_tab()
 
1015
  outputs=[search_result_df, chiban_result_df, result_tb, result_df],
1016
  )
1017
 
1018
+ def create_vector_search_v2_tab():
1019
+ with gr.Tab("ベクトル検索V2"):
1020
+ with gr.Row():
1021
+ with gr.Column():
1022
+ address_input = gr.Textbox(label='住所', placeholder='検索したい住所を入力してください')
1023
+ gr.Examples(examples=examples, inputs=[address_input])
1024
+ top_k_input = gr.Slider(minimum=1, maximum=100, step=1, value=5, label='検索数top-k')
1025
+ search_button = gr.Button(value='検索', variant='primary')
1026
+ result_tb = gr.Textbox(label='正規化後')
1027
+ result_df = gr.Dataframe(label="正規化後(分割)", wrap=True)
1028
+ search_result_df = gr.Dataframe(label="町丁目まで検索結果")
1029
+ chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")
1030
+
1031
+ def search_address(query_address, top_k):
1032
+ splitted, hits, splits, best_addresses, best_similarities = normalize_address_v2(query_address, top_k)
1033
+ search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所', '都道府県', '郡', '市区町村', '政令市区', '大字・町', '丁目', '小字'])
1034
+ result_df = pd.DataFrame([splits.values()], columns=splits.keys())
1035
+ chiban_result_df = pd.DataFrame({
1036
+ 'Top-k': range(1, len(best_similarities) + 1),
1037
+ '類似度': best_similarities,
1038
+ '住所': [best_address + splitted['building'] for best_address in best_addresses]
1039
+ })
1040
+ best_address = best_addresses[0] + splitted['building']
1041
+
1042
+ return search_result_df, chiban_result_df, best_address, result_df
1043
+
1044
+ search_button.click(
1045
+ fn=search_address,
1046
+ inputs=[address_input, top_k_input],
1047
+ outputs=[search_result_df, chiban_result_df, result_tb, result_df],
1048
+ )
1049
+
1050
  with gr.Blocks() as demo:
1051
  create_function_test_tab()
1052
  create_vector_search_tab()
1053
+ create_vector_search_v2_tab()
1054
  create_digital_agency_tab()
1055
 
1056
  app = gr.mount_gradio_app(app, demo, path='/')