matsuap commited on
Commit
d934afb
·
1 Parent(s): 5a18e3b

住所処理機能において、住所と建物名を分割する新しい関数を追加し、検索処理での住所の取り扱いを改善しました。正規表現を用いて住所の解析を行い、結果を適切に処理するように修正しました。

Browse files
Files changed (1) hide show
  1. app.py +35 -3
app.py CHANGED
@@ -16,6 +16,7 @@ from dotenv import load_dotenv
16
  import time
17
  from contextlib import contextmanager
18
  import numpy as np
 
19
 
20
  # .envファイルを読み込む
21
  load_dotenv()
@@ -122,6 +123,35 @@ def measure(label="処理"):
122
  end = time.time()
123
  print(f"{label} 実行時間: {end - start:.6f} 秒")
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def split_address(normalized_address):
126
  splits = normalize(normalized_address)
127
  return splits
@@ -735,8 +765,10 @@ def create_vector_search_tab():
735
  chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")
736
 
737
  def search_address(query_address, top_k):
 
 
738
  with measure('preprocess'):
739
- preprocessed = preprocess(query_address)
740
  with measure('vector_search'):
741
  hits = vector_search(preprocessed, top_k)
742
  search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所', '都道府県', '郡', '市区町村', '政令市区', '大字・町', '丁目', '小字'])
@@ -761,7 +793,7 @@ def create_vector_search_tab():
761
  splits['oaza_cho'], splits['chome'], splits['koaza'])
762
  addresses = list(set(addresses)) # 重複を除去
763
  with measure('embed_via_multilingual_e5_large'):
764
- embeds = embed_via_multilingual_e5_large([query_address] + addresses)
765
  query_embed = [embeds[0]]
766
  address_embeds = embeds[1:]
767
  with measure('cosine'):
@@ -775,7 +807,7 @@ def create_vector_search_tab():
775
  chiban_result_df = pd.DataFrame({
776
  'Top-k': range(1, top_k + 1),
777
  '類似度': best_similarities,
778
- '住所': best_addresses
779
  })
780
 
781
  best_address = best_addresses[0]
 
16
  import time
17
  from contextlib import contextmanager
18
  import numpy as np
19
+ import re
20
 
21
  # .envファイルを読み込む
22
  load_dotenv()
 
123
  end = time.time()
124
  print(f"{label} 実行時間: {end - start:.6f} 秒")
125
 
126
+ import re
127
+
128
+ ADDRESS_REGEX = re.compile(
129
+ r'^'
130
+ r'(?P<address>'
131
+ r'.+?[都道府県]' # 都道府県
132
+ r'.+?[市区町村]' # 市区町村
133
+ r'.*?' # 町名など(最小マッチ)
134
+ r'[0-90-9]+' # 番地の先頭数字
135
+ r'(?:[-ー−–][0-90-9]+)*' # 「-数字」の繰返し
136
+ r'(?:(?:丁目|番地|番|号)' # 「丁目」「番地」「番」「号」
137
+ r'(?:[0-90-9]+' # のあとに続く数字
138
+ r'(?:[-ー−–][0-90-9]+)*' # 「-数字」の繰返し
139
+ r')?'
140
+ r')*' # 上記ユニットを何度でも繰返し
141
+ r')'
142
+ r'(?P<building>.*)' # 残りを建物名としてキャプチャ
143
+ r'$'
144
+ )
145
+
146
+ def split_address_building(address: str) -> dict:
147
+ m = ADDRESS_REGEX.match(address)
148
+ if not m:
149
+ return {'address': address, 'building': ''}
150
+ return {
151
+ 'address': m.group('address').strip(),
152
+ 'building': m.group('building').strip()
153
+ }
154
+
155
  def split_address(normalized_address):
156
  splits = normalize(normalized_address)
157
  return splits
 
765
  chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")
766
 
767
  def search_address(query_address, top_k):
768
+ with measure('split_address_building'):
769
+ splitted = split_address_building(query_address)
770
  with measure('preprocess'):
771
+ preprocessed = preprocess(splitted['address'])
772
  with measure('vector_search'):
773
  hits = vector_search(preprocessed, top_k)
774
  search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所', '都道府県', '郡', '市区町村', '政令市区', '大字・町', '丁目', '小字'])
 
793
  splits['oaza_cho'], splits['chome'], splits['koaza'])
794
  addresses = list(set(addresses)) # 重複を除去
795
  with measure('embed_via_multilingual_e5_large'):
796
+ embeds = embed_via_multilingual_e5_large([splitted['address']] + addresses)
797
  query_embed = [embeds[0]]
798
  address_embeds = embeds[1:]
799
  with measure('cosine'):
 
807
  chiban_result_df = pd.DataFrame({
808
  'Top-k': range(1, top_k + 1),
809
  '類似度': best_similarities,
810
+ '住所': best_addresses + splitted['building']
811
  })
812
 
813
  best_address = best_addresses[0]