Spaces:
Sleeping
Sleeping
住所処理機能において、住所と建物名を分割する新しい関数を追加し、検索処理での住所の取り扱いを改善しました。正規表現を用いて住所の解析を行い、結果を適切に処理するように修正しました。
Browse files
app.py
CHANGED
@@ -16,6 +16,7 @@ from dotenv import load_dotenv
|
|
16 |
import time
|
17 |
from contextlib import contextmanager
|
18 |
import numpy as np
|
|
|
19 |
|
20 |
# .envファイルを読み込む
|
21 |
load_dotenv()
|
@@ -122,6 +123,35 @@ def measure(label="処理"):
|
|
122 |
end = time.time()
|
123 |
print(f"{label} 実行時間: {end - start:.6f} 秒")
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
def split_address(normalized_address):
|
126 |
splits = normalize(normalized_address)
|
127 |
return splits
|
@@ -735,8 +765,10 @@ def create_vector_search_tab():
|
|
735 |
chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")
|
736 |
|
737 |
def search_address(query_address, top_k):
|
|
|
|
|
738 |
with measure('preprocess'):
|
739 |
-
preprocessed = preprocess(
|
740 |
with measure('vector_search'):
|
741 |
hits = vector_search(preprocessed, top_k)
|
742 |
search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所', '都道府県', '郡', '市区町村', '政令市区', '大字・町', '丁目', '小字'])
|
@@ -761,7 +793,7 @@ def create_vector_search_tab():
|
|
761 |
splits['oaza_cho'], splits['chome'], splits['koaza'])
|
762 |
addresses = list(set(addresses)) # 重複を除去
|
763 |
with measure('embed_via_multilingual_e5_large'):
|
764 |
-
embeds = embed_via_multilingual_e5_large([
|
765 |
query_embed = [embeds[0]]
|
766 |
address_embeds = embeds[1:]
|
767 |
with measure('cosine'):
|
@@ -775,7 +807,7 @@ def create_vector_search_tab():
|
|
775 |
chiban_result_df = pd.DataFrame({
|
776 |
'Top-k': range(1, top_k + 1),
|
777 |
'類似度': best_similarities,
|
778 |
-
'住所': best_addresses
|
779 |
})
|
780 |
|
781 |
best_address = best_addresses[0]
|
|
|
16 |
import time
|
17 |
from contextlib import contextmanager
|
18 |
import numpy as np
|
19 |
+
import re
|
20 |
|
21 |
# .envファイルを読み込む
|
22 |
load_dotenv()
|
|
|
123 |
end = time.time()
|
124 |
print(f"{label} 実行時間: {end - start:.6f} 秒")
|
125 |
|
126 |
+
import re
|
127 |
+
|
128 |
+
ADDRESS_REGEX = re.compile(
|
129 |
+
r'^'
|
130 |
+
r'(?P<address>'
|
131 |
+
r'.+?[都道府県]' # 都道府県
|
132 |
+
r'.+?[市区町村]' # 市区町村
|
133 |
+
r'.*?' # 町名など(最小マッチ)
|
134 |
+
r'[0-90-9]+' # 番地の先頭数字
|
135 |
+
r'(?:[-ー−–][0-90-9]+)*' # 「-数字」の繰返し
|
136 |
+
r'(?:(?:丁目|番地|番|号)' # 「丁目」「番地」「番」「号」
|
137 |
+
r'(?:[0-90-9]+' # のあとに続く数字
|
138 |
+
r'(?:[-ー−–][0-90-9]+)*' # 「-数字」の繰返し
|
139 |
+
r')?'
|
140 |
+
r')*' # 上記ユニットを何度でも繰返し
|
141 |
+
r')'
|
142 |
+
r'(?P<building>.*)' # 残りを建物名としてキャプチャ
|
143 |
+
r'$'
|
144 |
+
)
|
145 |
+
|
146 |
+
def split_address_building(address: str) -> dict:
|
147 |
+
m = ADDRESS_REGEX.match(address)
|
148 |
+
if not m:
|
149 |
+
return {'address': address, 'building': ''}
|
150 |
+
return {
|
151 |
+
'address': m.group('address').strip(),
|
152 |
+
'building': m.group('building').strip()
|
153 |
+
}
|
154 |
+
|
155 |
def split_address(normalized_address):
|
156 |
splits = normalize(normalized_address)
|
157 |
return splits
|
|
|
765 |
chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")
|
766 |
|
767 |
def search_address(query_address, top_k):
|
768 |
+
with measure('split_address_building'):
|
769 |
+
splitted = split_address_building(query_address)
|
770 |
with measure('preprocess'):
|
771 |
+
preprocessed = preprocess(splitted['address'])
|
772 |
with measure('vector_search'):
|
773 |
hits = vector_search(preprocessed, top_k)
|
774 |
search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所', '都道府県', '郡', '市区町村', '政令市区', '大字・町', '丁目', '小字'])
|
|
|
793 |
splits['oaza_cho'], splits['chome'], splits['koaza'])
|
794 |
addresses = list(set(addresses)) # 重複を除去
|
795 |
with measure('embed_via_multilingual_e5_large'):
|
796 |
+
embeds = embed_via_multilingual_e5_large([splitted['address']] + addresses)
|
797 |
query_embed = [embeds[0]]
|
798 |
address_embeds = embeds[1:]
|
799 |
with measure('cosine'):
|
|
|
807 |
chiban_result_df = pd.DataFrame({
|
808 |
'Top-k': range(1, top_k + 1),
|
809 |
'類似度': best_similarities,
|
810 |
+
'住所': best_addresses + splitted['building']
|
811 |
})
|
812 |
|
813 |
best_address = best_addresses[0]
|