Spaces:
Sleeping
Sleeping
Google検索APIを統合し、住所正規化処理を改善する新しいエンドポイントを追加しました。住所のスペル修正機能を実装し、Gradioタブに新しい機能を追加しました。また、正規化処理のバージョン2を実装し、類似度計算を強化しました。
Browse files
app.py
CHANGED
@@ -37,6 +37,9 @@ VECTOR_SEARCH_TOKEN = os.environ.get('VECTOR_SEARCH_TOKEN')
|
|
37 |
VECTOR_SEARCH_COLLECTION_NAME = os.environ.get('VECTOR_SEARCH_COLLECTION_NAME')
|
38 |
VECTOR_SEARCH_COLLECTION_NAME_V2 = os.environ.get('VECTOR_SEARCH_COLLECTION_NAME_V2')
|
39 |
|
|
|
|
|
|
|
40 |
MILVUS_CLIENT = MilvusClient(uri=VECTOR_SEARCH_ENDPOINT, token=VECTOR_SEARCH_TOKEN)
|
41 |
print(f"Connected to DB: {VECTOR_SEARCH_ENDPOINT} successfully")
|
42 |
|
@@ -134,6 +137,16 @@ def measure(label="処理"):
|
|
134 |
end = time.time()
|
135 |
print(f"{label} 実行時間: {end - start:.6f} 秒")
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
def convert_zenkaku_to_hankaku(text):
|
138 |
zenkaku_numbers = '0123456789'
|
139 |
hankaku_numbers = '0123456789'
|
@@ -497,6 +510,57 @@ def normalize_address(query_address):
|
|
497 |
best_address = best_addresses[0]
|
498 |
return best_address + splitted['building']
|
499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
500 |
|
501 |
# =========================
|
502 |
# FastAPI definition
|
@@ -514,7 +578,6 @@ app = FastAPI(
|
|
514 |
# ---------------------------
|
515 |
# リクエスト・レスポンス定義
|
516 |
# ---------------------------
|
517 |
-
|
518 |
class CompareAddressesRequest(BaseModel):
|
519 |
address1: str = Field(..., description="比較する最初の住所", example="東京 墨田区 押上 1丁目1-1")
|
520 |
address2: str = Field(..., description="比較する2番目の住所", example="東京 墨田区 押上 1-1-1")
|
@@ -532,7 +595,6 @@ class NormalizeAddressResponse(BaseModel):
|
|
532 |
# ---------------------------
|
533 |
# エンドポイント定義
|
534 |
# ---------------------------
|
535 |
-
|
536 |
@app.post(
|
537 |
"/compare-two-addresses",
|
538 |
response_model=CompareAddressesResponse,
|
@@ -559,7 +621,6 @@ async def compare_two_addresses_api(request: CompareAddressesRequest):
|
|
559 |
result = compare_two_addresses(request.address1, request.address2)
|
560 |
return {"result": result}
|
561 |
|
562 |
-
|
563 |
@app.post(
|
564 |
"/normalize-address",
|
565 |
response_model=NormalizeAddressResponse,
|
@@ -585,11 +646,37 @@ async def normalize_address_api(request: NormalizeAddressRequest):
|
|
585 |
normalized = normalize_address(request.query_address)
|
586 |
return {"normalized": normalized}
|
587 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
588 |
|
589 |
# =========================
|
590 |
# Gradio tabs definition
|
591 |
# =========================
|
592 |
examples = [
|
|
|
593 |
'東京都荒川区1−5−6荒川マンション102',
|
594 |
'福岡市中央区天神1の11の2',
|
595 |
'私の住所は京都府京都市右京区太秦青木元町4-10です。',
|
@@ -685,6 +772,22 @@ def create_function_test_tab():
|
|
685 |
inputs=[in_tb],
|
686 |
outputs=[out_tb],
|
687 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
688 |
def create_split_address_tab():
|
689 |
with gr.Tab("split_address"):
|
690 |
in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
|
@@ -767,10 +870,23 @@ def create_function_test_tab():
|
|
767 |
inputs=[in_tb],
|
768 |
outputs=[out_df],
|
769 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
770 |
|
771 |
with gr.Tab("関数テスト"):
|
772 |
create_normalize_address_tab()
|
|
|
773 |
create_compare_two_addresses_tab()
|
|
|
774 |
create_get_addresses_with_parcel_tab()
|
775 |
create_vector_search()
|
776 |
create_remove_left_of_pref_tab()
|
@@ -899,9 +1015,42 @@ def create_vector_search_tab():
|
|
899 |
outputs=[search_result_df, chiban_result_df, result_tb, result_df],
|
900 |
)
|
901 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
902 |
with gr.Blocks() as demo:
|
903 |
create_function_test_tab()
|
904 |
create_vector_search_tab()
|
|
|
905 |
create_digital_agency_tab()
|
906 |
|
907 |
app = gr.mount_gradio_app(app, demo, path='/')
|
|
|
37 |
VECTOR_SEARCH_COLLECTION_NAME = os.environ.get('VECTOR_SEARCH_COLLECTION_NAME')
|
38 |
VECTOR_SEARCH_COLLECTION_NAME_V2 = os.environ.get('VECTOR_SEARCH_COLLECTION_NAME_V2')
|
39 |
|
40 |
+
GOOGLE_SEARCH_API_KEY = os.environ.get('GOOGLE_SEARCH_API_KEY')
|
41 |
+
GOOGLE_SEARCH_ENGINE_ID = os.environ.get('GOOGLE_SEARCH_ENGINE_ID')
|
42 |
+
|
43 |
MILVUS_CLIENT = MilvusClient(uri=VECTOR_SEARCH_ENDPOINT, token=VECTOR_SEARCH_TOKEN)
|
44 |
print(f"Connected to DB: {VECTOR_SEARCH_ENDPOINT} successfully")
|
45 |
|
|
|
137 |
end = time.time()
|
138 |
print(f"{label} 実行時間: {end - start:.6f} 秒")
|
139 |
|
140 |
+
def get_spelling(query_address):
|
141 |
+
# APIリクエストを作成
|
142 |
+
url = f'https://www.googleapis.com/customsearch/v1?key={GOOGLE_SEARCH_API_KEY}&cx={GOOGLE_SEARCH_ENGINE_ID}&q={query_address}'
|
143 |
+
|
144 |
+
# リクエストを送信
|
145 |
+
response = requests.get(url)
|
146 |
+
results = response.json()
|
147 |
+
|
148 |
+
return results.get('spelling', {}).get('correctedQuery', '')
|
149 |
+
|
150 |
def convert_zenkaku_to_hankaku(text):
|
151 |
zenkaku_numbers = '0123456789'
|
152 |
hankaku_numbers = '0123456789'
|
|
|
510 |
best_address = best_addresses[0]
|
511 |
return best_address + splitted['building']
|
512 |
|
513 |
+
def convert_no_to_hyphen(query_address):
|
514 |
+
return re.sub(r'(?<=\d)の(?=\d)', '-', query_address)
|
515 |
+
|
516 |
+
def normalize_address_v2(query_address, top_k=1):
|
517 |
+
with measure('convert_zenkaku_to_hankaku'):
|
518 |
+
query_address = convert_zenkaku_to_hankaku(query_address)
|
519 |
+
with measure('split_address_building_with_gpt'):
|
520 |
+
splitted = split_address_building_with_gpt(query_address)
|
521 |
+
with measure('get_spelling'):
|
522 |
+
spelling = get_spelling(splitted['address'])
|
523 |
+
if spelling:
|
524 |
+
splitted['address'] = spelling
|
525 |
+
with measure(''):
|
526 |
+
splitted['address'] = convert_no_to_hyphen(splitted['address'])
|
527 |
+
with measure('preprocess'):
|
528 |
+
preprocessed = preprocess(splitted['address'])
|
529 |
+
with measure('vector_search'):
|
530 |
+
hits = vector_search(preprocessed, 1)
|
531 |
+
with measure('split_address'):
|
532 |
+
splits = {
|
533 |
+
'pref': hits[0][3],
|
534 |
+
'county': hits[0][4],
|
535 |
+
'city': hits[0][5],
|
536 |
+
'ward': hits[0][6],
|
537 |
+
'oaza_cho': hits[0][7],
|
538 |
+
'chome': hits[0][8],
|
539 |
+
'koaza': hits[0][9],
|
540 |
+
}
|
541 |
+
with measure('get_addresses_with_parcel'):
|
542 |
+
addresses = get_addresses_with_parcel(
|
543 |
+
splits['pref'], splits['county'], splits['city'], splits['ward'],
|
544 |
+
splits['oaza_cho'], splits['chome'], splits['koaza'])
|
545 |
+
with measure('get_addresses_with_rsdtdsp'):
|
546 |
+
addresses += get_addresses_with_rsdtdsp(
|
547 |
+
splits['pref'], splits['county'], splits['city'], splits['ward'],
|
548 |
+
splits['oaza_cho'], splits['chome'], splits['koaza'])
|
549 |
+
addresses = list(set(addresses)) # 重複を除去
|
550 |
+
with measure('embed_via_multilingual_e5_large'):
|
551 |
+
embeds = embed_via_multilingual_e5_large([splitted['address']] + addresses)
|
552 |
+
query_embed = [embeds[0]]
|
553 |
+
address_embeds = embeds[1:]
|
554 |
+
with measure('cosine'):
|
555 |
+
# コサイン類似度を計算
|
556 |
+
similarities = cosine_similarity(query_embed, address_embeds)
|
557 |
+
|
558 |
+
best_match_indices = np.argsort(similarities[0])[-top_k:][::-1] # 上位Kのインデックスを取得
|
559 |
+
best_addresses = [addresses[i] for i in best_match_indices]
|
560 |
+
best_similarities = similarities[0][best_match_indices]
|
561 |
+
|
562 |
+
return splitted, hits, splits, best_addresses, best_similarities
|
563 |
+
|
564 |
|
565 |
# =========================
|
566 |
# FastAPI definition
|
|
|
578 |
# ---------------------------
|
579 |
# リクエスト・レスポンス定義
|
580 |
# ---------------------------
|
|
|
581 |
class CompareAddressesRequest(BaseModel):
|
582 |
address1: str = Field(..., description="比較する最初の住所", example="東京 墨田区 押上 1丁目1-1")
|
583 |
address2: str = Field(..., description="比較する2番目の住所", example="東京 墨田区 押上 1-1-1")
|
|
|
595 |
# ---------------------------
|
596 |
# エンドポイント定義
|
597 |
# ---------------------------
|
|
|
598 |
@app.post(
|
599 |
"/compare-two-addresses",
|
600 |
response_model=CompareAddressesResponse,
|
|
|
621 |
result = compare_two_addresses(request.address1, request.address2)
|
622 |
return {"result": result}
|
623 |
|
|
|
624 |
@app.post(
|
625 |
"/normalize-address",
|
626 |
response_model=NormalizeAddressResponse,
|
|
|
646 |
normalized = normalize_address(request.query_address)
|
647 |
return {"normalized": normalized}
|
648 |
|
649 |
+
@app.post(
|
650 |
+
"/normalize-address-v2",
|
651 |
+
response_model=NormalizeAddressResponse,
|
652 |
+
summary="住所を正規化する",
|
653 |
+
description="指定された住所を正規化し、正規化後の住所を返します。",
|
654 |
+
responses={
|
655 |
+
200: {
|
656 |
+
"description": "正規化結果の返却",
|
657 |
+
"content": {
|
658 |
+
"application/json": {
|
659 |
+
"example": {
|
660 |
+
"normalized": "東京都千代田区一丁目1番"
|
661 |
+
}
|
662 |
+
}
|
663 |
+
}
|
664 |
+
}
|
665 |
+
}
|
666 |
+
)
|
667 |
+
async def normalize_address_v2_api(request: NormalizeAddressRequest):
|
668 |
+
"""
|
669 |
+
- **query_address**: 正規化する住所
|
670 |
+
"""
|
671 |
+
_, __, ___, bests, _____ = normalize_address_v2(request.query_address)
|
672 |
+
return {"normalized": bests[0]}
|
673 |
+
|
674 |
|
675 |
# =========================
|
676 |
# Gradio tabs definition
|
677 |
# =========================
|
678 |
examples = [
|
679 |
+
'東京都中央区みなと3の12の10、プレサンスロゼ東京港301。',
|
680 |
'東京都荒川区1−5−6荒川マンション102',
|
681 |
'福岡市中央区天神1の11の2',
|
682 |
'私の住所は京都府京都市右京区太秦青木元町4-10です。',
|
|
|
772 |
inputs=[in_tb],
|
773 |
outputs=[out_tb],
|
774 |
)
|
775 |
+
def create_normalize_address__v2_tab():
|
776 |
+
with gr.Tab("normalize_address_v2"):
|
777 |
+
in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
|
778 |
+
gr.Examples(examples=examples, inputs=[in_tb])
|
779 |
+
out_tb = gr.Textbox(label='アウトプット')
|
780 |
+
exe_button = gr.Button(value='実行', variant='primary')
|
781 |
+
|
782 |
+
def f(query_address):
|
783 |
+
splitted, __, ___, bests, _____ = normalize_address_v2(query_address)
|
784 |
+
return bests[0] + splitted['building']
|
785 |
+
|
786 |
+
exe_button.click(
|
787 |
+
fn=f,
|
788 |
+
inputs=[in_tb],
|
789 |
+
outputs=[out_tb],
|
790 |
+
)
|
791 |
def create_split_address_tab():
|
792 |
with gr.Tab("split_address"):
|
793 |
in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
|
|
|
870 |
inputs=[in_tb],
|
871 |
outputs=[out_df],
|
872 |
)
|
873 |
+
def create_get_spelling_tab():
|
874 |
+
with gr.Tab("create_get_spelling_tab"):
|
875 |
+
in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
|
876 |
+
gr.Examples(examples=examples, inputs=[in_tb])
|
877 |
+
out_tb = gr.Textbox(label='アウトプット')
|
878 |
+
exe_button = gr.Button(value='実行', variant='primary')
|
879 |
+
exe_button.click(
|
880 |
+
fn=get_spelling,
|
881 |
+
inputs=[in_tb],
|
882 |
+
outputs=[out_tb],
|
883 |
+
)
|
884 |
|
885 |
with gr.Tab("関数テスト"):
|
886 |
create_normalize_address_tab()
|
887 |
+
create_normalize_address__v2_tab()
|
888 |
create_compare_two_addresses_tab()
|
889 |
+
create_get_spelling_tab()
|
890 |
create_get_addresses_with_parcel_tab()
|
891 |
create_vector_search()
|
892 |
create_remove_left_of_pref_tab()
|
|
|
1015 |
outputs=[search_result_df, chiban_result_df, result_tb, result_df],
|
1016 |
)
|
1017 |
|
1018 |
+
def create_vector_search_v2_tab():
|
1019 |
+
with gr.Tab("ベクトル検索V2"):
|
1020 |
+
with gr.Row():
|
1021 |
+
with gr.Column():
|
1022 |
+
address_input = gr.Textbox(label='住所', placeholder='検索したい住所を入力してください')
|
1023 |
+
gr.Examples(examples=examples, inputs=[address_input])
|
1024 |
+
top_k_input = gr.Slider(minimum=1, maximum=100, step=1, value=5, label='検索数top-k')
|
1025 |
+
search_button = gr.Button(value='検索', variant='primary')
|
1026 |
+
result_tb = gr.Textbox(label='正規化後')
|
1027 |
+
result_df = gr.Dataframe(label="正規化後(分割)", wrap=True)
|
1028 |
+
search_result_df = gr.Dataframe(label="町丁目まで検索結果")
|
1029 |
+
chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")
|
1030 |
+
|
1031 |
+
def search_address(query_address, top_k):
|
1032 |
+
splitted, hits, splits, best_addresses, best_similarities = normalize_address_v2(query_address, top_k)
|
1033 |
+
search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所', '都道府県', '郡', '市区町村', '政令市区', '大字・町', '丁目', '小字'])
|
1034 |
+
result_df = pd.DataFrame([splits.values()], columns=splits.keys())
|
1035 |
+
chiban_result_df = pd.DataFrame({
|
1036 |
+
'Top-k': range(1, len(best_similarities) + 1),
|
1037 |
+
'類似度': best_similarities,
|
1038 |
+
'住所': [best_address + splitted['building'] for best_address in best_addresses]
|
1039 |
+
})
|
1040 |
+
best_address = best_addresses[0] + splitted['building']
|
1041 |
+
|
1042 |
+
return search_result_df, chiban_result_df, best_address, result_df
|
1043 |
+
|
1044 |
+
search_button.click(
|
1045 |
+
fn=search_address,
|
1046 |
+
inputs=[address_input, top_k_input],
|
1047 |
+
outputs=[search_result_df, chiban_result_df, result_tb, result_df],
|
1048 |
+
)
|
1049 |
+
|
1050 |
with gr.Blocks() as demo:
|
1051 |
create_function_test_tab()
|
1052 |
create_vector_search_tab()
|
1053 |
+
create_vector_search_v2_tab()
|
1054 |
create_digital_agency_tab()
|
1055 |
|
1056 |
app = gr.mount_gradio_app(app, demo, path='/')
|