Spaces:
Sleeping
Sleeping
Azure OpenAI APIを統合し、住所を分割する新しい関数を追加。全角から半角への変換機能を実装し、Gradioタブに新しい機能を追加しました。また、依存関係にopenaiを追加しました。
Browse files- app.py +81 -7
- requirements.txt +2 -1
app.py
CHANGED
@@ -17,6 +17,8 @@ import time
|
|
17 |
from contextlib import contextmanager
|
18 |
import numpy as np
|
19 |
import re
|
|
|
|
|
20 |
|
21 |
# .envファイルを読み込む
|
22 |
load_dotenv()
|
@@ -48,6 +50,15 @@ prefs = [
|
|
48 |
'徳島県', '香川県', '愛媛県', '高知県', '福岡県', '佐賀県', '長崎県',
|
49 |
'熊本県', '大分県', '宮崎県', '鹿児島県', '沖縄県'
|
50 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
# ----------------------------
|
53 |
# Download mt_city_all.csv
|
@@ -123,7 +134,14 @@ def measure(label="処理"):
|
|
123 |
end = time.time()
|
124 |
print(f"{label} 実行時間: {end - start:.6f} 秒")
|
125 |
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
ADDRESS_REGEX = re.compile(
|
129 |
r'^'
|
@@ -152,6 +170,31 @@ def split_address_building(address: str) -> dict:
|
|
152 |
'building': m.group('building').strip()
|
153 |
}
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
def split_address(normalized_address):
|
156 |
splits = normalize(normalized_address)
|
157 |
return splits
|
@@ -413,8 +456,10 @@ def compare_two_addresses(address1, address2):
|
|
413 |
return result
|
414 |
|
415 |
def normalize_address(query_address):
|
416 |
-
with measure('
|
417 |
-
|
|
|
|
|
418 |
with measure('preprocess'):
|
419 |
preprocessed = preprocess(splitted['address'])
|
420 |
with measure('vector_search'):
|
@@ -545,6 +590,8 @@ async def normalize_address_api(request: NormalizeAddressRequest):
|
|
545 |
# Gradio tabs definition
|
546 |
# =========================
|
547 |
examples = [
|
|
|
|
|
548 |
'私の住所は京都府京都市右京区太秦青木元町4-10です。',
|
549 |
'京都府京都市右京区太秦青木元町4-10',
|
550 |
'京都府京都市右京区太秦青木元町4-10ダックス101号室',
|
@@ -641,6 +688,7 @@ def create_function_test_tab():
|
|
641 |
def create_split_address_tab():
|
642 |
with gr.Tab("split_address"):
|
643 |
in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
|
|
|
644 |
out_tb = gr.Textbox(label='アウトプット')
|
645 |
exe_button = gr.Button(value='実行', variant='primary')
|
646 |
exe_button.click(
|
@@ -648,6 +696,28 @@ def create_function_test_tab():
|
|
648 |
inputs=[in_tb],
|
649 |
outputs=[out_tb],
|
650 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
651 |
def create_vector_search():
|
652 |
def f(query_address, top_k):
|
653 |
with measure('preprocess'):
|
@@ -708,6 +778,8 @@ def create_function_test_tab():
|
|
708 |
create_remove_filler_tab()
|
709 |
create_preprocess_tab()
|
710 |
create_split_address_tab()
|
|
|
|
|
711 |
|
712 |
def create_digital_agency_tab():
|
713 |
with gr.Tab("デジ庁API"):
|
@@ -767,8 +839,10 @@ def create_vector_search_tab():
|
|
767 |
chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")
|
768 |
|
769 |
def search_address(query_address, top_k):
|
770 |
-
with measure('
|
771 |
-
|
|
|
|
|
772 |
with measure('preprocess'):
|
773 |
preprocessed = preprocess(splitted['address'])
|
774 |
with measure('vector_search'):
|
@@ -809,10 +883,10 @@ def create_vector_search_tab():
|
|
809 |
chiban_result_df = pd.DataFrame({
|
810 |
'Top-k': range(1, top_k + 1),
|
811 |
'類似度': best_similarities,
|
812 |
-
'住所':
|
813 |
})
|
814 |
|
815 |
-
best_address = best_addresses[0]
|
816 |
|
817 |
return search_result_df, chiban_result_df, best_address, result_df
|
818 |
|
|
|
17 |
from contextlib import contextmanager
|
18 |
import numpy as np
|
19 |
import re
|
20 |
+
import os
|
21 |
+
from openai import AzureOpenAI
|
22 |
|
23 |
# .envファイルを読み込む
|
24 |
load_dotenv()
|
|
|
50 |
'徳島県', '香川県', '愛媛県', '高知県', '福岡県', '佐賀県', '長崎県',
|
51 |
'熊本県', '大分県', '宮崎県', '鹿児島県', '沖縄県'
|
52 |
]
|
53 |
+
|
54 |
+
# ----------------------------
|
55 |
+
# Azure OpenAI API
|
56 |
+
# ----------------------------
|
57 |
+
client = AzureOpenAI(
|
58 |
+
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
|
59 |
+
api_version="2025-03-01-preview",
|
60 |
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
|
61 |
+
)
|
62 |
|
63 |
# ----------------------------
|
64 |
# Download mt_city_all.csv
|
|
|
134 |
end = time.time()
|
135 |
print(f"{label} 実行時間: {end - start:.6f} 秒")
|
136 |
|
137 |
+
def convert_zenkaku_to_hankaku(text):
|
138 |
+
zenkaku_numbers = '0123456789'
|
139 |
+
hankaku_numbers = '0123456789'
|
140 |
+
zenkaku_hyphens = '-'
|
141 |
+
hankaku_hyphens = '-'
|
142 |
+
|
143 |
+
translation_table = str.maketrans(zenkaku_numbers + zenkaku_hyphens, hankaku_numbers + hankaku_hyphens)
|
144 |
+
return text.translate(translation_table)
|
145 |
|
146 |
ADDRESS_REGEX = re.compile(
|
147 |
r'^'
|
|
|
170 |
'building': m.group('building').strip()
|
171 |
}
|
172 |
|
173 |
+
def split_address_building_with_gpt(query_address: str) -> dict:
|
174 |
+
class SplittedAddress(BaseModel):
|
175 |
+
address: str
|
176 |
+
building: str
|
177 |
+
|
178 |
+
response = client.responses.parse(
|
179 |
+
model="gpt-4o-mini",
|
180 |
+
input=[
|
181 |
+
{"role": "system", "content": "Extract the event information."},
|
182 |
+
{
|
183 |
+
"role": "user",
|
184 |
+
"content": f"与えられた住所をaddressとbuildingに分けろ:{query_address}",
|
185 |
+
},
|
186 |
+
],
|
187 |
+
text_format=SplittedAddress,
|
188 |
+
)
|
189 |
+
|
190 |
+
response = response.output_parsed
|
191 |
+
|
192 |
+
return {
|
193 |
+
'address': response.address,
|
194 |
+
'building': response.building,
|
195 |
+
}
|
196 |
+
|
197 |
+
|
198 |
def split_address(normalized_address):
|
199 |
splits = normalize(normalized_address)
|
200 |
return splits
|
|
|
456 |
return result
|
457 |
|
458 |
def normalize_address(query_address):
|
459 |
+
with measure('convert_zenkaku_to_hankaku'):
|
460 |
+
query_address = convert_zenkaku_to_hankaku(query_address)
|
461 |
+
with measure('split_address_building_with_gpt'):
|
462 |
+
splitted = split_address_building_with_gpt(query_address)
|
463 |
with measure('preprocess'):
|
464 |
preprocessed = preprocess(splitted['address'])
|
465 |
with measure('vector_search'):
|
|
|
590 |
# Gradio tabs definition
|
591 |
# =========================
|
592 |
examples = [
|
593 |
+
'東京都荒川区1−5−6荒川マンション102',
|
594 |
+
'福岡市中央区天神1の11の2',
|
595 |
'私の住所は京都府京都市右京区太秦青木元町4-10です。',
|
596 |
'京都府京都市右京区太秦青木元町4-10',
|
597 |
'京都府京都市右京区太秦青木元町4-10ダックス101号室',
|
|
|
688 |
def create_split_address_tab():
|
689 |
with gr.Tab("split_address"):
|
690 |
in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
|
691 |
+
gr.Examples(examples=examples, inputs=[in_tb])
|
692 |
out_tb = gr.Textbox(label='アウトプット')
|
693 |
exe_button = gr.Button(value='実行', variant='primary')
|
694 |
exe_button.click(
|
|
|
696 |
inputs=[in_tb],
|
697 |
outputs=[out_tb],
|
698 |
)
|
699 |
+
def create_split_address_building_tab():
|
700 |
+
with gr.Tab("split_address_building_with_gpt"):
|
701 |
+
in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
|
702 |
+
gr.Examples(examples=examples, inputs=[in_tb])
|
703 |
+
out_tb = gr.Textbox(label='アウトプット')
|
704 |
+
exe_button = gr.Button(value='実行', variant='primary')
|
705 |
+
exe_button.click(
|
706 |
+
fn=split_address_building_with_gpt,
|
707 |
+
inputs=[in_tb],
|
708 |
+
outputs=[out_tb],
|
709 |
+
)
|
710 |
+
def create_convert_zenkaku_to_hankaku_tab():
|
711 |
+
with gr.Tab("convert_zenkaku_to_hankaku"):
|
712 |
+
in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
|
713 |
+
gr.Examples(examples=examples, inputs=[in_tb])
|
714 |
+
out_tb = gr.Textbox(label='アウトプット')
|
715 |
+
exe_button = gr.Button(value='実行', variant='primary')
|
716 |
+
exe_button.click(
|
717 |
+
fn=convert_zenkaku_to_hankaku,
|
718 |
+
inputs=[in_tb],
|
719 |
+
outputs=[out_tb],
|
720 |
+
)
|
721 |
def create_vector_search():
|
722 |
def f(query_address, top_k):
|
723 |
with measure('preprocess'):
|
|
|
778 |
create_remove_filler_tab()
|
779 |
create_preprocess_tab()
|
780 |
create_split_address_tab()
|
781 |
+
create_split_address_building_tab()
|
782 |
+
create_convert_zenkaku_to_hankaku_tab()
|
783 |
|
784 |
def create_digital_agency_tab():
|
785 |
with gr.Tab("デジ庁API"):
|
|
|
839 |
chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")
|
840 |
|
841 |
def search_address(query_address, top_k):
|
842 |
+
with measure('convert_zenkaku_to_hankaku'):
|
843 |
+
query_address = convert_zenkaku_to_hankaku(query_address)
|
844 |
+
with measure('split_address_building_with_gpt'):
|
845 |
+
splitted = split_address_building_with_gpt(query_address)
|
846 |
with measure('preprocess'):
|
847 |
preprocessed = preprocess(splitted['address'])
|
848 |
with measure('vector_search'):
|
|
|
883 |
chiban_result_df = pd.DataFrame({
|
884 |
'Top-k': range(1, top_k + 1),
|
885 |
'類似度': best_similarities,
|
886 |
+
'住所': [best_address + splitted['building'] for best_address in best_addresses]
|
887 |
})
|
888 |
|
889 |
+
best_address = best_addresses[0] + splitted['building']
|
890 |
|
891 |
return search_result_df, chiban_result_df, best_address, result_df
|
892 |
|
requirements.txt
CHANGED
@@ -9,4 +9,5 @@ spacy
|
|
9 |
normalize-japanese-addresses
|
10 |
ginza
|
11 |
ja-ginza
|
12 |
-
scikit-learn
|
|
|
|
9 |
normalize-japanese-addresses
|
10 |
ginza
|
11 |
ja-ginza
|
12 |
+
scikit-learn
|
13 |
+
openai
|