matsuap commited on
Commit
43d5699
·
1 Parent(s): 9dcbb35

Azure OpenAI APIを統合し、住所を分割する新しい関数を追加。全角から半角への変換機能を実装し、Gradioタブに新しい機能を追加しました。また、依存関係にopenaiを追加しました。

Browse files
Files changed (2) hide show
  1. app.py +81 -7
  2. requirements.txt +2 -1
app.py CHANGED
@@ -17,6 +17,8 @@ import time
17
  from contextlib import contextmanager
18
  import numpy as np
19
  import re
 
 
20
 
21
  # .envファイルを読み込む
22
  load_dotenv()
@@ -48,6 +50,15 @@ prefs = [
48
  '徳島県', '香川県', '愛媛県', '高知県', '福岡県', '佐賀県', '長崎県',
49
  '熊本県', '大分県', '宮崎県', '鹿児島県', '沖縄県'
50
  ]
 
 
 
 
 
 
 
 
 
51
 
52
  # ----------------------------
53
  # Download mt_city_all.csv
@@ -123,7 +134,14 @@ def measure(label="処理"):
123
  end = time.time()
124
  print(f"{label} 実行時間: {end - start:.6f} 秒")
125
 
126
- import re
 
 
 
 
 
 
 
127
 
128
  ADDRESS_REGEX = re.compile(
129
  r'^'
@@ -152,6 +170,31 @@ def split_address_building(address: str) -> dict:
152
  'building': m.group('building').strip()
153
  }
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def split_address(normalized_address):
156
  splits = normalize(normalized_address)
157
  return splits
@@ -413,8 +456,10 @@ def compare_two_addresses(address1, address2):
413
  return result
414
 
415
  def normalize_address(query_address):
416
- with measure('split_address_building'):
417
- splitted = split_address_building(query_address)
 
 
418
  with measure('preprocess'):
419
  preprocessed = preprocess(splitted['address'])
420
  with measure('vector_search'):
@@ -545,6 +590,8 @@ async def normalize_address_api(request: NormalizeAddressRequest):
545
  # Gradio tabs definition
546
  # =========================
547
  examples = [
 
 
548
  '私の住所は京都府京都市右京区太秦青木元町4-10です。',
549
  '京都府京都市右京区太秦青木元町4-10',
550
  '京都府京都市右京区太秦青木元町4-10ダックス101号室',
@@ -641,6 +688,7 @@ def create_function_test_tab():
641
  def create_split_address_tab():
642
  with gr.Tab("split_address"):
643
  in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
 
644
  out_tb = gr.Textbox(label='アウトプット')
645
  exe_button = gr.Button(value='実行', variant='primary')
646
  exe_button.click(
@@ -648,6 +696,28 @@ def create_function_test_tab():
648
  inputs=[in_tb],
649
  outputs=[out_tb],
650
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  def create_vector_search():
652
  def f(query_address, top_k):
653
  with measure('preprocess'):
@@ -708,6 +778,8 @@ def create_function_test_tab():
708
  create_remove_filler_tab()
709
  create_preprocess_tab()
710
  create_split_address_tab()
 
 
711
 
712
  def create_digital_agency_tab():
713
  with gr.Tab("デジ庁API"):
@@ -767,8 +839,10 @@ def create_vector_search_tab():
767
  chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")
768
 
769
  def search_address(query_address, top_k):
770
- with measure('split_address_building'):
771
- splitted = split_address_building(query_address)
 
 
772
  with measure('preprocess'):
773
  preprocessed = preprocess(splitted['address'])
774
  with measure('vector_search'):
@@ -809,10 +883,10 @@ def create_vector_search_tab():
809
  chiban_result_df = pd.DataFrame({
810
  'Top-k': range(1, top_k + 1),
811
  '類似度': best_similarities,
812
- '住所': best_addresses + splitted['building']
813
  })
814
 
815
- best_address = best_addresses[0]
816
 
817
  return search_result_df, chiban_result_df, best_address, result_df
818
 
 
17
  from contextlib import contextmanager
18
  import numpy as np
19
  import re
20
+ import os
21
+ from openai import AzureOpenAI
22
 
23
  # .envファイルを読み込む
24
  load_dotenv()
 
50
  '徳島県', '香川県', '愛媛県', '高知県', '福岡県', '佐賀県', '長崎県',
51
  '熊本県', '大分県', '宮崎県', '鹿児島県', '沖縄県'
52
  ]
53
+
54
+ # ----------------------------
55
+ # Azure OpenAI API
56
+ # ----------------------------
57
+ client = AzureOpenAI(
58
+ api_key=os.getenv("AZURE_OPENAI_API_KEY"),
59
+ api_version="2025-03-01-preview",
60
+ azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
61
+ )
62
 
63
  # ----------------------------
64
  # Download mt_city_all.csv
 
134
  end = time.time()
135
  print(f"{label} 実行時間: {end - start:.6f} 秒")
136
 
137
+ def convert_zenkaku_to_hankaku(text):
138
+ zenkaku_numbers = '0123456789'
139
+ hankaku_numbers = '0123456789'
140
+ zenkaku_hyphens = '-'
141
+ hankaku_hyphens = '-'
142
+
143
+ translation_table = str.maketrans(zenkaku_numbers + zenkaku_hyphens, hankaku_numbers + hankaku_hyphens)
144
+ return text.translate(translation_table)
145
 
146
  ADDRESS_REGEX = re.compile(
147
  r'^'
 
170
  'building': m.group('building').strip()
171
  }
172
 
173
+ def split_address_building_with_gpt(query_address: str) -> dict:
174
+ class SplittedAddress(BaseModel):
175
+ address: str
176
+ building: str
177
+
178
+ response = client.responses.parse(
179
+ model="gpt-4o-mini",
180
+ input=[
181
+ {"role": "system", "content": "Extract the event information."},
182
+ {
183
+ "role": "user",
184
+ "content": f"与えられた住所をaddressとbuildingに分けろ:{query_address}",
185
+ },
186
+ ],
187
+ text_format=SplittedAddress,
188
+ )
189
+
190
+ response = response.output_parsed
191
+
192
+ return {
193
+ 'address': response.address,
194
+ 'building': response.building,
195
+ }
196
+
197
+
198
  def split_address(normalized_address):
199
  splits = normalize(normalized_address)
200
  return splits
 
456
  return result
457
 
458
  def normalize_address(query_address):
459
+ with measure('convert_zenkaku_to_hankaku'):
460
+ query_address = convert_zenkaku_to_hankaku(query_address)
461
+ with measure('split_address_building_with_gpt'):
462
+ splitted = split_address_building_with_gpt(query_address)
463
  with measure('preprocess'):
464
  preprocessed = preprocess(splitted['address'])
465
  with measure('vector_search'):
 
590
  # Gradio tabs definition
591
  # =========================
592
  examples = [
593
+ '東京都荒川区1−5−6荒川マンション102',
594
+ '福岡市中央区天神1の11の2',
595
  '私の住所は京都府京都市右京区太秦青木元町4-10です。',
596
  '京都府京都市右京区太秦青木元町4-10',
597
  '京都府京都市右京区太秦青木元町4-10ダックス101号室',
 
688
  def create_split_address_tab():
689
  with gr.Tab("split_address"):
690
  in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
691
+ gr.Examples(examples=examples, inputs=[in_tb])
692
  out_tb = gr.Textbox(label='アウトプット')
693
  exe_button = gr.Button(value='実行', variant='primary')
694
  exe_button.click(
 
696
  inputs=[in_tb],
697
  outputs=[out_tb],
698
  )
699
+ def create_split_address_building_tab():
700
+ with gr.Tab("split_address_building_with_gpt"):
701
+ in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
702
+ gr.Examples(examples=examples, inputs=[in_tb])
703
+ out_tb = gr.Textbox(label='アウトプット')
704
+ exe_button = gr.Button(value='実行', variant='primary')
705
+ exe_button.click(
706
+ fn=split_address_building_with_gpt,
707
+ inputs=[in_tb],
708
+ outputs=[out_tb],
709
+ )
710
+ def create_convert_zenkaku_to_hankaku_tab():
711
+ with gr.Tab("convert_zenkaku_to_hankaku"):
712
+ in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
713
+ gr.Examples(examples=examples, inputs=[in_tb])
714
+ out_tb = gr.Textbox(label='アウトプット')
715
+ exe_button = gr.Button(value='実行', variant='primary')
716
+ exe_button.click(
717
+ fn=convert_zenkaku_to_hankaku,
718
+ inputs=[in_tb],
719
+ outputs=[out_tb],
720
+ )
721
  def create_vector_search():
722
  def f(query_address, top_k):
723
  with measure('preprocess'):
 
778
  create_remove_filler_tab()
779
  create_preprocess_tab()
780
  create_split_address_tab()
781
+ create_split_address_building_tab()
782
+ create_convert_zenkaku_to_hankaku_tab()
783
 
784
  def create_digital_agency_tab():
785
  with gr.Tab("デジ庁API"):
 
839
  chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")
840
 
841
  def search_address(query_address, top_k):
842
+ with measure('convert_zenkaku_to_hankaku'):
843
+ query_address = convert_zenkaku_to_hankaku(query_address)
844
+ with measure('split_address_building_with_gpt'):
845
+ splitted = split_address_building_with_gpt(query_address)
846
  with measure('preprocess'):
847
  preprocessed = preprocess(splitted['address'])
848
  with measure('vector_search'):
 
883
  chiban_result_df = pd.DataFrame({
884
  'Top-k': range(1, top_k + 1),
885
  '類似度': best_similarities,
886
+ '住所': [best_address + splitted['building'] for best_address in best_addresses]
887
  })
888
 
889
+ best_address = best_addresses[0] + splitted['building']
890
 
891
  return search_result_df, chiban_result_df, best_address, result_df
892
 
requirements.txt CHANGED
@@ -9,4 +9,5 @@ spacy
9
  normalize-japanese-addresses
10
  ginza
11
  ja-ginza
12
- scikit-learn
 
 
9
  normalize-japanese-addresses
10
  ginza
11
  ja-ginza
12
+ scikit-learn
13
+ openai