Spaces:

AtPeak
/

japanese-address-search-v2

Sleeping

App Files Files Community

japanese-address-search-v2 / app.py

matsuap

Google検索APIを統合し、住所正規化処理を改善する新しいエンドポイントを追加しました。住所のスペル修正機能を実装し、Gradioタブに新しい機能を追加しました。また、正規化処理のバージョン2を実装し、類似度計算を強化しました。

d77ce72 3 months ago

raw

history blame contribute delete

45.6 kB

	import gradio as gr
	import zipfile
	from tqdm import tqdm
	from sklearn.metrics.pairwise import cosine_similarity
	from pathlib import Path
	import spacy
	from normalize_japanese_addresses import normalize
	from enum import Enum
	import time
	import requests
	import pandas as pd
	import os
	from fastapi import FastAPI, Request
	from pymilvus import MilvusClient
	from dotenv import load_dotenv
	import time
	from contextlib import contextmanager
	import numpy as np
	import re
	import os
	from openai import AzureOpenAI

	# .envファイルを読み込む
	load_dotenv()

	# =========================
	# Global variables
	# =========================
	TARGET_DIR = Path(os.environ.get('TARGET_DIR'))

	HUGGING_FACE_TOKEN = os.environ.get('HUGGING_FACE_TOKEN')
	EMBEDDING_MODEL_ENDPOINT = os.environ.get('EMBEDDING_MODEL_ENDPOINT')
	ABRG_ENDPOINT = os.environ.get('ABRG_ENDPOINT')

	VECTOR_SEARCH_ENDPOINT = os.environ.get('VECTOR_SEARCH_ENDPOINT')
	VECTOR_SEARCH_TOKEN = os.environ.get('VECTOR_SEARCH_TOKEN')
	VECTOR_SEARCH_COLLECTION_NAME = os.environ.get('VECTOR_SEARCH_COLLECTION_NAME')
	VECTOR_SEARCH_COLLECTION_NAME_V2 = os.environ.get('VECTOR_SEARCH_COLLECTION_NAME_V2')

	GOOGLE_SEARCH_API_KEY = os.environ.get('GOOGLE_SEARCH_API_KEY')
	GOOGLE_SEARCH_ENGINE_ID = os.environ.get('GOOGLE_SEARCH_ENGINE_ID')

	MILVUS_CLIENT = MilvusClient(uri=VECTOR_SEARCH_ENDPOINT, token=VECTOR_SEARCH_TOKEN)
	print(f"Connected to DB: {VECTOR_SEARCH_ENDPOINT} successfully")

	# 47都道府県のリスト
	prefs = [
	'北海道', '青森県', '岩手県', '宮城県', '秋田県', '山形県', '福島県',
	'茨城県', '栃木県', '群馬県', '埼玉県', '千葉県', '東京都', '神奈川県',
	'新潟県', '富山県', '石川県', '福井県', '山梨県', '長野県', '岐阜県',
	'静岡県', '愛知県', '三重県', '滋賀県', '京都府', '大阪府', '兵庫県',
	'奈良県', '和歌山県', '鳥取県', '島根県', '岡山県', '広島県', '山口県',
	'徳島県', '香川県', '愛媛県', '高知県', '福岡県', '佐賀県', '長崎県',
	'熊本県', '大分県', '宮崎県', '鹿児島県', '沖縄県'
	]

	# ----------------------------
	# Azure OpenAI API
	# ----------------------------
	client = AzureOpenAI(
	api_key=os.getenv("AZURE_OPENAI_API_KEY"),
	api_version="2025-03-01-preview",
	azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
	)

	# ----------------------------
	# Download mt_city_all.csv
	# ----------------------------
	temp_dir = Path('temp')
	temp_dir.mkdir(exist_ok=True)

	city_all_url = 'https://catalog.registries.digital.go.jp/rsc/address/mt_city_all.csv.zip'
	zip_file_path = temp_dir / 'mt_city_all.csv.zip'

	# すでにファイルが存在する場合はダウンロードをスキップ
	if not os.path.exists(zip_file_path):
	# ZIPファイルをダウンロード
	response = requests.get(city_all_url)
	with open(zip_file_path, 'wb') as f:
	f.write(response.content)

	# target_dir直下に解凍
	with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
	zip_ref.extractall(TARGET_DIR)

	# ------------------------------------
	# Download mt_parcel_cityXXXXXX.csv
	# ------------------------------------
	city_all_file = TARGET_DIR / 'mt_city_all.csv'
	city_all_df = pd.read_csv(city_all_file)
	lg_codes = city_all_df['lg_code'].tolist()
	print('lg_codes', len(lg_codes))

	for lg_code in tqdm(lg_codes):
	parcel_url = f'https://catalog.registries.digital.go.jp/rsc/address/mt_parcel_city{lg_code:06d}.csv.zip'
	zip_file_path = temp_dir / f'mt_parcel_city{lg_code:06d}.csv.zip'

	if not os.path.exists(TARGET_DIR / 'parcel' / f'mt_parcel_city{lg_code:06d}.csv'):
	response = requests.get(parcel_url)
	if response.status_code == 200: # URLが存在する場合のみ処理を続ける
	with open(zip_file_path, 'wb') as f:
	f.write(response.content)
	with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
	zip_ref.extractall(TARGET_DIR / 'parcel')
	time.sleep(0.2) # ダウンロードごとに200msのスリープを入れる

	# ------------------------------------
	# Download mt_rsdtdsp_rsdt_prefXX.csv
	# ------------------------------------
	pref_codes = list(set([('%06d' % lg_code)[0:2] for lg_code in lg_codes]))

	for pref_code in tqdm(pref_codes):
	rsdt_url = f'https://catalog.registries.digital.go.jp/rsc/address/mt_rsdtdsp_rsdt_pref{pref_code}.csv.zip'
	zip_file_path = temp_dir / f'mt_rsdtdsp_rsdt_pref{pref_code}.csv.zip'

	if not os.path.exists(TARGET_DIR / 'rsdt' / f'mt_rsdtdsp_rsdt_pref{pref_code}.csv'):
	response = requests.get(rsdt_url)
	if response.status_code == 200: # URLが存在する場合のみ処理を続ける
	with open(zip_file_path, 'wb') as f:
	f.write(response.content)
	with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
	zip_ref.extractall(TARGET_DIR / 'rsdt')
	time.sleep(0.2) # ダウンロードごとに200msのスリープを入れる

	# 一時ディレクトリを削除
	for file in temp_dir.iterdir():
	file.unlink()
	temp_dir.rmdir()

	# =========================
	# Utilitiy functions
	# =========================
	@contextmanager
	def measure(label="処理"):
	start = time.time()
	yield
	end = time.time()
	print(f"{label} 実行時間: {end - start:.6f} 秒")

	def get_spelling(query_address):
	# APIリクエストを作成
	url = f'https://www.googleapis.com/customsearch/v1?key={GOOGLE_SEARCH_API_KEY}&cx={GOOGLE_SEARCH_ENGINE_ID}&q={query_address}'

	# リクエストを送信
	response = requests.get(url)
	results = response.json()

	return results.get('spelling', {}).get('correctedQuery', '')

	def convert_zenkaku_to_hankaku(text):
	zenkaku_numbers = '０１２３４５６７８９'
	hankaku_numbers = '0123456789'
	zenkaku_hyphens = '－'
	hankaku_hyphens = '-'

	translation_table = str.maketrans(zenkaku_numbers + zenkaku_hyphens, hankaku_numbers + hankaku_hyphens)
	return text.translate(translation_table)

	ADDRESS_REGEX = re.compile(
	r'^'
	r'(?P<address>'
	r'.+?[都道府県]' # 都道府県
	r'.+?[市区町村]' # 市区町村
	r'.*?' # 町名など（最小マッチ）
	r'[0-9０-９]+' # 番地の先頭数字
	r'(?:[-ー−–][0-9０-９]+)*' # 「-数字」の繰返し
	r'(?:(?:丁目\|番地\|番\|号)' # 「丁目」「番地」「番」「号」
	r'(?:[0-9０-９]+' # のあとに続く数字
	r'(?:[-ー−–][0-9０-９]+)*' # 「-数字」の繰返し
	r')?'
	r')*' # 上記ユニットを何度でも繰返し
	r')'
	r'(?P<building>.*)' # 残りを建物名としてキャプチャ
	r'$'
	)

	def split_address_building(address: str) -> dict:
	m = ADDRESS_REGEX.match(address)
	if not m:
	return {'address': address, 'building': ''}
	return {
	'address': m.group('address').strip(),
	'building': m.group('building').strip()
	}

	def split_address_building_with_gpt(query_address: str) -> dict:
	class SplittedAddress(BaseModel):
	address: str
	building: str

	response = client.responses.parse(
	model="gpt-4o-mini",
	input=[
	{"role": "system", "content": "Extract the event information."},
	{
	"role": "user",
	"content": f"与えられた住所をaddressとbuildingに分けろ：{query_address}",
	},
	],
	text_format=SplittedAddress,
	)

	response = response.output_parsed

	return {
	'address': response.address,
	'building': response.building,
	}


	def split_address(normalized_address):
	splits = normalize(normalized_address)
	return splits

	def compare(normalized_address1, normalized_address2):
	split1 = split_address(normalized_address1)
	split2 = split_address(normalized_address2)

	result = {
	'pref': False,
	'city': False,
	'town': False,
	'addr': False,
	}

	for key in result.keys():
	if split1[key] == split2[key]:
	result[key] = True

	return all(result.values())

	def vector_search(query_address, top_k):
	wait_time = 30
	max_retries = 5
	for attempt in range(max_retries):
	try:
	with measure('vector_search - embed_via_multilingual_e5_large'):
	query_embeds = embed_via_multilingual_e5_large([query_address])
	break # 成功した場合はループを抜ける

	except InferenceEndpointError as e:
	if e.code == InferenceEndpointErrorCode.SERVICE_UNAVAILABLE:
	if attempt < max_retries - 1:
	gr.Warning(f"{InferenceEndpointErrorCode.SERVICE_UNAVAILABLE}: 埋め込みモデルの推論エンドポイントが起動中です。{wait_time}秒後にリトライします。", duration=wait_time)
	time.sleep(wait_time) # 30秒待機
	else:
	raise gr.Error(f"{InferenceEndpointErrorCode.SERVICE_UNAVAILABLE}: 最大リトライ回数に達しました。しばらくしてから再度実行してみてください。")

	elif e.code == InferenceEndpointErrorCode.INVALID_STATE:
	raise gr.Error(f"{InferenceEndpointErrorCode.INVALID_STATE}: 埋め込みモデルの推論エンドポイントが停止中です。再起動するよう管理者に問い合わせてください。")

	elif e.code == InferenceEndpointErrorCode.UNKNOWN_ERROR:
	raise gr.Error(f"{InferenceEndpointErrorCode.UNKNOWN_ERROR}: {e.message}")

	with measure('vector_search - search_via_milvus'):
	hits = search_via_milvus(query_embeds[0], top_k, VECTOR_SEARCH_COLLECTION_NAME)
	return hits

	def replace_circle(input_text):
	output_text = input_text.replace('◯', '0')
	return output_text

	def remove_filler(input_text: str) -> str:
	"""
	GiNZAを用いて日本語テキストからフィラーを除去する関数。

	Parameters:
	text (str): 入力テキスト。

	Returns:
	str: フィラーを除去したテキスト。
	"""
	# GiNZAモデルの読み込み
	nlp = spacy.load("ja_ginza")

	# テキストの解析
	doc = nlp(input_text)

	# フィラーを除去したテキストの生成
	cleaned_text = ''.join([token.text for token in doc if token.tag_ != "感動詞-フィラー"])

	return cleaned_text

	def remove_left_of_pref(text):
	for pref in prefs:
	pref_index = text.find(pref)
	if pref_index != -1:
	return text[pref_index:] # 都道府県名の位置から右側の文字列を返す
	return text # 都道府県名が見つからない場合は元のテキストを返す

	def preprocess(input_text):
	output_text = remove_left_of_pref(input_text)
	output_text = replace_circle(output_text)
	output_text = remove_filler(output_text)
	return output_text

	class InferenceEndpointErrorCode(Enum):
	INVALID_STATE = 400
	SERVICE_UNAVAILABLE = 503
	UNKNOWN_ERROR = 520

	class InferenceEndpointError(Exception):
	def __init__(self, code: InferenceEndpointErrorCode, message="エラー"):
	self.code = code
	self.message = message
	super().__init__(self.message)

	def embed_via_multilingual_e5_large(query_addresses):
	headers = {
	"Accept": "application/json",
	"Authorization": f"Bearer {HUGGING_FACE_TOKEN}",
	"Content-Type": "application/json"
	}

	all_responses = []
	for i in range(0, len(query_addresses), 2048):
	chunk = query_addresses[i:i + 2048]
	response = requests.post(EMBEDDING_MODEL_ENDPOINT, headers=headers, json={"inputs": chunk})
	response_json = response.json()

	if 'error' in response_json:
	if response_json['error'] == 'Bad Request: Invalid state':
	raise InferenceEndpointError(InferenceEndpointErrorCode.INVALID_STATE, "Bad Request: Invalid state")
	elif response_json['error'] == '503 Service Unavailable':
	raise InferenceEndpointError(InferenceEndpointErrorCode.SERVICE_UNAVAILABLE, "Service Unavailable")
	else:
	raise InferenceEndpointError(InferenceEndpointErrorCode.UNKNOWN_ERROR, response_json['error'])

	all_responses.extend(response_json)

	return all_responses

	def search_via_milvus(query_vector, top_k, collection_name, thresh=0.0):
	search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}} # MiniLM系はCOSINE推奨

	results = MILVUS_CLIENT.search(
	collection_name=collection_name,
	data=[query_vector],
	search_params=search_params,
	limit=top_k,
	anns_field='embedding',
	output_fields=['address', 'pref', 'county', 'city', 'ward', 'oaza_cho', 'chome', 'koaza'],
	)[0]

	hits = []
	for i, result in enumerate(results, start=1):
	distance = result['distance']
	address = result['entity'].get('address')
	pref = result['entity'].get('pref')
	county = result['entity'].get('county')
	city = result['entity'].get('city')
	ward = result['entity'].get('ward')
	oaza_cho = result['entity'].get('oaza_cho')
	chome = result['entity'].get('chome')
	koaza = result['entity'].get('koaza')

	if distance >= thresh:
	hits.append([i, distance, address, pref, county, city, ward, oaza_cho, chome, koaza])

	return hits

	def get_lg_code(pref, county, city, ward):
	city_all_file = TARGET_DIR / 'mt_city_all.csv'
	city_all_df = pd.read_csv(city_all_file)
	city_all_df_temp = city_all_df[city_all_df['pref'] == pref]
	city_name1 = city_all_df_temp['county'].fillna('') + city_all_df_temp['city'].fillna('') + city_all_df_temp['ward'].fillna('')
	city_name2 = county + city + ward
	lg_codes = city_all_df_temp[city_name1 == city_name2]['lg_code'].values
	if len(lg_codes) > 1:
	raise Exception('Too many lg_code')
	return lg_codes[0]

	def get_addresses_with_parcel(pref, county, city, ward, oaza_cho, chome, koaza):
	lg_code = get_lg_code(pref, county, city, ward)
	parcel_city_file = TARGET_DIR / 'parcel' / f'mt_parcel_city{lg_code:06d}.csv'
	if not os.path.exists(parcel_city_file):
	raise gr.Error('Not found: ', parcel_city_file)
	parcel_city_df = pd.read_csv(parcel_city_file)

	cities = parcel_city_df['city'].fillna('')
	wards = parcel_city_df['ward'].fillna('')
	oaza_chos = parcel_city_df['oaza_cho'].fillna('')
	chomes = parcel_city_df['chome'].fillna('')
	koazas = parcel_city_df['koaza'].fillna('')

	city_name1 = cities + wards
	city_name2 = county + city + ward
	city_mask = city_name1 == city_name2

	town_name1 = oaza_chos + chomes
	town_name2 = oaza_cho + chome
	town_mask = town_name1 == town_name2

	koaza_mask = koazas == koaza
	parcel_city_df_filtered = parcel_city_df[city_mask & town_mask & koaza_mask]

	if len(parcel_city_df_filtered) == 0:
	return [pref + county + city + ward + oaza_cho + chome + koaza]

	cities = parcel_city_df_filtered['city'].fillna('')
	wards = parcel_city_df_filtered['ward'].fillna('')
	oaza_chos = parcel_city_df_filtered['oaza_cho'].fillna('')
	chomes = parcel_city_df_filtered['chome'].fillna('')
	koazas = parcel_city_df_filtered['koaza'].fillna('')
	prc_num1s = parcel_city_df_filtered['prc_num1'].fillna(9999).astype(int).astype(str).replace('9999', '')
	prc_num2s = parcel_city_df_filtered['prc_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
	prc_num3s = parcel_city_df_filtered['prc_num3'].fillna(9999).astype(int).astype(str).replace('9999', '')

	# アドレスを生成
	return [
	f"{pref}{_city}{_ward}{_oaza_cho}{_chome}{_koaza}{_prc_num1}" +
	(f"-{_prc_num2}" if _prc_num2 else '') +
	(f"-{_prc_num3}" if _prc_num3 else '')
	for _city, _ward, _oaza_cho, _chome, _koaza, _prc_num1, _prc_num2, _prc_num3 in zip(
	cities, wards, oaza_chos, chomes, koazas, prc_num1s, prc_num2s, prc_num3s
	)
	]
	def get_pref_code(pref):
	return prefs.index(pref) + 1

	def get_addresses_with_rsdtdsp(pref, county, city, ward, oaza_cho, chome, koaza):
	pref_code = get_pref_code(pref)
	rsdtdsp_file = TARGET_DIR / 'rsdt' / f'mt_rsdtdsp_rsdt_pref{pref_code:02d}.csv'
	if not os.path.exists(rsdtdsp_file):
	raise gr.Error(f'Not found: {rsdtdsp_file}')
	rsdtdsp_df = pd.read_csv(rsdtdsp_file)

	city_name1 = rsdtdsp_df['city'].fillna('') + rsdtdsp_df['ward'].fillna('')
	city_name2 = county + city + ward
	city_mask = city_name1 == city_name2

	town_name1 = rsdtdsp_df['oaza_cho'].fillna('') + rsdtdsp_df['chome'].fillna('')
	town_name2 = oaza_cho + chome
	town_mask = town_name1 == town_name2

	koaza_mask = rsdtdsp_df['koaza'].fillna('') == koaza

	rsdtdsp_df_filtered = rsdtdsp_df[city_mask & town_mask & koaza_mask]

	if len(rsdtdsp_df_filtered) == 0:
	return [pref + county + city + ward + oaza_cho + chome + koaza]

	cities = rsdtdsp_df_filtered['city'].fillna('')
	wards = rsdtdsp_df_filtered['ward'].fillna('')
	oaza_chos = rsdtdsp_df_filtered['oaza_cho'].fillna('')
	chomes = rsdtdsp_df_filtered['chome'].fillna('')
	koazas = rsdtdsp_df_filtered['koaza'].fillna('')
	blk_nums = rsdtdsp_df_filtered['blk_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
	rsdt_nums = rsdtdsp_df_filtered['rsdt_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
	rsdt_num2s = rsdtdsp_df_filtered['rsdt_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')

	# アドレスを生成
	return [
	f"{pref}{_city}{_ward}{_oaza_cho}{_chome}{_koaza}{_blk_num}" +
	(f"-{_rsdt_num}" if _rsdt_num else '') +
	(f"-{_rsdt_num2}" if _rsdt_num2 else '')
	for _city, _ward, _oaza_cho, _chome, _koaza, _blk_num, _rsdt_num, _rsdt_num2 in zip(
	cities, wards, oaza_chos, chomes, koazas, blk_nums, rsdt_nums, rsdt_num2s)
	]

	def compare_two_addresses(address1, address2):
	preprocessed1 = preprocess(address1)
	preprocessed2 = preprocess(address2)
	hits1 = vector_search(preprocessed1, top_k=1)
	hits2 = vector_search(preprocessed2, top_k=1)
	normalized1 = hits1[0][-1]
	normalized2 = hits2[0][-1]
	result = compare(normalized1, normalized2)
	return result

	def normalize_address(query_address):
	with measure('convert_zenkaku_to_hankaku'):
	query_address = convert_zenkaku_to_hankaku(query_address)
	with measure('split_address_building_with_gpt'):
	splitted = split_address_building_with_gpt(query_address)
	with measure('preprocess'):
	preprocessed = preprocess(splitted['address'])
	with measure('vector_search'):
	hits = vector_search(preprocessed, 1)
	with measure('split_address'):
	splits = {
	'pref': hits[0][3],
	'county': hits[0][4],
	'city': hits[0][5],
	'ward': hits[0][6],
	'oaza_cho': hits[0][7],
	'chome': hits[0][8],
	'koaza': hits[0][9],
	}
	with measure('get_addresses_with_parcel'):
	addresses = get_addresses_with_parcel(
	splits['pref'], splits['county'], splits['city'], splits['ward'],
	splits['oaza_cho'], splits['chome'], splits['koaza'])
	with measure('get_addresses_with_rsdtdsp'):
	addresses += get_addresses_with_rsdtdsp(
	splits['pref'], splits['county'], splits['city'], splits['ward'],
	splits['oaza_cho'], splits['chome'], splits['koaza'])
	addresses = list(set(addresses)) # 重複を除去
	with measure('embed_via_multilingual_e5_large'):
	embeds = embed_via_multilingual_e5_large([splitted['address']] + addresses)
	query_embed = [embeds[0]]
	address_embeds = embeds[1:]
	with measure('cosine'):
	# コサイン類似度を計算
	similarities = cosine_similarity(query_embed, address_embeds)

	best_match_indices = np.argsort(similarities[0])[-1:][::-1] # 上位Kのインデックスを取得
	best_addresses = [addresses[i] for i in best_match_indices]

	best_address = best_addresses[0]
	return best_address + splitted['building']

	def convert_no_to_hyphen(query_address):
	return re.sub(r'(?<=\d)の(?=\d)', '-', query_address)

	def normalize_address_v2(query_address, top_k=1):
	with measure('convert_zenkaku_to_hankaku'):
	query_address = convert_zenkaku_to_hankaku(query_address)
	with measure('split_address_building_with_gpt'):
	splitted = split_address_building_with_gpt(query_address)
	with measure('get_spelling'):
	spelling = get_spelling(splitted['address'])
	if spelling:
	splitted['address'] = spelling
	with measure(''):
	splitted['address'] = convert_no_to_hyphen(splitted['address'])
	with measure('preprocess'):
	preprocessed = preprocess(splitted['address'])
	with measure('vector_search'):
	hits = vector_search(preprocessed, 1)
	with measure('split_address'):
	splits = {
	'pref': hits[0][3],
	'county': hits[0][4],
	'city': hits[0][5],
	'ward': hits[0][6],
	'oaza_cho': hits[0][7],
	'chome': hits[0][8],
	'koaza': hits[0][9],
	}
	with measure('get_addresses_with_parcel'):
	addresses = get_addresses_with_parcel(
	splits['pref'], splits['county'], splits['city'], splits['ward'],
	splits['oaza_cho'], splits['chome'], splits['koaza'])
	with measure('get_addresses_with_rsdtdsp'):
	addresses += get_addresses_with_rsdtdsp(
	splits['pref'], splits['county'], splits['city'], splits['ward'],
	splits['oaza_cho'], splits['chome'], splits['koaza'])
	addresses = list(set(addresses)) # 重複を除去
	with measure('embed_via_multilingual_e5_large'):
	embeds = embed_via_multilingual_e5_large([splitted['address']] + addresses)
	query_embed = [embeds[0]]
	address_embeds = embeds[1:]
	with measure('cosine'):
	# コサイン類似度を計算
	similarities = cosine_similarity(query_embed, address_embeds)

	best_match_indices = np.argsort(similarities[0])[-top_k:][::-1] # 上位Kのインデックスを取得
	best_addresses = [addresses[i] for i in best_match_indices]
	best_similarities = similarities[0][best_match_indices]

	return splitted, hits, splits, best_addresses, best_similarities


	# =========================
	# FastAPI definition
	# =========================
	from fastapi import FastAPI
	from pydantic import BaseModel, Field
	from typing import Literal

	app = FastAPI(
	title="住所処理API",
	description="住所の正規化・比較を行うAPIです。",
	version="1.0.0"
	)

	# ---------------------------
	# リクエスト・レスポンス定義
	# ---------------------------
	class CompareAddressesRequest(BaseModel):
	address1: str = Field(..., description="比較する最初の住所", example="東京墨田区押上１丁目１-1")
	address2: str = Field(..., description="比較する2番目の住所", example="東京墨田区押上 1-１-1")

	class CompareAddressesResponse(BaseModel):
	result: Literal[True, False] = Field(..., description="比較結果", example=True)

	class NormalizeAddressRequest(BaseModel):
	query_address: str = Field(..., description="正規化する住所", example="東京墨田区押上１丁目１-1")

	class NormalizeAddressResponse(BaseModel):
	normalized: str = Field(..., description="正規化された住所", example="東京都墨田区押上一丁目1-1")


	# ---------------------------
	# エンドポイント定義
	# ---------------------------
	@app.post(
	"/compare-two-addresses",
	response_model=CompareAddressesResponse,
	summary="2つの住所を比較する",
	description="2つの住所を比較し、一致するかどうかを返します。",
	responses={
	200: {
	"description": "比較結果を Bool 値 (true/false) として返す",
	"content": {
	"application/json": {
	"example": {
	"result": True
	}
	}
	}
	}
	}
	)
	async def compare_two_addresses_api(request: CompareAddressesRequest):
	"""
	- address1: 比較する最初の住所
	- address2: 比較する2番目の住所
	"""
	result = compare_two_addresses(request.address1, request.address2)
	return {"result": result}

	@app.post(
	"/normalize-address",
	response_model=NormalizeAddressResponse,
	summary="住所を正規化する",
	description="指定された住所を正規化し、正規化後の住所を返します。",
	responses={
	200: {
	"description": "正規化結果の返却",
	"content": {
	"application/json": {
	"example": {
	"normalized": "東京都千代田区一丁目1番"
	}
	}
	}
	}
	}
	)
	async def normalize_address_api(request: NormalizeAddressRequest):
	"""
	- query_address: 正規化する住所
	"""
	normalized = normalize_address(request.query_address)
	return {"normalized": normalized}

	@app.post(
	"/normalize-address-v2",
	response_model=NormalizeAddressResponse,
	summary="住所を正規化する",
	description="指定された住所を正規化し、正規化後の住所を返します。",
	responses={
	200: {
	"description": "正規化結果の返却",
	"content": {
	"application/json": {
	"example": {
	"normalized": "東京都千代田区一丁目1番"
	}
	}
	}
	}
	}
	)
	async def normalize_address_v2_api(request: NormalizeAddressRequest):
	"""
	- query_address: 正規化する住所
	"""
	_, __, ___, bests, _____ = normalize_address_v2(request.query_address)
	return {"normalized": bests[0]}


	# =========================
	# Gradio tabs definition
	# =========================
	examples = [
	'東京都中央区みなと3の12の10、プレサンスロゼ東京港301。',
	'東京都荒川区１−5−6荒川マンション102',
	'福岡市中央区天神1の11の2',
	'私の住所は京都府京都市右京区太秦青木元町４－１０です。',
	'京都府京都市右京区太秦青木元町４－１０',
	'京都府京都市右京区太秦青木元町４－１０ダックス101号室',
	'京都府宇治市伊勢田町名木１－１－４ダックス101号室',
	'東京都渋谷区道玄坂1-12-1',
	'私の住所は東京都渋谷区道玄坂1-12-1です。',
	'私の住所は東京都しぶや道玄坂1の12の1です。',
	'東京都渋谷区道玄坂1の12の1で契約しています。',
	'秋田県秋田市山王四丁目1番1号です。',
	'東京墨田区押上１丁目１',
	'三重県伊勢市宇治館町',
	'住所は 030-0803 青森県青森市安方１丁目１−４０になります。',
	'東京都大島町差木地字クダッチ',
	'前橋市大手町１丁目1番地１',
	'東京都渋谷区表参道の3の5の6。',
	'琉球圏尾張町3の5の6に住んでます。',
	'3254987の場所です。',
	'大阪府でした。',
	'1940923の東京都渋谷区道玄坂一丁目。渋谷マークシティウェスト23階です。',
	'名前は山田太郎です。',
	'はい。名古屋、あ、愛知県名古屋市南里2の3の4だと思います。',
	'ー',
	'少し待ってください。',
	]

	def create_function_test_tab():
	def create_remove_left_of_pref_tab():
	with gr.Tab("remove_left_of_pref"):
	in_tb = gr.Textbox(label='インプット', placeholder='テキストを入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	out_tb = gr.Textbox(label='アウトプット')
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=remove_left_of_pref,
	inputs=[in_tb],
	outputs=[out_tb],
	)
	def create_replace_circle_tab():
	with gr.Tab("replace_circle"):
	in_tb = gr.Textbox(label='インプット', placeholder='テキストを入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	out_tb = gr.Textbox(label='アウトプット')
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=replace_circle,
	inputs=[in_tb],
	outputs=[out_tb],
	)
	def create_remove_filler_tab():
	with gr.Tab("remove_filler"):
	in_tb = gr.Textbox(label='インプット', placeholder='テキストを入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	out_tb = gr.Textbox(label='アウトプット')
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=remove_filler,
	inputs=[in_tb],
	outputs=[out_tb],
	)
	def create_preprocess_tab():
	with gr.Tab("preprocess"):
	in_tb = gr.Textbox(label='インプット', placeholder='テキストを入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	out_tb = gr.Textbox(label='アウトプット')
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=preprocess,
	inputs=[in_tb],
	outputs=[out_tb],
	)
	def create_compare_two_addresses_tab():
	with gr.Tab("compare_two_addresses"):
	in_tb1 = gr.Textbox(label='住所1 (顧客が発言した住所)', placeholder='住所を入力してください')
	gr.Examples(examples=examples, inputs=[in_tb1])
	in_tb2 = gr.Textbox(label='住所2 (CRM 内に格納されている住所)', placeholder='住所を入力してください')
	out_tb = gr.Textbox(label='アウトプット')
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=compare_two_addresses,
	inputs=[in_tb1, in_tb2],
	outputs=[out_tb],
	)
	def create_normalize_address_tab():
	with gr.Tab("normalize_address"):
	in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	out_tb = gr.Textbox(label='アウトプット')
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=normalize_address,
	inputs=[in_tb],
	outputs=[out_tb],
	)
	def create_normalize_address__v2_tab():
	with gr.Tab("normalize_address_v2"):
	in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	out_tb = gr.Textbox(label='アウトプット')
	exe_button = gr.Button(value='実行', variant='primary')

	def f(query_address):
	splitted, __, ___, bests, _____ = normalize_address_v2(query_address)
	return bests[0] + splitted['building']

	exe_button.click(
	fn=f,
	inputs=[in_tb],
	outputs=[out_tb],
	)
	def create_split_address_tab():
	with gr.Tab("split_address"):
	in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	out_tb = gr.Textbox(label='アウトプット')
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=split_address,
	inputs=[in_tb],
	outputs=[out_tb],
	)
	def create_split_address_building_tab():
	with gr.Tab("split_address_building_with_gpt"):
	in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	out_tb = gr.Textbox(label='アウトプット')
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=split_address_building_with_gpt,
	inputs=[in_tb],
	outputs=[out_tb],
	)
	def create_convert_zenkaku_to_hankaku_tab():
	with gr.Tab("convert_zenkaku_to_hankaku"):
	in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	out_tb = gr.Textbox(label='アウトプット')
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=convert_zenkaku_to_hankaku,
	inputs=[in_tb],
	outputs=[out_tb],
	)
	def create_vector_search():
	def f(query_address, top_k):
	with measure('preprocess'):
	preprocessed = preprocess(query_address)
	with measure('vector_search'):
	hits = vector_search(preprocessed, top_k=top_k)
	return pd.DataFrame(hits, columns=['Top-k', '類似度', '住所', '都道府県', '郡', '市区町村', '政令市区', '大字・町', '丁目', '小字'])
	with gr.Tab("vector_search"):
	in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	top_k_input = gr.Slider(minimum=1, maximum=100, step=1, value=5, label='検索数top-k')
	out_df = gr.Dataframe(label="アウトプット", wrap=True)
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=f,
	inputs=[in_tb, top_k_input],
	outputs=[out_df],
	)
	def create_get_addresses_with_parcel_tab():
	def f(query_address):
	with measure('preprocess'):
	preprocessed = preprocess(query_address)
	with measure('vector_search'):
	hits = vector_search(preprocessed, top_k=1)
	with measure('split_address'):
	splits = {
	'pref': hits[0][3],
	'county': hits[0][4],
	'city': hits[0][5],
	'ward': hits[0][6],
	'oaza_cho': hits[0][7],
	'chome': hits[0][8],
	'koaza': hits[0][9],
	}
	with measure('get_addresses_with_parcel'):
	addresses = get_addresses_with_parcel(
	splits['pref'], splits['county'], splits['city'], splits['ward'],
	splits['oaza_cho'], splits['chome'], splits['koaza'])
	return pd.DataFrame(addresses, columns=['住所'])
	with gr.Tab("get_addresses_with_parcel"):
	in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	out_df = gr.Dataframe(label="アウトプット", wrap=True)
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=f,
	inputs=[in_tb],
	outputs=[out_df],
	)
	def create_get_spelling_tab():
	with gr.Tab("create_get_spelling_tab"):
	in_tb = gr.Textbox(label='住所', placeholder='住所を入力してください')
	gr.Examples(examples=examples, inputs=[in_tb])
	out_tb = gr.Textbox(label='アウトプット')
	exe_button = gr.Button(value='実行', variant='primary')
	exe_button.click(
	fn=get_spelling,
	inputs=[in_tb],
	outputs=[out_tb],
	)

	with gr.Tab("関数テスト"):
	create_normalize_address_tab()
	create_normalize_address__v2_tab()
	create_compare_two_addresses_tab()
	create_get_spelling_tab()
	create_get_addresses_with_parcel_tab()
	create_vector_search()
	create_remove_left_of_pref_tab()
	create_replace_circle_tab()
	create_remove_filler_tab()
	create_preprocess_tab()
	create_split_address_tab()
	create_split_address_building_tab()
	create_convert_zenkaku_to_hankaku_tab()

	def create_digital_agency_tab():
	with gr.Tab("デジ庁API"):
	with gr.Row():
	with gr.Column():
	address_input_tab2 = gr.Textbox(label='住所', placeholder='検索したい住所を入力してください')
	gr.Examples(examples=examples, inputs=[address_input_tab2])
	search_button_tab2 = gr.Button(value='検索', variant='primary')
	result_tb = gr.Textbox(label='正規化後')
	result_df = gr.Dataframe(label="正規化後（分割）", wrap=True)

	def normalize_address_via_abrg_geocode(query_address):
	query_address = preprocess(query_address)

	url = f'{ABRG_ENDPOINT}/geocode?address={query_address}'
	response = requests.get(url)
	result = response.json()[0]['result']

	normalized = result['output']
	data = {
	'pref': result['pref'],
	'county': result['county'],
	'city': result['city'],
	'ward': result['ward'],
	'oaza_cho': result['oaza_cho'],
	'chome': result['chome'],
	'koaza': result['koaza'],
	'blk_num': result['blk_num'],
	'rsdt_num': result['rsdt_num'],
	'rsdt_num2': result['rsdt_num2'],
	'prc_num1': result['prc_num1'],
	'prc_num2': result['prc_num2'],
	'prc_num3': result['prc_num3'],
	'others': ''.join(result['others'])
	}
	df = pd.DataFrame([data])

	return normalized, df

	search_button_tab2.click(
	fn=normalize_address_via_abrg_geocode,
	inputs=[address_input_tab2],
	outputs=[result_tb, result_df],
	)

	def create_vector_search_tab():
	with gr.Tab("ベクトル検索"):
	with gr.Row():
	with gr.Column():
	address_input = gr.Textbox(label='住所', placeholder='検索したい住所を入力してください')
	gr.Examples(examples=examples, inputs=[address_input])
	top_k_input = gr.Slider(minimum=1, maximum=100, step=1, value=5, label='検索数top-k')
	search_button = gr.Button(value='検索', variant='primary')
	result_tb = gr.Textbox(label='正規化後')
	result_df = gr.Dataframe(label="正規化後（分割）", wrap=True)
	search_result_df = gr.Dataframe(label="町丁目まで検索結果")
	chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")

	def search_address(query_address, top_k):
	with measure('convert_zenkaku_to_hankaku'):
	query_address = convert_zenkaku_to_hankaku(query_address)
	with measure('split_address_building_with_gpt'):
	splitted = split_address_building_with_gpt(query_address)
	with measure('preprocess'):
	preprocessed = preprocess(splitted['address'])
	with measure('vector_search'):
	hits = vector_search(preprocessed, top_k)
	search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所', '都道府県', '郡', '市区町村', '政令市区', '大字・町', '丁目', '小字'])
	with measure('split_address'):
	splits = {
	'pref': hits[0][3],
	'county': hits[0][4],
	'city': hits[0][5],
	'ward': hits[0][6],
	'oaza_cho': hits[0][7],
	'chome': hits[0][8],
	'koaza': hits[0][9],
	}
	result_df = pd.DataFrame([splits.values()], columns=splits.keys())
	with measure('get_addresses_with_parcel'):
	addresses = get_addresses_with_parcel(
	splits['pref'], splits['county'], splits['city'], splits['ward'],
	splits['oaza_cho'], splits['chome'], splits['koaza'])
	with measure('get_addresses_with_rsdtdsp'):
	addresses += get_addresses_with_rsdtdsp(
	splits['pref'], splits['county'], splits['city'], splits['ward'],
	splits['oaza_cho'], splits['chome'], splits['koaza'])
	addresses = list(set(addresses)) # 重複を除去
	with measure('embed_via_multilingual_e5_large'):
	embeds = embed_via_multilingual_e5_large([splitted['address']] + addresses)
	query_embed = [embeds[0]]
	address_embeds = embeds[1:]
	with measure('cosine'):
	# コサイン類似度を計算
	similarities = cosine_similarity(query_embed, address_embeds)

	best_match_indices = np.argsort(similarities[0])[-top_k:][::-1] # 上位Kのインデックスを取得
	best_addresses = [addresses[i] for i in best_match_indices]
	best_similarities = similarities[0][best_match_indices]
	print(top_k)
	print('len(best_similarities)', len(best_similarities))
	print('len(best_addresses)', len(best_addresses))

	chiban_result_df = pd.DataFrame({
	'Top-k': range(1, len(best_similarities) + 1),
	'類似度': best_similarities,
	'住所': [best_address + splitted['building'] for best_address in best_addresses]
	})

	best_address = best_addresses[0] + splitted['building']

	return search_result_df, chiban_result_df, best_address, result_df

	search_button.click(
	fn=search_address,
	inputs=[address_input, top_k_input],
	outputs=[search_result_df, chiban_result_df, result_tb, result_df],
	)

	def create_vector_search_v2_tab():
	with gr.Tab("ベクトル検索V2"):
	with gr.Row():
	with gr.Column():
	address_input = gr.Textbox(label='住所', placeholder='検索したい住所を入力してください')
	gr.Examples(examples=examples, inputs=[address_input])
	top_k_input = gr.Slider(minimum=1, maximum=100, step=1, value=5, label='検索数top-k')
	search_button = gr.Button(value='検索', variant='primary')
	result_tb = gr.Textbox(label='正規化後')
	result_df = gr.Dataframe(label="正規化後（分割）", wrap=True)
	search_result_df = gr.Dataframe(label="町丁目まで検索結果")
	chiban_result_df = gr.Dataframe(label="地番・住居表示検索結果")

	def search_address(query_address, top_k):
	splitted, hits, splits, best_addresses, best_similarities = normalize_address_v2(query_address, top_k)
	search_result_df = pd.DataFrame(hits, columns=['Top-k', '類似度', '住所', '都道府県', '郡', '市区町村', '政令市区', '大字・町', '丁目', '小字'])
	result_df = pd.DataFrame([splits.values()], columns=splits.keys())
	chiban_result_df = pd.DataFrame({
	'Top-k': range(1, len(best_similarities) + 1),
	'類似度': best_similarities,
	'住所': [best_address + splitted['building'] for best_address in best_addresses]
	})
	best_address = best_addresses[0] + splitted['building']

	return search_result_df, chiban_result_df, best_address, result_df

	search_button.click(
	fn=search_address,
	inputs=[address_input, top_k_input],
	outputs=[search_result_df, chiban_result_df, result_tb, result_df],
	)

	with gr.Blocks() as demo:
	create_function_test_tab()
	create_vector_search_tab()
	create_vector_search_v2_tab()
	create_digital_agency_tab()

	app = gr.mount_gradio_app(app, demo, path='/')