Spaces:

AtPeak
/

japanese-address-search-v2

Sleeping

App Files Files Community

matsuap commited on May 1

Commit

05dae99

1 Parent(s): c826c92

住所取得機能を追加し、都道府県、郡、市区町村、政令市区、大字・町、丁目、小字を基にアドレスを生成するロジックを実装。重複アドレスの除去機能を強化し、コサイン類似度を用いた検索結果の精度を向上。

Browse files

Files changed (1) hide show

app.py +189 -98

app.py CHANGED Viewed

@@ -261,6 +261,150 @@ def search_via_milvus(query_vector, top_k, collection_name, thresh=0.0):
     return hits
 # =========================
 #  FastAPI definition
 # =========================
@@ -282,8 +426,39 @@ def normalize_address(query_address):
     with measure('preprocess'):
         preprocessed = preprocess(query_address)
     with measure('vector_search'):
-        result = vector_search(preprocessed, top_k=1)[0][-1]
-    return result
 # =========================
 #  Gradio tabs definition
@@ -459,103 +634,19 @@ def create_vector_search_tab():
                     'koaza': hits[0][9],
                 }
                 result_df = pd.DataFrame([splits.values()], columns=splits.keys())
-            with measure('load city_all_file'):
-                city_all_file = TARGET_DIR / 'mt_city_all.csv'
-                city_all_df = pd.read_csv(city_all_file)
-                city_all_df_temp = city_all_df[city_all_df['pref'] == splits['pref']]
-                city_name1 = city_all_df_temp['county'].fillna('') + city_all_df_temp['city'].fillna('') + city_all_df_temp['ward'].fillna('')
-                city_name2 = splits['county'] + splits['city'] + splits['ward']
-                lg_codes = city_all_df_temp[city_name1 == city_name2]['lg_code'].values
-                if len(lg_codes) > 1:
-                    raise Exception('Too many lg_code')
-                lg_code = lg_codes[0]
-            with measure('load parcel_city_file'):
-                parcel_city_file = TARGET_DIR / 'parcel' / f'mt_parcel_city{lg_code:06d}.csv'
-                if not os.path.exists(parcel_city_file):
-                    # raise gr.Error('Too many lg_code')
-                    raise Exception('Too many lg_code')
-                parcel_city_df = pd.read_csv(parcel_city_file)
-                cities = parcel_city_df['city'].fillna('')
-                wards = parcel_city_df['ward'].fillna('')
-                oaza_chos = parcel_city_df['oaza_cho'].fillna('')
-                chomes = parcel_city_df['chome'].fillna('')
-                koazas = parcel_city_df['koaza'].fillna('')
-                city_name1 = cities + wards
-                city_name2 = splits['county'] + splits['city'] + splits['ward']
-                city_mask = city_name1 == city_name2
-                town_name1 = oaza_chos + chomes
-                town_name2 = splits['oaza_cho'] + splits['chome']
-                town_mask = town_name1 == town_name2
-                koaza_mask = koazas == splits['koaza']
-                parcel_city_df_filtered = parcel_city_df[city_mask & town_mask & koaza_mask]
-                cities = parcel_city_df_filtered['city'].fillna('')
-                wards = parcel_city_df_filtered['ward'].fillna('')
-                oaza_chos = parcel_city_df_filtered['oaza_cho'].fillna('')
-                chomes = parcel_city_df_filtered['chome'].fillna('')
-                koazas = parcel_city_df_filtered['koaza'].fillna('')
-                prc_num1s = parcel_city_df_filtered['prc_num1'].fillna(9999).astype(int).astype(str).replace('9999', '')
-                prc_num2s = parcel_city_df_filtered['prc_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
-                prc_num3s = parcel_city_df_filtered['prc_num3'].fillna(9999).astype(int).astype(str).replace('9999', '')
-                # アドレスを生成
-                addresses = [
-                    f"{splits['pref']}{city}{ward}{oaza_cho}{chome}{koaza}{prc_num1}" +
-                    (f"-{prc_num2}" if prc_num2 else '') +
-                    (f"-{prc_num3}" if prc_num3 else '')
-                    for city, ward, oaza_cho, chome, koaza, prc_num1, prc_num2, prc_num3 in zip(
-                        cities, wards, oaza_chos, chomes, koazas, prc_num1s, prc_num2s, prc_num3s
-                    )
-                ]
-            with measure('load rsdtdsp_file'):
-                pref_code = ('%06d' % lg_code)[0:2]
-                rsdtdsp_file = TARGET_DIR / 'rsdt' / f'mt_rsdtdsp_rsdt_pref{pref_code}.csv'
-                if not os.path.exists(rsdtdsp_file):
-                    # raise gr.Error(f'Not found: {rsdtdsp_file}')
-                    raise Exception(f'Not found: {rsdtdsp_file}')
-                rsdtdsp_df = pd.read_csv(rsdtdsp_file)
-                city_name1 = rsdtdsp_df['city'].fillna('') + rsdtdsp_df['ward'].fillna('')
-                city_name2 = splits['county'] + splits['city'] + splits['ward']
-                city_mask = city_name1 == city_name2
-                town_name1 = rsdtdsp_df['oaza_cho'].fillna('') + rsdtdsp_df['chome'].fillna('')
-                town_name2 = splits['oaza_cho'] + splits['chome']
-                town_mask = town_name1 == town_name2
-                koaza_mask = rsdtdsp_df['koaza'].fillna('') == splits['koaza']
-                rsdtdsp_df_filtered = rsdtdsp_df[city_mask & town_mask & koaza_mask]
-                cities = rsdtdsp_df_filtered['city'].fillna('')
-                wards = rsdtdsp_df_filtered['ward'].fillna('')
-                oaza_chos = rsdtdsp_df_filtered['oaza_cho'].fillna('')
-                chomes = rsdtdsp_df_filtered['chome'].fillna('')
-                koazas = rsdtdsp_df_filtered['koaza'].fillna('')
-                blk_nums = rsdtdsp_df_filtered['blk_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
-                rsdt_nums = rsdtdsp_df_filtered['rsdt_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
-                rsdt_num2s = rsdtdsp_df_filtered['rsdt_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
-                # アドレスを生成
-                addresses += [
-                    f"{splits['pref']}{city}{ward}{oaza_cho}{chome}{koaza}{blk_num}" +
-                    (f"-{rsdt_num}" if rsdt_num else '') +
-                    (f"-{rsdt_num2}" if rsdt_num2 else '')
-                    for city, ward, oaza_cho, chome, koaza, blk_num, rsdt_num, rsdt_num2 in zip(
-                        cities, wards, oaza_chos, chomes, koazas, blk_nums, rsdt_nums, rsdt_num2s
-                    )
-                ]
                 addresses = list(set(addresses))  # 重複を除去
-            with measure('query_embed'):
-                query_embed = embed_via_multilingual_e5_large([query_address])
-            with measure('address_embeds'):
-                address_embeds = embed_via_multilingual_e5_large(addresses)
             with measure('cosine'):
                 # コサイン類似度を計算
                 similarities = cosine_similarity(query_embed, address_embeds)

     return hits
+def get_lg_code(pref, county, city, ward):
+    city_all_file = TARGET_DIR / 'mt_city_all.csv'
+    city_all_df = pd.read_csv(city_all_file)
+    city_all_df_temp = city_all_df[city_all_df['pref'] == pref]
+    city_name1 = city_all_df_temp['county'].fillna('') + city_all_df_temp['city'].fillna('') + city_all_df_temp['ward'].fillna('')
+    city_name2 = county + city + ward
+    lg_codes = city_all_df_temp[city_name1 == city_name2]['lg_code'].values
+    if len(lg_codes) > 1:
+        raise Exception('Too many lg_code')
+    return lg_codes[0]
+def get_addresses_with_parcel(pref, county, city, ward, oaza_cho, chome, koaza):
+    lg_code = get_lg_code(pref, county, city, ward)
+    parcel_city_file = TARGET_DIR / 'parcel' / f'mt_parcel_city{lg_code:06d}.csv'
+    if not os.path.exists(parcel_city_file):
+        raise gr.Error('Too many lg_code')
+    parcel_city_df = pd.read_csv(parcel_city_file)
+    cities = parcel_city_df['city'].fillna('')
+    wards = parcel_city_df['ward'].fillna('')
+    oaza_chos = parcel_city_df['oaza_cho'].fillna('')
+    chomes = parcel_city_df['chome'].fillna('')
+    koazas = parcel_city_df['koaza'].fillna('')
+    city_name1 = cities + wards
+    city_name2 = county + city + ward
+    city_mask = city_name1 == city_name2
+    town_name1 = oaza_chos + chomes
+    town_name2 = oaza_cho + chome
+    town_mask = town_name1 == town_name2
+    koaza_mask = koazas == koaza
+    parcel_city_df_filtered = parcel_city_df[city_mask & town_mask & koaza_mask]
+    cities = parcel_city_df_filtered['city'].fillna('')
+    wards = parcel_city_df_filtered['ward'].fillna('')
+    oaza_chos = parcel_city_df_filtered['oaza_cho'].fillna('')
+    chomes = parcel_city_df_filtered['chome'].fillna('')
+    koazas = parcel_city_df_filtered['koaza'].fillna('')
+    prc_num1s = parcel_city_df_filtered['prc_num1'].fillna(9999).astype(int).astype(str).replace('9999', '')
+    prc_num2s = parcel_city_df_filtered['prc_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
+    prc_num3s = parcel_city_df_filtered['prc_num3'].fillna(9999).astype(int).astype(str).replace('9999', '')
+    # アドレスを生成
+    return [
+        f"{pref}{_city}{_ward}{_oaza_cho}{_chome}{_koaza}{_prc_num1}" +
+        (f"-{_prc_num2}" if _prc_num2 else '') +
+        (f"-{_prc_num3}" if _prc_num3 else '')
+        for _city, _ward, _oaza_cho, _chome, _koaza, _prc_num1, _prc_num2, _prc_num3 in zip(
+            cities, wards, oaza_chos, chomes, koazas, prc_num1s, prc_num2s, prc_num3s
+        )
+    ]
+pref_names = [
+    '北海道',
+    '青森県',
+    '岩手県',
+    '宮城県',
+    '秋田県',
+    '山形県',
+    '福島県',
+    '茨城県',
+    '栃木県',
+    '群馬県',
+    '埼玉県',
+    '千葉県',
+    '東京都',
+    '神奈川県',
+    '新潟県',
+    '富山県',
+    '石川県',
+    '福井県',
+    '山梨県',
+    '長野県',
+    '岐阜県',
+    '静岡県',
+    '愛知県',
+    '三重県',
+    '滋賀県',
+    '京都府',
+    '大阪府',
+    '兵庫県',
+    '奈良県',
+    '和歌山県',
+    '鳥取県',
+    '島根県',
+    '岡山県',
+    '広島県',
+    '山口県',
+    '徳島県',
+    '香川県',
+    '愛媛県',
+    '高知県',
+    '福岡県',
+    '佐賀県',
+    '長崎県',
+    '熊本県',
+    '大分県',
+    '宮崎県',
+    '鹿児島県',
+    '沖縄県'
+]
+def get_pref_code(pref):
+    return pref_names.index(pref) + 1
+def get_addresses_with_rsdtdsp(pref, county, city, ward, oaza_cho, chome, koaza):
+    pref_code = get_pref_code(pref)
+    rsdtdsp_file = TARGET_DIR / 'rsdt' / f'mt_rsdtdsp_rsdt_pref{pref_code:02d}.csv'
+    if not os.path.exists(rsdtdsp_file):
+        raise gr.Error(f'Not found: {rsdtdsp_file}')
+    rsdtdsp_df = pd.read_csv(rsdtdsp_file)
+    city_name1 = rsdtdsp_df['city'].fillna('') + rsdtdsp_df['ward'].fillna('')
+    city_name2 = county + city + ward
+    city_mask = city_name1 == city_name2
+    town_name1 = rsdtdsp_df['oaza_cho'].fillna('') + rsdtdsp_df['chome'].fillna('')
+    town_name2 = oaza_cho + chome
+    town_mask = town_name1 == town_name2
+    koaza_mask = rsdtdsp_df['koaza'].fillna('') == koaza
+    rsdtdsp_df_filtered = rsdtdsp_df[city_mask & town_mask & koaza_mask]
+    cities = rsdtdsp_df_filtered['city'].fillna('')
+    wards = rsdtdsp_df_filtered['ward'].fillna('')
+    oaza_chos = rsdtdsp_df_filtered['oaza_cho'].fillna('')
+    chomes = rsdtdsp_df_filtered['chome'].fillna('')
+    koazas = rsdtdsp_df_filtered['koaza'].fillna('')
+    blk_nums = rsdtdsp_df_filtered['blk_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
+    rsdt_nums = rsdtdsp_df_filtered['rsdt_num'].fillna(9999).astype(int).astype(str).replace('9999', '')
+    rsdt_num2s = rsdtdsp_df_filtered['rsdt_num2'].fillna(9999).astype(int).astype(str).replace('9999', '')
+    # アドレスを生成
+    return [
+        f"{pref}{_city}{_ward}{_oaza_cho}{_chome}{_koaza}{_blk_num}" +
+        (f"-{_rsdt_num}" if _rsdt_num else '') +
+        (f"-{_rsdt_num2}" if _rsdt_num2 else '')
+        for _city, _ward, _oaza_cho, _chome, _koaza, _blk_num, _rsdt_num, _rsdt_num2 in zip(
+            cities, wards, oaza_chos, chomes, koazas, blk_nums, rsdt_nums, rsdt_num2s)
+    ]
 # =========================
 #  FastAPI definition
 # =========================
     with measure('preprocess'):
         preprocessed = preprocess(query_address)
     with measure('vector_search'):
+        hits = vector_search(preprocessed, 1)
+    with measure('split_address'):
+        splits = {
+            'pref': hits[0][3],
+            'county': hits[0][4],
+            'city': hits[0][5],
+            'ward': hits[0][6],
+            'oaza_cho': hits[0][7],
+            'chome': hits[0][8],
+            'koaza': hits[0][9],
+        }
+    with measure('get_addresses_with_parcel'):
+        addresses = get_addresses_with_parcel(
+                splits['pref'], splits['county'], splits['city'], splits['ward'],
+                splits['oaza_cho'], splits['chome'], splits['koaza'])
+    with measure('get_addresses_with_rsdtdsp'):
+        addresses += get_addresses_with_rsdtdsp(
+                splits['pref'], splits['county'], splits['city'], splits['ward'],
+                splits['oaza_cho'], splits['chome'], splits['koaza'])
+        addresses = list(set(addresses))  # 重複を除去
+    with measure('embed_via_multilingual_e5_large'):
+        embeds = embed_via_multilingual_e5_large([query_address] + addresses)
+        query_embed = [embeds[0]]
+        address_embeds = embeds[1:]
+    with measure('cosine'):
+        # コサイン類似度を計算
+        similarities = cosine_similarity(query_embed, address_embeds)
+        best_match_indices = np.argsort(similarities[0])[-1:][::-1]  # 上位Kのインデックスを取得
+        best_addresses = [addresses[i] for i in best_match_indices]
+        best_address = best_addresses[0]
+    return best_address
 # =========================
 #  Gradio tabs definition
                     'koaza': hits[0][9],
                 }
                 result_df = pd.DataFrame([splits.values()], columns=splits.keys())
+            with measure('get_addresses_with_parcel'):
+                addresses = get_addresses_with_parcel(
+                        splits['pref'], splits['county'], splits['city'], splits['ward'],
+                        splits['oaza_cho'], splits['chome'], splits['koaza'])
+            with measure('get_addresses_with_rsdtdsp'):
+                addresses += get_addresses_with_rsdtdsp(
+                        splits['pref'], splits['county'], splits['city'], splits['ward'],
+                        splits['oaza_cho'], splits['chome'], splits['koaza'])
                 addresses = list(set(addresses))  # 重複を除去
+            with measure('embed_via_multilingual_e5_large'):
+                embeds = embed_via_multilingual_e5_large([query_address] + addresses)
+                query_embed = [embeds[0]]
+                address_embeds = embeds[1:]
             with measure('cosine'):
                 # コサイン類似度を計算
                 similarities = cosine_similarity(query_embed, address_embeds)