Spaces:
Sleeping
Sleeping
change service
Browse files
config.py
CHANGED
|
@@ -12,6 +12,7 @@ UPLOAD_DIR = os.path.join(BASE_DIR, "uploads")
|
|
| 12 |
OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
|
| 13 |
SUBJECT_DATA_FILE = os.path.join(DATA_DIR, "subjectData.csv")
|
| 14 |
SAMPLE_DATA_FILE = os.path.join(DATA_DIR, "sampleData.csv")
|
|
|
|
| 15 |
# Model Names
|
| 16 |
MODEL_NAME = "Detomo/cl-nagoya-sup-simcse-ja-for-standard-name-v0_9_10"
|
| 17 |
SENTENCE_EMBEDDING_FILE = os.path.join(
|
|
|
|
| 12 |
OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
|
| 13 |
SUBJECT_DATA_FILE = os.path.join(DATA_DIR, "subjectData.csv")
|
| 14 |
SAMPLE_DATA_FILE = os.path.join(DATA_DIR, "sampleData.csv")
|
| 15 |
+
STANDARD_NAME_MAP_DATA_FILE = os.path.join(DATA_DIR, "standardNameMapData.csv")
|
| 16 |
# Model Names
|
| 17 |
MODEL_NAME = "Detomo/cl-nagoya-sup-simcse-ja-for-standard-name-v0_9_10"
|
| 18 |
SENTENCE_EMBEDDING_FILE = os.path.join(
|
data/anchor_name_sentence_sentence_embeddings(cl-nagoya-sup-simcse-ja-for-standard-name-v0_9_10).pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e25d41cf2b9ab5b90f2c0e7e0f5d0ec31499f7dcb252de64d7af20ab63e91750
|
| 3 |
+
size 12073124
|
data/standardNameMapData.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
routes/predict.py
CHANGED
|
@@ -8,7 +8,7 @@ from auth import get_current_user
|
|
| 8 |
from services.sentence_transformer_service import SentenceTransformerService, sentence_transformer_service
|
| 9 |
from data_lib.input_name_data import InputNameData
|
| 10 |
from data_lib.base_name_data import COL_NAME_SENTENCE
|
| 11 |
-
from mapping_lib.
|
| 12 |
from config import UPLOAD_DIR, OUTPUT_DIR
|
| 13 |
|
| 14 |
router = APIRouter()
|
|
@@ -39,36 +39,32 @@ async def predict(
|
|
| 39 |
try:
|
| 40 |
# Process input data
|
| 41 |
start_time = time.time()
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
similarity_matrix = sentence_service.sentenceTransformerHelper.create_similarity_matrix_from_embeddings(
|
| 50 |
-
sentence_service.sample_name_sentence_embeddings,
|
| 51 |
-
input_name_sentence_embeddings
|
| 52 |
-
)
|
| 53 |
-
|
| 54 |
# Map standard names
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
| 64 |
|
| 65 |
# Create output dataframe and save to CSV
|
| 66 |
column_to_keep = ['ファイル名', 'シート名', '行', '科目', '中科目', '分類', '名称', '摘要', '備考']
|
| 67 |
output_df = inputData.dataframe[column_to_keep].copy()
|
| 68 |
output_df.reset_index(drop=False, inplace=True)
|
| 69 |
-
output_df.loc[:, "出力_科目"] = df_predicted["
|
| 70 |
-
output_df.loc[:, "出力_項目名"] = df_predicted["
|
| 71 |
-
output_df.loc[:, "出力_確率度"] = df_predicted["
|
| 72 |
|
| 73 |
# Save with utf_8_sig encoding for Japanese Excel compatibility
|
| 74 |
output_df.to_csv(output_file_path, index=False, encoding="utf_8_sig")
|
|
|
|
| 8 |
from services.sentence_transformer_service import SentenceTransformerService, sentence_transformer_service
|
| 9 |
from data_lib.input_name_data import InputNameData
|
| 10 |
from data_lib.base_name_data import COL_NAME_SENTENCE
|
| 11 |
+
from mapping_lib.name_mapper import NameMapper
|
| 12 |
from config import UPLOAD_DIR, OUTPUT_DIR
|
| 13 |
|
| 14 |
router = APIRouter()
|
|
|
|
| 39 |
try:
|
| 40 |
# Process input data
|
| 41 |
start_time = time.time()
|
| 42 |
+
try:
|
| 43 |
+
inputData = InputNameData(sentence_service.dic_standard_subject)
|
| 44 |
+
inputData.load_data_from_csv(input_file_path)
|
| 45 |
+
inputData.process_data(sentence_service.sentenceTransformerHelper)
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"Error processing input data: {e}")
|
| 48 |
+
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
# Map standard names
|
| 50 |
+
try:
|
| 51 |
+
nameMapper = NameMapper(
|
| 52 |
+
sentence_service.sentenceTransformerHelper,
|
| 53 |
+
sentence_service.standardNameMapData,
|
| 54 |
+
top_count=3
|
| 55 |
+
)
|
| 56 |
+
df_predicted = nameMapper.predict(inputData)
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f"Error mapping standard names: {e}")
|
| 59 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 60 |
|
| 61 |
# Create output dataframe and save to CSV
|
| 62 |
column_to_keep = ['ファイル名', 'シート名', '行', '科目', '中科目', '分類', '名称', '摘要', '備考']
|
| 63 |
output_df = inputData.dataframe[column_to_keep].copy()
|
| 64 |
output_df.reset_index(drop=False, inplace=True)
|
| 65 |
+
output_df.loc[:, "出力_科目"] = df_predicted["標準科目"]
|
| 66 |
+
output_df.loc[:, "出力_項目名"] = df_predicted["標準項目名"]
|
| 67 |
+
output_df.loc[:, "出力_確率度"] = df_predicted["基準名称類似度"]
|
| 68 |
|
| 69 |
# Save with utf_8_sig encoding for Japanese Excel compatibility
|
| 70 |
output_df.to_csv(output_file_path, index=False, encoding="utf_8_sig")
|
services/sentence_transformer_service.py
CHANGED
|
@@ -2,18 +2,18 @@ import pickle
|
|
| 2 |
from config import (
|
| 3 |
MODEL_NAME,
|
| 4 |
SENTENCE_EMBEDDING_FILE,
|
| 5 |
-
|
| 6 |
)
|
| 7 |
from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper
|
| 8 |
from data_lib.subject_data import SubjectData
|
| 9 |
-
from data_lib.
|
| 10 |
|
| 11 |
|
| 12 |
class SentenceTransformerService:
|
| 13 |
def __init__(self):
|
| 14 |
self.sentenceTransformerHelper = None
|
| 15 |
self.dic_standard_subject = None
|
| 16 |
-
self.
|
| 17 |
self.sampleData = None
|
| 18 |
|
| 19 |
def load_model_data(self):
|
|
@@ -34,14 +34,13 @@ class SentenceTransformerService:
|
|
| 34 |
|
| 35 |
# Load pre-computed embeddings and similarities
|
| 36 |
with open(SENTENCE_EMBEDDING_FILE, "rb") as f:
|
| 37 |
-
self.
|
| 38 |
|
| 39 |
# Load and process sample data
|
| 40 |
-
self.
|
| 41 |
-
self.
|
| 42 |
-
self.
|
| 43 |
-
|
| 44 |
-
|
| 45 |
print("Models and data loaded successfully")
|
| 46 |
|
| 47 |
# Global instance (singleton)
|
|
|
|
| 2 |
from config import (
|
| 3 |
MODEL_NAME,
|
| 4 |
SENTENCE_EMBEDDING_FILE,
|
| 5 |
+
STANDARD_NAME_MAP_DATA_FILE, SUBJECT_DATA_FILE
|
| 6 |
)
|
| 7 |
from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper
|
| 8 |
from data_lib.subject_data import SubjectData
|
| 9 |
+
from data_lib.standard_name_map_data import StandardNameMapData
|
| 10 |
|
| 11 |
|
| 12 |
class SentenceTransformerService:
|
| 13 |
def __init__(self):
|
| 14 |
self.sentenceTransformerHelper = None
|
| 15 |
self.dic_standard_subject = None
|
| 16 |
+
self.anchor_name_sentence_embeddings = None
|
| 17 |
self.sampleData = None
|
| 18 |
|
| 19 |
def load_model_data(self):
|
|
|
|
| 34 |
|
| 35 |
# Load pre-computed embeddings and similarities
|
| 36 |
with open(SENTENCE_EMBEDDING_FILE, "rb") as f:
|
| 37 |
+
self.anchor_name_sentence_embeddings = pickle.load(f)
|
| 38 |
|
| 39 |
# Load and process sample data
|
| 40 |
+
self.standardNameMapData = StandardNameMapData()
|
| 41 |
+
self.standardNameMapData.load_data_from_csv(STANDARD_NAME_MAP_DATA_FILE)
|
| 42 |
+
self.standardNameMapData.process_data(self.anchor_name_sentence_embeddings)
|
| 43 |
+
|
|
|
|
| 44 |
print("Models and data loaded successfully")
|
| 45 |
|
| 46 |
# Global instance (singleton)
|