Spaces:

JasonTPhillipsJr
/

SpaGAN

Sleeping

App Files Files Community

JasonTPhillipsJr commited on Nov 11, 2024

Commit

a74fa0d

verified ·

1 Parent(s): 5914cea

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -1

app.py CHANGED Viewed

@@ -37,7 +37,7 @@ spaBERT_model.load_state_dict(pre_trained_model, strict=False)
 spaBERT_model.to(device)
 spaBERT_model.eval()
-# Load data using SpatialDataset
 spatialDataset = PbfMapDataset(data_file_path = data_file_path,
                                         tokenizer = bert_tokenizer,
                                         max_token_len = 256,              #Originally 300
@@ -51,6 +51,48 @@ spatialDataset = PbfMapDataset(data_file_path = data_file_path,
 data_loader = DataLoader(spatialDataset, batch_size=1, num_workers=0, shuffle=False, pin_memory=False, drop_last=False) #issue needs to be fixed with num_workers not stopping after finished
 #Get BERT Embedding for review
 def get_bert_embedding(review_text):
     #tokenize review

 spaBERT_model.to(device)
 spaBERT_model.eval()
+#Load data using SpatialDataset
 spatialDataset = PbfMapDataset(data_file_path = data_file_path,
                                         tokenizer = bert_tokenizer,
                                         max_token_len = 256,              #Originally 300
 data_loader = DataLoader(spatialDataset, batch_size=1, num_workers=0, shuffle=False, pin_memory=False, drop_last=False) #issue needs to be fixed with num_workers not stopping after finished
+#Pre-aquire the SpaBERT embeddings for all geo-entities within our dataset
+def process_entity(batch, model, device):
+    input_ids = batch['masked_input'].to(device)
+    attention_mask = batch['attention_mask'].to(device)
+    position_list_x = batch['norm_lng_list'].to(device)
+    position_list_y = batch['norm_lat_list'].to(device)
+    sent_position_ids = batch['sent_position_ids'].to(device)
+    pseudo_sentence = batch['pseudo_sentence'].to(device)
+    # Convert tensor to list of token IDs, and decode them into a readable sentence
+    pseudo_sentence_decoded = tokenizer.decode(pseudo_sentence[0].tolist(), skip_special_tokens=False)
+    with torch.no_grad():
+        outputs = spaBERT_model(#input_ids=input_ids,
+                        input_ids=pseudo_sentence,
+                        attention_mask=attention_mask,
+                        sent_position_ids=sent_position_ids,
+                        position_list_x=position_list_x,
+                        position_list_y=position_list_y)
+                        #NOTE: we are ommitting the pseudo_sentence here. Verify that this is correct
+    embeddings = outputs.hidden_states[-1].to(device)
+    # Extract the [CLS] token embedding (first token)
+    embedding = embeddings[:, 0, :].detach()  # [batch_size, hidden_size]
+    #pivot_token_len = batch['pivot_token_len'].item()
+    #pivot_embeddings = embeddings[:, :pivot_token_len, :]
+    #return pivot_embeddings.cpu().numpy(), input_ids.cpu().numpy()
+    return embedding.cpu().numpy(), input_ids.cpu().numpy()
+all_embeddings = []
+for batch in (data_loader):
+  embeddings, input_ids = process_entity(batch, model, device)
+  all_embeddings.append(embeddings)
+st.write("SpaBERT Embedding shape:", all_embeddings[0].shape)
+st.write("SpaBERT Embedding:", all_embeddings[0])
 #Get BERT Embedding for review
 def get_bert_embedding(review_text):
     #tokenize review