RE_UPLOAD-REBUILD-RESTART
Browse files- analysis.ipynb +539 -0
analysis.ipynb
ADDED
|
@@ -0,0 +1,539 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {
|
| 6 |
+
"id": "IsB9l3mBIGUN"
|
| 7 |
+
},
|
| 8 |
+
"source": [
|
| 9 |
+
"## Analysis"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": null,
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"%load_ext autoreload\n",
|
| 19 |
+
"%autoreload 2"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"cell_type": "code",
|
| 24 |
+
"execution_count": null,
|
| 25 |
+
"metadata": {},
|
| 26 |
+
"outputs": [],
|
| 27 |
+
"source": [
|
| 28 |
+
"import pandas as pd\n",
|
| 29 |
+
"from PIL import Image\n",
|
| 30 |
+
"from scipy.stats import pearsonr\n",
|
| 31 |
+
"from utils.get_unique_values import get_unique_values\n",
|
| 32 |
+
"from utils.remove_duplicates import unzip_fn\n",
|
| 33 |
+
"from utils.show_tile_images import show_tile_images\n",
|
| 34 |
+
"import zipfile\n",
|
| 35 |
+
"import json\n",
|
| 36 |
+
"from utils.visualize_bboxes_on_image import draw_text_on_image\n",
|
| 37 |
+
"import numpy as np\n",
|
| 38 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
| 39 |
+
"import matplotlib.pyplot as plt\n",
|
| 40 |
+
"import tqdm as tqdm\n",
|
| 41 |
+
"from functools import cache\n",
|
| 42 |
+
"from utils.flatten import flatten"
|
| 43 |
+
]
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"cell_type": "code",
|
| 47 |
+
"execution_count": null,
|
| 48 |
+
"metadata": {
|
| 49 |
+
"id": "5l6iv7ZrIGUP"
|
| 50 |
+
},
|
| 51 |
+
"outputs": [],
|
| 52 |
+
"source": [
|
| 53 |
+
"# !GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features --depth=1\n",
|
| 54 |
+
"\n",
|
| 55 |
+
"# !wget https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features/resolve/main/data/processed/RVL-CDIP-invoice/vectors.json.zip -P ./data/processed/RVL-CDIP-invoice/\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"# import sys\n",
|
| 60 |
+
"# sys.path.insert(0, './document-similarity-search-using-visual-layout-features')"
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"cell_type": "code",
|
| 65 |
+
"execution_count": null,
|
| 66 |
+
"metadata": {
|
| 67 |
+
"id": "172P8Ey8ytD9"
|
| 68 |
+
},
|
| 69 |
+
"outputs": [],
|
| 70 |
+
"source": [
|
| 71 |
+
"# import os\n",
|
| 72 |
+
"# vectors_chunks = os.listdir('/content/document-similarity-search-using-visual-layout-features/data/processed/RVL-CDIP-invoice/vectors.json.zip.chunks')\n",
|
| 73 |
+
"# vectors_chunks.sort(key=lambda x: int(x.split('-')[0]))\n",
|
| 74 |
+
"# vectors_chunks"
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"cell_type": "code",
|
| 79 |
+
"execution_count": null,
|
| 80 |
+
"metadata": {
|
| 81 |
+
"id": "ZZD9JBaWa_T_"
|
| 82 |
+
},
|
| 83 |
+
"outputs": [],
|
| 84 |
+
"source": [
|
| 85 |
+
"vectors_df = pd.read_json('./data/local-data/processed/RVL-CDIP-invoice/vectors.json.zip')\n",
|
| 86 |
+
"vectors_df"
|
| 87 |
+
]
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"cell_type": "code",
|
| 91 |
+
"execution_count": null,
|
| 92 |
+
"metadata": {},
|
| 93 |
+
"outputs": [],
|
| 94 |
+
"source": [
|
| 95 |
+
"# https://gemini.google.com/app/8cd4389df12d29e6\n",
|
| 96 |
+
"\n",
|
| 97 |
+
"# https://chat.openai.com/c/a345a9ec-9238-4089-a6c0-bb4d375148eb"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"cell_type": "markdown",
|
| 102 |
+
"metadata": {
|
| 103 |
+
"id": "X0n7rBnZIGUQ"
|
| 104 |
+
},
|
| 105 |
+
"source": [
|
| 106 |
+
"### Correlation"
|
| 107 |
+
]
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"cell_type": "code",
|
| 111 |
+
"execution_count": null,
|
| 112 |
+
"metadata": {},
|
| 113 |
+
"outputs": [],
|
| 114 |
+
"source": [
|
| 115 |
+
"unique_values = get_unique_values(start=0.17, end=1, count=10*1000)\n",
|
| 116 |
+
"\n",
|
| 117 |
+
"def get_stats(index: int):\n",
|
| 118 |
+
" vectors = vectors_df.loc[index, 'vectors']\n",
|
| 119 |
+
" weighted_vectors = vectors_df.loc[index, 'weighted_vectors']\n",
|
| 120 |
+
" reduced_vectors = vectors_df.loc[index, 'reduced_vectors']\n",
|
| 121 |
+
" reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']\n",
|
| 122 |
+
" non_zero_vectors, non_zero_uniques = unzip_fn([(vector, unique) for vector, unique in zip(vectors, unique_values) if vector > 0]) if len([i for i in vectors if i > 0]) > 0 else ([], [])\n",
|
| 123 |
+
"\n",
|
| 124 |
+
" non_zero_vectors__uniques = pearsonr(non_zero_vectors, non_zero_uniques) if len(non_zero_vectors) > 0 else [0,1]\n",
|
| 125 |
+
" vectors___unique_values = pearsonr(vectors, unique_values)\n",
|
| 126 |
+
" vectors___weighted_vectors = pearsonr(vectors, weighted_vectors)\n",
|
| 127 |
+
" vectors___reduced_vectors = pearsonr(vectors, reduced_vectors)\n",
|
| 128 |
+
" vectors___reduced_weighted_vectors = pearsonr(vectors, reduced_weighted_vectors)\n",
|
| 129 |
+
" weighted_vectors___reduced_vectors = pearsonr(weighted_vectors, reduced_vectors)\n",
|
| 130 |
+
" weighted_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
|
| 131 |
+
" reduced_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n",
|
| 132 |
+
"\n",
|
| 133 |
+
" return {\n",
|
| 134 |
+
" 'non_zero_vectors__uniques': non_zero_vectors__uniques,\n",
|
| 135 |
+
" 'vectors___unique_values': vectors___unique_values,\n",
|
| 136 |
+
" 'vectors___weighted_vectors': vectors___weighted_vectors,\n",
|
| 137 |
+
" 'vectors___reduced_vectors': vectors___reduced_vectors,\n",
|
| 138 |
+
" 'vectors___reduced_weighted_vectors': vectors___reduced_weighted_vectors,\n",
|
| 139 |
+
" 'weighted_vectors___reduced_vectors': weighted_vectors___reduced_vectors,\n",
|
| 140 |
+
" 'weighted_vectors___reduced_weighted_vectors': weighted_vectors___reduced_weighted_vectors,\n",
|
| 141 |
+
" 'reduced_vectors___reduced_weighted_vectors': reduced_vectors___reduced_weighted_vectors,\n",
|
| 142 |
+
" }\n",
|
| 143 |
+
"\n",
|
| 144 |
+
"from matplotlib import pyplot as plt\n",
|
| 145 |
+
"from scipy.signal import convolve\n",
|
| 146 |
+
"kernel = np.array([0.25, 0.5, 0.25]) # Example kernel for simple averaging\n",
|
| 147 |
+
"\n",
|
| 148 |
+
"def smooth_vector(vector):\n",
|
| 149 |
+
" # Perform convolution\n",
|
| 150 |
+
" smoothed_vector = convolve(vector, kernel, mode='same') / sum(kernel)\n",
|
| 151 |
+
" return smoothed_vector\n",
|
| 152 |
+
"\n",
|
| 153 |
+
"def get_modified_stats(image_1_index: int, image_2_index: int, vector_column: str = 'vectors', plot = False):\n",
|
| 154 |
+
" image_1_values = vectors_df.loc[image_1_index, vector_column]\n",
|
| 155 |
+
" image_2_values = vectors_df.loc[image_2_index, vector_column]\n",
|
| 156 |
+
"\n",
|
| 157 |
+
" image_1_matrix = np.array(image_1_values)\n",
|
| 158 |
+
" image_2_matrix = np.array(image_2_values)\n",
|
| 159 |
+
"\n",
|
| 160 |
+
" vector_1_zero_indices = image_1_matrix == 0\n",
|
| 161 |
+
" vector_2_zero_indices = image_2_matrix == 0\n",
|
| 162 |
+
"\n",
|
| 163 |
+
" image_1_matrix[vector_1_zero_indices] = unique_values[vector_1_zero_indices]\n",
|
| 164 |
+
" image_2_matrix[vector_2_zero_indices] = unique_values[vector_2_zero_indices]\n",
|
| 165 |
+
"\n",
|
| 166 |
+
" _old_pearsonr = pearsonr(image_1_values, image_2_values)\n",
|
| 167 |
+
" [[_old_cosine_similarity]] = cosine_similarity([image_1_values], [image_2_values])\n",
|
| 168 |
+
" _pearsonr = pearsonr(image_1_matrix, image_2_matrix)\n",
|
| 169 |
+
" [[_cosine_similarity]] = cosine_similarity([image_1_matrix], [image_2_matrix])\n",
|
| 170 |
+
"\n",
|
| 171 |
+
" image_1_matrix_smooth = smooth_vector(image_1_matrix)\n",
|
| 172 |
+
" image_2_matrix_smooth = smooth_vector(image_2_matrix)\n",
|
| 173 |
+
" _pearsonr_smooth = pearsonr(image_1_matrix_smooth, image_2_matrix)\n",
|
| 174 |
+
" [[_cosine_similarity_smooth]] = cosine_similarity([image_1_matrix_smooth], [image_2_matrix])\n",
|
| 175 |
+
"\n",
|
| 176 |
+
" permuted_indices = np.random.permutation(len(image_1_matrix))\n",
|
| 177 |
+
" _pearsonr_random = pearsonr(image_1_matrix[permuted_indices], image_2_matrix[permuted_indices])\n",
|
| 178 |
+
" [[_cosine_similarity_random]] = cosine_similarity([image_1_matrix[permuted_indices]], [image_2_matrix[permuted_indices]])\n",
|
| 179 |
+
"\n",
|
| 180 |
+
" if plot:\n",
|
| 181 |
+
" plt.figure(figsize=(12, 6))\n",
|
| 182 |
+
" plt.plot(image_1_values, label='image_1_values', color = 'red')\n",
|
| 183 |
+
" plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', color = 'blue')\n",
|
| 184 |
+
" # plt.plot(image_1_matrix, label='image_1_matrix', linestyle='--', color = 'blue')\n",
|
| 185 |
+
" # plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', linestyle='--', color = \"green\")\n",
|
| 186 |
+
" plt.show()\n",
|
| 187 |
+
"\n",
|
| 188 |
+
" return {\n",
|
| 189 |
+
" 'old_pearsonr' : f'{round(_old_pearsonr.statistic, 4)} - {_old_pearsonr.pvalue}',\n",
|
| 190 |
+
" 'old_cosine_similarity' : round(_old_cosine_similarity, 4),\n",
|
| 191 |
+
" 'pearsonr' : f'{round(_pearsonr.statistic, 4)} - {_pearsonr.pvalue}',\n",
|
| 192 |
+
" 'cosine_similarity' : round(_cosine_similarity, 4),\n",
|
| 193 |
+
" 'pearsonr_smooth' : f'{round(_pearsonr_smooth.statistic, 4)} - {_pearsonr_smooth.pvalue}',\n",
|
| 194 |
+
" 'cosine_similarity_smooth' : round(_cosine_similarity_smooth, 4),\n",
|
| 195 |
+
" 'pearsonr_random' : f'{round(_pearsonr_random.statistic, 4)} - {_pearsonr_random.pvalue}',\n",
|
| 196 |
+
" 'cosine_similarity_random' : round(_cosine_similarity_random, 4),\n",
|
| 197 |
+
" }\n"
|
| 198 |
+
]
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"cell_type": "code",
|
| 202 |
+
"execution_count": null,
|
| 203 |
+
"metadata": {},
|
| 204 |
+
"outputs": [],
|
| 205 |
+
"source": [
|
| 206 |
+
"get_stats(19569)"
|
| 207 |
+
]
|
| 208 |
+
},
|
| 209 |
+
{
|
| 210 |
+
"cell_type": "code",
|
| 211 |
+
"execution_count": null,
|
| 212 |
+
"metadata": {},
|
| 213 |
+
"outputs": [],
|
| 214 |
+
"source": [
|
| 215 |
+
"correlation_results = []\n",
|
| 216 |
+
"for i in tqdm.tqdm(range(len(correlation_results), len(vectors_df))):\n",
|
| 217 |
+
" correlation_results.append(get_stats(i))"
|
| 218 |
+
]
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"cell_type": "code",
|
| 222 |
+
"execution_count": null,
|
| 223 |
+
"metadata": {},
|
| 224 |
+
"outputs": [],
|
| 225 |
+
"source": [
|
| 226 |
+
"columns = list(correlation_results[0].keys())\n",
|
| 227 |
+
"fig, axes = plt.subplots(4, 2, figsize=(12, 12))\n",
|
| 228 |
+
"axes = axes.flatten()\n",
|
| 229 |
+
"for i, column in enumerate(columns):\n",
|
| 230 |
+
" ax = axes[i]\n",
|
| 231 |
+
" ax.hist([j[column][0] for j in correlation_results], bins=100)\n",
|
| 232 |
+
" ax.set_title(column)"
|
| 233 |
+
]
|
| 234 |
+
},
|
| 235 |
+
{
|
| 236 |
+
"cell_type": "code",
|
| 237 |
+
"execution_count": null,
|
| 238 |
+
"metadata": {},
|
| 239 |
+
"outputs": [],
|
| 240 |
+
"source": [
|
| 241 |
+
"def correlation_fn(index: int):\n",
|
| 242 |
+
" vectors = vectors_df.loc[index, 'vectors']\n",
|
| 243 |
+
" weighted_vectors = vectors_df.loc[index, 'weighted_vectors']\n",
|
| 244 |
+
" reduced_vectors = vectors_df.loc[index, 'reduced_vectors']\n",
|
| 245 |
+
" reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']\n",
|
| 246 |
+
" return {\n",
|
| 247 |
+
" 'vectors vs weighted_vectors': pearsonr(vectors, weighted_vectors),\n",
|
| 248 |
+
" 'vectors vs reduced_vectors': pearsonr(vectors, reduced_vectors),\n",
|
| 249 |
+
" 'vectors vs reduced_weighted_vectors': pearsonr(vectors, reduced_weighted_vectors),\n",
|
| 250 |
+
" 'weighted_vectors vs reduced_vectors': pearsonr(weighted_vectors, reduced_vectors),\n",
|
| 251 |
+
" 'weighted_vectors vs reduced_weighted_vectors': pearsonr(weighted_vectors, reduced_weighted_vectors),\n",
|
| 252 |
+
" 'reduced_vectors vs reduced_weighted_vectors': pearsonr(reduced_vectors, reduced_weighted_vectors),\n",
|
| 253 |
+
" }\n",
|
| 254 |
+
"\n",
|
| 255 |
+
"correlation_results_2 = [correlation_fn(i) for i in tqdm.tqdm(range(len(vectors_df)))]"
|
| 256 |
+
]
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"cell_type": "code",
|
| 260 |
+
"execution_count": null,
|
| 261 |
+
"metadata": {},
|
| 262 |
+
"outputs": [],
|
| 263 |
+
"source": [
|
| 264 |
+
"import matplotlib.pyplot as plt\n",
|
| 265 |
+
"\n",
|
| 266 |
+
"columns = list(correlation_results_2[0].keys())\n",
|
| 267 |
+
"fig, axes = plt.subplots(6, 2, figsize=(24, 24))\n",
|
| 268 |
+
"axes = axes.flatten()\n",
|
| 269 |
+
"for i, column in enumerate(columns):\n",
|
| 270 |
+
" ax = axes[i]\n",
|
| 271 |
+
" corr = [j[column][0] for j in correlation_results_2]\n",
|
| 272 |
+
" pvalues = [j[column][1] for j in correlation_results_2]\n",
|
| 273 |
+
" # ax.hist([j[column][0] for j in correlation_results_2], bins=100)\n",
|
| 274 |
+
" ax.plot(range(0, len(corr)), corr, label='Correlation', color='blue')\n",
|
| 275 |
+
" # ax.plot(range(0, len(pvalues)), pvalues, label='pvalues', color='red')\n",
|
| 276 |
+
" ax.set_title(column)"
|
| 277 |
+
]
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"cell_type": "code",
|
| 281 |
+
"execution_count": null,
|
| 282 |
+
"metadata": {},
|
| 283 |
+
"outputs": [],
|
| 284 |
+
"source": [
|
| 285 |
+
"import matplotlib.pyplot as plt\n",
|
| 286 |
+
"\n",
|
| 287 |
+
"columns = list(correlation_results_2[0].keys())\n",
|
| 288 |
+
"fig, axes = plt.subplots(3, 2, figsize=(24, 24))\n",
|
| 289 |
+
"axes = axes.flatten()\n",
|
| 290 |
+
"for i, column in enumerate(columns):\n",
|
| 291 |
+
" ax = axes[i]\n",
|
| 292 |
+
" corr = [j[column][0] for j in correlation_results_2]\n",
|
| 293 |
+
" pvalues = [j[column][1] for j in correlation_results_2]\n",
|
| 294 |
+
" ax.plot(range(0, len(corr)), corr, label='correlation', color='blue')\n",
|
| 295 |
+
" ax.plot(range(0, len(pvalues)), pvalues, label='p-value', color='red')\n",
|
| 296 |
+
" ax.legend(bbox_to_anchor=(1, 0.1), loc='lower right')\n",
|
| 297 |
+
" ax.set_ylabel('correlation & p-value')\n",
|
| 298 |
+
" ax.set_xlabel(f'images - {column}')\n",
|
| 299 |
+
" ax.set_title(column)\n",
|
| 300 |
+
"\n",
|
| 301 |
+
"fig.savefig('/Users/charleskabue/Downloads/vector-correlations.png')"
|
| 302 |
+
]
|
| 303 |
+
},
|
| 304 |
+
{
|
| 305 |
+
"cell_type": "markdown",
|
| 306 |
+
"metadata": {},
|
| 307 |
+
"source": [
|
| 308 |
+
"<hr/>"
|
| 309 |
+
]
|
| 310 |
+
},
|
| 311 |
+
{
|
| 312 |
+
"cell_type": "code",
|
| 313 |
+
"execution_count": null,
|
| 314 |
+
"metadata": {},
|
| 315 |
+
"outputs": [],
|
| 316 |
+
"source": [
|
| 317 |
+
"# vector_columns = ['vectors_column', 'weighted_vectors_column', 'reduced_vectors_column', 'reduced_weighted_vectors_column']\n",
|
| 318 |
+
"# similarities_json = {}\n",
|
| 319 |
+
"# for vector_column in tqdm.tqdm(vector_columns):\n",
|
| 320 |
+
"# with zipfile.ZipFile(f'./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/{vector_column}.json.zip', \"r\") as zip_ref:\n",
|
| 321 |
+
"# similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))\n",
|
| 322 |
+
"# similarities_json[vector_column] = similarity_vectors_json\n",
|
| 323 |
+
"@cache\n",
|
| 324 |
+
"def get_similarities(filter, vector_column: str = 'vectors_column'):\n",
|
| 325 |
+
" with zipfile.ZipFile(f'./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/{vector_column}.json.zip', \"r\") as zip_ref:\n",
|
| 326 |
+
" similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))\n",
|
| 327 |
+
" results = [value for value in tqdm.tqdm(similarity_vectors_json) if (filter(value) if filter else True)]\n",
|
| 328 |
+
" results.sort(key=lambda similarity: similarity['cosine_similarity_score'], reverse=True)\n",
|
| 329 |
+
" similarity_vectors_json = None\n",
|
| 330 |
+
" return results"
|
| 331 |
+
]
|
| 332 |
+
},
|
| 333 |
+
{
|
| 334 |
+
"cell_type": "code",
|
| 335 |
+
"execution_count": null,
|
| 336 |
+
"metadata": {},
|
| 337 |
+
"outputs": [],
|
| 338 |
+
"source": []
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"cell_type": "code",
|
| 342 |
+
"execution_count": null,
|
| 343 |
+
"metadata": {},
|
| 344 |
+
"outputs": [],
|
| 345 |
+
"source": [
|
| 346 |
+
"duplicates_matches = get_similarities(\n",
|
| 347 |
+
" lambda similarity: similarity['cosine_similarity_score'] < 1 and similarity['document_image_1'] == similarity['document_image_2'], \n",
|
| 348 |
+
" 'reduced_weighted_vectors_column')\n",
|
| 349 |
+
"\n",
|
| 350 |
+
"len(duplicates_matches)"
|
| 351 |
+
]
|
| 352 |
+
},
|
| 353 |
+
{
|
| 354 |
+
"cell_type": "code",
|
| 355 |
+
"execution_count": null,
|
| 356 |
+
"metadata": {},
|
| 357 |
+
"outputs": [],
|
| 358 |
+
"source": [
|
| 359 |
+
"top_matches = get_similarities(\n",
|
| 360 |
+
" lambda similarity: similarity['cosine_similarity_score'] > 0.8 and similarity['document_image_1'] != similarity['document_image_2'], \n",
|
| 361 |
+
" 'reduced_weighted_vectors_column')"
|
| 362 |
+
]
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"cell_type": "code",
|
| 366 |
+
"execution_count": null,
|
| 367 |
+
"metadata": {},
|
| 368 |
+
"outputs": [],
|
| 369 |
+
"source": [
|
| 370 |
+
"def get_image(filename: str):\n",
|
| 371 |
+
" return Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{filename}')\n",
|
| 372 |
+
"\n",
|
| 373 |
+
"def print_matches(matches, *, per_side = 1, figsize = None, startistics = True):\n",
|
| 374 |
+
" images = [\n",
|
| 375 |
+
" [\n",
|
| 376 |
+
" get_image(match['document_image_1']), \n",
|
| 377 |
+
" get_image(match['document_image_2']),\n",
|
| 378 |
+
" \n",
|
| 379 |
+
" ] + ([\n",
|
| 380 |
+
" draw_text_on_image(\n",
|
| 381 |
+
" Image.new(\"RGB\", (800, 1200), 'white'),\n",
|
| 382 |
+
" [100, 100],\n",
|
| 383 |
+
" json.dumps(\n",
|
| 384 |
+
" get_modified_stats(\n",
|
| 385 |
+
" int(match['document_image_1'].split('.')[0]), \n",
|
| 386 |
+
" int(match['document_image_2'].split('.')[0]), \n",
|
| 387 |
+
" 'vectors'), \n",
|
| 388 |
+
" indent=4),\n",
|
| 389 |
+
" label_text_size=40,\n",
|
| 390 |
+
" label_fill_color='white')\n",
|
| 391 |
+
" ] if startistics else [])\n",
|
| 392 |
+
" for match\n",
|
| 393 |
+
" in matches\n",
|
| 394 |
+
" ]\n",
|
| 395 |
+
" titles = [\n",
|
| 396 |
+
" [\n",
|
| 397 |
+
" f\"{match['document_image_1']}, Similarity - {round(match['cosine_similarity_score'], 4)}\" if startistics else match['document_image_1'],\n",
|
| 398 |
+
" match['document_image_2'],\n",
|
| 399 |
+
" ] + (['More Statistics'] if startistics else [])\n",
|
| 400 |
+
" for match\n",
|
| 401 |
+
" in matches\n",
|
| 402 |
+
" ]\n",
|
| 403 |
+
" width_parts = len(images[0]) * per_side\n",
|
| 404 |
+
" tile_image = show_tile_images(\n",
|
| 405 |
+
" images = flatten(images),\n",
|
| 406 |
+
" titles = flatten(titles),\n",
|
| 407 |
+
" width_parts = width_parts,\n",
|
| 408 |
+
" figsize = figsize or (10.2 * width_parts, 30 * (len(images) / width_parts)),\n",
|
| 409 |
+
" space = 2,\n",
|
| 410 |
+
" pad = True,\n",
|
| 411 |
+
" figcolor = '#d3eddd',\n",
|
| 412 |
+
" title_color = 'white',\n",
|
| 413 |
+
" title_background_color = 'black',\n",
|
| 414 |
+
" title_font_size = 25)\n",
|
| 415 |
+
" return tile_image\n",
|
| 416 |
+
"\n",
|
| 417 |
+
"len([i for i in top_matches if i['cosine_similarity_score'] >= 1])"
|
| 418 |
+
]
|
| 419 |
+
},
|
| 420 |
+
{
|
| 421 |
+
"cell_type": "code",
|
| 422 |
+
"execution_count": null,
|
| 423 |
+
"metadata": {},
|
| 424 |
+
"outputs": [],
|
| 425 |
+
"source": [
|
| 426 |
+
"print_matches(top_matches[0:28])"
|
| 427 |
+
]
|
| 428 |
+
},
|
| 429 |
+
{
|
| 430 |
+
"cell_type": "code",
|
| 431 |
+
"execution_count": null,
|
| 432 |
+
"metadata": {},
|
| 433 |
+
"outputs": [],
|
| 434 |
+
"source": [
|
| 435 |
+
"index = 44\n",
|
| 436 |
+
"print(top_matches[index]['document_image_1'] + ' - ' + top_matches[index]['document_image_2'])\n",
|
| 437 |
+
"draw_text_on_image(\n",
|
| 438 |
+
" print_matches([top_matches[index]], figsize=(10, 7)),\n",
|
| 439 |
+
" [330, 335],\n",
|
| 440 |
+
" f\"cosine similarity - {round(top_matches[index]['cosine_similarity_score'], 4)}\",\n",
|
| 441 |
+
" label_text_size=30,\n",
|
| 442 |
+
" label_fill_color='black',\n",
|
| 443 |
+
" label_text_color='white',\n",
|
| 444 |
+
" label_rotate_angle = 90,\n",
|
| 445 |
+
" label_text_padding = 2\n",
|
| 446 |
+
")"
|
| 447 |
+
]
|
| 448 |
+
},
|
| 449 |
+
{
|
| 450 |
+
"cell_type": "code",
|
| 451 |
+
"execution_count": null,
|
| 452 |
+
"metadata": {},
|
| 453 |
+
"outputs": [],
|
| 454 |
+
"source": [
|
| 455 |
+
"print(duplicates_matches[0])\n",
|
| 456 |
+
"print_matches(duplicates_matches[:10])"
|
| 457 |
+
]
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"cell_type": "code",
|
| 461 |
+
"execution_count": null,
|
| 462 |
+
"metadata": {},
|
| 463 |
+
"outputs": [],
|
| 464 |
+
"source": [
|
| 465 |
+
"from main import app\n",
|
| 466 |
+
"import os\n",
|
| 467 |
+
"\n",
|
| 468 |
+
"model_path = '../detectron2-layout-parser/model_final.pth'\n",
|
| 469 |
+
"config_path = '../detectron2-layout-parser/config.yaml'\n",
|
| 470 |
+
"\n",
|
| 471 |
+
"examples = [f'./demo-examples/{filename}' for filename in os.listdir('./demo-examples/')]\n",
|
| 472 |
+
"app(model_path=model_path, config_path=config_path, examples=examples, debug=True)"
|
| 473 |
+
]
|
| 474 |
+
},
|
| 475 |
+
{
|
| 476 |
+
"cell_type": "code",
|
| 477 |
+
"execution_count": null,
|
| 478 |
+
"metadata": {},
|
| 479 |
+
"outputs": [],
|
| 480 |
+
"source": [
|
| 481 |
+
"import os\n",
|
| 482 |
+
"from PIL import Image\n",
|
| 483 |
+
"import layoutparser as lp\n",
|
| 484 |
+
"from utils.get_features import get_features\n",
|
| 485 |
+
"\n",
|
| 486 |
+
"documents = os.listdir('./data/local-data/raw/RVL-CDIP-invoice')\n",
|
| 487 |
+
"# model_path = './model/trained_model/model_final.pth'\n",
|
| 488 |
+
"# config_path = './model/trained_model/config.yaml'\n",
|
| 489 |
+
"model_path = '../detectron2-layout-parser/model_final.pth'\n",
|
| 490 |
+
"config_path = '../detectron2-layout-parser/config.yaml'\n",
|
| 491 |
+
"label_map = {0: 'Caption', 1: 'Footnote', 2: 'Formula', 3: 'List-item', \n",
|
| 492 |
+
" 4: 'Page-footer', 5: 'Page-header', 6: 'Picture', \n",
|
| 493 |
+
" 7: 'Section-header', 8: 'Table', 9: 'Text', 10: 'Title'}\n",
|
| 494 |
+
"model = lp.Detectron2LayoutModel(\n",
|
| 495 |
+
" config_path=config_path,\n",
|
| 496 |
+
" model_path=model_path,\n",
|
| 497 |
+
" label_map=label_map)\n",
|
| 498 |
+
"\n",
|
| 499 |
+
"for document in documents[0:1]:\n",
|
| 500 |
+
" features = get_features(\n",
|
| 501 |
+
" image=Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{document}'),\n",
|
| 502 |
+
" model=model,\n",
|
| 503 |
+
" label_names=list(label_map.values()),\n",
|
| 504 |
+
" width_parts=100,\n",
|
| 505 |
+
" height_parts=100)"
|
| 506 |
+
]
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"cell_type": "markdown",
|
| 510 |
+
"metadata": {},
|
| 511 |
+
"source": [
|
| 512 |
+
"<hr/>"
|
| 513 |
+
]
|
| 514 |
+
}
|
| 515 |
+
],
|
| 516 |
+
"metadata": {
|
| 517 |
+
"colab": {
|
| 518 |
+
"provenance": []
|
| 519 |
+
},
|
| 520 |
+
"kernelspec": {
|
| 521 |
+
"display_name": "Python 3",
|
| 522 |
+
"name": "python3"
|
| 523 |
+
},
|
| 524 |
+
"language_info": {
|
| 525 |
+
"codemirror_mode": {
|
| 526 |
+
"name": "ipython",
|
| 527 |
+
"version": 3
|
| 528 |
+
},
|
| 529 |
+
"file_extension": ".py",
|
| 530 |
+
"mimetype": "text/x-python",
|
| 531 |
+
"name": "python",
|
| 532 |
+
"nbconvert_exporter": "python",
|
| 533 |
+
"pygments_lexer": "ipython3",
|
| 534 |
+
"version": "3.10.13"
|
| 535 |
+
}
|
| 536 |
+
},
|
| 537 |
+
"nbformat": 4,
|
| 538 |
+
"nbformat_minor": 0
|
| 539 |
+
}
|