{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[{"sourceId":9407065,"sourceType":"datasetVersion","datasetId":5711615}],"dockerImageVersionId":30762,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","jupyter":{"source_hidden":true},"execution":{"iopub.status.busy":"2024-09-16T03:32:51.651185Z","iopub.execute_input":"2024-09-16T03:32:51.651963Z","iopub.status.idle":"2024-09-16T03:32:51.660806Z","shell.execute_reply.started":"2024-09-16T03:32:51.651918Z","shell.execute_reply":"2024-09-16T03:32:51.659896Z"},"trusted":true},"execution_count":26,"outputs":[{"name":"stdout","text":"/kaggle/input/ocrsampledataner/train50000ocred.csv\n/kaggle/input/ocrsampledataner/train50000.csv\n","output_type":"stream"}]},{"cell_type":"code","source":"import pandas as pd\nimport os","metadata":{"execution":{"iopub.status.busy":"2024-09-16T03:32:51.662421Z","iopub.execute_input":"2024-09-16T03:32:51.662773Z","iopub.status.idle":"2024-09-16T03:32:51.672588Z","shell.execute_reply.started":"2024-09-16T03:32:51.662733Z","shell.execute_reply":"2024-09-16T03:32:51.671659Z"},"trusted":true},"execution_count":27,"outputs":[]},{"cell_type":"code","source":"datasetDir = '/kaggle/input/ocrsampledataner/'\ndf = pd.read_csv(os.path.join(datasetDir, 'train50000ocred.csv'))","metadata":{"execution":{"iopub.status.busy":"2024-09-16T03:32:51.674245Z","iopub.execute_input":"2024-09-16T03:32:51.674576Z","iopub.status.idle":"2024-09-16T03:32:52.497978Z","shell.execute_reply.started":"2024-09-16T03:32:51.674541Z","shell.execute_reply":"2024-09-16T03:32:52.496917Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"df.reset_index(drop = True, inplace = True)\ndf = df.drop(columns = ['Unnamed: 0', 'image_link', 'imageName', 'ocrdata', 'result'])","metadata":{"execution":{"iopub.status.busy":"2024-09-16T03:32:52.499784Z","iopub.execute_input":"2024-09-16T03:32:52.500114Z","iopub.status.idle":"2024-09-16T03:32:52.518678Z","shell.execute_reply.started":"2024-09-16T03:32:52.500080Z","shell.execute_reply":"2024-09-16T03:32:52.517724Z"},"trusted":true},"execution_count":29,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-09-16T03:32:52.520042Z","iopub.execute_input":"2024-09-16T03:32:52.520405Z","iopub.status.idle":"2024-09-16T03:32:52.536535Z","shell.execute_reply.started":"2024-09-16T03:32:52.520355Z","shell.execute_reply":"2024-09-16T03:32:52.535623Z"},"trusted":true},"execution_count":30,"outputs":[{"execution_count":30,"output_type":"execute_result","data":{"text/plain":" group_id entity_name entity_value \\\n0 997176 wattage 3.0 kilowatt \n1 403664 width 9.0 centimetre \n2 681445 height 11.8 inch \n3 599772 height 20.63 inch \n4 767202 item_weight 120.0 gram \n... ... ... ... \n49995 145452 height 6.0 centimetre \n49996 664736 width 6.6 inch \n49997 952470 depth 8.5 inch \n49998 459516 item_weight 550.0 milligram \n49999 653767 depth 45.0 centimetre \n\n cleandata \n0 ['3kW', '8'] \n1 ['34 cm', '9cm'] \n2 ['19.5cm(7.6in)', '30cm (11.8in)'] \n3 ['615mm/24.21in', '459mm/18.07in', '250mm/9.84... \n4 ['20Gm', '120Gms', '120Gm', '120Gm', '120Gm', ... \n... ... \n49995 ['2.36inch/6.0cm', '.77inch/4.5cm', '2.75inch/... \n49996 ['15.7\"', '2.2', '6.6\"', 'CO2'] \n49997 ['8.5\"', '15\"', '12.5\"', '8.7\"', '6.2\"'] \n49998 ['ServingSize:2 Capsules/Servings Per Containe... \n49999 ['45cm/18in', '45cm/18in', '45cm/18in', '45cm/... \n\n[50000 rows x 4 columns]","text/html":"
\n | group_id | \nentity_name | \nentity_value | \ncleandata | \n
---|---|---|---|---|
0 | \n997176 | \nwattage | \n3.0 kilowatt | \n['3kW', '8'] | \n
1 | \n403664 | \nwidth | \n9.0 centimetre | \n['34 cm', '9cm'] | \n
2 | \n681445 | \nheight | \n11.8 inch | \n['19.5cm(7.6in)', '30cm (11.8in)'] | \n
3 | \n599772 | \nheight | \n20.63 inch | \n['615mm/24.21in', '459mm/18.07in', '250mm/9.84... | \n
4 | \n767202 | \nitem_weight | \n120.0 gram | \n['20Gm', '120Gms', '120Gm', '120Gm', '120Gm', ... | \n
... | \n... | \n... | \n... | \n... | \n
49995 | \n145452 | \nheight | \n6.0 centimetre | \n['2.36inch/6.0cm', '.77inch/4.5cm', '2.75inch/... | \n
49996 | \n664736 | \nwidth | \n6.6 inch | \n['15.7\"', '2.2', '6.6\"', 'CO2'] | \n
49997 | \n952470 | \ndepth | \n8.5 inch | \n['8.5\"', '15\"', '12.5\"', '8.7\"', '6.2\"'] | \n
49998 | \n459516 | \nitem_weight | \n550.0 milligram | \n['ServingSize:2 Capsules/Servings Per Containe... | \n
49999 | \n653767 | \ndepth | \n45.0 centimetre | \n['45cm/18in', '45cm/18in', '45cm/18in', '45cm/... | \n
50000 rows × 4 columns
\n\n | group_id | \nentity_name | \nentity_value | \ncleandata | \ncleaned_data | \n
---|---|---|---|---|---|
0 | \n997176 | \nwattage | \n3.0 kilowatt | \n['3kW', '8'] | \n[['3 kw', '8']] | \n
1 | \n403664 | \nwidth | \n9.0 centimetre | \n['34 cm', '9cm'] | \n[['34 centimetre', '9 centimetre']] | \n
2 | \n681445 | \nheight | \n11.8 inch | \n['19.5cm(7.6in)', '30cm (11.8in)'] | \n[['19.5 centimetre(7.6 inch)', '30 centimetre ... | \n
3 | \n599772 | \nheight | \n20.63 inch | \n['615mm/24.21in', '459mm/18.07in', '250mm/9.84... | \n[['615 millimetre/24.21 inch', '459 millimetre... | \n
4 | \n767202 | \nitem_weight | \n120.0 gram | \n['20Gm', '120Gms', '120Gm', '120Gm', '120Gm', ... | \n[['20 gm', '120 gms', '120 gm', '120 gm', '120... | \n
... | \n... | \n... | \n... | \n... | \n... | \n
49995 | \n145452 | \nheight | \n6.0 centimetre | \n['2.36inch/6.0cm', '.77inch/4.5cm', '2.75inch/... | \n[['2.36 inch/6.0 centimetre', '.77 inch/4.5 ce... | \n
49996 | \n664736 | \nwidth | \n6.6 inch | \n['15.7\"', '2.2', '6.6\"', 'CO2'] | \n[['15.7\"', '2.2', '6.6\"', 'co2']] | \n
49997 | \n952470 | \ndepth | \n8.5 inch | \n['8.5\"', '15\"', '12.5\"', '8.7\"', '6.2\"'] | \n[['8.5\"', '15\"', '12.5\"', '8.7\"', '6.2\"']] | \n
49998 | \n459516 | \nitem_weight | \n550.0 milligram | \n['ServingSize:2 Capsules/Servings Per Containe... | \n[['servingsize:2 capsules/servings per contain... | \n
49999 | \n653767 | \ndepth | \n45.0 centimetre | \n['45cm/18in', '45cm/18in', '45cm/18in', '45cm/... | \n[['45 centimetre/18 inch', '45 centimetre/18 i... | \n
50000 rows × 5 columns
\n