first commit
Browse files- html/front_layout.html +117 -0
- json/app_column_config.json +71 -0
- json/app_dataset_cols.json +32 -0
- json/cat_cols.json +25 -0
- json/cat_to_num_maps.json +169 -0
- json/col_names_map.json +114 -0
- json/col_names_map_sorted.json +114 -0
- json/cols_tabla_equity.json +15 -0
- json/embeddings_excluded_words.json +39 -0
- json/gamma_params.json +7 -0
- json/ignore_columns.json +96 -0
- json/industry_lists.json +169 -0
- json/nn_search_metrics.json +15 -0
- json/numeric_columns.json +86 -0
- json/semantic_search_params.json +8 -0
- requirements.txt +12 -0
- src/__init__.py +15 -0
- src/app_utils.py +69 -0
- src/env_options.py +42 -0
- src/front_dataset_handler.py +385 -0
- src/semantic_search.py +124 -0
html/front_layout.html
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- html/front_layout.html -->
|
2 |
+
<h1 style="text-align:center;margin-bottom:15px;margin-left:10px">
|
3 |
+
Swift Stock Screener
|
4 |
+
</h1>
|
5 |
+
<p style="margin-left:10px">
|
6 |
+
Browse and search over 12,000 stocks. Search assets by theme, filter, sort, analyze, and get ideas to build portfolios and indices. Search by <b>ticker symbol</b> to display a list of ranked related companies. Enter any keyword in <b>thematic search</b> to search by theme. Click on <u>country names</u> or <u>GICS sectors</u> for strict filtering. <b>Reset</b>the search and <b>bort</b> all assets by any of the displayed metrics.
|
7 |
+
|
8 |
+
<style>
|
9 |
+
/* Botón de tamaño contenido */
|
10 |
+
.small-btn {
|
11 |
+
/*width: 140px;*/
|
12 |
+
max-width: 140px;
|
13 |
+
/*min-width: 140px;*/
|
14 |
+
}
|
15 |
+
|
16 |
+
/* Etiqueta de paginación */
|
17 |
+
.pagination-label {
|
18 |
+
flex: 0 0 auto;
|
19 |
+
width: auto;
|
20 |
+
margin: 0 8px; /* small horizontal gap */
|
21 |
+
}
|
22 |
+
|
23 |
+
/* cap the Gradio table + keep pagination row below */
|
24 |
+
.clickable-columns .dataframe-container {
|
25 |
+
max-height: calc(100vh - 300px); /* adjust px to match header+controls height */
|
26 |
+
overflow-y: auto;
|
27 |
+
}
|
28 |
+
|
29 |
+
/* Columnas filtrables (click en la celda) */
|
30 |
+
.clickable-columns tbody td:nth-child(3),
|
31 |
+
.clickable-columns tbody td:nth-child(4) {
|
32 |
+
color: #1a0dab; /* link blue for light theme */
|
33 |
+
text-decoration: underline; /* underline */
|
34 |
+
cursor: pointer; /* pointer cursor */
|
35 |
+
}
|
36 |
+
|
37 |
+
@media (prefers-color-scheme: dark) {
|
38 |
+
.clickable-columns tbody td:nth-child(3),
|
39 |
+
.clickable-columns tbody td:nth-child(4) {
|
40 |
+
color: #8ab4f8; /* lighter blue for dark theme */
|
41 |
+
}
|
42 |
+
}
|
43 |
+
|
44 |
+
.clickable-columns span.negative-value {
|
45 |
+
color: red;
|
46 |
+
}
|
47 |
+
|
48 |
+
/* make the table use fixed layout so width rules apply */
|
49 |
+
.clickable-columns table {
|
50 |
+
table-layout: fixed;
|
51 |
+
}
|
52 |
+
|
53 |
+
/* CONFIGURACIÓN DE ANCHO DE COLUMNAS */
|
54 |
+
/* Ticker */
|
55 |
+
.clickable-columns table th:nth-child(1),
|
56 |
+
.clickable-columns table td:nth-child(1) {
|
57 |
+
min-width: 40px; max-width: 100px;
|
58 |
+
overflow: hidden;
|
59 |
+
}
|
60 |
+
.clickable-columns table th:nth-child(2),
|
61 |
+
.clickable-columns table td:nth-child(2) {
|
62 |
+
min-width: 75px; max-width: 220px;
|
63 |
+
overflow: hidden;
|
64 |
+
}
|
65 |
+
.clickable-columns table th:nth-child(3),
|
66 |
+
.clickable-columns table td:nth-child(3) {
|
67 |
+
min-width: 70px; max-width: 160px;
|
68 |
+
overflow: hidden;
|
69 |
+
}
|
70 |
+
.clickable-columns table th:nth-child(4),
|
71 |
+
.clickable-columns table td:nth-child(4) {
|
72 |
+
min-width: 70px; max-width: 200px;
|
73 |
+
overflow: hidden;
|
74 |
+
}
|
75 |
+
.clickable-columns table th:nth-child(5),
|
76 |
+
.clickable-columns table td:nth-child(5) {
|
77 |
+
min-width: 60px; max-width: 80px;
|
78 |
+
overflow: hidden;
|
79 |
+
}
|
80 |
+
/* 1yr return */
|
81 |
+
.clickable-columns table th:nth-child(6),
|
82 |
+
.clickable-columns table td:nth-child(6) {
|
83 |
+
min-width: 60px; max-width: 80px;
|
84 |
+
overflow: hidden;
|
85 |
+
}
|
86 |
+
.clickable-columns table th:nth-child(7),
|
87 |
+
.clickable-columns table td:nth-child(7) {
|
88 |
+
min-width: 70px; max-width: 100px;
|
89 |
+
overflow: hidden;
|
90 |
+
}
|
91 |
+
.clickable-columns table th:nth-child(8),
|
92 |
+
.clickable-columns table td:nth-child(8) {
|
93 |
+
min-width: 70px; max-width: 100px;
|
94 |
+
overflow: hidden;
|
95 |
+
}
|
96 |
+
.clickable-columns table th:nth-child(9),
|
97 |
+
.clickable-columns table td:nth-child(9) {
|
98 |
+
min-width: 70px; max-width: 100px;
|
99 |
+
overflow: hidden;
|
100 |
+
}
|
101 |
+
.clickable-columns table th:nth-child(10),
|
102 |
+
.clickable-columns table td:nth-child(10) {
|
103 |
+
min-width: 70px; max-width: 100px;
|
104 |
+
overflow: hidden;
|
105 |
+
}
|
106 |
+
.clickable-columns table th:nth-child(11),
|
107 |
+
.clickable-columns table td:nth-child(11) {
|
108 |
+
min-width: 60px; max-width: 70px;
|
109 |
+
overflow: hidden;
|
110 |
+
}
|
111 |
+
.clickable-columns table th:nth-child(12),
|
112 |
+
.clickable-columns table td:nth-child(12) {
|
113 |
+
min-width: 50px; max-width: 70px;
|
114 |
+
overflow: hidden;
|
115 |
+
}
|
116 |
+
|
117 |
+
</style>
|
json/app_column_config.json
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"app_dataset_cols": [
|
3 |
+
"ticker",
|
4 |
+
"security",
|
5 |
+
"country",
|
6 |
+
"sector",
|
7 |
+
"marketCap",
|
8 |
+
"ret_365",
|
9 |
+
"vol_365",
|
10 |
+
"trailingPE",
|
11 |
+
"revenueGrowth",
|
12 |
+
"dividendYield",
|
13 |
+
"beta",
|
14 |
+
"beta_norm",
|
15 |
+
"category",
|
16 |
+
"country_num_norm",
|
17 |
+
"debtToEquity_norm",
|
18 |
+
"fullTimeEmployees_norm",
|
19 |
+
"fundFamily",
|
20 |
+
"fundInceptionDate",
|
21 |
+
"industryDisp_num_norm",
|
22 |
+
"marketCap_norm",
|
23 |
+
"netExpenseRatio",
|
24 |
+
"quoteType",
|
25 |
+
"ret_365_norm",
|
26 |
+
"revenueGrowth_norm",
|
27 |
+
"sectorDisp_num_norm",
|
28 |
+
"totalAssets",
|
29 |
+
"trailingPE_norm",
|
30 |
+
"vol_365_norm",
|
31 |
+
"longBusinessSummary",
|
32 |
+
"embeddings"
|
33 |
+
],
|
34 |
+
"variables_busq_norm": [
|
35 |
+
"beta_norm",
|
36 |
+
"country_num_norm",
|
37 |
+
"debtToEquity_norm",
|
38 |
+
"fullTimeEmployees_norm",
|
39 |
+
"industryDisp_num_norm",
|
40 |
+
"marketCap_norm",
|
41 |
+
"ret_365_norm",
|
42 |
+
"revenueGrowth_norm",
|
43 |
+
"sectorDisp_num_norm",
|
44 |
+
"trailingPE_norm",
|
45 |
+
"vol_365_norm"
|
46 |
+
],
|
47 |
+
"cols_tabla_equity": [
|
48 |
+
"ticker",
|
49 |
+
"security",
|
50 |
+
"country",
|
51 |
+
"sector",
|
52 |
+
"marketCap",
|
53 |
+
"ret_365",
|
54 |
+
"vol_365",
|
55 |
+
"trailingPE",
|
56 |
+
"revenueGrowth",
|
57 |
+
"dividendYield",
|
58 |
+
"beta"
|
59 |
+
],
|
60 |
+
"cols_tabla_etfs": [
|
61 |
+
"ticker",
|
62 |
+
"security",
|
63 |
+
"category",
|
64 |
+
"ret_365",
|
65 |
+
"vol_365",
|
66 |
+
"totalAssets",
|
67 |
+
"netExpenseRatio",
|
68 |
+
"fundInceptionDate",
|
69 |
+
"fundFamily"
|
70 |
+
]
|
71 |
+
}
|
json/app_dataset_cols.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"app_dataset_cols": [
|
3 |
+
"ticker",
|
4 |
+
"security",
|
5 |
+
"country",
|
6 |
+
"sector",
|
7 |
+
"marketCap",
|
8 |
+
"ret_365",
|
9 |
+
"vol_365",
|
10 |
+
"trailingPE",
|
11 |
+
"revenueGrowth",
|
12 |
+
"dividendYield",
|
13 |
+
"beta",
|
14 |
+
"industryDisp_num_norm",
|
15 |
+
"sectorDisp_num_norm",
|
16 |
+
"country_num_norm",
|
17 |
+
"ret_365_norm",
|
18 |
+
"vol_365_norm",
|
19 |
+
"marketCap_norm",
|
20 |
+
"beta_norm",
|
21 |
+
"revenueGrowth_norm",
|
22 |
+
"debtToEquity_norm",
|
23 |
+
"fullTimeEmployees_norm",
|
24 |
+
"trailingPE_norm",
|
25 |
+
"category",
|
26 |
+
"fundFamily",
|
27 |
+
"totalAssets",
|
28 |
+
"netExpenseRatio",
|
29 |
+
"quoteType",
|
30 |
+
"embeddings"
|
31 |
+
]
|
32 |
+
}
|
json/cat_cols.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cat_cols": [
|
3 |
+
"country",
|
4 |
+
"industryDisp",
|
5 |
+
"legalType",
|
6 |
+
"sector",
|
7 |
+
"state",
|
8 |
+
"exchange",
|
9 |
+
"exchangeTimezoneShortName",
|
10 |
+
"zip",
|
11 |
+
"exchangeTimezoneName",
|
12 |
+
"category",
|
13 |
+
"industryKey",
|
14 |
+
"currency",
|
15 |
+
"quoteType",
|
16 |
+
"industry",
|
17 |
+
"fullExchangeName",
|
18 |
+
"city",
|
19 |
+
"sectorKey",
|
20 |
+
"market",
|
21 |
+
"financialCurrency",
|
22 |
+
"recommendationKey",
|
23 |
+
"sectorDisp"
|
24 |
+
]
|
25 |
+
}
|
json/cat_to_num_maps.json
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"sector_num_map": {
|
3 |
+
"Technology": 0,
|
4 |
+
"Healthcare": 1,
|
5 |
+
"Utilities": 2,
|
6 |
+
"Industrials": 3,
|
7 |
+
"Basic Materials": 4,
|
8 |
+
"Consumer Cyclical": 5,
|
9 |
+
"Consumer Defensive": 6,
|
10 |
+
"Energy": 7,
|
11 |
+
"Communication Services": 8,
|
12 |
+
"Financial Services": 9,
|
13 |
+
"Real Estate": 10
|
14 |
+
},
|
15 |
+
"dummy_num_dict": {
|
16 |
+
"abcd": 0,
|
17 |
+
"efgh": 1,
|
18 |
+
"ijkl": 2,
|
19 |
+
"mnop": 3,
|
20 |
+
"qrst": 4
|
21 |
+
},
|
22 |
+
"industry_num_map": {
|
23 |
+
"Thermal Coal": 0,
|
24 |
+
"Oil & Gas Integrated": 1,
|
25 |
+
"Oil & Gas Refining & Marketing": 2,
|
26 |
+
"Uranium": 3,
|
27 |
+
"Oil & Gas Equipment & Services": 4,
|
28 |
+
"Oil & Gas E&P": 5,
|
29 |
+
"Oil & Gas Midstream": 6,
|
30 |
+
"Oil & Gas Drilling": 7,
|
31 |
+
"Aluminum": 8,
|
32 |
+
"Steel": 9,
|
33 |
+
"Specialty Chemicals": 10,
|
34 |
+
"Chemicals": 11,
|
35 |
+
"Paper & Paper Products": 12,
|
36 |
+
"Lumber & Wood Production": 13,
|
37 |
+
"Building Materials": 14,
|
38 |
+
"Agricultural Inputs": 15,
|
39 |
+
"Other Industrial Metals & Mining": 16,
|
40 |
+
"Coking Coal": 17,
|
41 |
+
"Copper": 18,
|
42 |
+
"Other Precious Metals & Mining": 19,
|
43 |
+
"Gold": 20,
|
44 |
+
"Silver": 21,
|
45 |
+
"Marine Shipping": 22,
|
46 |
+
"Integrated Freight & Logistics": 23,
|
47 |
+
"Trucking": 24,
|
48 |
+
"Railroads": 25,
|
49 |
+
"Airlines": 26,
|
50 |
+
"Farm & Heavy Construction Machinery": 27,
|
51 |
+
"Industrial Distribution": 28,
|
52 |
+
"Rental & Leasing Services": 29,
|
53 |
+
"Aerospace & Defense": 30,
|
54 |
+
"Specialty Industrial Machinery": 31,
|
55 |
+
"Waste Management": 32,
|
56 |
+
"Electrical Equipment & Parts": 33,
|
57 |
+
"Airports & Air Services": 34,
|
58 |
+
"Pollution & Treatment Controls": 35,
|
59 |
+
"Conglomerates": 36,
|
60 |
+
"Tools & Accessories": 37,
|
61 |
+
"Engineering & Construction": 38,
|
62 |
+
"Metal Fabrication": 39,
|
63 |
+
"Building Products & Equipment": 40,
|
64 |
+
"Specialty Business Services": 41,
|
65 |
+
"Security & Protection Services": 42,
|
66 |
+
"Consulting Services": 43,
|
67 |
+
"Staffing & Employment Services": 44,
|
68 |
+
"Business Equipment & Supplies": 45,
|
69 |
+
"Infrastructure Operations": 46,
|
70 |
+
"Utilities - Regulated Electric": 47,
|
71 |
+
"Utilities - Independent Power Producers": 48,
|
72 |
+
"Utilities - Diversified": 49,
|
73 |
+
"Utilities - Regulated Gas": 50,
|
74 |
+
"Utilities - Renewable": 51,
|
75 |
+
"Utilities - Regulated Water": 52,
|
76 |
+
"Recreational Vehicles": 53,
|
77 |
+
"Auto Manufacturers": 54,
|
78 |
+
"Auto & Truck Dealerships": 55,
|
79 |
+
"Auto Parts": 56,
|
80 |
+
"Footwear & Accessories": 57,
|
81 |
+
"Apparel Manufacturing": 58,
|
82 |
+
"Specialty Retail": 59,
|
83 |
+
"Furnishings, Fixtures & Appliances": 60,
|
84 |
+
"Luxury Goods": 61,
|
85 |
+
"Internet Retail": 62,
|
86 |
+
"Travel Services": 63,
|
87 |
+
"Leisure": 64,
|
88 |
+
"Packaging & Containers": 65,
|
89 |
+
"Home Improvement Retail": 66,
|
90 |
+
"Apparel Retail": 67,
|
91 |
+
"Textile Manufacturing": 68,
|
92 |
+
"Department Stores": 69,
|
93 |
+
"Residential Construction": 70,
|
94 |
+
"Lodging": 71,
|
95 |
+
"Restaurants": 72,
|
96 |
+
"Gambling": 73,
|
97 |
+
"Personal Services": 74,
|
98 |
+
"Resorts & Casinos": 75,
|
99 |
+
"Confectioners": 76,
|
100 |
+
"Beverages - Non - Alcoholic": 77,
|
101 |
+
"Packaged Foods": 78,
|
102 |
+
"Food Distribution": 79,
|
103 |
+
"Household & Personal Products": 80,
|
104 |
+
"Discount Stores": 81,
|
105 |
+
"Grocery Stores": 82,
|
106 |
+
"Tobacco": 83,
|
107 |
+
"Beverages - Wineries & Distilleries": 84,
|
108 |
+
"Beverages - Brewers": 85,
|
109 |
+
"Farm Products": 86,
|
110 |
+
"Education & Training Services": 87,
|
111 |
+
"Electronics & Computer Distribution": 88,
|
112 |
+
"Computer Hardware": 89,
|
113 |
+
"Semiconductors": 90,
|
114 |
+
"Electronic Components": 91,
|
115 |
+
"Semiconductor Equipment & Materials": 92,
|
116 |
+
"Consumer Electronics": 93,
|
117 |
+
"Communication Equipment": 94,
|
118 |
+
"Scientific & Technical Instruments": 95,
|
119 |
+
"Information Technology Services": 96,
|
120 |
+
"Solar": 97,
|
121 |
+
"Software - Application": 98,
|
122 |
+
"Software - Infrastructure": 99,
|
123 |
+
"Broadcasting": 100,
|
124 |
+
"Telecom Services": 101,
|
125 |
+
"Advertising Agencies": 102,
|
126 |
+
"Entertainment": 103,
|
127 |
+
"Publishing": 104,
|
128 |
+
"Internet Content & Information": 105,
|
129 |
+
"Electronic Gaming & Multimedia": 106,
|
130 |
+
"Medical Distribution": 107,
|
131 |
+
"Drug Manufacturers - General": 108,
|
132 |
+
"Pharmaceutical Retailers": 109,
|
133 |
+
"Drug Manufacturers - Specialty & Generic": 110,
|
134 |
+
"Medical Instruments & Supplies": 111,
|
135 |
+
"Health Information Services": 112,
|
136 |
+
"Medical Devices": 113,
|
137 |
+
"Healthcare Plans": 114,
|
138 |
+
"Diagnostics & Research": 115,
|
139 |
+
"Biotechnology": 116,
|
140 |
+
"Medical Care Facilities": 117,
|
141 |
+
"Banks - Diversified": 118,
|
142 |
+
"Banks - Regional": 119,
|
143 |
+
"Financial Conglomerates": 120,
|
144 |
+
"Credit Services": 121,
|
145 |
+
"Insurance - Reinsurance": 122,
|
146 |
+
"Mortgage Finance": 123,
|
147 |
+
"Insurance - Diversified": 124,
|
148 |
+
"Capital Markets": 125,
|
149 |
+
"Insurance - Life": 126,
|
150 |
+
"Insurance - Specialty": 127,
|
151 |
+
"Insurance - Property & Casualty": 128,
|
152 |
+
"Financial Data & Stock Exchanges": 129,
|
153 |
+
"Insurance Brokers": 130,
|
154 |
+
"Asset Management": 131,
|
155 |
+
"Shell Companies": 132,
|
156 |
+
"REIT - Mortgage": 133,
|
157 |
+
"REIT - Healthcare Facilities": 134,
|
158 |
+
"REIT - Retail": 135,
|
159 |
+
"REIT - Diversified": 136,
|
160 |
+
"REIT - Residential": 137,
|
161 |
+
"REIT - Office": 138,
|
162 |
+
"REIT - Industrial": 139,
|
163 |
+
"REIT - Hotel & Motel": 140,
|
164 |
+
"Real Estate - Diversified": 141,
|
165 |
+
"Real Estate Services": 142,
|
166 |
+
"REIT - Specialty": 143,
|
167 |
+
"Real Estate - Development": 144
|
168 |
+
}
|
169 |
+
}
|
json/col_names_map.json
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"col_names_map": {
|
3 |
+
"52WeekChange": "52-Week Change",
|
4 |
+
"asset_age": "Years Listed",
|
5 |
+
"averageAnalystRating": "Avg. Analyst Rating",
|
6 |
+
"averageVolume": "Avg. Volume",
|
7 |
+
"beta": "Beta",
|
8 |
+
"beta3Year": "Beta 3-Year",
|
9 |
+
"bookValue": "Book Value",
|
10 |
+
"category": "Category",
|
11 |
+
"city": "City",
|
12 |
+
"country": "Country",
|
13 |
+
"currency": "Currency",
|
14 |
+
"currentRatio": "Current Ratio",
|
15 |
+
"debtToEquity": "Debt To Equity",
|
16 |
+
"dividendRate": "Dividend Rate",
|
17 |
+
"dividendYield": "Dividend Yield",
|
18 |
+
"earningsGrowth": "Earnings Growth",
|
19 |
+
"earningsQuarterlyGrowth": "Earnings Quarterly Growth",
|
20 |
+
"ebitda": "EBITDA",
|
21 |
+
"ebitdaMargins": "EBITDA Margins",
|
22 |
+
"enterpriseToEbitda": "Enterprise To EBITDA",
|
23 |
+
"enterpriseToRevenue": "Enterprise To Revenue",
|
24 |
+
"enterpriseValue": "Enterprise Value",
|
25 |
+
"epsCurrentYear": "EPS Current Year",
|
26 |
+
"epsForward": "EPS Forward",
|
27 |
+
"exchange": "Exchange",
|
28 |
+
"exchangeTimezoneName": "Exchange Timezone Name",
|
29 |
+
"exchangeTimezoneShortName": "Exchange Timezone Short Name",
|
30 |
+
"fiftyDayAverageChange": "50-Day Avg. Change",
|
31 |
+
"fiftyDayAverageChangePercent": "50-Day Avg. Change Percent",
|
32 |
+
"fiftyTwoWeekHighChangePercent": "52-Week High Change Percent",
|
33 |
+
"fiftyTwoWeekLowChange": "52-Week Low Change",
|
34 |
+
"fiftyTwoWeekLowChangePercent": "52-Week Low Change Percent",
|
35 |
+
"financialCurrency": "Financial Currency",
|
36 |
+
"firstTradeDateMilliseconds": "First Trade Date (ms)",
|
37 |
+
"fiveYearAvgDividendYield": "5-Year Avg Dividend Yield",
|
38 |
+
"floatShares": "Float Shares",
|
39 |
+
"forwardPE": "Forward PE",
|
40 |
+
"freeCashflow": "Free Cashflow",
|
41 |
+
"fullExchangeName": "Full Exchange Name",
|
42 |
+
"fullTimeEmployees": "Full Time Employees",
|
43 |
+
"fundInceptionDate": "Fund Inception Date",
|
44 |
+
"grossMargins": "Gross Margins",
|
45 |
+
"grossProfits": "Gross Profits",
|
46 |
+
"heldPercentInsiders": "Held Percent Insiders",
|
47 |
+
"heldPercentInstitutions": "Held Percent Institutions",
|
48 |
+
"ind_sust": "Similarity Index",
|
49 |
+
"industry": "GICS Industry",
|
50 |
+
"industryDisp": "GICS Industry",
|
51 |
+
"industryKey": "GICS Industry Key",
|
52 |
+
"legalType": "Legal Type",
|
53 |
+
"market": "Market",
|
54 |
+
"marketCap": "Market Cap",
|
55 |
+
"navPrice": "NAV Price",
|
56 |
+
"netAssets": "Net Assets",
|
57 |
+
"netExpenseRatio": "Net Expense Ratio",
|
58 |
+
"netIncomeToCommon": "Net Income To Common",
|
59 |
+
"numberOfAnalystOpinions": "Number Of Analyst Opinions",
|
60 |
+
"operatingCashflow": "Operating Cashflow",
|
61 |
+
"operatingMargins": "Operating Margins",
|
62 |
+
"overallRisk": "Overall Risk",
|
63 |
+
"payoutRatio": "Payout Ratio",
|
64 |
+
"preMarketChange": "Pre-Market Change",
|
65 |
+
"preMarketChangePercent": "Pre-Market Change Percent",
|
66 |
+
"preMarketPrice": "Pre-Market Price",
|
67 |
+
"previousClose": "Previous Close",
|
68 |
+
"priceEpsCurrentYear": "Price/EPS Current Year",
|
69 |
+
"priceToBook": "Price To Book",
|
70 |
+
"priceToSalesTrailing12Months": "Price To Sales Trailing 12 Months",
|
71 |
+
"profitMargins": "Profit Margins",
|
72 |
+
"quickRatio": "Quick Ratio",
|
73 |
+
"quoteType": "Quote Type",
|
74 |
+
"recommendationKey": "Recommendation Key",
|
75 |
+
"recommendationMean": "Recommendation Mean",
|
76 |
+
"regularMarketChangePercent": "Regular Market Change Percent",
|
77 |
+
"ret_365": "1yr Return",
|
78 |
+
"returnOnAssets": "Return On Assets",
|
79 |
+
"returnOnEquity": "Return On Equity",
|
80 |
+
"revenueGrowth": "Revenue Growth",
|
81 |
+
"revenuePerShare": "Revenue Per Share",
|
82 |
+
"sector": "GICS Sector",
|
83 |
+
"sectorDisp": "GICS Sector",
|
84 |
+
"sectorKey": "GICS Sector Key",
|
85 |
+
"security": "Name",
|
86 |
+
"sharesOutstanding": "Shares Outstanding",
|
87 |
+
"sharesPercentSharesOut": "Shares Percent Shares Out",
|
88 |
+
"sharesShort": "Shares Short",
|
89 |
+
"sharesShortPriorMonth": "Shares Short Prior Month",
|
90 |
+
"shortPercentOfFloat": "Short Percent Of Float",
|
91 |
+
"shortRatio": "Short Ratio",
|
92 |
+
"state": "State",
|
93 |
+
"threeYearAverageReturn": "3yr Avg. Return",
|
94 |
+
"ticker": "Ticker",
|
95 |
+
"totalAssets": "Total Assets",
|
96 |
+
"totalCash": "Total Cash",
|
97 |
+
"totalCashPerShare": "Total Cash Per Share",
|
98 |
+
"totalDebt": "Total Debt",
|
99 |
+
"totalRevenue": "Total Revenue",
|
100 |
+
"trailingAnnualDividendRate": "Trailing Annual Dividend Rate",
|
101 |
+
"trailingAnnualDividendYield": "Trailing Annual Dividend Yield",
|
102 |
+
"trailingEps": "Trailing EPS",
|
103 |
+
"trailingPE": "Trailing PE",
|
104 |
+
"trailingPegRatio": "Trailing PEG Ratio",
|
105 |
+
"trailingThreeMonthNavReturns": "Trailing 3-Month NAV Returns",
|
106 |
+
"trailingThreeMonthReturns": "Trailing 3-Month Returns",
|
107 |
+
"twoHundredDayAverageChange": "200-Day Avg. Change",
|
108 |
+
"twoHundredDayAverageChangePercent": "200-Day Avg. Change Percent",
|
109 |
+
"vol_365": "Volatility",
|
110 |
+
"yield": "Yield",
|
111 |
+
"ytdReturn": "YTD Return",
|
112 |
+
"zip": "Zip"
|
113 |
+
}
|
114 |
+
}
|
json/col_names_map_sorted.json
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"col_names_map": {
|
3 |
+
"52WeekChange": "52-Week Change",
|
4 |
+
"asset_age": "Years Listed",
|
5 |
+
"averageAnalystRating": "Avg. Analyst Rating",
|
6 |
+
"averageVolume": "Avg. Volume",
|
7 |
+
"beta": "Beta",
|
8 |
+
"beta3Year": "Beta 3-Year",
|
9 |
+
"bookValue": "Book Value",
|
10 |
+
"category": "Category",
|
11 |
+
"city": "City",
|
12 |
+
"country": "Country",
|
13 |
+
"currency": "Currency",
|
14 |
+
"currentRatio": "Current Ratio",
|
15 |
+
"debtToEquity": "Debt To Equity",
|
16 |
+
"dividendRate": "Dividend Rate",
|
17 |
+
"dividendYield": "Dividend Yield",
|
18 |
+
"earningsGrowth": "Earnings Growth",
|
19 |
+
"earningsQuarterlyGrowth": "Earnings Quarterly Growth",
|
20 |
+
"ebitda": "EBITDA",
|
21 |
+
"ebitdaMargins": "EBITDA Margins",
|
22 |
+
"enterpriseToEbitda": "Enterprise To EBITDA",
|
23 |
+
"enterpriseToRevenue": "Enterprise To Revenue",
|
24 |
+
"enterpriseValue": "Enterprise Value",
|
25 |
+
"epsCurrentYear": "EPS Current Year",
|
26 |
+
"epsForward": "EPS Forward",
|
27 |
+
"exchange": "Exchange",
|
28 |
+
"exchangeTimezoneName": "Exchange Timezone Name",
|
29 |
+
"exchangeTimezoneShortName": "Exchange Timezone Short Name",
|
30 |
+
"fiftyDayAverageChange": "50-Day Avg. Change",
|
31 |
+
"fiftyDayAverageChangePercent": "50-Day Avg. Change Percent",
|
32 |
+
"fiftyTwoWeekHighChangePercent": "52-Week High Change Percent",
|
33 |
+
"fiftyTwoWeekLowChange": "52-Week Low Change",
|
34 |
+
"fiftyTwoWeekLowChangePercent": "52-Week Low Change Percent",
|
35 |
+
"financialCurrency": "Financial Currency",
|
36 |
+
"firstTradeDateMilliseconds": "First Trade Date (ms)",
|
37 |
+
"fiveYearAvgDividendYield": "5-Year Avg Dividend Yield",
|
38 |
+
"floatShares": "Float Shares",
|
39 |
+
"forwardPE": "Forward PE",
|
40 |
+
"freeCashflow": "Free Cashflow",
|
41 |
+
"fullExchangeName": "Full Exchange Name",
|
42 |
+
"fullTimeEmployees": "Full Time Employees",
|
43 |
+
"fundInceptionDate": "Fund Inception Date",
|
44 |
+
"grossMargins": "Gross Margins",
|
45 |
+
"grossProfits": "Gross Profits",
|
46 |
+
"heldPercentInsiders": "Held Percent Insiders",
|
47 |
+
"heldPercentInstitutions": "Held Percent Institutions",
|
48 |
+
"ind_sust": "Similarity Index",
|
49 |
+
"industry": "Sector",
|
50 |
+
"industryDisp": "Industry",
|
51 |
+
"industryKey": "Industry Key",
|
52 |
+
"legalType": "Legal Type",
|
53 |
+
"market": "Market",
|
54 |
+
"marketCap": "Market Cap",
|
55 |
+
"navPrice": "NAV Price",
|
56 |
+
"netAssets": "Net Assets",
|
57 |
+
"netExpenseRatio": "Net Expense Ratio",
|
58 |
+
"netIncomeToCommon": "Net Income To Common",
|
59 |
+
"numberOfAnalystOpinions": "Number Of Analyst Opinions",
|
60 |
+
"operatingCashflow": "Operating Cashflow",
|
61 |
+
"operatingMargins": "Operating Margins",
|
62 |
+
"overallRisk": "Overall Risk",
|
63 |
+
"payoutRatio": "Payout Ratio",
|
64 |
+
"preMarketChange": "Pre-Market Change",
|
65 |
+
"preMarketChangePercent": "Pre-Market Change Percent",
|
66 |
+
"preMarketPrice": "Pre-Market Price",
|
67 |
+
"previousClose": "Previous Close",
|
68 |
+
"priceEpsCurrentYear": "Price/EPS Current Year",
|
69 |
+
"priceToBook": "Price To Book",
|
70 |
+
"priceToSalesTrailing12Months": "Price To Sales Trailing 12 Months",
|
71 |
+
"profitMargins": "Profit Margins",
|
72 |
+
"quickRatio": "Quick Ratio",
|
73 |
+
"quoteType": "Quote Type",
|
74 |
+
"recommendationKey": "Recommendation Key",
|
75 |
+
"recommendationMean": "Recommendation Mean",
|
76 |
+
"regularMarketChangePercent": "Regular Market Change Percent",
|
77 |
+
"ret_365": "1yr Return",
|
78 |
+
"returnOnAssets": "Return On Assets",
|
79 |
+
"returnOnEquity": "Return On Equity",
|
80 |
+
"revenueGrowth": "Revenue Growth",
|
81 |
+
"revenuePerShare": "Revenue Per Share",
|
82 |
+
"sector": "Sector",
|
83 |
+
"sectorDisp": "Sector",
|
84 |
+
"sectorKey": "Sector Key",
|
85 |
+
"security": "Name",
|
86 |
+
"sharesOutstanding": "Shares Outstanding",
|
87 |
+
"sharesPercentSharesOut": "Shares Percent Shares Out",
|
88 |
+
"sharesShort": "Shares Short",
|
89 |
+
"sharesShortPriorMonth": "Shares Short Prior Month",
|
90 |
+
"shortPercentOfFloat": "Short Percent Of Float",
|
91 |
+
"shortRatio": "Short Ratio",
|
92 |
+
"state": "State",
|
93 |
+
"threeYearAverageReturn": "3yr Avg. Return",
|
94 |
+
"ticker": "Ticker",
|
95 |
+
"totalAssets": "Total Assets",
|
96 |
+
"totalCash": "Total Cash",
|
97 |
+
"totalCashPerShare": "Total Cash Per Share",
|
98 |
+
"totalDebt": "Total Debt",
|
99 |
+
"totalRevenue": "Total Revenue",
|
100 |
+
"trailingAnnualDividendRate": "Trailing Annual Dividend Rate",
|
101 |
+
"trailingAnnualDividendYield": "Trailing Annual Dividend Yield",
|
102 |
+
"trailingEps": "Trailing EPS",
|
103 |
+
"trailingPE": "Trailing PE",
|
104 |
+
"trailingPegRatio": "Trailing PEG Ratio",
|
105 |
+
"trailingThreeMonthNavReturns": "Trailing 3-Month NAV Returns",
|
106 |
+
"trailingThreeMonthReturns": "Trailing 3-Month Returns",
|
107 |
+
"twoHundredDayAverageChange": "200-Day Avg. Change",
|
108 |
+
"twoHundredDayAverageChangePercent": "200-Day Avg. Change Percent",
|
109 |
+
"vol_365": "Volatility",
|
110 |
+
"yield": "Yield",
|
111 |
+
"ytdReturn": "YTD Return",
|
112 |
+
"zip": "Zip"
|
113 |
+
}
|
114 |
+
}
|
json/cols_tabla_equity.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cols_tabla_equity": [
|
3 |
+
"ticker",
|
4 |
+
"security",
|
5 |
+
"country",
|
6 |
+
"sector",
|
7 |
+
"marketCap",
|
8 |
+
"ret_365",
|
9 |
+
"vol_365",
|
10 |
+
"trailingPE",
|
11 |
+
"revenueGrowth",
|
12 |
+
"dividendYield",
|
13 |
+
"beta"
|
14 |
+
]
|
15 |
+
}
|
json/embeddings_excluded_words.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"excluded_words": [
|
3 |
+
"us",
|
4 |
+
"china",
|
5 |
+
"japan",
|
6 |
+
"russia",
|
7 |
+
"india",
|
8 |
+
"europe",
|
9 |
+
"company",
|
10 |
+
"operates",
|
11 |
+
"provides",
|
12 |
+
"offers",
|
13 |
+
"headquartered",
|
14 |
+
"based",
|
15 |
+
"incorporated",
|
16 |
+
"together",
|
17 |
+
"founded",
|
18 |
+
"business",
|
19 |
+
"businesses",
|
20 |
+
"companies",
|
21 |
+
"customers",
|
22 |
+
"clients",
|
23 |
+
"under",
|
24 |
+
"co",
|
25 |
+
"inc",
|
26 |
+
"nv",
|
27 |
+
"ltd",
|
28 |
+
"limited",
|
29 |
+
"normal market conditions",
|
30 |
+
"the fund will normally invest",
|
31 |
+
"the fund invests",
|
32 |
+
"normal circumstances",
|
33 |
+
"at least",
|
34 |
+
"of the fund",
|
35 |
+
"seeks to achieve",
|
36 |
+
"through its subsidiaries",
|
37 |
+
"with its subsidiaries"
|
38 |
+
]
|
39 |
+
}
|
json/gamma_params.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"shape": 1000,
|
3 |
+
"loc": -8,
|
4 |
+
"scale": 0.009,
|
5 |
+
"max_dist": 2,
|
6 |
+
"precision_cdf": 1000
|
7 |
+
}
|
json/ignore_columns.json
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"ignore_columns": [
|
3 |
+
"address1",
|
4 |
+
"phone",
|
5 |
+
"governanceEpochDate",
|
6 |
+
"maxAge",
|
7 |
+
"tradeable",
|
8 |
+
"SandP52WeekChange",
|
9 |
+
"language",
|
10 |
+
"region",
|
11 |
+
"typeDisp",
|
12 |
+
"quoteSourceName",
|
13 |
+
"esgPopulated",
|
14 |
+
"postMarketTime",
|
15 |
+
"regularMarketTime",
|
16 |
+
"marketState",
|
17 |
+
"exchangeDataDelayedBy",
|
18 |
+
"cryptoTradeable",
|
19 |
+
"postMarketChangePercent",
|
20 |
+
"postMarketPrice",
|
21 |
+
"postMarketChange",
|
22 |
+
"isEarningsDateEstimate",
|
23 |
+
"gmtOffSetMilliseconds",
|
24 |
+
"preMarketTime",
|
25 |
+
"preMarketPrice",
|
26 |
+
"preMarketChange",
|
27 |
+
"preMarketChangePercent",
|
28 |
+
"governanceEpochDate",
|
29 |
+
"compensationAsOfEpochDate",
|
30 |
+
"sharesShortPreviousMonthDate",
|
31 |
+
"dateShortInterest",
|
32 |
+
"dividendDate",
|
33 |
+
"earningsTimestamp",
|
34 |
+
"earningsTimestampStart",
|
35 |
+
"earningsTimestampEnd",
|
36 |
+
"earningsCallTimestampStart",
|
37 |
+
"earningsCallTimestampEnd",
|
38 |
+
"priceHint",
|
39 |
+
"triggerable",
|
40 |
+
"customPriceAlertConfidence",
|
41 |
+
"messageBoardId",
|
42 |
+
"hasPrePostMarketData",
|
43 |
+
"sourceInterval",
|
44 |
+
"open",
|
45 |
+
"dayLow",
|
46 |
+
"dayHigh",
|
47 |
+
"regularMarketPreviousClose",
|
48 |
+
"bid",
|
49 |
+
"ask",
|
50 |
+
"bidSize",
|
51 |
+
"askSize",
|
52 |
+
"regularMarketOpen",
|
53 |
+
"regularMarketDayLow",
|
54 |
+
"regularMarketDayHigh",
|
55 |
+
"twoHundredDayAverage",
|
56 |
+
"lastDividendValue",
|
57 |
+
"targetHighPrice",
|
58 |
+
"targetLowPrice",
|
59 |
+
"targetMeanPrice",
|
60 |
+
"targetMedianPrice",
|
61 |
+
"regularMarketPrice",
|
62 |
+
"regularMarketChangePercentfiftyTwoWeekLowChange",
|
63 |
+
"fiftyTwoWeekHighChange",
|
64 |
+
"fiftyTwoWeekLow",
|
65 |
+
"fiftyTwoWeekHigh",
|
66 |
+
"fiftyDayAverage",
|
67 |
+
"dividendRatefiftyDayAverage",
|
68 |
+
"regularMarketChange",
|
69 |
+
"exDividendDate",
|
70 |
+
"lastFiscalYearEnd",
|
71 |
+
"nextFiscalYearEnd",
|
72 |
+
"mostRecentQuarter",
|
73 |
+
"nameChangeDate",
|
74 |
+
"lastSplitDate",
|
75 |
+
"lastDividendDate",
|
76 |
+
"earningsCallTimestampStart",
|
77 |
+
"earningsCallTimestampEnd",
|
78 |
+
"regularMarketVolume",
|
79 |
+
"volume",
|
80 |
+
"averageDailyVolume10Day",
|
81 |
+
"averageDailyVolume3Month",
|
82 |
+
"epsTrailingTwelveMonths",
|
83 |
+
"averageVolume10days",
|
84 |
+
"auditRisk",
|
85 |
+
"boardRisk",
|
86 |
+
"compensationRisk",
|
87 |
+
"shareHolderRightsRisk",
|
88 |
+
"epsTrailingTwelveMonths",
|
89 |
+
"currentPrice",
|
90 |
+
"forwardEps",
|
91 |
+
"impliedSharesOutstanding",
|
92 |
+
"averageDailyVolume10Day",
|
93 |
+
"volume",
|
94 |
+
"fiftyTwoWeekChangePercent"
|
95 |
+
]
|
96 |
+
}
|
json/industry_lists.json
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Technology": [
|
3 |
+
"Consumer Electronics",
|
4 |
+
"Software - Infrastructure",
|
5 |
+
"Semiconductors",
|
6 |
+
"Software - Application",
|
7 |
+
"Semiconductor Equipment & Materials",
|
8 |
+
"Communication Equipment",
|
9 |
+
"Information Technology Services",
|
10 |
+
"Scientific & Technical Instruments",
|
11 |
+
"Computer Hardware",
|
12 |
+
"Electronic Components",
|
13 |
+
"Solar",
|
14 |
+
"Electronics & Computer Distribution"
|
15 |
+
],
|
16 |
+
"Consumer Cyclical": [
|
17 |
+
"Internet Retail",
|
18 |
+
"Auto Manufacturers",
|
19 |
+
"Home Improvement Retail",
|
20 |
+
"Luxury Goods",
|
21 |
+
"Restaurants",
|
22 |
+
"Apparel Retail",
|
23 |
+
"Travel Services",
|
24 |
+
"Footwear & Accessories",
|
25 |
+
"Auto Parts",
|
26 |
+
"Furnishings, Fixtures & Appliances",
|
27 |
+
"Lodging",
|
28 |
+
"Specialty Retail",
|
29 |
+
"Gambling",
|
30 |
+
"Residential Construction",
|
31 |
+
"Leisure",
|
32 |
+
"Auto & Truck Dealerships",
|
33 |
+
"Personal Services",
|
34 |
+
"Resorts & Casinos",
|
35 |
+
"Packaging & Containers",
|
36 |
+
"Department Stores",
|
37 |
+
"Apparel Manufacturing",
|
38 |
+
"Textile Manufacturing",
|
39 |
+
"Recreational Vehicles"
|
40 |
+
],
|
41 |
+
"Communication Services": [
|
42 |
+
"Internet Content & Information",
|
43 |
+
"Entertainment",
|
44 |
+
"Telecom Services",
|
45 |
+
"Electronic Gaming & Multimedia",
|
46 |
+
"Advertising Agencies",
|
47 |
+
"Publishing",
|
48 |
+
"Broadcasting"
|
49 |
+
],
|
50 |
+
"Energy": [
|
51 |
+
"Oil & Gas Integrated",
|
52 |
+
"Oil & Gas Refining & Marketing",
|
53 |
+
"Oil & Gas E&P",
|
54 |
+
"Thermal Coal",
|
55 |
+
"Oil & Gas Midstream",
|
56 |
+
"Oil & Gas Equipment & Services",
|
57 |
+
"Uranium",
|
58 |
+
"Oil & Gas Drilling"
|
59 |
+
],
|
60 |
+
"Financial Services": [
|
61 |
+
"Insurance - Diversified",
|
62 |
+
"Banks - Diversified",
|
63 |
+
"Credit Services",
|
64 |
+
"Capital Markets",
|
65 |
+
"Banks - Regional",
|
66 |
+
"Asset Management",
|
67 |
+
"Insurance - Property & Casualty",
|
68 |
+
"Financial Data & Stock Exchanges",
|
69 |
+
"Insurance - Life",
|
70 |
+
"Insurance Brokers",
|
71 |
+
"Insurance - Reinsurance",
|
72 |
+
"Mortgage Finance",
|
73 |
+
"Financial Conglomerates",
|
74 |
+
"Insurance - Specialty",
|
75 |
+
"Shell Companies"
|
76 |
+
],
|
77 |
+
"Healthcare": [
|
78 |
+
"Drug Manufacturers - General",
|
79 |
+
"Healthcare Plans",
|
80 |
+
"Medical Devices",
|
81 |
+
"Medical Instruments & Supplies",
|
82 |
+
"Diagnostics & Research",
|
83 |
+
"Biotechnology",
|
84 |
+
"Medical Distribution",
|
85 |
+
"Medical Care Facilities",
|
86 |
+
"Drug Manufacturers - Specialty & Generic",
|
87 |
+
"Health Information Services",
|
88 |
+
"Pharmaceutical Retailers"
|
89 |
+
],
|
90 |
+
"Consumer Defensive": [
|
91 |
+
"Discount Stores",
|
92 |
+
"Household & Personal Products",
|
93 |
+
"Beverages - Non - Alcoholic",
|
94 |
+
"Packaged Foods",
|
95 |
+
"Beverages - Wineries & Distilleries",
|
96 |
+
"Tobacco",
|
97 |
+
"Beverages - Brewers",
|
98 |
+
"Confectioners",
|
99 |
+
"Grocery Stores",
|
100 |
+
"Food Distribution",
|
101 |
+
"Farm Products",
|
102 |
+
"Education & Training Services"
|
103 |
+
],
|
104 |
+
"Basic Materials": [
|
105 |
+
"Specialty Chemicals",
|
106 |
+
"Other Industrial Metals & Mining",
|
107 |
+
"Copper",
|
108 |
+
"Gold",
|
109 |
+
"Building Materials",
|
110 |
+
"Chemicals",
|
111 |
+
"Agricultural Inputs",
|
112 |
+
"Steel",
|
113 |
+
"Paper & Paper Products",
|
114 |
+
"Aluminum",
|
115 |
+
"Other Precious Metals & Mining",
|
116 |
+
"Lumber & Wood Production",
|
117 |
+
"Silver",
|
118 |
+
"Coking Coal"
|
119 |
+
],
|
120 |
+
"Industrials": [
|
121 |
+
"Aerospace & Defense",
|
122 |
+
"Specialty Industrial Machinery",
|
123 |
+
"Farm & Heavy Construction Machinery",
|
124 |
+
"Electrical Equipment & Parts",
|
125 |
+
"Conglomerates",
|
126 |
+
"Railroads",
|
127 |
+
"Specialty Business Services",
|
128 |
+
"Waste Management",
|
129 |
+
"Integrated Freight & Logistics",
|
130 |
+
"Building Products & Equipment",
|
131 |
+
"Engineering & Construction",
|
132 |
+
"Industrial Distribution",
|
133 |
+
"Consulting Services",
|
134 |
+
"Rental & Leasing Services",
|
135 |
+
"Airports & Air Services",
|
136 |
+
"Infrastructure Operations",
|
137 |
+
"Trucking",
|
138 |
+
"Security & Protection Services",
|
139 |
+
"Marine Shipping",
|
140 |
+
"Airlines",
|
141 |
+
"Pollution & Treatment Controls",
|
142 |
+
"Tools & Accessories",
|
143 |
+
"Metal Fabrication",
|
144 |
+
"Staffing & Employment Services",
|
145 |
+
"Business Equipment & Supplies"
|
146 |
+
],
|
147 |
+
"Utilities": [
|
148 |
+
"Utilities - Regulated Electric",
|
149 |
+
"Utilities - Diversified",
|
150 |
+
"Utilities - Renewable",
|
151 |
+
"Utilities - Independent Power Producers",
|
152 |
+
"Utilities - Regulated Water",
|
153 |
+
"Utilities - Regulated Gas"
|
154 |
+
],
|
155 |
+
"Real Estate": [
|
156 |
+
"REIT - Specialty",
|
157 |
+
"REIT - Healthcare Facilities",
|
158 |
+
"REIT - Industrial",
|
159 |
+
"REIT - Retail",
|
160 |
+
"Real Estate - Diversified",
|
161 |
+
"Real Estate Services",
|
162 |
+
"REIT - Diversified",
|
163 |
+
"REIT - Residential",
|
164 |
+
"Real Estate - Development",
|
165 |
+
"REIT - Office",
|
166 |
+
"REIT - Mortgage",
|
167 |
+
"REIT - Hotel & Motel"
|
168 |
+
]
|
169 |
+
}
|
json/nn_search_metrics.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"nn_search_metrics": [
|
3 |
+
"industryDisp_num_norm",
|
4 |
+
"sectorDisp_num_norm",
|
5 |
+
"country_num_norm",
|
6 |
+
"ret_365_norm",
|
7 |
+
"vol_365_norm",
|
8 |
+
"marketCap_norm",
|
9 |
+
"beta_norm",
|
10 |
+
"revenueGrowth_norm",
|
11 |
+
"debtToEquity_norm",
|
12 |
+
"fullTimeEmployees_norm",
|
13 |
+
"trailingPE_norm"
|
14 |
+
]
|
15 |
+
}
|
json/numeric_columns.json
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"numeric_columns": [
|
3 |
+
"netAssets",
|
4 |
+
"threeYearAverageReturn",
|
5 |
+
"fiveYearAvgDividendYield",
|
6 |
+
"preMarketPrice",
|
7 |
+
"payoutRatio",
|
8 |
+
"heldPercentInstitutions",
|
9 |
+
"epsForward",
|
10 |
+
"sharesShort",
|
11 |
+
"preMarketChange",
|
12 |
+
"fiftyTwoWeekLowChange",
|
13 |
+
"enterpriseToEbitda",
|
14 |
+
"quickRatio",
|
15 |
+
"yield",
|
16 |
+
"operatingMargins",
|
17 |
+
"firstTradeDateMilliseconds",
|
18 |
+
"priceEpsCurrentYear",
|
19 |
+
"bookValue",
|
20 |
+
"forwardPE",
|
21 |
+
"profitMargins",
|
22 |
+
"netIncomeToCommon",
|
23 |
+
"priceToSalesTrailing12Months",
|
24 |
+
"currentRatio",
|
25 |
+
"ebitda",
|
26 |
+
"beta3Year",
|
27 |
+
"ebitdaMargins",
|
28 |
+
"trailingAnnualDividendYield",
|
29 |
+
"trailingThreeMonthNavReturns",
|
30 |
+
"sharesOutstanding",
|
31 |
+
"trailingPE",
|
32 |
+
"totalDebt",
|
33 |
+
"netExpenseRatio",
|
34 |
+
"dividendRate",
|
35 |
+
"totalAssets",
|
36 |
+
"heldPercentInsiders",
|
37 |
+
"trailingPegRatio",
|
38 |
+
"totalRevenue",
|
39 |
+
"totalCashPerShare",
|
40 |
+
"previousClose",
|
41 |
+
"returnOnAssets",
|
42 |
+
"revenuePerShare",
|
43 |
+
"enterpriseValue",
|
44 |
+
"debtToEquity",
|
45 |
+
"epsCurrentYear",
|
46 |
+
"dividendYield",
|
47 |
+
"revenueGrowth",
|
48 |
+
"52WeekChange",
|
49 |
+
"shortRatio",
|
50 |
+
"numberOfAnalystOpinions",
|
51 |
+
"operatingCashflow",
|
52 |
+
"sharesShortPriorMonth",
|
53 |
+
"twoHundredDayAverageChangePercent",
|
54 |
+
"grossProfits",
|
55 |
+
"sharesPercentSharesOut",
|
56 |
+
"overallRisk",
|
57 |
+
"priceToBook",
|
58 |
+
"trailingThreeMonthReturns",
|
59 |
+
"returnOnEquity",
|
60 |
+
"fiftyTwoWeekLowChangePercent",
|
61 |
+
"fullTimeEmployees",
|
62 |
+
"floatShares",
|
63 |
+
"regularMarketChangePercent",
|
64 |
+
"marketCap",
|
65 |
+
"averageVolume",
|
66 |
+
"trailingAnnualDividendRate",
|
67 |
+
"earningsGrowth",
|
68 |
+
"trailingEps",
|
69 |
+
"grossMargins",
|
70 |
+
"fiftyDayAverageChangePercent",
|
71 |
+
"shortPercentOfFloat",
|
72 |
+
"fiftyDayAverageChange",
|
73 |
+
"ytdReturn",
|
74 |
+
"preMarketChangePercent",
|
75 |
+
"earningsQuarterlyGrowth",
|
76 |
+
"fiftyTwoWeekHighChangePercent",
|
77 |
+
"freeCashflow",
|
78 |
+
"recommendationMean",
|
79 |
+
"fundInceptionDate",
|
80 |
+
"navPrice",
|
81 |
+
"beta",
|
82 |
+
"totalCash",
|
83 |
+
"enterpriseToRevenue",
|
84 |
+
"twoHundredDayAverageChange"
|
85 |
+
]
|
86 |
+
}
|
json/semantic_search_params.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"semantic_search_params": {
|
3 |
+
"k": 2000,
|
4 |
+
"brevity_penalty": 0.1,
|
5 |
+
"reward_for_literal": 0.03,
|
6 |
+
"partial_match_factor": 0.8
|
7 |
+
}
|
8 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers==4.44.2
|
2 |
+
sentence-transformers
|
3 |
+
torch
|
4 |
+
scikit-learn
|
5 |
+
scipy
|
6 |
+
numpy
|
7 |
+
pandas
|
8 |
+
datasets
|
9 |
+
duckdb
|
10 |
+
pathlib
|
11 |
+
json
|
12 |
+
gradio
|
src/__init__.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/__init__.py
|
2 |
+
from importlib import import_module
|
3 |
+
import sys
|
4 |
+
|
5 |
+
# Aggregate core sub‑modules so a caller imports the package once instead of listing each file.
|
6 |
+
__all__ = [
|
7 |
+
"front_dataset_handler",
|
8 |
+
"env_options",
|
9 |
+
"semantic_search"
|
10 |
+
]
|
11 |
+
|
12 |
+
for _mod in __all__:
|
13 |
+
mod = import_module(f".{_mod}", __name__)
|
14 |
+
globals()[_mod] = mod
|
15 |
+
sys.modules[_mod] = mod # Pre‑register bare names so intra‑package imports (e.g., `import front_dataset_handler`) succeed.
|
src/app_utils.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from typing import Sequence, Any
|
3 |
+
|
4 |
+
import re
|
5 |
+
|
6 |
+
_NEG_COLOR = "red"
|
7 |
+
|
8 |
+
def format_large_number(n, decimals=2):
|
9 |
+
if n >= 1e12:
|
10 |
+
return f'{n / 1e12:.{decimals}f} T'
|
11 |
+
elif n >= 1e9:
|
12 |
+
return f'{n / 1e9:.{decimals}f} B'
|
13 |
+
elif n >= 1e6:
|
14 |
+
return f'{n / 1e6:.{decimals}f} M'
|
15 |
+
else:
|
16 |
+
return str(n)
|
17 |
+
|
18 |
+
def format_results(df: pd.DataFrame, rename_columns: dict) -> pd.DataFrame:
|
19 |
+
# Índice 100
|
20 |
+
if "ind_sust" in df.columns:
|
21 |
+
df["ind_sust"] = df["ind_sust"].apply(lambda x: "-" if pd.isna(x) else int(round(x * 100, 0)))
|
22 |
+
# 1 decimal
|
23 |
+
for col in ["trailingPE", "beta"]:
|
24 |
+
if col in df.columns:
|
25 |
+
df[col] = df[col].apply(lambda x: "-" if pd.isna(x) else f"{x:.1f}")
|
26 |
+
|
27 |
+
# 2 decimales
|
28 |
+
if "Search dist." in df.columns:
|
29 |
+
df["Search dist."] = df["Search dist."].apply(lambda n: "-" if pd.isna(n) else f"{n:.2f}")
|
30 |
+
|
31 |
+
# Cantidades monetarias grandes
|
32 |
+
if "marketCap" in df.columns:
|
33 |
+
df["marketCap"] = df["marketCap"].apply(lambda n: "-" if pd.isna(n) else format_large_number(n, 1))
|
34 |
+
# Porcentajes 1 decimal
|
35 |
+
for col in ["ret_365", "revenueGrowth"]:
|
36 |
+
if col in df.columns:
|
37 |
+
df[col] = df[col].apply(lambda x: "-" if pd.isna(x) or x == 0 else f"{(x * 100):.1f}%")
|
38 |
+
# Porcentajes 1 decimal (porcentaje numérico en fuente)
|
39 |
+
for col in ["dividendYield"]:
|
40 |
+
if col in df.columns:
|
41 |
+
df[col] = df[col].apply(lambda x: "-" if pd.isna(x) else f"{round(x, 1)}%")
|
42 |
+
# Volatilidad
|
43 |
+
if "vol_365" in df.columns:
|
44 |
+
df["vol_365"] = df["vol_365"].apply(lambda x: "-" if pd.isna(x) or x == 0 else f"{x:.4f}")
|
45 |
+
|
46 |
+
# Devolvemos el dataframe con los nombres de columnas renombrados
|
47 |
+
return df.rename(columns=rename_columns)
|
48 |
+
|
49 |
+
|
50 |
+
def random_ticker(df: pd.DataFrame) -> str:
|
51 |
+
return df["ticker"].sample(n=1).values[0]
|
52 |
+
|
53 |
+
def styler_negative_red(df: pd.DataFrame, cols: list[str] | None = None):
|
54 |
+
"""
|
55 |
+
Returns a Styler that paints negative numeric values in *cols*.
|
56 |
+
Columns absent in *df* are ignored.
|
57 |
+
"""
|
58 |
+
cols = [c for c in (cols or df.columns) if c in df.columns]
|
59 |
+
|
60 |
+
def _style(v):
|
61 |
+
try:
|
62 |
+
num = float(re.sub(r"[ %,TMB]", "", str(v)))
|
63 |
+
if num < 0:
|
64 |
+
return f"color:{_NEG_COLOR}"
|
65 |
+
except ValueError:
|
66 |
+
pass
|
67 |
+
return ""
|
68 |
+
|
69 |
+
return df.style.applymap(_style, subset=cols)
|
src/env_options.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
import torch
|
4 |
+
import transformers
|
5 |
+
from typing import List, Dict
|
6 |
+
|
7 |
+
def check_env(colab:bool=False, use_dotenv:bool=True, dotenv_path:str=None, colab_secrets:dict=None, env_tokens:List[str]=None) -> Dict[str, str]:
|
8 |
+
# Checking versions and GPU availability:
|
9 |
+
print(f"Python version: {sys.version}")
|
10 |
+
print(f"PyTorch version: {torch.__version__}")
|
11 |
+
print(f"Transformers version: {transformers.__version__}")
|
12 |
+
if torch.cuda.is_available():
|
13 |
+
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
|
14 |
+
print(f"CUDA Version: {torch.version.cuda}")
|
15 |
+
print(f"FlashAttention available: {torch.backends.cuda.flash_sdp_enabled()}")
|
16 |
+
else:
|
17 |
+
print("No CUDA device available")
|
18 |
+
|
19 |
+
if use_dotenv:
|
20 |
+
from dotenv import load_dotenv
|
21 |
+
load_dotenv(dotenv_path) # path to your dotenv file
|
22 |
+
print(f"Retrieving token(s) from {dotenv_path} or environment variables")
|
23 |
+
|
24 |
+
def mask_token(token, unmasked_chars=4):
|
25 |
+
return token[:unmasked_chars] + '*' * (len(token) - unmasked_chars*2) + token[-unmasked_chars:]
|
26 |
+
|
27 |
+
tokens = {}
|
28 |
+
for token_name in env_tokens or []:
|
29 |
+
if use_dotenv:
|
30 |
+
token = os.getenv(token_name)
|
31 |
+
elif colab:
|
32 |
+
token = colab_secrets.get(token_name)
|
33 |
+
else:
|
34 |
+
token = os.environ.get(token_name)
|
35 |
+
|
36 |
+
if token is None:
|
37 |
+
print(f"{token_name} not found in the provided .env file or environment variables")
|
38 |
+
else:
|
39 |
+
print(f"Using {token_name}: {mask_token(token)}")
|
40 |
+
tokens[token_name] = token
|
41 |
+
|
42 |
+
return tokens
|
src/front_dataset_handler.py
ADDED
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
from sklearn.neighbors import NearestNeighbors
|
5 |
+
from sklearn.preprocessing import QuantileTransformer
|
6 |
+
from scipy.stats import gamma
|
7 |
+
import json
|
8 |
+
|
9 |
+
class FrontDatasetHandler:
|
10 |
+
def __init__(self, maestro: pd.DataFrame=None, precios_cierre: pd.DataFrame=None, app_dataset: pd.DataFrame=None,
|
11 |
+
json_path: str = None, pickle_path: str = None):
|
12 |
+
self.maestro = maestro
|
13 |
+
self.app_dataset = app_dataset # Dataframe preprocesado para la app
|
14 |
+
self.pickle_path = pickle_path
|
15 |
+
# Extraemos los ficheros JSON para la creación del dataset de la app si no se ha pasado como argumento:
|
16 |
+
if self.app_dataset is None and json_path is not None:
|
17 |
+
with open(os.path.join(json_path, "ignore_columns.json"), "r") as f:
|
18 |
+
self.ignore_columns = json.load(f)['ignore_columns']
|
19 |
+
print(f"ignore_columns: {self.ignore_columns}")
|
20 |
+
with open(os.path.join(json_path, "numeric_columns.json"), "r") as f:
|
21 |
+
self.numeric_columns = json.load(f)['numeric_columns']
|
22 |
+
print(f"numeric_columns: {self.numeric_columns}")
|
23 |
+
with open(os.path.join(json_path, "app_column_config.json"), "rb") as f:
|
24 |
+
self.app_dataset_cols = json.load(f)['app_dataset_cols']
|
25 |
+
print(f"app_dataset_cols: {self.app_dataset_cols}")
|
26 |
+
|
27 |
+
with open(os.path.join(json_path, "cat_to_num_maps.json"), "r") as f:
|
28 |
+
num_maps = json.load(f)
|
29 |
+
self.sector_num_map = num_maps['sector_num_map']
|
30 |
+
self.industry_num_map = num_maps['industry_num_map']
|
31 |
+
|
32 |
+
self.norm_columns = None
|
33 |
+
if maestro is not None:
|
34 |
+
maestro.drop(columns=self.ignore_columns, inplace=True, errors='ignore')
|
35 |
+
self.precios_cierre = precios_cierre # Sólo necesario cuando se requiere preprocesar el dataset para la app
|
36 |
+
self.rend_diario_log = None
|
37 |
+
self.precios_cierre_fh = None
|
38 |
+
self.rendimientos_y_volatilidad = None
|
39 |
+
self.mapeos_var_categoricas = None
|
40 |
+
self.activos_descartados = []
|
41 |
+
self.quantile_scaler = None
|
42 |
+
|
43 |
+
def filtra_y_homogeneiza(self, n_dias=366, n_dias_descartar=1, min_dias=100):
|
44 |
+
if self.precios_cierre.index.name != 'date':
|
45 |
+
self.precios_cierre.set_index('date', inplace=True)
|
46 |
+
self.precios_cierre.columns.name = 'ticker'
|
47 |
+
end_date = self.precios_cierre.index.max()
|
48 |
+
start_date = end_date - pd.Timedelta(days=n_dias)
|
49 |
+
|
50 |
+
# Filtrar datos dentro del rango de fechas
|
51 |
+
precios_cierre_fh = self.precios_cierre.loc[start_date:end_date].copy()
|
52 |
+
|
53 |
+
# Descartar los últimos n_dias_descartar
|
54 |
+
if n_dias_descartar > 0:
|
55 |
+
dates_to_drop = precios_cierre_fh.index.sort_values(ascending=False)[:n_dias_descartar]
|
56 |
+
precios_cierre_fh.drop(dates_to_drop, inplace=True)
|
57 |
+
|
58 |
+
precios_cierre_fh.ffill(axis=0, inplace=True) # Se rellenan los datos vacíos con el dato del día anterior
|
59 |
+
self.activos_descartados = precios_cierre_fh.columns[precios_cierre_fh.notna().sum(axis=0) < min_dias].tolist()
|
60 |
+
precios_cierre_fh.drop(columns=self.activos_descartados, inplace=True)
|
61 |
+
self.precios_cierre = precios_cierre_fh
|
62 |
+
return
|
63 |
+
|
64 |
+
def calcula_rendimientos_y_volatilidad(self, n_dias=365, umbral_max=0.3, umbral_min=-0.3):
|
65 |
+
end_date = self.precios_cierre.index.max()
|
66 |
+
start_date = end_date - pd.Timedelta(days=n_dias)
|
67 |
+
# Dado que la tabla no siempre incluye fechas de fin de semana o festivos, se busca la fecha más cercana anterior a start_date
|
68 |
+
if start_date not in self.precios_cierre.index:
|
69 |
+
previous_dates = self.precios_cierre.index[self.precios_cierre.index < start_date]
|
70 |
+
if len(previous_dates) > 0:
|
71 |
+
start_date = previous_dates.max()
|
72 |
+
else:
|
73 |
+
raise ValueError(f"No hay datos históricos suficientes ({n_dias}, {end_date})")
|
74 |
+
_df_rend_y_vol = self.precios_cierre.loc[start_date:end_date].copy()
|
75 |
+
|
76 |
+
_df_rend_y_vol.dropna(how='all', inplace=True) #####
|
77 |
+
# Reemplazar valores cero y negativos (errores de formato) por el siguiente valor más pequeño positivo
|
78 |
+
_df_rend_y_vol[_df_rend_y_vol <= 0] = np.nextafter(0, 1)
|
79 |
+
if self.activos_descartados:
|
80 |
+
_df_rend_y_vol = _df_rend_y_vol.drop(columns=[col for col in self.activos_descartados if col in _df_rend_y_vol.columns])
|
81 |
+
if len(_df_rend_y_vol) == 0:
|
82 |
+
raise ValueError(f"No hay datos disponibles en el rango de {n_dias} días")
|
83 |
+
|
84 |
+
|
85 |
+
_rend_diario_log = np.log(_df_rend_y_vol).diff()
|
86 |
+
_rend_diario_log = _rend_diario_log.iloc[1:] # Eliminar la primera fila
|
87 |
+
# _rend_diario_log.dropna(how='all', inplace=True)
|
88 |
+
print(f'Datos rentabilidad ({n_dias} días) con outliers: {_rend_diario_log.shape}')
|
89 |
+
# Identificar activos a descartar (outliers)
|
90 |
+
_activos_outliers = _rend_diario_log.columns[((_rend_diario_log > umbral_max) | (_rend_diario_log < umbral_min)).any()].tolist()
|
91 |
+
self.activos_descartados.extend([asset for asset in _activos_outliers if asset not in self.activos_descartados])
|
92 |
+
# Descartar activos con rentabilidades atípicas
|
93 |
+
_rend_diario_log = _rend_diario_log.loc[:, ~((_rend_diario_log > umbral_max) | (_rend_diario_log < umbral_min)).any()]
|
94 |
+
print(f'Datos rentabilidad sin outliers: {_rend_diario_log.shape}')
|
95 |
+
|
96 |
+
self.rend_diario_log = _rend_diario_log.copy()
|
97 |
+
|
98 |
+
# Inicializar rendimientos_y_volatilidad si no existe
|
99 |
+
if self.rendimientos_y_volatilidad is None:
|
100 |
+
self.rendimientos_y_volatilidad = pd.DataFrame(columns=_rend_diario_log.columns)
|
101 |
+
# print(f'INIT: Tabla rendimientos {n_dias}: {self.rendimientos_y_volatilidad.shape}')
|
102 |
+
else:
|
103 |
+
# Mantener solo los activos que están en _rend_diario_log
|
104 |
+
self.rendimientos_y_volatilidad = self.rendimientos_y_volatilidad.loc[:, _rend_diario_log.columns]
|
105 |
+
# print(f'Tabla rendimientos {n_dias}: {self.rendimientos_y_volatilidad.shape}')
|
106 |
+
|
107 |
+
# Añadir nuevas columnas para el n_dias actual
|
108 |
+
self.rendimientos_y_volatilidad.loc[f'ret_log_{n_dias}'] = np.sum(_rend_diario_log, axis=0)
|
109 |
+
self.rendimientos_y_volatilidad.loc[f'ret_{n_dias}'] = (_df_rend_y_vol.ffill().bfill().iloc[-1] / _df_rend_y_vol.ffill().bfill().iloc[0]) - 1
|
110 |
+
self.rendimientos_y_volatilidad.loc[f'vol_{n_dias}'] = _rend_diario_log.var()**0.5
|
111 |
+
|
112 |
+
return
|
113 |
+
|
114 |
+
def cruza_maestro(self):
|
115 |
+
_rets_y_vol_maestro = self.rendimientos_y_volatilidad.T.reset_index().copy()
|
116 |
+
_columns_to_merge = [col for col in _rets_y_vol_maestro.columns if col not in self.maestro.columns]
|
117 |
+
if len(_columns_to_merge) > 0:
|
118 |
+
_maestro_v2 = self.maestro.merge(_rets_y_vol_maestro, left_on='ticker', right_on='ticker')
|
119 |
+
_maestro_v2 = _maestro_v2.replace([float('inf'), float('-inf')], np.nan)
|
120 |
+
self.maestro = _maestro_v2
|
121 |
+
else:
|
122 |
+
raise ValueError("No hay nuevas columnas para cruzar con el dataframe maestro")
|
123 |
+
return
|
124 |
+
|
125 |
+
def _cat_to_num_(self, df, cat, pre_map=None):
|
126 |
+
"""
|
127 |
+
Transforma una columna categórica en un DataFrame a valores numéricos asignando un número entero a cada categoría.
|
128 |
+
Si no se proporciona un mapeo (`pre_map`), asigna 0 a la categoría más frecuente, 1 a la siguiente más frecuente, y así sucesivamente.
|
129 |
+
Si se proporciona un mapeo (`pre_map`), utiliza ese mapeo para la conversión.
|
130 |
+
Parámetros
|
131 |
+
----------
|
132 |
+
df : pandas.DataFrame
|
133 |
+
DataFrame que contiene la columna categórica a transformar.
|
134 |
+
cat : str
|
135 |
+
Nombre de la columna categórica a transformar.
|
136 |
+
pre_map : dict, opcional
|
137 |
+
Diccionario que mapea cada categoría a un valor numérico. Si no se proporciona, el mapeo se genera automáticamente.
|
138 |
+
Devuelve
|
139 |
+
--------
|
140 |
+
pandas.DataFrame
|
141 |
+
DataFrame con dos columnas: la columna categórica original y una columna con los valores numéricos asignados.
|
142 |
+
"""
|
143 |
+
if not pre_map:
|
144 |
+
pivot = pd.pivot_table(df, index=[cat], aggfunc='size')
|
145 |
+
df_sorted = pivot.sort_values(ascending=False).reset_index(name='count')
|
146 |
+
df_sorted[cat + '_num'] = range(len(df_sorted))
|
147 |
+
else:
|
148 |
+
df_sorted = pd.DataFrame({cat: list(pre_map.keys()), cat + '_num': list(pre_map.values())})
|
149 |
+
return df_sorted
|
150 |
+
|
151 |
+
def var_categorica_a_numerica(self, cat_cols):
|
152 |
+
for col in cat_cols:
|
153 |
+
if col == 'sectorDisp':
|
154 |
+
globals()[f"pt_{col}"] = self._cat_to_num_(self.maestro, col, self.sector_num_map)
|
155 |
+
elif col == 'industryDisp':
|
156 |
+
globals()[f"pt_{col}"] = self._cat_to_num_(self.maestro, col, self.industry_num_map)
|
157 |
+
else:
|
158 |
+
globals()[f"pt_{col}"] = self._cat_to_num_(self.maestro, col) # Creamos un dataframe con el mapeo de cada variable categórica por frecuencia
|
159 |
+
self.mapeos_var_categoricas = [globals()[f"pt_{col}"] for col in cat_cols] # Lista de dataframes con los mapeos de cada una de las variables categóricas
|
160 |
+
|
161 |
+
_maestro = self.maestro.copy()
|
162 |
+
for col, pt in zip(cat_cols, self.mapeos_var_categoricas):
|
163 |
+
_maestro[col] = _maestro[col].astype(str)
|
164 |
+
pt[col] = pt[col].astype(str)
|
165 |
+
# Creamos un diccionario con cada variable categórica y su equivalente numérico
|
166 |
+
mapping_dict = dict(zip(pt[col], pt[col + '_num']))
|
167 |
+
_maestro[col + '_num'] = _maestro[col].map(mapping_dict)
|
168 |
+
_maestro[col + '_num'] = pd.to_numeric(_maestro[col + '_num'], errors='coerce')
|
169 |
+
|
170 |
+
self.maestro = _maestro
|
171 |
+
return
|
172 |
+
|
173 |
+
def normaliza_por_cuantiles(self):
|
174 |
+
maestro_copy = self.maestro.copy()
|
175 |
+
numeric_columns = maestro_copy.select_dtypes(include=np.number).columns
|
176 |
+
self.quantile_scaler = QuantileTransformer(output_distribution='uniform')
|
177 |
+
variables_numericas = [col for col in numeric_columns if not col.endswith('_norm')]
|
178 |
+
all_na_cols = [col for col in variables_numericas if maestro_copy[col].isna().all()]
|
179 |
+
variables_numericas = [col for col in variables_numericas if col not in all_na_cols]
|
180 |
+
self.norm_columns = ['{}_norm'.format(var) for var in variables_numericas]
|
181 |
+
maestro_copy[self.norm_columns] = self.quantile_scaler.fit_transform(maestro_copy[variables_numericas])
|
182 |
+
maestro_copy[self.norm_columns] = maestro_copy[self.norm_columns].clip(0, 1)
|
183 |
+
self.maestro = maestro_copy
|
184 |
+
return
|
185 |
+
|
186 |
+
def var_estandar_z(self):
|
187 |
+
maestro_copy = self.maestro.copy()
|
188 |
+
numeric_columns = maestro_copy.select_dtypes(include=np.number).columns
|
189 |
+
variables_numericas = [col for col in numeric_columns if not col.endswith('_std')]
|
190 |
+
variables_num_std = ['{}_std'.format(var) for var in variables_numericas]
|
191 |
+
|
192 |
+
def estandarizar(x):
|
193 |
+
# Estandariza el valor z, restando la media y dividiendo por la desviación estándar
|
194 |
+
mean_val = x.mean()
|
195 |
+
std_val = x.std()
|
196 |
+
if pd.isna(std_val) or std_val == 0:
|
197 |
+
return pd.Series(0.0, index=x.index, name=x.name)
|
198 |
+
else:
|
199 |
+
normalized_series = (x - mean_val) / std_val
|
200 |
+
return normalized_series.fillna(0.0)
|
201 |
+
|
202 |
+
normalized_data = maestro_copy[variables_numericas].apply(estandarizar, axis=0)
|
203 |
+
maestro_copy[variables_num_std] = normalized_data
|
204 |
+
self.maestro = maestro_copy
|
205 |
+
return
|
206 |
+
|
207 |
+
def configura_distr_prob(self, shape, loc, scale, max_dist, precision_cdf):
|
208 |
+
x = np.linspace(0, max_dist, num=precision_cdf)
|
209 |
+
y_pdf = gamma.pdf(x, shape, loc, scale )
|
210 |
+
y_cdf = gamma.cdf(x, shape, loc, scale )
|
211 |
+
return y_pdf, y_cdf
|
212 |
+
|
213 |
+
def calculos_y_ajustes_dataset_activos(self):
|
214 |
+
maestro_copy = self.maestro.copy()
|
215 |
+
# Conversiones a formato numérico de columnas que dan problemas
|
216 |
+
for column in self.numeric_columns:
|
217 |
+
if column in maestro_copy.columns:
|
218 |
+
maestro_copy[column] = pd.to_numeric(maestro_copy[column], errors='coerce')
|
219 |
+
# print(f"Columna {column} convertida a {maestro_copy[column].dtype}")
|
220 |
+
# Estandarización de los diferentes tipos de NaN
|
221 |
+
# maestro_copy = maestro_copy.replace([None, np.nan, np.inf, -np.inf], pd.NA)
|
222 |
+
# Antigüedad del fondo en años:
|
223 |
+
if self.precios_cierre is not None and not self.precios_cierre.index.empty:
|
224 |
+
_most_recent_date = self.precios_cierre.index.max().date()
|
225 |
+
else:
|
226 |
+
_most_recent_date = pd.Timestamp.today().date()
|
227 |
+
# maestro_copy['firstTradeDateMilliseconds'] = pd.to_datetime(maestro_copy['firstTradeDateMilliseconds']).dt.date
|
228 |
+
maestro_copy['firstTradeDateMilliseconds'] = pd.to_datetime(maestro_copy['firstTradeDateMilliseconds'], unit='ms', errors='coerce').dt.date
|
229 |
+
maestro_copy['asset_age'] = maestro_copy['firstTradeDateMilliseconds'].apply(
|
230 |
+
lambda x: ((_most_recent_date - x).days / 365) if pd.notnull(x) and hasattr(x, 'day') else 0
|
231 |
+
).astype(int)
|
232 |
+
outlier_thresholds = {
|
233 |
+
'beta': (-100, 100),
|
234 |
+
'dividendYield': (-1,100),
|
235 |
+
'fiveYearAvgDividendYield': (-1,100),
|
236 |
+
'trailingAnnualDividendYield': (-1,100),
|
237 |
+
'quickRatio': (-1, 500),
|
238 |
+
'currentRatio': (-1, 500),
|
239 |
+
'ebitda': (-1e12, 1e12),
|
240 |
+
'grossProfits': (-1e12, 1e12),
|
241 |
+
}
|
242 |
+
for column, (lower_bound, upper_bound) in outlier_thresholds.items():
|
243 |
+
maestro_copy.loc[(maestro_copy[column] < lower_bound) | (maestro_copy[column] > upper_bound), column] = pd.NA
|
244 |
+
self.maestro = maestro_copy.copy()
|
245 |
+
return
|
246 |
+
|
247 |
+
def filtra_df_activos(self, df, isin_target, filtros, debug=False):
|
248 |
+
'''
|
249 |
+
LEGACY
|
250 |
+
Devuelve un dataframe filtrado, sin alterar el orden, eliminando características no deseadas, para usar en aplicación de búsqueda de activos sustitutivos.
|
251 |
+
Las características y valores a filtrar son las de un fondo objetivo dado por su isin.
|
252 |
+
Por ejemplo, si clean_share es False en filtros, el dataframe final no incluirá más activos con el mismo valor de clean_share que el ISIN objetivo
|
253 |
+
Argumentos:
|
254 |
+
df (pandas.core.frame.DataFrame): Dataframe maestro de activos
|
255 |
+
isin_target (str): ISIN del fondo objetivo
|
256 |
+
# caracteristicas (list): Lista de str con los nombres de las características
|
257 |
+
filtros (dict): Diccionario donde las claves son las características y los valores son True si se quiere conservar
|
258 |
+
debug (bool, optional): Muestra información de depuración. Por defecto False.
|
259 |
+
Resultado:
|
260 |
+
df_filt (pandas.core.frame.DataFrame): Dataframe filtrado
|
261 |
+
'''
|
262 |
+
# fondo_target = df[df['isin'] == isin_target].iloc[0]
|
263 |
+
fondo_target = df[df['ticker'] == isin_target].iloc[0]
|
264 |
+
if debug: print(f'Tamaño inicial: {df.shape}')
|
265 |
+
|
266 |
+
car_numericas = ['ret_365', 'vol_365', 'marketCap', 'asset_age']
|
267 |
+
|
268 |
+
# for feature in caracteristicas[2:]:
|
269 |
+
for feature in list(filtros.keys()):
|
270 |
+
value = fondo_target[feature]
|
271 |
+
if debug: print(f'{feature} = {value}')
|
272 |
+
|
273 |
+
# Verificar si esta característica debe ser filtrada
|
274 |
+
if feature in filtros and not filtros[feature]:
|
275 |
+
if debug: print(f'FILTRO: {feature} != {value}')
|
276 |
+
df = df[df[feature] != value]
|
277 |
+
|
278 |
+
# Aplicar filtros adicionales para variables numéricas
|
279 |
+
if feature in car_numericas:
|
280 |
+
if feature == 'ret_365':
|
281 |
+
if debug: print(f'FILTRO NUMÉRICO: {feature} > {value}')
|
282 |
+
df = df[df[feature] > value]
|
283 |
+
elif feature == 'vol_365':
|
284 |
+
if debug: print(f'FILTRO NUMÉRICO: {feature} < {value}')
|
285 |
+
df = df[df[feature] < value]
|
286 |
+
elif feature == 'asset_age':
|
287 |
+
if debug: print(f'FILTRO NUMÉRICO: {feature} > {value}')
|
288 |
+
df = df[df[feature] > value]
|
289 |
+
elif feature == 'marketCap':
|
290 |
+
if debug: print(f'FILTRO NUMÉRICO: {feature} > {value}')
|
291 |
+
df = df[df[feature] < value]
|
292 |
+
|
293 |
+
df_filt = df
|
294 |
+
if debug: print(f'Tamaño final: {df_filt.shape}')
|
295 |
+
return df_filt
|
296 |
+
|
297 |
+
def calcula_ind_sust (self, dist, y_cdf, precision_cdf, max_dist):
|
298 |
+
try:
|
299 |
+
idx = int((precision_cdf / max_dist) * dist)
|
300 |
+
idx = min(idx, len(y_cdf) - 1)
|
301 |
+
norm_dist = y_cdf[idx]
|
302 |
+
ind_sust = max(0.0, 1.0 - norm_dist)
|
303 |
+
except IndexError:
|
304 |
+
ind_sust = 0
|
305 |
+
return ind_sust
|
306 |
+
|
307 |
+
|
308 |
+
def vecinos_cercanos(self, df, variables_busq, caracteristicas, target_ticker, y_cdf, precision_cdf, max_dist, n_neighbors, filtros):
|
309 |
+
if target_ticker not in df['ticker'].values:
|
310 |
+
return f"Error: '{target_ticker}' no encontrado en la base de datos"
|
311 |
+
target_row = df[df['ticker'] == target_ticker]
|
312 |
+
if ~target_row.index.isin(df.index):
|
313 |
+
df = pd.concat([df, target_row], ignore_index=True)
|
314 |
+
# print(f'DF original: {df.shape}')
|
315 |
+
X = df[variables_busq]
|
316 |
+
model = NearestNeighbors(n_neighbors=n_neighbors) ##### probar con más y filtrar después #######
|
317 |
+
model.fit(X)
|
318 |
+
target_row = df[df['ticker'] == target_ticker][variables_busq]
|
319 |
+
# model.kneighbors devuelve dos arrays bidimensionales con los vecinos más cercanos y sus distancias:
|
320 |
+
distances, indices = model.kneighbors(target_row)
|
321 |
+
# combined_columns = list(set(caracteristicas + variables_busq))
|
322 |
+
neighbors_df = df.iloc[indices[0]][caracteristicas]
|
323 |
+
neighbors_df['distance'] = distances[0]
|
324 |
+
ind_sust = np.array([self.calcula_ind_sust(dist, y_cdf, precision_cdf, max_dist) for dist in distances[0]])
|
325 |
+
|
326 |
+
neighbors_df['ind_sust'] = ind_sust
|
327 |
+
neighbors_df = neighbors_df.sort_values(by='distance', ascending=True)
|
328 |
+
target_row = neighbors_df[neighbors_df['ticker'] == target_ticker]
|
329 |
+
|
330 |
+
# Aplicamos los filtros de exclusión:
|
331 |
+
### Código pendiente de eliminar/modificar (legado de la aplicación de fondos)
|
332 |
+
neighbors_df = self.filtra_df_activos (df = neighbors_df, isin_target = target_ticker, filtros = filtros)
|
333 |
+
####################
|
334 |
+
|
335 |
+
# Recupera el activo seleccionado en caso de haber hecho filtros, devolviéndolo a la primera posición del dataframe:
|
336 |
+
if ~target_row.index.isin(neighbors_df.index):
|
337 |
+
neighbors_df = pd.concat([pd.DataFrame(target_row), neighbors_df], ignore_index=True)
|
338 |
+
# print(f'DF filtrado: {neighbors_df.shape}')
|
339 |
+
# Ponemos el ticker como índice:
|
340 |
+
neighbors_df.set_index('ticker', inplace = True)
|
341 |
+
return neighbors_df
|
342 |
+
|
343 |
+
def format_large_number(self, n, decimals=2):
|
344 |
+
if n >= 1e12:
|
345 |
+
return f'{n / 1e12:.{decimals}f} T'
|
346 |
+
elif n >= 1e9:
|
347 |
+
return f'{n / 1e9:.{decimals}f} B'
|
348 |
+
elif n >= 1e6:
|
349 |
+
return f'{n / 1e6:.{decimals}f} M'
|
350 |
+
else:
|
351 |
+
return str(n)
|
352 |
+
|
353 |
+
def trae_embeddings_desde_pkl(self, embeddings_df_file_name='df_with_embeddings.pkl', embeddings_col_name='embeddings'):
|
354 |
+
embeddings_df = pd.read_pickle(os.path.join(self.pickle_path, embeddings_df_file_name))
|
355 |
+
self.maestro = self.maestro.merge(
|
356 |
+
embeddings_df[['ticker', embeddings_col_name]],
|
357 |
+
on='ticker',
|
358 |
+
how='left'
|
359 |
+
)
|
360 |
+
print(f"Agregados embeddings {self.maestro.shape}")
|
361 |
+
return
|
362 |
+
|
363 |
+
def procesa_app_dataset(self, periodo=366, n_dias_descartar=1, min_dias=250, umbrales_rend=(-0.3, +0.3), periodos_metricas=[60, 365],
|
364 |
+
cat_cols = ['industryDisp', 'sectorDisp', 'country', 'city', 'exchange', 'financialCurrency', 'quoteType'],
|
365 |
+
embeddings_df_file_name='df_with_embeddings.pkl', embeddings_col_name='embeddings'):
|
366 |
+
if self.app_dataset is not None:
|
367 |
+
print("app_dataset already exists, skipping processing")
|
368 |
+
return
|
369 |
+
|
370 |
+
self.filtra_y_homogeneiza(n_dias=periodo, n_dias_descartar=n_dias_descartar, min_dias=min_dias)
|
371 |
+
|
372 |
+
for periodo_metricas in periodos_metricas:
|
373 |
+
self.calcula_rendimientos_y_volatilidad(n_dias=periodo_metricas, umbral_max=umbrales_rend[1], umbral_min=umbrales_rend[0])
|
374 |
+
self.cruza_maestro()
|
375 |
+
self.var_categorica_a_numerica(cat_cols)
|
376 |
+
|
377 |
+
self.calculos_y_ajustes_dataset_activos()
|
378 |
+
self.normaliza_por_cuantiles()
|
379 |
+
self.trae_embeddings_desde_pkl(embeddings_df_file_name=embeddings_df_file_name, embeddings_col_name=embeddings_col_name)
|
380 |
+
app_dataset = self.maestro.copy()
|
381 |
+
app_dataset = app_dataset.fillna({col: 0.5 for col in self.norm_columns})
|
382 |
+
# Filtrado final de columnas para reducir el dataset:
|
383 |
+
self.app_dataset = app_dataset[self.app_dataset_cols].copy()
|
384 |
+
print(f"app_dataset preparado: {self.app_dataset.shape}")
|
385 |
+
return
|
src/semantic_search.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import duckdb
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
import pandas as pd
|
4 |
+
import re
|
5 |
+
|
6 |
+
def duckdb_vss_local(
|
7 |
+
model: SentenceTransformer,
|
8 |
+
duckdb_connection: duckdb.DuckDBPyConnection,
|
9 |
+
query: str,
|
10 |
+
k: int = 1000,
|
11 |
+
brevity_penalty: float = 0.0,
|
12 |
+
reward_for_literal: float = 0.0,
|
13 |
+
partial_match_factor: float = 0.5,
|
14 |
+
table_name: str = "maestro_vector_table",
|
15 |
+
embedding_column: str = "vec",
|
16 |
+
):
|
17 |
+
|
18 |
+
query_vector = model.encode(query)
|
19 |
+
embedding_dim = model.get_sentence_embedding_dimension()
|
20 |
+
|
21 |
+
sql = f"""
|
22 |
+
SELECT
|
23 |
+
*,
|
24 |
+
array_cosine_distance(
|
25 |
+
{embedding_column}::float[{embedding_dim}],
|
26 |
+
{query_vector.tolist()}::float[{embedding_dim}]
|
27 |
+
) as distance
|
28 |
+
FROM {table_name}
|
29 |
+
ORDER BY distance
|
30 |
+
LIMIT {k}
|
31 |
+
"""
|
32 |
+
result = duckdb_connection.sql(sql).to_df()
|
33 |
+
# Utilizar los parámetros "debug" para mostrar columnas intermedias:
|
34 |
+
if brevity_penalty > 0:
|
35 |
+
result = penalize_short_summaries(result, factor = brevity_penalty, distance_column = 'distance',
|
36 |
+
summary_column = 'longBusinessSummary', debug = False)
|
37 |
+
if reward_for_literal > 0:
|
38 |
+
result = reward_literals(result, query, factor = reward_for_literal,
|
39 |
+
partial_match_factor= partial_match_factor, distance_column = 'distance',
|
40 |
+
summary_column = 'longBusinessSummary', debug = False)
|
41 |
+
|
42 |
+
return result
|
43 |
+
|
44 |
+
def penalize_short_summaries(
|
45 |
+
df: pd.DataFrame,
|
46 |
+
factor: float = 0.1,
|
47 |
+
distance_column: str = 'distance',
|
48 |
+
summary_column: str = 'longBusinessSummary',
|
49 |
+
debug: bool = True
|
50 |
+
) -> pd.DataFrame:
|
51 |
+
|
52 |
+
result_df = df.copy()
|
53 |
+
result_df['summary_length'] = result_df[summary_column].apply(
|
54 |
+
lambda x: len(str(x)) if pd.notna(x) else 0
|
55 |
+
)
|
56 |
+
avg_length = max(1.0, result_df['summary_length'].mean())
|
57 |
+
max_dist = result_df['distance'].max()
|
58 |
+
|
59 |
+
result_df['percent_shorter'] = result_df['summary_length'].apply(
|
60 |
+
lambda x: max(0, (avg_length - x) / avg_length)
|
61 |
+
)
|
62 |
+
result_df['orig_distance'] = result_df[distance_column]
|
63 |
+
# Penalizamos en función del porcentaje en el que el resumen es más corto que la media (multiplicado por el factor)
|
64 |
+
result_df[distance_column] = result_df.apply(
|
65 |
+
lambda row: min(max_dist, row[distance_column] + (row['percent_shorter'] * factor)),
|
66 |
+
axis=1
|
67 |
+
)
|
68 |
+
|
69 |
+
if not debug:
|
70 |
+
result_df = result_df.drop(['orig_distance', 'summary_length', 'percent_shorter'], axis=1)
|
71 |
+
|
72 |
+
result_df = result_df.sort_values(by=distance_column, ascending=True)
|
73 |
+
return result_df
|
74 |
+
|
75 |
+
def reward_literals(
|
76 |
+
df: pd.DataFrame,
|
77 |
+
query: str,
|
78 |
+
factor: float = 0.1,
|
79 |
+
partial_match_factor: float = 0.5,
|
80 |
+
distance_column: str = 'distance',
|
81 |
+
summary_column: str = 'longBusinessSummary',
|
82 |
+
debug: bool = True
|
83 |
+
) -> pd.DataFrame:
|
84 |
+
|
85 |
+
result_df = df.copy()
|
86 |
+
query_lower = query.lower().strip()
|
87 |
+
|
88 |
+
def count_phrase_occurrences(summary):
|
89 |
+
if pd.isna(summary):
|
90 |
+
return 0
|
91 |
+
summary_lower = str(summary).lower()
|
92 |
+
|
93 |
+
# Cuenta coincidencias exactas (palabras completas)
|
94 |
+
exact_pattern = r'\b' + re.escape(query_lower) + r'\b'
|
95 |
+
exact_count = len(re.findall(exact_pattern, summary_lower))
|
96 |
+
|
97 |
+
# Cuenta coincidencias parciales basadas en el tipo de consulta
|
98 |
+
if ' ' in query_lower: # Si la consulta incluye varias palabras
|
99 |
+
# Para frases, contamos las veces que aparece en el texto
|
100 |
+
partial_pattern = re.escape(query_lower)
|
101 |
+
partial_count = len(re.findall(partial_pattern, summary_lower))
|
102 |
+
else:
|
103 |
+
# Para consultas de una sola palabra, buscamos subcadenas dentro de palabras
|
104 |
+
partial_pattern = r'\b\w*' + re.escape(query_lower) + r'\w*\b'
|
105 |
+
partial_count = len(re.findall(partial_pattern, summary_lower))
|
106 |
+
|
107 |
+
# Resta las coincidencias exactas de las parciales para evitar contar dos veces
|
108 |
+
partial_count = partial_count - exact_count
|
109 |
+
|
110 |
+
# Penalizamos las coincidencias parciales:
|
111 |
+
return exact_count + (partial_count * partial_match_factor)
|
112 |
+
|
113 |
+
result_df['term_occurrences'] = result_df[summary_column].apply(count_phrase_occurrences)
|
114 |
+
result_df['orig_distance'] = result_df[distance_column]
|
115 |
+
result_df[distance_column] = result_df.apply(
|
116 |
+
lambda row: max(0, row[distance_column] - (row['term_occurrences'] * factor)),
|
117 |
+
axis=1
|
118 |
+
)
|
119 |
+
if not debug:
|
120 |
+
result_df = result_df.drop(['orig_distance', 'term_occurrences'], axis=1)
|
121 |
+
result_df = result_df.sort_values(by=distance_column, ascending=True)
|
122 |
+
|
123 |
+
return result_df
|
124 |
+
|