reddgr commited on
Commit
8f074bc
·
1 Parent(s): e6c7897

first commit

Browse files
html/front_layout.html ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- html/front_layout.html -->
2
+ <h1 style="text-align:center;margin-bottom:15px;margin-left:10px">
3
+ Swift Stock Screener
4
+ </h1>
5
+ <p style="margin-left:10px">
6
+ Browse and search over 12,000 stocks. Search assets by theme, filter, sort, analyze, and get ideas to build portfolios and indices. Search by <b>ticker symbol</b> to display a list of ranked related companies. Enter any keyword in <b>thematic search</b> to search by theme. Click on <u>country names</u> or <u>GICS sectors</u> for strict filtering. <b>Reset</b>the search and <b>bort</b> all assets by any of the displayed metrics.
7
+
8
+ <style>
9
+ /* Botón de tamaño contenido */
10
+ .small-btn {
11
+ /*width: 140px;*/
12
+ max-width: 140px;
13
+ /*min-width: 140px;*/
14
+ }
15
+
16
+ /* Etiqueta de paginación */
17
+ .pagination-label {
18
+ flex: 0 0 auto;
19
+ width: auto;
20
+ margin: 0 8px; /* small horizontal gap */
21
+ }
22
+
23
+ /* cap the Gradio table + keep pagination row below */
24
+ .clickable-columns .dataframe-container {
25
+ max-height: calc(100vh - 300px); /* adjust px to match header+controls height */
26
+ overflow-y: auto;
27
+ }
28
+
29
+ /* Columnas filtrables (click en la celda) */
30
+ .clickable-columns tbody td:nth-child(3),
31
+ .clickable-columns tbody td:nth-child(4) {
32
+ color: #1a0dab; /* link blue for light theme */
33
+ text-decoration: underline; /* underline */
34
+ cursor: pointer; /* pointer cursor */
35
+ }
36
+
37
+ @media (prefers-color-scheme: dark) {
38
+ .clickable-columns tbody td:nth-child(3),
39
+ .clickable-columns tbody td:nth-child(4) {
40
+ color: #8ab4f8; /* lighter blue for dark theme */
41
+ }
42
+ }
43
+
44
+ .clickable-columns span.negative-value {
45
+ color: red;
46
+ }
47
+
48
+ /* make the table use fixed layout so width rules apply */
49
+ .clickable-columns table {
50
+ table-layout: fixed;
51
+ }
52
+
53
+ /* CONFIGURACIÓN DE ANCHO DE COLUMNAS */
54
+ /* Ticker */
55
+ .clickable-columns table th:nth-child(1),
56
+ .clickable-columns table td:nth-child(1) {
57
+ min-width: 40px; max-width: 100px;
58
+ overflow: hidden;
59
+ }
60
+ .clickable-columns table th:nth-child(2),
61
+ .clickable-columns table td:nth-child(2) {
62
+ min-width: 75px; max-width: 220px;
63
+ overflow: hidden;
64
+ }
65
+ .clickable-columns table th:nth-child(3),
66
+ .clickable-columns table td:nth-child(3) {
67
+ min-width: 70px; max-width: 160px;
68
+ overflow: hidden;
69
+ }
70
+ .clickable-columns table th:nth-child(4),
71
+ .clickable-columns table td:nth-child(4) {
72
+ min-width: 70px; max-width: 200px;
73
+ overflow: hidden;
74
+ }
75
+ .clickable-columns table th:nth-child(5),
76
+ .clickable-columns table td:nth-child(5) {
77
+ min-width: 60px; max-width: 80px;
78
+ overflow: hidden;
79
+ }
80
+ /* 1yr return */
81
+ .clickable-columns table th:nth-child(6),
82
+ .clickable-columns table td:nth-child(6) {
83
+ min-width: 60px; max-width: 80px;
84
+ overflow: hidden;
85
+ }
86
+ .clickable-columns table th:nth-child(7),
87
+ .clickable-columns table td:nth-child(7) {
88
+ min-width: 70px; max-width: 100px;
89
+ overflow: hidden;
90
+ }
91
+ .clickable-columns table th:nth-child(8),
92
+ .clickable-columns table td:nth-child(8) {
93
+ min-width: 70px; max-width: 100px;
94
+ overflow: hidden;
95
+ }
96
+ .clickable-columns table th:nth-child(9),
97
+ .clickable-columns table td:nth-child(9) {
98
+ min-width: 70px; max-width: 100px;
99
+ overflow: hidden;
100
+ }
101
+ .clickable-columns table th:nth-child(10),
102
+ .clickable-columns table td:nth-child(10) {
103
+ min-width: 70px; max-width: 100px;
104
+ overflow: hidden;
105
+ }
106
+ .clickable-columns table th:nth-child(11),
107
+ .clickable-columns table td:nth-child(11) {
108
+ min-width: 60px; max-width: 70px;
109
+ overflow: hidden;
110
+ }
111
+ .clickable-columns table th:nth-child(12),
112
+ .clickable-columns table td:nth-child(12) {
113
+ min-width: 50px; max-width: 70px;
114
+ overflow: hidden;
115
+ }
116
+
117
+ </style>
json/app_column_config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "app_dataset_cols": [
3
+ "ticker",
4
+ "security",
5
+ "country",
6
+ "sector",
7
+ "marketCap",
8
+ "ret_365",
9
+ "vol_365",
10
+ "trailingPE",
11
+ "revenueGrowth",
12
+ "dividendYield",
13
+ "beta",
14
+ "beta_norm",
15
+ "category",
16
+ "country_num_norm",
17
+ "debtToEquity_norm",
18
+ "fullTimeEmployees_norm",
19
+ "fundFamily",
20
+ "fundInceptionDate",
21
+ "industryDisp_num_norm",
22
+ "marketCap_norm",
23
+ "netExpenseRatio",
24
+ "quoteType",
25
+ "ret_365_norm",
26
+ "revenueGrowth_norm",
27
+ "sectorDisp_num_norm",
28
+ "totalAssets",
29
+ "trailingPE_norm",
30
+ "vol_365_norm",
31
+ "longBusinessSummary",
32
+ "embeddings"
33
+ ],
34
+ "variables_busq_norm": [
35
+ "beta_norm",
36
+ "country_num_norm",
37
+ "debtToEquity_norm",
38
+ "fullTimeEmployees_norm",
39
+ "industryDisp_num_norm",
40
+ "marketCap_norm",
41
+ "ret_365_norm",
42
+ "revenueGrowth_norm",
43
+ "sectorDisp_num_norm",
44
+ "trailingPE_norm",
45
+ "vol_365_norm"
46
+ ],
47
+ "cols_tabla_equity": [
48
+ "ticker",
49
+ "security",
50
+ "country",
51
+ "sector",
52
+ "marketCap",
53
+ "ret_365",
54
+ "vol_365",
55
+ "trailingPE",
56
+ "revenueGrowth",
57
+ "dividendYield",
58
+ "beta"
59
+ ],
60
+ "cols_tabla_etfs": [
61
+ "ticker",
62
+ "security",
63
+ "category",
64
+ "ret_365",
65
+ "vol_365",
66
+ "totalAssets",
67
+ "netExpenseRatio",
68
+ "fundInceptionDate",
69
+ "fundFamily"
70
+ ]
71
+ }
json/app_dataset_cols.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "app_dataset_cols": [
3
+ "ticker",
4
+ "security",
5
+ "country",
6
+ "sector",
7
+ "marketCap",
8
+ "ret_365",
9
+ "vol_365",
10
+ "trailingPE",
11
+ "revenueGrowth",
12
+ "dividendYield",
13
+ "beta",
14
+ "industryDisp_num_norm",
15
+ "sectorDisp_num_norm",
16
+ "country_num_norm",
17
+ "ret_365_norm",
18
+ "vol_365_norm",
19
+ "marketCap_norm",
20
+ "beta_norm",
21
+ "revenueGrowth_norm",
22
+ "debtToEquity_norm",
23
+ "fullTimeEmployees_norm",
24
+ "trailingPE_norm",
25
+ "category",
26
+ "fundFamily",
27
+ "totalAssets",
28
+ "netExpenseRatio",
29
+ "quoteType",
30
+ "embeddings"
31
+ ]
32
+ }
json/cat_cols.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cat_cols": [
3
+ "country",
4
+ "industryDisp",
5
+ "legalType",
6
+ "sector",
7
+ "state",
8
+ "exchange",
9
+ "exchangeTimezoneShortName",
10
+ "zip",
11
+ "exchangeTimezoneName",
12
+ "category",
13
+ "industryKey",
14
+ "currency",
15
+ "quoteType",
16
+ "industry",
17
+ "fullExchangeName",
18
+ "city",
19
+ "sectorKey",
20
+ "market",
21
+ "financialCurrency",
22
+ "recommendationKey",
23
+ "sectorDisp"
24
+ ]
25
+ }
json/cat_to_num_maps.json ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sector_num_map": {
3
+ "Technology": 0,
4
+ "Healthcare": 1,
5
+ "Utilities": 2,
6
+ "Industrials": 3,
7
+ "Basic Materials": 4,
8
+ "Consumer Cyclical": 5,
9
+ "Consumer Defensive": 6,
10
+ "Energy": 7,
11
+ "Communication Services": 8,
12
+ "Financial Services": 9,
13
+ "Real Estate": 10
14
+ },
15
+ "dummy_num_dict": {
16
+ "abcd": 0,
17
+ "efgh": 1,
18
+ "ijkl": 2,
19
+ "mnop": 3,
20
+ "qrst": 4
21
+ },
22
+ "industry_num_map": {
23
+ "Thermal Coal": 0,
24
+ "Oil & Gas Integrated": 1,
25
+ "Oil & Gas Refining & Marketing": 2,
26
+ "Uranium": 3,
27
+ "Oil & Gas Equipment & Services": 4,
28
+ "Oil & Gas E&P": 5,
29
+ "Oil & Gas Midstream": 6,
30
+ "Oil & Gas Drilling": 7,
31
+ "Aluminum": 8,
32
+ "Steel": 9,
33
+ "Specialty Chemicals": 10,
34
+ "Chemicals": 11,
35
+ "Paper & Paper Products": 12,
36
+ "Lumber & Wood Production": 13,
37
+ "Building Materials": 14,
38
+ "Agricultural Inputs": 15,
39
+ "Other Industrial Metals & Mining": 16,
40
+ "Coking Coal": 17,
41
+ "Copper": 18,
42
+ "Other Precious Metals & Mining": 19,
43
+ "Gold": 20,
44
+ "Silver": 21,
45
+ "Marine Shipping": 22,
46
+ "Integrated Freight & Logistics": 23,
47
+ "Trucking": 24,
48
+ "Railroads": 25,
49
+ "Airlines": 26,
50
+ "Farm & Heavy Construction Machinery": 27,
51
+ "Industrial Distribution": 28,
52
+ "Rental & Leasing Services": 29,
53
+ "Aerospace & Defense": 30,
54
+ "Specialty Industrial Machinery": 31,
55
+ "Waste Management": 32,
56
+ "Electrical Equipment & Parts": 33,
57
+ "Airports & Air Services": 34,
58
+ "Pollution & Treatment Controls": 35,
59
+ "Conglomerates": 36,
60
+ "Tools & Accessories": 37,
61
+ "Engineering & Construction": 38,
62
+ "Metal Fabrication": 39,
63
+ "Building Products & Equipment": 40,
64
+ "Specialty Business Services": 41,
65
+ "Security & Protection Services": 42,
66
+ "Consulting Services": 43,
67
+ "Staffing & Employment Services": 44,
68
+ "Business Equipment & Supplies": 45,
69
+ "Infrastructure Operations": 46,
70
+ "Utilities - Regulated Electric": 47,
71
+ "Utilities - Independent Power Producers": 48,
72
+ "Utilities - Diversified": 49,
73
+ "Utilities - Regulated Gas": 50,
74
+ "Utilities - Renewable": 51,
75
+ "Utilities - Regulated Water": 52,
76
+ "Recreational Vehicles": 53,
77
+ "Auto Manufacturers": 54,
78
+ "Auto & Truck Dealerships": 55,
79
+ "Auto Parts": 56,
80
+ "Footwear & Accessories": 57,
81
+ "Apparel Manufacturing": 58,
82
+ "Specialty Retail": 59,
83
+ "Furnishings, Fixtures & Appliances": 60,
84
+ "Luxury Goods": 61,
85
+ "Internet Retail": 62,
86
+ "Travel Services": 63,
87
+ "Leisure": 64,
88
+ "Packaging & Containers": 65,
89
+ "Home Improvement Retail": 66,
90
+ "Apparel Retail": 67,
91
+ "Textile Manufacturing": 68,
92
+ "Department Stores": 69,
93
+ "Residential Construction": 70,
94
+ "Lodging": 71,
95
+ "Restaurants": 72,
96
+ "Gambling": 73,
97
+ "Personal Services": 74,
98
+ "Resorts & Casinos": 75,
99
+ "Confectioners": 76,
100
+ "Beverages - Non - Alcoholic": 77,
101
+ "Packaged Foods": 78,
102
+ "Food Distribution": 79,
103
+ "Household & Personal Products": 80,
104
+ "Discount Stores": 81,
105
+ "Grocery Stores": 82,
106
+ "Tobacco": 83,
107
+ "Beverages - Wineries & Distilleries": 84,
108
+ "Beverages - Brewers": 85,
109
+ "Farm Products": 86,
110
+ "Education & Training Services": 87,
111
+ "Electronics & Computer Distribution": 88,
112
+ "Computer Hardware": 89,
113
+ "Semiconductors": 90,
114
+ "Electronic Components": 91,
115
+ "Semiconductor Equipment & Materials": 92,
116
+ "Consumer Electronics": 93,
117
+ "Communication Equipment": 94,
118
+ "Scientific & Technical Instruments": 95,
119
+ "Information Technology Services": 96,
120
+ "Solar": 97,
121
+ "Software - Application": 98,
122
+ "Software - Infrastructure": 99,
123
+ "Broadcasting": 100,
124
+ "Telecom Services": 101,
125
+ "Advertising Agencies": 102,
126
+ "Entertainment": 103,
127
+ "Publishing": 104,
128
+ "Internet Content & Information": 105,
129
+ "Electronic Gaming & Multimedia": 106,
130
+ "Medical Distribution": 107,
131
+ "Drug Manufacturers - General": 108,
132
+ "Pharmaceutical Retailers": 109,
133
+ "Drug Manufacturers - Specialty & Generic": 110,
134
+ "Medical Instruments & Supplies": 111,
135
+ "Health Information Services": 112,
136
+ "Medical Devices": 113,
137
+ "Healthcare Plans": 114,
138
+ "Diagnostics & Research": 115,
139
+ "Biotechnology": 116,
140
+ "Medical Care Facilities": 117,
141
+ "Banks - Diversified": 118,
142
+ "Banks - Regional": 119,
143
+ "Financial Conglomerates": 120,
144
+ "Credit Services": 121,
145
+ "Insurance - Reinsurance": 122,
146
+ "Mortgage Finance": 123,
147
+ "Insurance - Diversified": 124,
148
+ "Capital Markets": 125,
149
+ "Insurance - Life": 126,
150
+ "Insurance - Specialty": 127,
151
+ "Insurance - Property & Casualty": 128,
152
+ "Financial Data & Stock Exchanges": 129,
153
+ "Insurance Brokers": 130,
154
+ "Asset Management": 131,
155
+ "Shell Companies": 132,
156
+ "REIT - Mortgage": 133,
157
+ "REIT - Healthcare Facilities": 134,
158
+ "REIT - Retail": 135,
159
+ "REIT - Diversified": 136,
160
+ "REIT - Residential": 137,
161
+ "REIT - Office": 138,
162
+ "REIT - Industrial": 139,
163
+ "REIT - Hotel & Motel": 140,
164
+ "Real Estate - Diversified": 141,
165
+ "Real Estate Services": 142,
166
+ "REIT - Specialty": 143,
167
+ "Real Estate - Development": 144
168
+ }
169
+ }
json/col_names_map.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "col_names_map": {
3
+ "52WeekChange": "52-Week Change",
4
+ "asset_age": "Years Listed",
5
+ "averageAnalystRating": "Avg. Analyst Rating",
6
+ "averageVolume": "Avg. Volume",
7
+ "beta": "Beta",
8
+ "beta3Year": "Beta 3-Year",
9
+ "bookValue": "Book Value",
10
+ "category": "Category",
11
+ "city": "City",
12
+ "country": "Country",
13
+ "currency": "Currency",
14
+ "currentRatio": "Current Ratio",
15
+ "debtToEquity": "Debt To Equity",
16
+ "dividendRate": "Dividend Rate",
17
+ "dividendYield": "Dividend Yield",
18
+ "earningsGrowth": "Earnings Growth",
19
+ "earningsQuarterlyGrowth": "Earnings Quarterly Growth",
20
+ "ebitda": "EBITDA",
21
+ "ebitdaMargins": "EBITDA Margins",
22
+ "enterpriseToEbitda": "Enterprise To EBITDA",
23
+ "enterpriseToRevenue": "Enterprise To Revenue",
24
+ "enterpriseValue": "Enterprise Value",
25
+ "epsCurrentYear": "EPS Current Year",
26
+ "epsForward": "EPS Forward",
27
+ "exchange": "Exchange",
28
+ "exchangeTimezoneName": "Exchange Timezone Name",
29
+ "exchangeTimezoneShortName": "Exchange Timezone Short Name",
30
+ "fiftyDayAverageChange": "50-Day Avg. Change",
31
+ "fiftyDayAverageChangePercent": "50-Day Avg. Change Percent",
32
+ "fiftyTwoWeekHighChangePercent": "52-Week High Change Percent",
33
+ "fiftyTwoWeekLowChange": "52-Week Low Change",
34
+ "fiftyTwoWeekLowChangePercent": "52-Week Low Change Percent",
35
+ "financialCurrency": "Financial Currency",
36
+ "firstTradeDateMilliseconds": "First Trade Date (ms)",
37
+ "fiveYearAvgDividendYield": "5-Year Avg Dividend Yield",
38
+ "floatShares": "Float Shares",
39
+ "forwardPE": "Forward PE",
40
+ "freeCashflow": "Free Cashflow",
41
+ "fullExchangeName": "Full Exchange Name",
42
+ "fullTimeEmployees": "Full Time Employees",
43
+ "fundInceptionDate": "Fund Inception Date",
44
+ "grossMargins": "Gross Margins",
45
+ "grossProfits": "Gross Profits",
46
+ "heldPercentInsiders": "Held Percent Insiders",
47
+ "heldPercentInstitutions": "Held Percent Institutions",
48
+ "ind_sust": "Similarity Index",
49
+ "industry": "GICS Industry",
50
+ "industryDisp": "GICS Industry",
51
+ "industryKey": "GICS Industry Key",
52
+ "legalType": "Legal Type",
53
+ "market": "Market",
54
+ "marketCap": "Market Cap",
55
+ "navPrice": "NAV Price",
56
+ "netAssets": "Net Assets",
57
+ "netExpenseRatio": "Net Expense Ratio",
58
+ "netIncomeToCommon": "Net Income To Common",
59
+ "numberOfAnalystOpinions": "Number Of Analyst Opinions",
60
+ "operatingCashflow": "Operating Cashflow",
61
+ "operatingMargins": "Operating Margins",
62
+ "overallRisk": "Overall Risk",
63
+ "payoutRatio": "Payout Ratio",
64
+ "preMarketChange": "Pre-Market Change",
65
+ "preMarketChangePercent": "Pre-Market Change Percent",
66
+ "preMarketPrice": "Pre-Market Price",
67
+ "previousClose": "Previous Close",
68
+ "priceEpsCurrentYear": "Price/EPS Current Year",
69
+ "priceToBook": "Price To Book",
70
+ "priceToSalesTrailing12Months": "Price To Sales Trailing 12 Months",
71
+ "profitMargins": "Profit Margins",
72
+ "quickRatio": "Quick Ratio",
73
+ "quoteType": "Quote Type",
74
+ "recommendationKey": "Recommendation Key",
75
+ "recommendationMean": "Recommendation Mean",
76
+ "regularMarketChangePercent": "Regular Market Change Percent",
77
+ "ret_365": "1yr Return",
78
+ "returnOnAssets": "Return On Assets",
79
+ "returnOnEquity": "Return On Equity",
80
+ "revenueGrowth": "Revenue Growth",
81
+ "revenuePerShare": "Revenue Per Share",
82
+ "sector": "GICS Sector",
83
+ "sectorDisp": "GICS Sector",
84
+ "sectorKey": "GICS Sector Key",
85
+ "security": "Name",
86
+ "sharesOutstanding": "Shares Outstanding",
87
+ "sharesPercentSharesOut": "Shares Percent Shares Out",
88
+ "sharesShort": "Shares Short",
89
+ "sharesShortPriorMonth": "Shares Short Prior Month",
90
+ "shortPercentOfFloat": "Short Percent Of Float",
91
+ "shortRatio": "Short Ratio",
92
+ "state": "State",
93
+ "threeYearAverageReturn": "3yr Avg. Return",
94
+ "ticker": "Ticker",
95
+ "totalAssets": "Total Assets",
96
+ "totalCash": "Total Cash",
97
+ "totalCashPerShare": "Total Cash Per Share",
98
+ "totalDebt": "Total Debt",
99
+ "totalRevenue": "Total Revenue",
100
+ "trailingAnnualDividendRate": "Trailing Annual Dividend Rate",
101
+ "trailingAnnualDividendYield": "Trailing Annual Dividend Yield",
102
+ "trailingEps": "Trailing EPS",
103
+ "trailingPE": "Trailing PE",
104
+ "trailingPegRatio": "Trailing PEG Ratio",
105
+ "trailingThreeMonthNavReturns": "Trailing 3-Month NAV Returns",
106
+ "trailingThreeMonthReturns": "Trailing 3-Month Returns",
107
+ "twoHundredDayAverageChange": "200-Day Avg. Change",
108
+ "twoHundredDayAverageChangePercent": "200-Day Avg. Change Percent",
109
+ "vol_365": "Volatility",
110
+ "yield": "Yield",
111
+ "ytdReturn": "YTD Return",
112
+ "zip": "Zip"
113
+ }
114
+ }
json/col_names_map_sorted.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "col_names_map": {
3
+ "52WeekChange": "52-Week Change",
4
+ "asset_age": "Years Listed",
5
+ "averageAnalystRating": "Avg. Analyst Rating",
6
+ "averageVolume": "Avg. Volume",
7
+ "beta": "Beta",
8
+ "beta3Year": "Beta 3-Year",
9
+ "bookValue": "Book Value",
10
+ "category": "Category",
11
+ "city": "City",
12
+ "country": "Country",
13
+ "currency": "Currency",
14
+ "currentRatio": "Current Ratio",
15
+ "debtToEquity": "Debt To Equity",
16
+ "dividendRate": "Dividend Rate",
17
+ "dividendYield": "Dividend Yield",
18
+ "earningsGrowth": "Earnings Growth",
19
+ "earningsQuarterlyGrowth": "Earnings Quarterly Growth",
20
+ "ebitda": "EBITDA",
21
+ "ebitdaMargins": "EBITDA Margins",
22
+ "enterpriseToEbitda": "Enterprise To EBITDA",
23
+ "enterpriseToRevenue": "Enterprise To Revenue",
24
+ "enterpriseValue": "Enterprise Value",
25
+ "epsCurrentYear": "EPS Current Year",
26
+ "epsForward": "EPS Forward",
27
+ "exchange": "Exchange",
28
+ "exchangeTimezoneName": "Exchange Timezone Name",
29
+ "exchangeTimezoneShortName": "Exchange Timezone Short Name",
30
+ "fiftyDayAverageChange": "50-Day Avg. Change",
31
+ "fiftyDayAverageChangePercent": "50-Day Avg. Change Percent",
32
+ "fiftyTwoWeekHighChangePercent": "52-Week High Change Percent",
33
+ "fiftyTwoWeekLowChange": "52-Week Low Change",
34
+ "fiftyTwoWeekLowChangePercent": "52-Week Low Change Percent",
35
+ "financialCurrency": "Financial Currency",
36
+ "firstTradeDateMilliseconds": "First Trade Date (ms)",
37
+ "fiveYearAvgDividendYield": "5-Year Avg Dividend Yield",
38
+ "floatShares": "Float Shares",
39
+ "forwardPE": "Forward PE",
40
+ "freeCashflow": "Free Cashflow",
41
+ "fullExchangeName": "Full Exchange Name",
42
+ "fullTimeEmployees": "Full Time Employees",
43
+ "fundInceptionDate": "Fund Inception Date",
44
+ "grossMargins": "Gross Margins",
45
+ "grossProfits": "Gross Profits",
46
+ "heldPercentInsiders": "Held Percent Insiders",
47
+ "heldPercentInstitutions": "Held Percent Institutions",
48
+ "ind_sust": "Similarity Index",
49
+ "industry": "Sector",
50
+ "industryDisp": "Industry",
51
+ "industryKey": "Industry Key",
52
+ "legalType": "Legal Type",
53
+ "market": "Market",
54
+ "marketCap": "Market Cap",
55
+ "navPrice": "NAV Price",
56
+ "netAssets": "Net Assets",
57
+ "netExpenseRatio": "Net Expense Ratio",
58
+ "netIncomeToCommon": "Net Income To Common",
59
+ "numberOfAnalystOpinions": "Number Of Analyst Opinions",
60
+ "operatingCashflow": "Operating Cashflow",
61
+ "operatingMargins": "Operating Margins",
62
+ "overallRisk": "Overall Risk",
63
+ "payoutRatio": "Payout Ratio",
64
+ "preMarketChange": "Pre-Market Change",
65
+ "preMarketChangePercent": "Pre-Market Change Percent",
66
+ "preMarketPrice": "Pre-Market Price",
67
+ "previousClose": "Previous Close",
68
+ "priceEpsCurrentYear": "Price/EPS Current Year",
69
+ "priceToBook": "Price To Book",
70
+ "priceToSalesTrailing12Months": "Price To Sales Trailing 12 Months",
71
+ "profitMargins": "Profit Margins",
72
+ "quickRatio": "Quick Ratio",
73
+ "quoteType": "Quote Type",
74
+ "recommendationKey": "Recommendation Key",
75
+ "recommendationMean": "Recommendation Mean",
76
+ "regularMarketChangePercent": "Regular Market Change Percent",
77
+ "ret_365": "1yr Return",
78
+ "returnOnAssets": "Return On Assets",
79
+ "returnOnEquity": "Return On Equity",
80
+ "revenueGrowth": "Revenue Growth",
81
+ "revenuePerShare": "Revenue Per Share",
82
+ "sector": "Sector",
83
+ "sectorDisp": "Sector",
84
+ "sectorKey": "Sector Key",
85
+ "security": "Name",
86
+ "sharesOutstanding": "Shares Outstanding",
87
+ "sharesPercentSharesOut": "Shares Percent Shares Out",
88
+ "sharesShort": "Shares Short",
89
+ "sharesShortPriorMonth": "Shares Short Prior Month",
90
+ "shortPercentOfFloat": "Short Percent Of Float",
91
+ "shortRatio": "Short Ratio",
92
+ "state": "State",
93
+ "threeYearAverageReturn": "3yr Avg. Return",
94
+ "ticker": "Ticker",
95
+ "totalAssets": "Total Assets",
96
+ "totalCash": "Total Cash",
97
+ "totalCashPerShare": "Total Cash Per Share",
98
+ "totalDebt": "Total Debt",
99
+ "totalRevenue": "Total Revenue",
100
+ "trailingAnnualDividendRate": "Trailing Annual Dividend Rate",
101
+ "trailingAnnualDividendYield": "Trailing Annual Dividend Yield",
102
+ "trailingEps": "Trailing EPS",
103
+ "trailingPE": "Trailing PE",
104
+ "trailingPegRatio": "Trailing PEG Ratio",
105
+ "trailingThreeMonthNavReturns": "Trailing 3-Month NAV Returns",
106
+ "trailingThreeMonthReturns": "Trailing 3-Month Returns",
107
+ "twoHundredDayAverageChange": "200-Day Avg. Change",
108
+ "twoHundredDayAverageChangePercent": "200-Day Avg. Change Percent",
109
+ "vol_365": "Volatility",
110
+ "yield": "Yield",
111
+ "ytdReturn": "YTD Return",
112
+ "zip": "Zip"
113
+ }
114
+ }
json/cols_tabla_equity.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cols_tabla_equity": [
3
+ "ticker",
4
+ "security",
5
+ "country",
6
+ "sector",
7
+ "marketCap",
8
+ "ret_365",
9
+ "vol_365",
10
+ "trailingPE",
11
+ "revenueGrowth",
12
+ "dividendYield",
13
+ "beta"
14
+ ]
15
+ }
json/embeddings_excluded_words.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "excluded_words": [
3
+ "us",
4
+ "china",
5
+ "japan",
6
+ "russia",
7
+ "india",
8
+ "europe",
9
+ "company",
10
+ "operates",
11
+ "provides",
12
+ "offers",
13
+ "headquartered",
14
+ "based",
15
+ "incorporated",
16
+ "together",
17
+ "founded",
18
+ "business",
19
+ "businesses",
20
+ "companies",
21
+ "customers",
22
+ "clients",
23
+ "under",
24
+ "co",
25
+ "inc",
26
+ "nv",
27
+ "ltd",
28
+ "limited",
29
+ "normal market conditions",
30
+ "the fund will normally invest",
31
+ "the fund invests",
32
+ "normal circumstances",
33
+ "at least",
34
+ "of the fund",
35
+ "seeks to achieve",
36
+ "through its subsidiaries",
37
+ "with its subsidiaries"
38
+ ]
39
+ }
json/gamma_params.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "shape": 1000,
3
+ "loc": -8,
4
+ "scale": 0.009,
5
+ "max_dist": 2,
6
+ "precision_cdf": 1000
7
+ }
json/ignore_columns.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ignore_columns": [
3
+ "address1",
4
+ "phone",
5
+ "governanceEpochDate",
6
+ "maxAge",
7
+ "tradeable",
8
+ "SandP52WeekChange",
9
+ "language",
10
+ "region",
11
+ "typeDisp",
12
+ "quoteSourceName",
13
+ "esgPopulated",
14
+ "postMarketTime",
15
+ "regularMarketTime",
16
+ "marketState",
17
+ "exchangeDataDelayedBy",
18
+ "cryptoTradeable",
19
+ "postMarketChangePercent",
20
+ "postMarketPrice",
21
+ "postMarketChange",
22
+ "isEarningsDateEstimate",
23
+ "gmtOffSetMilliseconds",
24
+ "preMarketTime",
25
+ "preMarketPrice",
26
+ "preMarketChange",
27
+ "preMarketChangePercent",
28
+ "governanceEpochDate",
29
+ "compensationAsOfEpochDate",
30
+ "sharesShortPreviousMonthDate",
31
+ "dateShortInterest",
32
+ "dividendDate",
33
+ "earningsTimestamp",
34
+ "earningsTimestampStart",
35
+ "earningsTimestampEnd",
36
+ "earningsCallTimestampStart",
37
+ "earningsCallTimestampEnd",
38
+ "priceHint",
39
+ "triggerable",
40
+ "customPriceAlertConfidence",
41
+ "messageBoardId",
42
+ "hasPrePostMarketData",
43
+ "sourceInterval",
44
+ "open",
45
+ "dayLow",
46
+ "dayHigh",
47
+ "regularMarketPreviousClose",
48
+ "bid",
49
+ "ask",
50
+ "bidSize",
51
+ "askSize",
52
+ "regularMarketOpen",
53
+ "regularMarketDayLow",
54
+ "regularMarketDayHigh",
55
+ "twoHundredDayAverage",
56
+ "lastDividendValue",
57
+ "targetHighPrice",
58
+ "targetLowPrice",
59
+ "targetMeanPrice",
60
+ "targetMedianPrice",
61
+ "regularMarketPrice",
62
+ "regularMarketChangePercentfiftyTwoWeekLowChange",
63
+ "fiftyTwoWeekHighChange",
64
+ "fiftyTwoWeekLow",
65
+ "fiftyTwoWeekHigh",
66
+ "fiftyDayAverage",
67
+ "dividendRatefiftyDayAverage",
68
+ "regularMarketChange",
69
+ "exDividendDate",
70
+ "lastFiscalYearEnd",
71
+ "nextFiscalYearEnd",
72
+ "mostRecentQuarter",
73
+ "nameChangeDate",
74
+ "lastSplitDate",
75
+ "lastDividendDate",
76
+ "earningsCallTimestampStart",
77
+ "earningsCallTimestampEnd",
78
+ "regularMarketVolume",
79
+ "volume",
80
+ "averageDailyVolume10Day",
81
+ "averageDailyVolume3Month",
82
+ "epsTrailingTwelveMonths",
83
+ "averageVolume10days",
84
+ "auditRisk",
85
+ "boardRisk",
86
+ "compensationRisk",
87
+ "shareHolderRightsRisk",
88
+ "epsTrailingTwelveMonths",
89
+ "currentPrice",
90
+ "forwardEps",
91
+ "impliedSharesOutstanding",
92
+ "averageDailyVolume10Day",
93
+ "volume",
94
+ "fiftyTwoWeekChangePercent"
95
+ ]
96
+ }
json/industry_lists.json ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Technology": [
3
+ "Consumer Electronics",
4
+ "Software - Infrastructure",
5
+ "Semiconductors",
6
+ "Software - Application",
7
+ "Semiconductor Equipment & Materials",
8
+ "Communication Equipment",
9
+ "Information Technology Services",
10
+ "Scientific & Technical Instruments",
11
+ "Computer Hardware",
12
+ "Electronic Components",
13
+ "Solar",
14
+ "Electronics & Computer Distribution"
15
+ ],
16
+ "Consumer Cyclical": [
17
+ "Internet Retail",
18
+ "Auto Manufacturers",
19
+ "Home Improvement Retail",
20
+ "Luxury Goods",
21
+ "Restaurants",
22
+ "Apparel Retail",
23
+ "Travel Services",
24
+ "Footwear & Accessories",
25
+ "Auto Parts",
26
+ "Furnishings, Fixtures & Appliances",
27
+ "Lodging",
28
+ "Specialty Retail",
29
+ "Gambling",
30
+ "Residential Construction",
31
+ "Leisure",
32
+ "Auto & Truck Dealerships",
33
+ "Personal Services",
34
+ "Resorts & Casinos",
35
+ "Packaging & Containers",
36
+ "Department Stores",
37
+ "Apparel Manufacturing",
38
+ "Textile Manufacturing",
39
+ "Recreational Vehicles"
40
+ ],
41
+ "Communication Services": [
42
+ "Internet Content & Information",
43
+ "Entertainment",
44
+ "Telecom Services",
45
+ "Electronic Gaming & Multimedia",
46
+ "Advertising Agencies",
47
+ "Publishing",
48
+ "Broadcasting"
49
+ ],
50
+ "Energy": [
51
+ "Oil & Gas Integrated",
52
+ "Oil & Gas Refining & Marketing",
53
+ "Oil & Gas E&P",
54
+ "Thermal Coal",
55
+ "Oil & Gas Midstream",
56
+ "Oil & Gas Equipment & Services",
57
+ "Uranium",
58
+ "Oil & Gas Drilling"
59
+ ],
60
+ "Financial Services": [
61
+ "Insurance - Diversified",
62
+ "Banks - Diversified",
63
+ "Credit Services",
64
+ "Capital Markets",
65
+ "Banks - Regional",
66
+ "Asset Management",
67
+ "Insurance - Property & Casualty",
68
+ "Financial Data & Stock Exchanges",
69
+ "Insurance - Life",
70
+ "Insurance Brokers",
71
+ "Insurance - Reinsurance",
72
+ "Mortgage Finance",
73
+ "Financial Conglomerates",
74
+ "Insurance - Specialty",
75
+ "Shell Companies"
76
+ ],
77
+ "Healthcare": [
78
+ "Drug Manufacturers - General",
79
+ "Healthcare Plans",
80
+ "Medical Devices",
81
+ "Medical Instruments & Supplies",
82
+ "Diagnostics & Research",
83
+ "Biotechnology",
84
+ "Medical Distribution",
85
+ "Medical Care Facilities",
86
+ "Drug Manufacturers - Specialty & Generic",
87
+ "Health Information Services",
88
+ "Pharmaceutical Retailers"
89
+ ],
90
+ "Consumer Defensive": [
91
+ "Discount Stores",
92
+ "Household & Personal Products",
93
+ "Beverages - Non - Alcoholic",
94
+ "Packaged Foods",
95
+ "Beverages - Wineries & Distilleries",
96
+ "Tobacco",
97
+ "Beverages - Brewers",
98
+ "Confectioners",
99
+ "Grocery Stores",
100
+ "Food Distribution",
101
+ "Farm Products",
102
+ "Education & Training Services"
103
+ ],
104
+ "Basic Materials": [
105
+ "Specialty Chemicals",
106
+ "Other Industrial Metals & Mining",
107
+ "Copper",
108
+ "Gold",
109
+ "Building Materials",
110
+ "Chemicals",
111
+ "Agricultural Inputs",
112
+ "Steel",
113
+ "Paper & Paper Products",
114
+ "Aluminum",
115
+ "Other Precious Metals & Mining",
116
+ "Lumber & Wood Production",
117
+ "Silver",
118
+ "Coking Coal"
119
+ ],
120
+ "Industrials": [
121
+ "Aerospace & Defense",
122
+ "Specialty Industrial Machinery",
123
+ "Farm & Heavy Construction Machinery",
124
+ "Electrical Equipment & Parts",
125
+ "Conglomerates",
126
+ "Railroads",
127
+ "Specialty Business Services",
128
+ "Waste Management",
129
+ "Integrated Freight & Logistics",
130
+ "Building Products & Equipment",
131
+ "Engineering & Construction",
132
+ "Industrial Distribution",
133
+ "Consulting Services",
134
+ "Rental & Leasing Services",
135
+ "Airports & Air Services",
136
+ "Infrastructure Operations",
137
+ "Trucking",
138
+ "Security & Protection Services",
139
+ "Marine Shipping",
140
+ "Airlines",
141
+ "Pollution & Treatment Controls",
142
+ "Tools & Accessories",
143
+ "Metal Fabrication",
144
+ "Staffing & Employment Services",
145
+ "Business Equipment & Supplies"
146
+ ],
147
+ "Utilities": [
148
+ "Utilities - Regulated Electric",
149
+ "Utilities - Diversified",
150
+ "Utilities - Renewable",
151
+ "Utilities - Independent Power Producers",
152
+ "Utilities - Regulated Water",
153
+ "Utilities - Regulated Gas"
154
+ ],
155
+ "Real Estate": [
156
+ "REIT - Specialty",
157
+ "REIT - Healthcare Facilities",
158
+ "REIT - Industrial",
159
+ "REIT - Retail",
160
+ "Real Estate - Diversified",
161
+ "Real Estate Services",
162
+ "REIT - Diversified",
163
+ "REIT - Residential",
164
+ "Real Estate - Development",
165
+ "REIT - Office",
166
+ "REIT - Mortgage",
167
+ "REIT - Hotel & Motel"
168
+ ]
169
+ }
json/nn_search_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nn_search_metrics": [
3
+ "industryDisp_num_norm",
4
+ "sectorDisp_num_norm",
5
+ "country_num_norm",
6
+ "ret_365_norm",
7
+ "vol_365_norm",
8
+ "marketCap_norm",
9
+ "beta_norm",
10
+ "revenueGrowth_norm",
11
+ "debtToEquity_norm",
12
+ "fullTimeEmployees_norm",
13
+ "trailingPE_norm"
14
+ ]
15
+ }
json/numeric_columns.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "numeric_columns": [
3
+ "netAssets",
4
+ "threeYearAverageReturn",
5
+ "fiveYearAvgDividendYield",
6
+ "preMarketPrice",
7
+ "payoutRatio",
8
+ "heldPercentInstitutions",
9
+ "epsForward",
10
+ "sharesShort",
11
+ "preMarketChange",
12
+ "fiftyTwoWeekLowChange",
13
+ "enterpriseToEbitda",
14
+ "quickRatio",
15
+ "yield",
16
+ "operatingMargins",
17
+ "firstTradeDateMilliseconds",
18
+ "priceEpsCurrentYear",
19
+ "bookValue",
20
+ "forwardPE",
21
+ "profitMargins",
22
+ "netIncomeToCommon",
23
+ "priceToSalesTrailing12Months",
24
+ "currentRatio",
25
+ "ebitda",
26
+ "beta3Year",
27
+ "ebitdaMargins",
28
+ "trailingAnnualDividendYield",
29
+ "trailingThreeMonthNavReturns",
30
+ "sharesOutstanding",
31
+ "trailingPE",
32
+ "totalDebt",
33
+ "netExpenseRatio",
34
+ "dividendRate",
35
+ "totalAssets",
36
+ "heldPercentInsiders",
37
+ "trailingPegRatio",
38
+ "totalRevenue",
39
+ "totalCashPerShare",
40
+ "previousClose",
41
+ "returnOnAssets",
42
+ "revenuePerShare",
43
+ "enterpriseValue",
44
+ "debtToEquity",
45
+ "epsCurrentYear",
46
+ "dividendYield",
47
+ "revenueGrowth",
48
+ "52WeekChange",
49
+ "shortRatio",
50
+ "numberOfAnalystOpinions",
51
+ "operatingCashflow",
52
+ "sharesShortPriorMonth",
53
+ "twoHundredDayAverageChangePercent",
54
+ "grossProfits",
55
+ "sharesPercentSharesOut",
56
+ "overallRisk",
57
+ "priceToBook",
58
+ "trailingThreeMonthReturns",
59
+ "returnOnEquity",
60
+ "fiftyTwoWeekLowChangePercent",
61
+ "fullTimeEmployees",
62
+ "floatShares",
63
+ "regularMarketChangePercent",
64
+ "marketCap",
65
+ "averageVolume",
66
+ "trailingAnnualDividendRate",
67
+ "earningsGrowth",
68
+ "trailingEps",
69
+ "grossMargins",
70
+ "fiftyDayAverageChangePercent",
71
+ "shortPercentOfFloat",
72
+ "fiftyDayAverageChange",
73
+ "ytdReturn",
74
+ "preMarketChangePercent",
75
+ "earningsQuarterlyGrowth",
76
+ "fiftyTwoWeekHighChangePercent",
77
+ "freeCashflow",
78
+ "recommendationMean",
79
+ "fundInceptionDate",
80
+ "navPrice",
81
+ "beta",
82
+ "totalCash",
83
+ "enterpriseToRevenue",
84
+ "twoHundredDayAverageChange"
85
+ ]
86
+ }
json/semantic_search_params.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "semantic_search_params": {
3
+ "k": 2000,
4
+ "brevity_penalty": 0.1,
5
+ "reward_for_literal": 0.03,
6
+ "partial_match_factor": 0.8
7
+ }
8
+ }
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers==4.44.2
2
+ sentence-transformers
3
+ torch
4
+ scikit-learn
5
+ scipy
6
+ numpy
7
+ pandas
8
+ datasets
9
+ duckdb
10
+ pathlib
11
+ json
12
+ gradio
src/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/__init__.py
2
+ from importlib import import_module
3
+ import sys
4
+
5
+ # Aggregate core sub‑modules so a caller imports the package once instead of listing each file.
6
+ __all__ = [
7
+ "front_dataset_handler",
8
+ "env_options",
9
+ "semantic_search"
10
+ ]
11
+
12
+ for _mod in __all__:
13
+ mod = import_module(f".{_mod}", __name__)
14
+ globals()[_mod] = mod
15
+ sys.modules[_mod] = mod # Pre‑register bare names so intra‑package imports (e.g., `import front_dataset_handler`) succeed.
src/app_utils.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from typing import Sequence, Any
3
+
4
+ import re
5
+
6
+ _NEG_COLOR = "red"
7
+
8
+ def format_large_number(n, decimals=2):
9
+ if n >= 1e12:
10
+ return f'{n / 1e12:.{decimals}f} T'
11
+ elif n >= 1e9:
12
+ return f'{n / 1e9:.{decimals}f} B'
13
+ elif n >= 1e6:
14
+ return f'{n / 1e6:.{decimals}f} M'
15
+ else:
16
+ return str(n)
17
+
18
+ def format_results(df: pd.DataFrame, rename_columns: dict) -> pd.DataFrame:
19
+ # Índice 100
20
+ if "ind_sust" in df.columns:
21
+ df["ind_sust"] = df["ind_sust"].apply(lambda x: "-" if pd.isna(x) else int(round(x * 100, 0)))
22
+ # 1 decimal
23
+ for col in ["trailingPE", "beta"]:
24
+ if col in df.columns:
25
+ df[col] = df[col].apply(lambda x: "-" if pd.isna(x) else f"{x:.1f}")
26
+
27
+ # 2 decimales
28
+ if "Search dist." in df.columns:
29
+ df["Search dist."] = df["Search dist."].apply(lambda n: "-" if pd.isna(n) else f"{n:.2f}")
30
+
31
+ # Cantidades monetarias grandes
32
+ if "marketCap" in df.columns:
33
+ df["marketCap"] = df["marketCap"].apply(lambda n: "-" if pd.isna(n) else format_large_number(n, 1))
34
+ # Porcentajes 1 decimal
35
+ for col in ["ret_365", "revenueGrowth"]:
36
+ if col in df.columns:
37
+ df[col] = df[col].apply(lambda x: "-" if pd.isna(x) or x == 0 else f"{(x * 100):.1f}%")
38
+ # Porcentajes 1 decimal (porcentaje numérico en fuente)
39
+ for col in ["dividendYield"]:
40
+ if col in df.columns:
41
+ df[col] = df[col].apply(lambda x: "-" if pd.isna(x) else f"{round(x, 1)}%")
42
+ # Volatilidad
43
+ if "vol_365" in df.columns:
44
+ df["vol_365"] = df["vol_365"].apply(lambda x: "-" if pd.isna(x) or x == 0 else f"{x:.4f}")
45
+
46
+ # Devolvemos el dataframe con los nombres de columnas renombrados
47
+ return df.rename(columns=rename_columns)
48
+
49
+
50
+ def random_ticker(df: pd.DataFrame) -> str:
51
+ return df["ticker"].sample(n=1).values[0]
52
+
53
+ def styler_negative_red(df: pd.DataFrame, cols: list[str] | None = None):
54
+ """
55
+ Returns a Styler that paints negative numeric values in *cols*.
56
+ Columns absent in *df* are ignored.
57
+ """
58
+ cols = [c for c in (cols or df.columns) if c in df.columns]
59
+
60
+ def _style(v):
61
+ try:
62
+ num = float(re.sub(r"[ %,TMB]", "", str(v)))
63
+ if num < 0:
64
+ return f"color:{_NEG_COLOR}"
65
+ except ValueError:
66
+ pass
67
+ return ""
68
+
69
+ return df.style.applymap(_style, subset=cols)
src/env_options.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import torch
4
+ import transformers
5
+ from typing import List, Dict
6
+
7
+ def check_env(colab:bool=False, use_dotenv:bool=True, dotenv_path:str=None, colab_secrets:dict=None, env_tokens:List[str]=None) -> Dict[str, str]:
8
+ # Checking versions and GPU availability:
9
+ print(f"Python version: {sys.version}")
10
+ print(f"PyTorch version: {torch.__version__}")
11
+ print(f"Transformers version: {transformers.__version__}")
12
+ if torch.cuda.is_available():
13
+ print(f"CUDA device: {torch.cuda.get_device_name(0)}")
14
+ print(f"CUDA Version: {torch.version.cuda}")
15
+ print(f"FlashAttention available: {torch.backends.cuda.flash_sdp_enabled()}")
16
+ else:
17
+ print("No CUDA device available")
18
+
19
+ if use_dotenv:
20
+ from dotenv import load_dotenv
21
+ load_dotenv(dotenv_path) # path to your dotenv file
22
+ print(f"Retrieving token(s) from {dotenv_path} or environment variables")
23
+
24
+ def mask_token(token, unmasked_chars=4):
25
+ return token[:unmasked_chars] + '*' * (len(token) - unmasked_chars*2) + token[-unmasked_chars:]
26
+
27
+ tokens = {}
28
+ for token_name in env_tokens or []:
29
+ if use_dotenv:
30
+ token = os.getenv(token_name)
31
+ elif colab:
32
+ token = colab_secrets.get(token_name)
33
+ else:
34
+ token = os.environ.get(token_name)
35
+
36
+ if token is None:
37
+ print(f"{token_name} not found in the provided .env file or environment variables")
38
+ else:
39
+ print(f"Using {token_name}: {mask_token(token)}")
40
+ tokens[token_name] = token
41
+
42
+ return tokens
src/front_dataset_handler.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ from sklearn.neighbors import NearestNeighbors
5
+ from sklearn.preprocessing import QuantileTransformer
6
+ from scipy.stats import gamma
7
+ import json
8
+
9
+ class FrontDatasetHandler:
10
+ def __init__(self, maestro: pd.DataFrame=None, precios_cierre: pd.DataFrame=None, app_dataset: pd.DataFrame=None,
11
+ json_path: str = None, pickle_path: str = None):
12
+ self.maestro = maestro
13
+ self.app_dataset = app_dataset # Dataframe preprocesado para la app
14
+ self.pickle_path = pickle_path
15
+ # Extraemos los ficheros JSON para la creación del dataset de la app si no se ha pasado como argumento:
16
+ if self.app_dataset is None and json_path is not None:
17
+ with open(os.path.join(json_path, "ignore_columns.json"), "r") as f:
18
+ self.ignore_columns = json.load(f)['ignore_columns']
19
+ print(f"ignore_columns: {self.ignore_columns}")
20
+ with open(os.path.join(json_path, "numeric_columns.json"), "r") as f:
21
+ self.numeric_columns = json.load(f)['numeric_columns']
22
+ print(f"numeric_columns: {self.numeric_columns}")
23
+ with open(os.path.join(json_path, "app_column_config.json"), "rb") as f:
24
+ self.app_dataset_cols = json.load(f)['app_dataset_cols']
25
+ print(f"app_dataset_cols: {self.app_dataset_cols}")
26
+
27
+ with open(os.path.join(json_path, "cat_to_num_maps.json"), "r") as f:
28
+ num_maps = json.load(f)
29
+ self.sector_num_map = num_maps['sector_num_map']
30
+ self.industry_num_map = num_maps['industry_num_map']
31
+
32
+ self.norm_columns = None
33
+ if maestro is not None:
34
+ maestro.drop(columns=self.ignore_columns, inplace=True, errors='ignore')
35
+ self.precios_cierre = precios_cierre # Sólo necesario cuando se requiere preprocesar el dataset para la app
36
+ self.rend_diario_log = None
37
+ self.precios_cierre_fh = None
38
+ self.rendimientos_y_volatilidad = None
39
+ self.mapeos_var_categoricas = None
40
+ self.activos_descartados = []
41
+ self.quantile_scaler = None
42
+
43
+ def filtra_y_homogeneiza(self, n_dias=366, n_dias_descartar=1, min_dias=100):
44
+ if self.precios_cierre.index.name != 'date':
45
+ self.precios_cierre.set_index('date', inplace=True)
46
+ self.precios_cierre.columns.name = 'ticker'
47
+ end_date = self.precios_cierre.index.max()
48
+ start_date = end_date - pd.Timedelta(days=n_dias)
49
+
50
+ # Filtrar datos dentro del rango de fechas
51
+ precios_cierre_fh = self.precios_cierre.loc[start_date:end_date].copy()
52
+
53
+ # Descartar los últimos n_dias_descartar
54
+ if n_dias_descartar > 0:
55
+ dates_to_drop = precios_cierre_fh.index.sort_values(ascending=False)[:n_dias_descartar]
56
+ precios_cierre_fh.drop(dates_to_drop, inplace=True)
57
+
58
+ precios_cierre_fh.ffill(axis=0, inplace=True) # Se rellenan los datos vacíos con el dato del día anterior
59
+ self.activos_descartados = precios_cierre_fh.columns[precios_cierre_fh.notna().sum(axis=0) < min_dias].tolist()
60
+ precios_cierre_fh.drop(columns=self.activos_descartados, inplace=True)
61
+ self.precios_cierre = precios_cierre_fh
62
+ return
63
+
64
+ def calcula_rendimientos_y_volatilidad(self, n_dias=365, umbral_max=0.3, umbral_min=-0.3):
65
+ end_date = self.precios_cierre.index.max()
66
+ start_date = end_date - pd.Timedelta(days=n_dias)
67
+ # Dado que la tabla no siempre incluye fechas de fin de semana o festivos, se busca la fecha más cercana anterior a start_date
68
+ if start_date not in self.precios_cierre.index:
69
+ previous_dates = self.precios_cierre.index[self.precios_cierre.index < start_date]
70
+ if len(previous_dates) > 0:
71
+ start_date = previous_dates.max()
72
+ else:
73
+ raise ValueError(f"No hay datos históricos suficientes ({n_dias}, {end_date})")
74
+ _df_rend_y_vol = self.precios_cierre.loc[start_date:end_date].copy()
75
+
76
+ _df_rend_y_vol.dropna(how='all', inplace=True) #####
77
+ # Reemplazar valores cero y negativos (errores de formato) por el siguiente valor más pequeño positivo
78
+ _df_rend_y_vol[_df_rend_y_vol <= 0] = np.nextafter(0, 1)
79
+ if self.activos_descartados:
80
+ _df_rend_y_vol = _df_rend_y_vol.drop(columns=[col for col in self.activos_descartados if col in _df_rend_y_vol.columns])
81
+ if len(_df_rend_y_vol) == 0:
82
+ raise ValueError(f"No hay datos disponibles en el rango de {n_dias} días")
83
+
84
+
85
+ _rend_diario_log = np.log(_df_rend_y_vol).diff()
86
+ _rend_diario_log = _rend_diario_log.iloc[1:] # Eliminar la primera fila
87
+ # _rend_diario_log.dropna(how='all', inplace=True)
88
+ print(f'Datos rentabilidad ({n_dias} días) con outliers: {_rend_diario_log.shape}')
89
+ # Identificar activos a descartar (outliers)
90
+ _activos_outliers = _rend_diario_log.columns[((_rend_diario_log > umbral_max) | (_rend_diario_log < umbral_min)).any()].tolist()
91
+ self.activos_descartados.extend([asset for asset in _activos_outliers if asset not in self.activos_descartados])
92
+ # Descartar activos con rentabilidades atípicas
93
+ _rend_diario_log = _rend_diario_log.loc[:, ~((_rend_diario_log > umbral_max) | (_rend_diario_log < umbral_min)).any()]
94
+ print(f'Datos rentabilidad sin outliers: {_rend_diario_log.shape}')
95
+
96
+ self.rend_diario_log = _rend_diario_log.copy()
97
+
98
+ # Inicializar rendimientos_y_volatilidad si no existe
99
+ if self.rendimientos_y_volatilidad is None:
100
+ self.rendimientos_y_volatilidad = pd.DataFrame(columns=_rend_diario_log.columns)
101
+ # print(f'INIT: Tabla rendimientos {n_dias}: {self.rendimientos_y_volatilidad.shape}')
102
+ else:
103
+ # Mantener solo los activos que están en _rend_diario_log
104
+ self.rendimientos_y_volatilidad = self.rendimientos_y_volatilidad.loc[:, _rend_diario_log.columns]
105
+ # print(f'Tabla rendimientos {n_dias}: {self.rendimientos_y_volatilidad.shape}')
106
+
107
+ # Añadir nuevas columnas para el n_dias actual
108
+ self.rendimientos_y_volatilidad.loc[f'ret_log_{n_dias}'] = np.sum(_rend_diario_log, axis=0)
109
+ self.rendimientos_y_volatilidad.loc[f'ret_{n_dias}'] = (_df_rend_y_vol.ffill().bfill().iloc[-1] / _df_rend_y_vol.ffill().bfill().iloc[0]) - 1
110
+ self.rendimientos_y_volatilidad.loc[f'vol_{n_dias}'] = _rend_diario_log.var()**0.5
111
+
112
+ return
113
+
114
+ def cruza_maestro(self):
115
+ _rets_y_vol_maestro = self.rendimientos_y_volatilidad.T.reset_index().copy()
116
+ _columns_to_merge = [col for col in _rets_y_vol_maestro.columns if col not in self.maestro.columns]
117
+ if len(_columns_to_merge) > 0:
118
+ _maestro_v2 = self.maestro.merge(_rets_y_vol_maestro, left_on='ticker', right_on='ticker')
119
+ _maestro_v2 = _maestro_v2.replace([float('inf'), float('-inf')], np.nan)
120
+ self.maestro = _maestro_v2
121
+ else:
122
+ raise ValueError("No hay nuevas columnas para cruzar con el dataframe maestro")
123
+ return
124
+
125
+ def _cat_to_num_(self, df, cat, pre_map=None):
126
+ """
127
+ Transforma una columna categórica en un DataFrame a valores numéricos asignando un número entero a cada categoría.
128
+ Si no se proporciona un mapeo (`pre_map`), asigna 0 a la categoría más frecuente, 1 a la siguiente más frecuente, y así sucesivamente.
129
+ Si se proporciona un mapeo (`pre_map`), utiliza ese mapeo para la conversión.
130
+ Parámetros
131
+ ----------
132
+ df : pandas.DataFrame
133
+ DataFrame que contiene la columna categórica a transformar.
134
+ cat : str
135
+ Nombre de la columna categórica a transformar.
136
+ pre_map : dict, opcional
137
+ Diccionario que mapea cada categoría a un valor numérico. Si no se proporciona, el mapeo se genera automáticamente.
138
+ Devuelve
139
+ --------
140
+ pandas.DataFrame
141
+ DataFrame con dos columnas: la columna categórica original y una columna con los valores numéricos asignados.
142
+ """
143
+ if not pre_map:
144
+ pivot = pd.pivot_table(df, index=[cat], aggfunc='size')
145
+ df_sorted = pivot.sort_values(ascending=False).reset_index(name='count')
146
+ df_sorted[cat + '_num'] = range(len(df_sorted))
147
+ else:
148
+ df_sorted = pd.DataFrame({cat: list(pre_map.keys()), cat + '_num': list(pre_map.values())})
149
+ return df_sorted
150
+
151
+ def var_categorica_a_numerica(self, cat_cols):
152
+ for col in cat_cols:
153
+ if col == 'sectorDisp':
154
+ globals()[f"pt_{col}"] = self._cat_to_num_(self.maestro, col, self.sector_num_map)
155
+ elif col == 'industryDisp':
156
+ globals()[f"pt_{col}"] = self._cat_to_num_(self.maestro, col, self.industry_num_map)
157
+ else:
158
+ globals()[f"pt_{col}"] = self._cat_to_num_(self.maestro, col) # Creamos un dataframe con el mapeo de cada variable categórica por frecuencia
159
+ self.mapeos_var_categoricas = [globals()[f"pt_{col}"] for col in cat_cols] # Lista de dataframes con los mapeos de cada una de las variables categóricas
160
+
161
+ _maestro = self.maestro.copy()
162
+ for col, pt in zip(cat_cols, self.mapeos_var_categoricas):
163
+ _maestro[col] = _maestro[col].astype(str)
164
+ pt[col] = pt[col].astype(str)
165
+ # Creamos un diccionario con cada variable categórica y su equivalente numérico
166
+ mapping_dict = dict(zip(pt[col], pt[col + '_num']))
167
+ _maestro[col + '_num'] = _maestro[col].map(mapping_dict)
168
+ _maestro[col + '_num'] = pd.to_numeric(_maestro[col + '_num'], errors='coerce')
169
+
170
+ self.maestro = _maestro
171
+ return
172
+
173
+ def normaliza_por_cuantiles(self):
174
+ maestro_copy = self.maestro.copy()
175
+ numeric_columns = maestro_copy.select_dtypes(include=np.number).columns
176
+ self.quantile_scaler = QuantileTransformer(output_distribution='uniform')
177
+ variables_numericas = [col for col in numeric_columns if not col.endswith('_norm')]
178
+ all_na_cols = [col for col in variables_numericas if maestro_copy[col].isna().all()]
179
+ variables_numericas = [col for col in variables_numericas if col not in all_na_cols]
180
+ self.norm_columns = ['{}_norm'.format(var) for var in variables_numericas]
181
+ maestro_copy[self.norm_columns] = self.quantile_scaler.fit_transform(maestro_copy[variables_numericas])
182
+ maestro_copy[self.norm_columns] = maestro_copy[self.norm_columns].clip(0, 1)
183
+ self.maestro = maestro_copy
184
+ return
185
+
186
+ def var_estandar_z(self):
187
+ maestro_copy = self.maestro.copy()
188
+ numeric_columns = maestro_copy.select_dtypes(include=np.number).columns
189
+ variables_numericas = [col for col in numeric_columns if not col.endswith('_std')]
190
+ variables_num_std = ['{}_std'.format(var) for var in variables_numericas]
191
+
192
+ def estandarizar(x):
193
+ # Estandariza el valor z, restando la media y dividiendo por la desviación estándar
194
+ mean_val = x.mean()
195
+ std_val = x.std()
196
+ if pd.isna(std_val) or std_val == 0:
197
+ return pd.Series(0.0, index=x.index, name=x.name)
198
+ else:
199
+ normalized_series = (x - mean_val) / std_val
200
+ return normalized_series.fillna(0.0)
201
+
202
+ normalized_data = maestro_copy[variables_numericas].apply(estandarizar, axis=0)
203
+ maestro_copy[variables_num_std] = normalized_data
204
+ self.maestro = maestro_copy
205
+ return
206
+
207
+ def configura_distr_prob(self, shape, loc, scale, max_dist, precision_cdf):
208
+ x = np.linspace(0, max_dist, num=precision_cdf)
209
+ y_pdf = gamma.pdf(x, shape, loc, scale )
210
+ y_cdf = gamma.cdf(x, shape, loc, scale )
211
+ return y_pdf, y_cdf
212
+
213
+ def calculos_y_ajustes_dataset_activos(self):
214
+ maestro_copy = self.maestro.copy()
215
+ # Conversiones a formato numérico de columnas que dan problemas
216
+ for column in self.numeric_columns:
217
+ if column in maestro_copy.columns:
218
+ maestro_copy[column] = pd.to_numeric(maestro_copy[column], errors='coerce')
219
+ # print(f"Columna {column} convertida a {maestro_copy[column].dtype}")
220
+ # Estandarización de los diferentes tipos de NaN
221
+ # maestro_copy = maestro_copy.replace([None, np.nan, np.inf, -np.inf], pd.NA)
222
+ # Antigüedad del fondo en años:
223
+ if self.precios_cierre is not None and not self.precios_cierre.index.empty:
224
+ _most_recent_date = self.precios_cierre.index.max().date()
225
+ else:
226
+ _most_recent_date = pd.Timestamp.today().date()
227
+ # maestro_copy['firstTradeDateMilliseconds'] = pd.to_datetime(maestro_copy['firstTradeDateMilliseconds']).dt.date
228
+ maestro_copy['firstTradeDateMilliseconds'] = pd.to_datetime(maestro_copy['firstTradeDateMilliseconds'], unit='ms', errors='coerce').dt.date
229
+ maestro_copy['asset_age'] = maestro_copy['firstTradeDateMilliseconds'].apply(
230
+ lambda x: ((_most_recent_date - x).days / 365) if pd.notnull(x) and hasattr(x, 'day') else 0
231
+ ).astype(int)
232
+ outlier_thresholds = {
233
+ 'beta': (-100, 100),
234
+ 'dividendYield': (-1,100),
235
+ 'fiveYearAvgDividendYield': (-1,100),
236
+ 'trailingAnnualDividendYield': (-1,100),
237
+ 'quickRatio': (-1, 500),
238
+ 'currentRatio': (-1, 500),
239
+ 'ebitda': (-1e12, 1e12),
240
+ 'grossProfits': (-1e12, 1e12),
241
+ }
242
+ for column, (lower_bound, upper_bound) in outlier_thresholds.items():
243
+ maestro_copy.loc[(maestro_copy[column] < lower_bound) | (maestro_copy[column] > upper_bound), column] = pd.NA
244
+ self.maestro = maestro_copy.copy()
245
+ return
246
+
247
+ def filtra_df_activos(self, df, isin_target, filtros, debug=False):
248
+ '''
249
+ LEGACY
250
+ Devuelve un dataframe filtrado, sin alterar el orden, eliminando características no deseadas, para usar en aplicación de búsqueda de activos sustitutivos.
251
+ Las características y valores a filtrar son las de un fondo objetivo dado por su isin.
252
+ Por ejemplo, si clean_share es False en filtros, el dataframe final no incluirá más activos con el mismo valor de clean_share que el ISIN objetivo
253
+ Argumentos:
254
+ df (pandas.core.frame.DataFrame): Dataframe maestro de activos
255
+ isin_target (str): ISIN del fondo objetivo
256
+ # caracteristicas (list): Lista de str con los nombres de las características
257
+ filtros (dict): Diccionario donde las claves son las características y los valores son True si se quiere conservar
258
+ debug (bool, optional): Muestra información de depuración. Por defecto False.
259
+ Resultado:
260
+ df_filt (pandas.core.frame.DataFrame): Dataframe filtrado
261
+ '''
262
+ # fondo_target = df[df['isin'] == isin_target].iloc[0]
263
+ fondo_target = df[df['ticker'] == isin_target].iloc[0]
264
+ if debug: print(f'Tamaño inicial: {df.shape}')
265
+
266
+ car_numericas = ['ret_365', 'vol_365', 'marketCap', 'asset_age']
267
+
268
+ # for feature in caracteristicas[2:]:
269
+ for feature in list(filtros.keys()):
270
+ value = fondo_target[feature]
271
+ if debug: print(f'{feature} = {value}')
272
+
273
+ # Verificar si esta característica debe ser filtrada
274
+ if feature in filtros and not filtros[feature]:
275
+ if debug: print(f'FILTRO: {feature} != {value}')
276
+ df = df[df[feature] != value]
277
+
278
+ # Aplicar filtros adicionales para variables numéricas
279
+ if feature in car_numericas:
280
+ if feature == 'ret_365':
281
+ if debug: print(f'FILTRO NUMÉRICO: {feature} > {value}')
282
+ df = df[df[feature] > value]
283
+ elif feature == 'vol_365':
284
+ if debug: print(f'FILTRO NUMÉRICO: {feature} < {value}')
285
+ df = df[df[feature] < value]
286
+ elif feature == 'asset_age':
287
+ if debug: print(f'FILTRO NUMÉRICO: {feature} > {value}')
288
+ df = df[df[feature] > value]
289
+ elif feature == 'marketCap':
290
+ if debug: print(f'FILTRO NUMÉRICO: {feature} > {value}')
291
+ df = df[df[feature] < value]
292
+
293
+ df_filt = df
294
+ if debug: print(f'Tamaño final: {df_filt.shape}')
295
+ return df_filt
296
+
297
+ def calcula_ind_sust (self, dist, y_cdf, precision_cdf, max_dist):
298
+ try:
299
+ idx = int((precision_cdf / max_dist) * dist)
300
+ idx = min(idx, len(y_cdf) - 1)
301
+ norm_dist = y_cdf[idx]
302
+ ind_sust = max(0.0, 1.0 - norm_dist)
303
+ except IndexError:
304
+ ind_sust = 0
305
+ return ind_sust
306
+
307
+
308
+ def vecinos_cercanos(self, df, variables_busq, caracteristicas, target_ticker, y_cdf, precision_cdf, max_dist, n_neighbors, filtros):
309
+ if target_ticker not in df['ticker'].values:
310
+ return f"Error: '{target_ticker}' no encontrado en la base de datos"
311
+ target_row = df[df['ticker'] == target_ticker]
312
+ if ~target_row.index.isin(df.index):
313
+ df = pd.concat([df, target_row], ignore_index=True)
314
+ # print(f'DF original: {df.shape}')
315
+ X = df[variables_busq]
316
+ model = NearestNeighbors(n_neighbors=n_neighbors) ##### probar con más y filtrar después #######
317
+ model.fit(X)
318
+ target_row = df[df['ticker'] == target_ticker][variables_busq]
319
+ # model.kneighbors devuelve dos arrays bidimensionales con los vecinos más cercanos y sus distancias:
320
+ distances, indices = model.kneighbors(target_row)
321
+ # combined_columns = list(set(caracteristicas + variables_busq))
322
+ neighbors_df = df.iloc[indices[0]][caracteristicas]
323
+ neighbors_df['distance'] = distances[0]
324
+ ind_sust = np.array([self.calcula_ind_sust(dist, y_cdf, precision_cdf, max_dist) for dist in distances[0]])
325
+
326
+ neighbors_df['ind_sust'] = ind_sust
327
+ neighbors_df = neighbors_df.sort_values(by='distance', ascending=True)
328
+ target_row = neighbors_df[neighbors_df['ticker'] == target_ticker]
329
+
330
+ # Aplicamos los filtros de exclusión:
331
+ ### Código pendiente de eliminar/modificar (legado de la aplicación de fondos)
332
+ neighbors_df = self.filtra_df_activos (df = neighbors_df, isin_target = target_ticker, filtros = filtros)
333
+ ####################
334
+
335
+ # Recupera el activo seleccionado en caso de haber hecho filtros, devolviéndolo a la primera posición del dataframe:
336
+ if ~target_row.index.isin(neighbors_df.index):
337
+ neighbors_df = pd.concat([pd.DataFrame(target_row), neighbors_df], ignore_index=True)
338
+ # print(f'DF filtrado: {neighbors_df.shape}')
339
+ # Ponemos el ticker como índice:
340
+ neighbors_df.set_index('ticker', inplace = True)
341
+ return neighbors_df
342
+
343
+ def format_large_number(self, n, decimals=2):
344
+ if n >= 1e12:
345
+ return f'{n / 1e12:.{decimals}f} T'
346
+ elif n >= 1e9:
347
+ return f'{n / 1e9:.{decimals}f} B'
348
+ elif n >= 1e6:
349
+ return f'{n / 1e6:.{decimals}f} M'
350
+ else:
351
+ return str(n)
352
+
353
+ def trae_embeddings_desde_pkl(self, embeddings_df_file_name='df_with_embeddings.pkl', embeddings_col_name='embeddings'):
354
+ embeddings_df = pd.read_pickle(os.path.join(self.pickle_path, embeddings_df_file_name))
355
+ self.maestro = self.maestro.merge(
356
+ embeddings_df[['ticker', embeddings_col_name]],
357
+ on='ticker',
358
+ how='left'
359
+ )
360
+ print(f"Agregados embeddings {self.maestro.shape}")
361
+ return
362
+
363
+ def procesa_app_dataset(self, periodo=366, n_dias_descartar=1, min_dias=250, umbrales_rend=(-0.3, +0.3), periodos_metricas=[60, 365],
364
+ cat_cols = ['industryDisp', 'sectorDisp', 'country', 'city', 'exchange', 'financialCurrency', 'quoteType'],
365
+ embeddings_df_file_name='df_with_embeddings.pkl', embeddings_col_name='embeddings'):
366
+ if self.app_dataset is not None:
367
+ print("app_dataset already exists, skipping processing")
368
+ return
369
+
370
+ self.filtra_y_homogeneiza(n_dias=periodo, n_dias_descartar=n_dias_descartar, min_dias=min_dias)
371
+
372
+ for periodo_metricas in periodos_metricas:
373
+ self.calcula_rendimientos_y_volatilidad(n_dias=periodo_metricas, umbral_max=umbrales_rend[1], umbral_min=umbrales_rend[0])
374
+ self.cruza_maestro()
375
+ self.var_categorica_a_numerica(cat_cols)
376
+
377
+ self.calculos_y_ajustes_dataset_activos()
378
+ self.normaliza_por_cuantiles()
379
+ self.trae_embeddings_desde_pkl(embeddings_df_file_name=embeddings_df_file_name, embeddings_col_name=embeddings_col_name)
380
+ app_dataset = self.maestro.copy()
381
+ app_dataset = app_dataset.fillna({col: 0.5 for col in self.norm_columns})
382
+ # Filtrado final de columnas para reducir el dataset:
383
+ self.app_dataset = app_dataset[self.app_dataset_cols].copy()
384
+ print(f"app_dataset preparado: {self.app_dataset.shape}")
385
+ return
src/semantic_search.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import duckdb
2
+ from sentence_transformers import SentenceTransformer
3
+ import pandas as pd
4
+ import re
5
+
6
+ def duckdb_vss_local(
7
+ model: SentenceTransformer,
8
+ duckdb_connection: duckdb.DuckDBPyConnection,
9
+ query: str,
10
+ k: int = 1000,
11
+ brevity_penalty: float = 0.0,
12
+ reward_for_literal: float = 0.0,
13
+ partial_match_factor: float = 0.5,
14
+ table_name: str = "maestro_vector_table",
15
+ embedding_column: str = "vec",
16
+ ):
17
+
18
+ query_vector = model.encode(query)
19
+ embedding_dim = model.get_sentence_embedding_dimension()
20
+
21
+ sql = f"""
22
+ SELECT
23
+ *,
24
+ array_cosine_distance(
25
+ {embedding_column}::float[{embedding_dim}],
26
+ {query_vector.tolist()}::float[{embedding_dim}]
27
+ ) as distance
28
+ FROM {table_name}
29
+ ORDER BY distance
30
+ LIMIT {k}
31
+ """
32
+ result = duckdb_connection.sql(sql).to_df()
33
+ # Utilizar los parámetros "debug" para mostrar columnas intermedias:
34
+ if brevity_penalty > 0:
35
+ result = penalize_short_summaries(result, factor = brevity_penalty, distance_column = 'distance',
36
+ summary_column = 'longBusinessSummary', debug = False)
37
+ if reward_for_literal > 0:
38
+ result = reward_literals(result, query, factor = reward_for_literal,
39
+ partial_match_factor= partial_match_factor, distance_column = 'distance',
40
+ summary_column = 'longBusinessSummary', debug = False)
41
+
42
+ return result
43
+
44
+ def penalize_short_summaries(
45
+ df: pd.DataFrame,
46
+ factor: float = 0.1,
47
+ distance_column: str = 'distance',
48
+ summary_column: str = 'longBusinessSummary',
49
+ debug: bool = True
50
+ ) -> pd.DataFrame:
51
+
52
+ result_df = df.copy()
53
+ result_df['summary_length'] = result_df[summary_column].apply(
54
+ lambda x: len(str(x)) if pd.notna(x) else 0
55
+ )
56
+ avg_length = max(1.0, result_df['summary_length'].mean())
57
+ max_dist = result_df['distance'].max()
58
+
59
+ result_df['percent_shorter'] = result_df['summary_length'].apply(
60
+ lambda x: max(0, (avg_length - x) / avg_length)
61
+ )
62
+ result_df['orig_distance'] = result_df[distance_column]
63
+ # Penalizamos en función del porcentaje en el que el resumen es más corto que la media (multiplicado por el factor)
64
+ result_df[distance_column] = result_df.apply(
65
+ lambda row: min(max_dist, row[distance_column] + (row['percent_shorter'] * factor)),
66
+ axis=1
67
+ )
68
+
69
+ if not debug:
70
+ result_df = result_df.drop(['orig_distance', 'summary_length', 'percent_shorter'], axis=1)
71
+
72
+ result_df = result_df.sort_values(by=distance_column, ascending=True)
73
+ return result_df
74
+
75
+ def reward_literals(
76
+ df: pd.DataFrame,
77
+ query: str,
78
+ factor: float = 0.1,
79
+ partial_match_factor: float = 0.5,
80
+ distance_column: str = 'distance',
81
+ summary_column: str = 'longBusinessSummary',
82
+ debug: bool = True
83
+ ) -> pd.DataFrame:
84
+
85
+ result_df = df.copy()
86
+ query_lower = query.lower().strip()
87
+
88
+ def count_phrase_occurrences(summary):
89
+ if pd.isna(summary):
90
+ return 0
91
+ summary_lower = str(summary).lower()
92
+
93
+ # Cuenta coincidencias exactas (palabras completas)
94
+ exact_pattern = r'\b' + re.escape(query_lower) + r'\b'
95
+ exact_count = len(re.findall(exact_pattern, summary_lower))
96
+
97
+ # Cuenta coincidencias parciales basadas en el tipo de consulta
98
+ if ' ' in query_lower: # Si la consulta incluye varias palabras
99
+ # Para frases, contamos las veces que aparece en el texto
100
+ partial_pattern = re.escape(query_lower)
101
+ partial_count = len(re.findall(partial_pattern, summary_lower))
102
+ else:
103
+ # Para consultas de una sola palabra, buscamos subcadenas dentro de palabras
104
+ partial_pattern = r'\b\w*' + re.escape(query_lower) + r'\w*\b'
105
+ partial_count = len(re.findall(partial_pattern, summary_lower))
106
+
107
+ # Resta las coincidencias exactas de las parciales para evitar contar dos veces
108
+ partial_count = partial_count - exact_count
109
+
110
+ # Penalizamos las coincidencias parciales:
111
+ return exact_count + (partial_count * partial_match_factor)
112
+
113
+ result_df['term_occurrences'] = result_df[summary_column].apply(count_phrase_occurrences)
114
+ result_df['orig_distance'] = result_df[distance_column]
115
+ result_df[distance_column] = result_df.apply(
116
+ lambda row: max(0, row[distance_column] - (row['term_occurrences'] * factor)),
117
+ axis=1
118
+ )
119
+ if not debug:
120
+ result_df = result_df.drop(['orig_distance', 'term_occurrences'], axis=1)
121
+ result_df = result_df.sort_values(by=distance_column, ascending=True)
122
+
123
+ return result_df
124
+