Spaces:

reddgr
/

sss

Running

App Files Files Community

reddgr commited on May 2

Commit

8f074bc

1 Parent(s): e6c7897

first commit

Browse files

Files changed (21) hide show

html/front_layout.html +117 -0
json/app_column_config.json +71 -0
json/app_dataset_cols.json +32 -0
json/cat_cols.json +25 -0
json/cat_to_num_maps.json +169 -0
json/col_names_map.json +114 -0
json/col_names_map_sorted.json +114 -0
json/cols_tabla_equity.json +15 -0
json/embeddings_excluded_words.json +39 -0
json/gamma_params.json +7 -0
json/ignore_columns.json +96 -0
json/industry_lists.json +169 -0
json/nn_search_metrics.json +15 -0
json/numeric_columns.json +86 -0
json/semantic_search_params.json +8 -0
requirements.txt +12 -0
src/__init__.py +15 -0
src/app_utils.py +69 -0
src/env_options.py +42 -0
src/front_dataset_handler.py +385 -0
src/semantic_search.py +124 -0

html/front_layout.html ADDED Viewed

	@@ -0,0 +1,117 @@

+<!-- html/front_layout.html -->
+<h1 style="text-align:center;margin-bottom:15px;margin-left:10px">
+  Swift Stock Screener
+</h1>
+<p style="margin-left:10px">
+  Browse and search over 12,000 stocks. Search assets by theme, filter, sort, analyze, and get ideas to build portfolios and indices. Search by <b>ticker symbol</b> to display a list of ranked related companies. Enter any keyword in <b>thematic search</b> to search by theme. Click on <u>country names</u> or <u>GICS sectors</u> for strict filtering. <b>Reset</b>the search and <b>bort</b> all assets by any of the displayed metrics.
+<style>
+  /* Botón de tamaño contenido */
+  .small-btn {
+    /*width: 140px;*/
+    max-width: 140px;
+    /*min-width: 140px;*/
+  }
+  /* Etiqueta de paginación */
+  .pagination-label {
+    flex: 0 0 auto;
+    width: auto;
+    margin: 0 8px;   /* small horizontal gap */
+  }
+  /* cap the Gradio table + keep pagination row below */
+  .clickable-columns .dataframe-container {
+    max-height: calc(100vh - 300px);  /* adjust px to match header+controls height */
+    overflow-y: auto;
+  }
+  /* Columnas filtrables (click en la celda) */
+  .clickable-columns tbody td:nth-child(3),
+  .clickable-columns tbody td:nth-child(4) {
+      color: #1a0dab;              /* link blue for light theme */
+      text-decoration: underline;  /* underline */
+      cursor: pointer;             /* pointer cursor */
+  }
+  @media (prefers-color-scheme: dark) {
+    .clickable-columns tbody td:nth-child(3),
+    .clickable-columns tbody td:nth-child(4) {
+      color: #8ab4f8;              /* lighter blue for dark theme */
+    }
+  }
+  .clickable-columns span.negative-value {
+    color: red;
+  }
+    /* make the table use fixed layout so width rules apply */
+    .clickable-columns table {
+    table-layout: fixed;
+  }
+  /* CONFIGURACIÓN DE ANCHO DE COLUMNAS */
+  /* Ticker */
+  .clickable-columns table th:nth-child(1),
+  .clickable-columns table td:nth-child(1) {
+    min-width: 40px; max-width: 100px;
+    overflow: hidden;
+  }
+  .clickable-columns table th:nth-child(2),
+  .clickable-columns table td:nth-child(2) {
+    min-width: 75px; max-width: 220px;
+    overflow: hidden;
+  }
+  .clickable-columns table th:nth-child(3),
+  .clickable-columns table td:nth-child(3) {
+    min-width: 70px; max-width: 160px;
+    overflow: hidden;
+  }
+  .clickable-columns table th:nth-child(4),
+  .clickable-columns table td:nth-child(4) {
+    min-width: 70px; max-width: 200px;
+    overflow: hidden;
+  }
+  .clickable-columns table th:nth-child(5),
+  .clickable-columns table td:nth-child(5) {
+    min-width: 60px; max-width: 80px;
+    overflow: hidden;
+  }
+  /* 1yr return */
+  .clickable-columns table th:nth-child(6),
+  .clickable-columns table td:nth-child(6) {
+    min-width: 60px; max-width: 80px;
+    overflow: hidden;
+  }
+  .clickable-columns table th:nth-child(7),
+  .clickable-columns table td:nth-child(7) {
+    min-width: 70px; max-width: 100px;
+    overflow: hidden;
+  }
+  .clickable-columns table th:nth-child(8),
+  .clickable-columns table td:nth-child(8) {
+    min-width: 70px; max-width: 100px;
+    overflow: hidden;
+  }
+  .clickable-columns table th:nth-child(9),
+  .clickable-columns table td:nth-child(9) {
+    min-width: 70px; max-width: 100px;
+    overflow: hidden;
+  }
+  .clickable-columns table th:nth-child(10),
+  .clickable-columns table td:nth-child(10) {
+    min-width: 70px; max-width: 100px;
+    overflow: hidden;
+  }
+  .clickable-columns table th:nth-child(11),
+  .clickable-columns table td:nth-child(11) {
+    min-width: 60px; max-width: 70px;
+    overflow: hidden;
+  }
+  .clickable-columns table th:nth-child(12),
+  .clickable-columns table td:nth-child(12) {
+    min-width: 50px; max-width: 70px;
+    overflow: hidden;
+  }
+</style>

json/app_column_config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+    "app_dataset_cols": [
+	    "ticker",
+        "security",
+        "country",
+        "sector",
+        "marketCap",
+        "ret_365",
+        "vol_365",
+        "trailingPE",
+        "revenueGrowth",
+        "dividendYield",
+        "beta",
+        "beta_norm",
+        "category",
+        "country_num_norm",
+        "debtToEquity_norm",
+        "fullTimeEmployees_norm",
+        "fundFamily",
+        "fundInceptionDate",
+        "industryDisp_num_norm",
+        "marketCap_norm",
+        "netExpenseRatio",
+        "quoteType",
+        "ret_365_norm",
+        "revenueGrowth_norm",
+        "sectorDisp_num_norm",
+        "totalAssets",
+        "trailingPE_norm",
+        "vol_365_norm",
+		"longBusinessSummary",
+		"embeddings"
+    ],
+    "variables_busq_norm": [
+        "beta_norm",
+        "country_num_norm",
+        "debtToEquity_norm",
+        "fullTimeEmployees_norm",
+        "industryDisp_num_norm",
+        "marketCap_norm",
+        "ret_365_norm",
+        "revenueGrowth_norm",
+        "sectorDisp_num_norm",
+        "trailingPE_norm",
+        "vol_365_norm"
+    ],
+    "cols_tabla_equity": [
+        "ticker",
+        "security",
+        "country",
+        "sector",
+        "marketCap",
+        "ret_365",
+        "vol_365",
+        "trailingPE",
+        "revenueGrowth",
+        "dividendYield",
+        "beta"
+    ],
+    "cols_tabla_etfs": [
+        "ticker",
+        "security",
+        "category",
+        "ret_365",
+        "vol_365",
+        "totalAssets",
+        "netExpenseRatio",
+        "fundInceptionDate",
+        "fundFamily"
+    ]
+}

json/app_dataset_cols.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "app_dataset_cols": [
+        "ticker",
+        "security",
+        "country",
+        "sector",
+        "marketCap",
+        "ret_365",
+        "vol_365",
+        "trailingPE",
+        "revenueGrowth",
+        "dividendYield",
+        "beta",
+        "industryDisp_num_norm",
+        "sectorDisp_num_norm",
+        "country_num_norm",
+        "ret_365_norm",
+        "vol_365_norm",
+        "marketCap_norm",
+        "beta_norm",
+        "revenueGrowth_norm",
+        "debtToEquity_norm",
+        "fullTimeEmployees_norm",
+        "trailingPE_norm",
+        "category",
+        "fundFamily",
+        "totalAssets",
+        "netExpenseRatio",
+        "quoteType",
+		"embeddings"
+    ]
+}

json/cat_cols.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "cat_cols": [
+        "country",
+        "industryDisp",
+        "legalType",
+        "sector",
+        "state",
+        "exchange",
+        "exchangeTimezoneShortName",
+        "zip",
+        "exchangeTimezoneName",
+        "category",
+        "industryKey",
+        "currency",
+        "quoteType",
+        "industry",
+        "fullExchangeName",
+        "city",
+        "sectorKey",
+        "market",
+        "financialCurrency",
+        "recommendationKey",
+        "sectorDisp"
+    ]
+}

json/cat_to_num_maps.json ADDED Viewed

	@@ -0,0 +1,169 @@

+{
+    "sector_num_map": {
+        "Technology": 0,
+        "Healthcare": 1,
+        "Utilities": 2,
+        "Industrials": 3,
+        "Basic Materials": 4,
+        "Consumer Cyclical": 5,
+        "Consumer Defensive": 6,
+        "Energy": 7,
+        "Communication Services": 8,
+        "Financial Services": 9,
+        "Real Estate": 10
+    },
+    "dummy_num_dict": {
+        "abcd": 0,
+        "efgh": 1,
+        "ijkl": 2,
+        "mnop": 3,
+        "qrst": 4
+    },
+    "industry_num_map": {
+        "Thermal Coal": 0,
+        "Oil & Gas Integrated": 1,
+        "Oil & Gas Refining & Marketing": 2,
+        "Uranium": 3,
+        "Oil & Gas Equipment & Services": 4,
+        "Oil & Gas E&P": 5,
+        "Oil & Gas Midstream": 6,
+        "Oil & Gas Drilling": 7,
+        "Aluminum": 8,
+        "Steel": 9,
+        "Specialty Chemicals": 10,
+        "Chemicals": 11,
+        "Paper & Paper Products": 12,
+        "Lumber & Wood Production": 13,
+        "Building Materials": 14,
+        "Agricultural Inputs": 15,
+        "Other Industrial Metals & Mining": 16,
+        "Coking Coal": 17,
+        "Copper": 18,
+        "Other Precious Metals & Mining": 19,
+        "Gold": 20,
+        "Silver": 21,
+        "Marine Shipping": 22,
+        "Integrated Freight & Logistics": 23,
+        "Trucking": 24,
+        "Railroads": 25,
+        "Airlines": 26,
+        "Farm & Heavy Construction Machinery": 27,
+        "Industrial Distribution": 28,
+        "Rental & Leasing Services": 29,
+        "Aerospace & Defense": 30,
+        "Specialty Industrial Machinery": 31,
+        "Waste Management": 32,
+        "Electrical Equipment & Parts": 33,
+        "Airports & Air Services": 34,
+        "Pollution & Treatment Controls": 35,
+        "Conglomerates": 36,
+        "Tools & Accessories": 37,
+        "Engineering & Construction": 38,
+        "Metal Fabrication": 39,
+        "Building Products & Equipment": 40,
+        "Specialty Business Services": 41,
+        "Security & Protection Services": 42,
+        "Consulting Services": 43,
+        "Staffing & Employment Services": 44,
+        "Business Equipment & Supplies": 45,
+        "Infrastructure Operations": 46,
+        "Utilities - Regulated Electric": 47,
+        "Utilities - Independent Power Producers": 48,
+        "Utilities - Diversified": 49,
+        "Utilities - Regulated Gas": 50,
+        "Utilities - Renewable": 51,
+        "Utilities - Regulated Water": 52,
+        "Recreational Vehicles": 53,
+        "Auto Manufacturers": 54,
+        "Auto & Truck Dealerships": 55,
+        "Auto Parts": 56,
+        "Footwear & Accessories": 57,
+        "Apparel Manufacturing": 58,
+        "Specialty Retail": 59,
+        "Furnishings, Fixtures & Appliances": 60,
+        "Luxury Goods": 61,
+        "Internet Retail": 62,
+        "Travel Services": 63,
+        "Leisure": 64,
+        "Packaging & Containers": 65,
+        "Home Improvement Retail": 66,
+        "Apparel Retail": 67,
+        "Textile Manufacturing": 68,
+        "Department Stores": 69,
+        "Residential Construction": 70,
+        "Lodging": 71,
+        "Restaurants": 72,
+        "Gambling": 73,
+        "Personal Services": 74,
+        "Resorts & Casinos": 75,
+        "Confectioners": 76,
+        "Beverages - Non - Alcoholic": 77,
+        "Packaged Foods": 78,
+        "Food Distribution": 79,
+        "Household & Personal Products": 80,
+        "Discount Stores": 81,
+        "Grocery Stores": 82,
+        "Tobacco": 83,
+        "Beverages - Wineries & Distilleries": 84,
+        "Beverages - Brewers": 85,
+        "Farm Products": 86,
+        "Education & Training Services": 87,
+        "Electronics & Computer Distribution": 88,
+        "Computer Hardware": 89,
+        "Semiconductors": 90,
+        "Electronic Components": 91,
+        "Semiconductor Equipment & Materials": 92,
+        "Consumer Electronics": 93,
+        "Communication Equipment": 94,
+        "Scientific & Technical Instruments": 95,
+        "Information Technology Services": 96,
+        "Solar": 97,
+        "Software - Application": 98,
+        "Software - Infrastructure": 99,
+        "Broadcasting": 100,
+        "Telecom Services": 101,
+        "Advertising Agencies": 102,
+        "Entertainment": 103,
+        "Publishing": 104,
+        "Internet Content & Information": 105,
+        "Electronic Gaming & Multimedia": 106,
+        "Medical Distribution": 107,
+        "Drug Manufacturers - General": 108,
+        "Pharmaceutical Retailers": 109,
+        "Drug Manufacturers - Specialty & Generic": 110,
+        "Medical Instruments & Supplies": 111,
+        "Health Information Services": 112,
+        "Medical Devices": 113,
+        "Healthcare Plans": 114,
+        "Diagnostics & Research": 115,
+        "Biotechnology": 116,
+        "Medical Care Facilities": 117,
+        "Banks - Diversified": 118,
+        "Banks - Regional": 119,
+        "Financial Conglomerates": 120,
+        "Credit Services": 121,
+        "Insurance - Reinsurance": 122,
+        "Mortgage Finance": 123,
+        "Insurance - Diversified": 124,
+        "Capital Markets": 125,
+        "Insurance - Life": 126,
+        "Insurance - Specialty": 127,
+        "Insurance - Property & Casualty": 128,
+        "Financial Data & Stock Exchanges": 129,
+        "Insurance Brokers": 130,
+        "Asset Management": 131,
+        "Shell Companies": 132,
+        "REIT - Mortgage": 133,
+        "REIT - Healthcare Facilities": 134,
+        "REIT - Retail": 135,
+        "REIT - Diversified": 136,
+        "REIT - Residential": 137,
+        "REIT - Office": 138,
+        "REIT - Industrial": 139,
+        "REIT - Hotel & Motel": 140,
+        "Real Estate - Diversified": 141,
+        "Real Estate Services": 142,
+        "REIT - Specialty": 143,
+        "Real Estate - Development": 144
+    }
+}

json/col_names_map.json ADDED Viewed

	@@ -0,0 +1,114 @@

+{
+    "col_names_map": {
+        "52WeekChange": "52-Week Change",
+        "asset_age": "Years Listed",
+        "averageAnalystRating": "Avg. Analyst Rating",
+        "averageVolume": "Avg. Volume",
+        "beta": "Beta",
+        "beta3Year": "Beta 3-Year",
+        "bookValue": "Book Value",
+        "category": "Category",
+        "city": "City",
+        "country": "Country",
+        "currency": "Currency",
+        "currentRatio": "Current Ratio",
+        "debtToEquity": "Debt To Equity",
+        "dividendRate": "Dividend Rate",
+        "dividendYield": "Dividend Yield",
+        "earningsGrowth": "Earnings Growth",
+        "earningsQuarterlyGrowth": "Earnings Quarterly Growth",
+        "ebitda": "EBITDA",
+        "ebitdaMargins": "EBITDA Margins",
+        "enterpriseToEbitda": "Enterprise To EBITDA",
+        "enterpriseToRevenue": "Enterprise To Revenue",
+        "enterpriseValue": "Enterprise Value",
+        "epsCurrentYear": "EPS Current Year",
+        "epsForward": "EPS Forward",
+        "exchange": "Exchange",
+        "exchangeTimezoneName": "Exchange Timezone Name",
+        "exchangeTimezoneShortName": "Exchange Timezone Short Name",
+        "fiftyDayAverageChange": "50-Day Avg. Change",
+        "fiftyDayAverageChangePercent": "50-Day Avg. Change Percent",
+        "fiftyTwoWeekHighChangePercent": "52-Week High Change Percent",
+        "fiftyTwoWeekLowChange": "52-Week Low Change",
+        "fiftyTwoWeekLowChangePercent": "52-Week Low Change Percent",
+        "financialCurrency": "Financial Currency",
+        "firstTradeDateMilliseconds": "First Trade Date (ms)",
+        "fiveYearAvgDividendYield": "5-Year Avg Dividend Yield",
+        "floatShares": "Float Shares",
+        "forwardPE": "Forward PE",
+        "freeCashflow": "Free Cashflow",
+        "fullExchangeName": "Full Exchange Name",
+        "fullTimeEmployees": "Full Time Employees",
+        "fundInceptionDate": "Fund Inception Date",
+        "grossMargins": "Gross Margins",
+        "grossProfits": "Gross Profits",
+        "heldPercentInsiders": "Held Percent Insiders",
+        "heldPercentInstitutions": "Held Percent Institutions",
+        "ind_sust": "Similarity Index",
+        "industry": "GICS Industry",
+        "industryDisp": "GICS Industry",
+        "industryKey": "GICS Industry Key",
+        "legalType": "Legal Type",
+        "market": "Market",
+        "marketCap": "Market Cap",
+        "navPrice": "NAV Price",
+        "netAssets": "Net Assets",
+        "netExpenseRatio": "Net Expense Ratio",
+        "netIncomeToCommon": "Net Income To Common",
+        "numberOfAnalystOpinions": "Number Of Analyst Opinions",
+        "operatingCashflow": "Operating Cashflow",
+        "operatingMargins": "Operating Margins",
+        "overallRisk": "Overall Risk",
+        "payoutRatio": "Payout Ratio",
+        "preMarketChange": "Pre-Market Change",
+        "preMarketChangePercent": "Pre-Market Change Percent",
+        "preMarketPrice": "Pre-Market Price",
+        "previousClose": "Previous Close",
+        "priceEpsCurrentYear": "Price/EPS Current Year",
+        "priceToBook": "Price To Book",
+        "priceToSalesTrailing12Months": "Price To Sales Trailing 12 Months",
+        "profitMargins": "Profit Margins",
+        "quickRatio": "Quick Ratio",
+        "quoteType": "Quote Type",
+        "recommendationKey": "Recommendation Key",
+        "recommendationMean": "Recommendation Mean",
+        "regularMarketChangePercent": "Regular Market Change Percent",
+        "ret_365": "1yr Return",
+        "returnOnAssets": "Return On Assets",
+        "returnOnEquity": "Return On Equity",
+        "revenueGrowth": "Revenue Growth",
+        "revenuePerShare": "Revenue Per Share",
+        "sector": "GICS Sector",
+        "sectorDisp": "GICS Sector",
+        "sectorKey": "GICS Sector Key",
+        "security": "Name",
+        "sharesOutstanding": "Shares Outstanding",
+        "sharesPercentSharesOut": "Shares Percent Shares Out",
+        "sharesShort": "Shares Short",
+        "sharesShortPriorMonth": "Shares Short Prior Month",
+        "shortPercentOfFloat": "Short Percent Of Float",
+        "shortRatio": "Short Ratio",
+        "state": "State",
+        "threeYearAverageReturn": "3yr Avg. Return",
+        "ticker": "Ticker",
+        "totalAssets": "Total Assets",
+        "totalCash": "Total Cash",
+        "totalCashPerShare": "Total Cash Per Share",
+        "totalDebt": "Total Debt",
+        "totalRevenue": "Total Revenue",
+        "trailingAnnualDividendRate": "Trailing Annual Dividend Rate",
+        "trailingAnnualDividendYield": "Trailing Annual Dividend Yield",
+        "trailingEps": "Trailing EPS",
+        "trailingPE": "Trailing PE",
+        "trailingPegRatio": "Trailing PEG Ratio",
+        "trailingThreeMonthNavReturns": "Trailing 3-Month NAV Returns",
+        "trailingThreeMonthReturns": "Trailing 3-Month Returns",
+        "twoHundredDayAverageChange": "200-Day Avg. Change",
+        "twoHundredDayAverageChangePercent": "200-Day Avg. Change Percent",
+        "vol_365": "Volatility",
+        "yield": "Yield",
+        "ytdReturn": "YTD Return",
+        "zip": "Zip"
+    }
+}

json/col_names_map_sorted.json ADDED Viewed

	@@ -0,0 +1,114 @@

+{
+    "col_names_map": {
+        "52WeekChange": "52-Week Change",
+        "asset_age": "Years Listed",
+        "averageAnalystRating": "Avg. Analyst Rating",
+        "averageVolume": "Avg. Volume",
+        "beta": "Beta",
+        "beta3Year": "Beta 3-Year",
+        "bookValue": "Book Value",
+        "category": "Category",
+        "city": "City",
+        "country": "Country",
+        "currency": "Currency",
+        "currentRatio": "Current Ratio",
+        "debtToEquity": "Debt To Equity",
+        "dividendRate": "Dividend Rate",
+        "dividendYield": "Dividend Yield",
+        "earningsGrowth": "Earnings Growth",
+        "earningsQuarterlyGrowth": "Earnings Quarterly Growth",
+        "ebitda": "EBITDA",
+        "ebitdaMargins": "EBITDA Margins",
+        "enterpriseToEbitda": "Enterprise To EBITDA",
+        "enterpriseToRevenue": "Enterprise To Revenue",
+        "enterpriseValue": "Enterprise Value",
+        "epsCurrentYear": "EPS Current Year",
+        "epsForward": "EPS Forward",
+        "exchange": "Exchange",
+        "exchangeTimezoneName": "Exchange Timezone Name",
+        "exchangeTimezoneShortName": "Exchange Timezone Short Name",
+        "fiftyDayAverageChange": "50-Day Avg. Change",
+        "fiftyDayAverageChangePercent": "50-Day Avg. Change Percent",
+        "fiftyTwoWeekHighChangePercent": "52-Week High Change Percent",
+        "fiftyTwoWeekLowChange": "52-Week Low Change",
+        "fiftyTwoWeekLowChangePercent": "52-Week Low Change Percent",
+        "financialCurrency": "Financial Currency",
+        "firstTradeDateMilliseconds": "First Trade Date (ms)",
+        "fiveYearAvgDividendYield": "5-Year Avg Dividend Yield",
+        "floatShares": "Float Shares",
+        "forwardPE": "Forward PE",
+        "freeCashflow": "Free Cashflow",
+        "fullExchangeName": "Full Exchange Name",
+        "fullTimeEmployees": "Full Time Employees",
+        "fundInceptionDate": "Fund Inception Date",
+        "grossMargins": "Gross Margins",
+        "grossProfits": "Gross Profits",
+        "heldPercentInsiders": "Held Percent Insiders",
+        "heldPercentInstitutions": "Held Percent Institutions",
+        "ind_sust": "Similarity Index",
+        "industry": "Sector",
+        "industryDisp": "Industry",
+        "industryKey": "Industry Key",
+        "legalType": "Legal Type",
+        "market": "Market",
+        "marketCap": "Market Cap",
+        "navPrice": "NAV Price",
+        "netAssets": "Net Assets",
+        "netExpenseRatio": "Net Expense Ratio",
+        "netIncomeToCommon": "Net Income To Common",
+        "numberOfAnalystOpinions": "Number Of Analyst Opinions",
+        "operatingCashflow": "Operating Cashflow",
+        "operatingMargins": "Operating Margins",
+        "overallRisk": "Overall Risk",
+        "payoutRatio": "Payout Ratio",
+        "preMarketChange": "Pre-Market Change",
+        "preMarketChangePercent": "Pre-Market Change Percent",
+        "preMarketPrice": "Pre-Market Price",
+        "previousClose": "Previous Close",
+        "priceEpsCurrentYear": "Price/EPS Current Year",
+        "priceToBook": "Price To Book",
+        "priceToSalesTrailing12Months": "Price To Sales Trailing 12 Months",
+        "profitMargins": "Profit Margins",
+        "quickRatio": "Quick Ratio",
+        "quoteType": "Quote Type",
+        "recommendationKey": "Recommendation Key",
+        "recommendationMean": "Recommendation Mean",
+        "regularMarketChangePercent": "Regular Market Change Percent",
+        "ret_365": "1yr Return",
+        "returnOnAssets": "Return On Assets",
+        "returnOnEquity": "Return On Equity",
+        "revenueGrowth": "Revenue Growth",
+        "revenuePerShare": "Revenue Per Share",
+        "sector": "Sector",
+        "sectorDisp": "Sector",
+        "sectorKey": "Sector Key",
+        "security": "Name",
+        "sharesOutstanding": "Shares Outstanding",
+        "sharesPercentSharesOut": "Shares Percent Shares Out",
+        "sharesShort": "Shares Short",
+        "sharesShortPriorMonth": "Shares Short Prior Month",
+        "shortPercentOfFloat": "Short Percent Of Float",
+        "shortRatio": "Short Ratio",
+        "state": "State",
+        "threeYearAverageReturn": "3yr Avg. Return",
+        "ticker": "Ticker",
+        "totalAssets": "Total Assets",
+        "totalCash": "Total Cash",
+        "totalCashPerShare": "Total Cash Per Share",
+        "totalDebt": "Total Debt",
+        "totalRevenue": "Total Revenue",
+        "trailingAnnualDividendRate": "Trailing Annual Dividend Rate",
+        "trailingAnnualDividendYield": "Trailing Annual Dividend Yield",
+        "trailingEps": "Trailing EPS",
+        "trailingPE": "Trailing PE",
+        "trailingPegRatio": "Trailing PEG Ratio",
+        "trailingThreeMonthNavReturns": "Trailing 3-Month NAV Returns",
+        "trailingThreeMonthReturns": "Trailing 3-Month Returns",
+        "twoHundredDayAverageChange": "200-Day Avg. Change",
+        "twoHundredDayAverageChangePercent": "200-Day Avg. Change Percent",
+        "vol_365": "Volatility",
+        "yield": "Yield",
+        "ytdReturn": "YTD Return",
+        "zip": "Zip"
+    }
+}

json/cols_tabla_equity.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "cols_tabla_equity": [
+        "ticker",
+        "security",
+        "country",
+        "sector",
+        "marketCap",
+        "ret_365",
+        "vol_365",
+        "trailingPE",
+        "revenueGrowth",
+        "dividendYield",
+        "beta"
+    ]
+}

json/embeddings_excluded_words.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+    "excluded_words": [
+		"us",
+		"china",
+		"japan",
+		"russia",
+		"india",
+		"europe",
+        "company",
+        "operates",
+        "provides",
+        "offers",
+        "headquartered",
+        "based",
+        "incorporated",
+		"together",
+        "founded",
+        "business",
+        "businesses",
+        "companies",
+        "customers",
+        "clients",
+		"under",
+		"co",
+		"inc",
+		"nv",
+		"ltd",
+		"limited",
+		"normal market conditions",
+		"the fund will normally invest",
+		"the fund invests",
+		"normal circumstances",
+		"at least",
+		"of the fund",
+		"seeks to achieve",
+		"through its subsidiaries",
+		"with its subsidiaries"
+    ]
+}

json/gamma_params.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "shape": 1000,
+    "loc": -8,
+    "scale": 0.009,
+    "max_dist": 2,
+    "precision_cdf": 1000
+}

json/ignore_columns.json ADDED Viewed

	@@ -0,0 +1,96 @@

+{
+    "ignore_columns": [
+        "address1",
+        "phone",
+        "governanceEpochDate",
+        "maxAge",
+        "tradeable",
+        "SandP52WeekChange",
+        "language",
+        "region",
+        "typeDisp",
+        "quoteSourceName",
+        "esgPopulated",
+        "postMarketTime",
+        "regularMarketTime",
+        "marketState",
+        "exchangeDataDelayedBy",
+        "cryptoTradeable",
+        "postMarketChangePercent",
+        "postMarketPrice",
+        "postMarketChange",
+        "isEarningsDateEstimate",
+        "gmtOffSetMilliseconds",
+        "preMarketTime",
+		"preMarketPrice",
+		"preMarketChange",
+		"preMarketChangePercent",
+        "governanceEpochDate",
+        "compensationAsOfEpochDate",
+        "sharesShortPreviousMonthDate",
+        "dateShortInterest",
+        "dividendDate",
+        "earningsTimestamp",
+        "earningsTimestampStart",
+        "earningsTimestampEnd",
+        "earningsCallTimestampStart",
+        "earningsCallTimestampEnd",
+        "priceHint",
+        "triggerable",
+        "customPriceAlertConfidence",
+        "messageBoardId",
+        "hasPrePostMarketData",
+        "sourceInterval",
+        "open",
+        "dayLow",
+        "dayHigh",
+        "regularMarketPreviousClose",
+        "bid",
+        "ask",
+        "bidSize",
+        "askSize",
+        "regularMarketOpen",
+        "regularMarketDayLow",
+        "regularMarketDayHigh",
+        "twoHundredDayAverage",
+        "lastDividendValue",
+        "targetHighPrice",
+        "targetLowPrice",
+        "targetMeanPrice",
+        "targetMedianPrice",
+        "regularMarketPrice",
+        "regularMarketChangePercentfiftyTwoWeekLowChange",
+        "fiftyTwoWeekHighChange",
+        "fiftyTwoWeekLow",
+        "fiftyTwoWeekHigh",
+        "fiftyDayAverage",
+        "dividendRatefiftyDayAverage",
+        "regularMarketChange",
+        "exDividendDate",
+        "lastFiscalYearEnd",
+        "nextFiscalYearEnd",
+        "mostRecentQuarter",
+        "nameChangeDate",
+        "lastSplitDate",
+        "lastDividendDate",
+        "earningsCallTimestampStart",
+        "earningsCallTimestampEnd",
+        "regularMarketVolume",
+        "volume",
+        "averageDailyVolume10Day",
+        "averageDailyVolume3Month",
+        "epsTrailingTwelveMonths",
+        "averageVolume10days",
+        "auditRisk",
+        "boardRisk",
+        "compensationRisk",
+        "shareHolderRightsRisk",
+        "epsTrailingTwelveMonths",
+        "currentPrice",
+        "forwardEps",
+        "impliedSharesOutstanding",
+        "averageDailyVolume10Day",
+        "volume",
+        "fiftyTwoWeekChangePercent"
+    ]
+}

json/industry_lists.json ADDED Viewed

	@@ -0,0 +1,169 @@

+{
+    "Technology": [
+        "Consumer Electronics",
+        "Software - Infrastructure",
+        "Semiconductors",
+        "Software - Application",
+        "Semiconductor Equipment & Materials",
+        "Communication Equipment",
+        "Information Technology Services",
+        "Scientific & Technical Instruments",
+        "Computer Hardware",
+        "Electronic Components",
+        "Solar",
+        "Electronics & Computer Distribution"
+    ],
+    "Consumer Cyclical": [
+        "Internet Retail",
+        "Auto Manufacturers",
+        "Home Improvement Retail",
+        "Luxury Goods",
+        "Restaurants",
+        "Apparel Retail",
+        "Travel Services",
+        "Footwear & Accessories",
+        "Auto Parts",
+        "Furnishings, Fixtures & Appliances",
+        "Lodging",
+        "Specialty Retail",
+        "Gambling",
+        "Residential Construction",
+        "Leisure",
+        "Auto & Truck Dealerships",
+        "Personal Services",
+        "Resorts & Casinos",
+        "Packaging & Containers",
+        "Department Stores",
+        "Apparel Manufacturing",
+        "Textile Manufacturing",
+        "Recreational Vehicles"
+    ],
+    "Communication Services": [
+        "Internet Content & Information",
+        "Entertainment",
+        "Telecom Services",
+        "Electronic Gaming & Multimedia",
+        "Advertising Agencies",
+        "Publishing",
+        "Broadcasting"
+    ],
+    "Energy": [
+        "Oil & Gas Integrated",
+        "Oil & Gas Refining & Marketing",
+        "Oil & Gas E&P",
+        "Thermal Coal",
+        "Oil & Gas Midstream",
+        "Oil & Gas Equipment & Services",
+        "Uranium",
+        "Oil & Gas Drilling"
+    ],
+    "Financial Services": [
+        "Insurance - Diversified",
+        "Banks - Diversified",
+        "Credit Services",
+        "Capital Markets",
+        "Banks - Regional",
+        "Asset Management",
+        "Insurance - Property & Casualty",
+        "Financial Data & Stock Exchanges",
+        "Insurance - Life",
+        "Insurance Brokers",
+        "Insurance - Reinsurance",
+        "Mortgage Finance",
+        "Financial Conglomerates",
+        "Insurance - Specialty",
+        "Shell Companies"
+    ],
+    "Healthcare": [
+        "Drug Manufacturers - General",
+        "Healthcare Plans",
+        "Medical Devices",
+        "Medical Instruments & Supplies",
+        "Diagnostics & Research",
+        "Biotechnology",
+        "Medical Distribution",
+        "Medical Care Facilities",
+        "Drug Manufacturers - Specialty & Generic",
+        "Health Information Services",
+        "Pharmaceutical Retailers"
+    ],
+    "Consumer Defensive": [
+        "Discount Stores",
+        "Household & Personal Products",
+        "Beverages - Non - Alcoholic",
+        "Packaged Foods",
+        "Beverages - Wineries & Distilleries",
+        "Tobacco",
+        "Beverages - Brewers",
+        "Confectioners",
+        "Grocery Stores",
+        "Food Distribution",
+        "Farm Products",
+        "Education & Training Services"
+    ],
+    "Basic Materials": [
+        "Specialty Chemicals",
+        "Other Industrial Metals & Mining",
+        "Copper",
+        "Gold",
+        "Building Materials",
+        "Chemicals",
+        "Agricultural Inputs",
+        "Steel",
+        "Paper & Paper Products",
+        "Aluminum",
+        "Other Precious Metals & Mining",
+        "Lumber & Wood Production",
+        "Silver",
+        "Coking Coal"
+    ],
+    "Industrials": [
+        "Aerospace & Defense",
+        "Specialty Industrial Machinery",
+        "Farm & Heavy Construction Machinery",
+        "Electrical Equipment & Parts",
+        "Conglomerates",
+        "Railroads",
+        "Specialty Business Services",
+        "Waste Management",
+        "Integrated Freight & Logistics",
+        "Building Products & Equipment",
+        "Engineering & Construction",
+        "Industrial Distribution",
+        "Consulting Services",
+        "Rental & Leasing Services",
+        "Airports & Air Services",
+        "Infrastructure Operations",
+        "Trucking",
+        "Security & Protection Services",
+        "Marine Shipping",
+        "Airlines",
+        "Pollution & Treatment Controls",
+        "Tools & Accessories",
+        "Metal Fabrication",
+        "Staffing & Employment Services",
+        "Business Equipment & Supplies"
+    ],
+    "Utilities": [
+        "Utilities - Regulated Electric",
+        "Utilities - Diversified",
+        "Utilities - Renewable",
+        "Utilities - Independent Power Producers",
+        "Utilities - Regulated Water",
+        "Utilities - Regulated Gas"
+    ],
+    "Real Estate": [
+        "REIT - Specialty",
+        "REIT - Healthcare Facilities",
+        "REIT - Industrial",
+        "REIT - Retail",
+        "Real Estate - Diversified",
+        "Real Estate Services",
+        "REIT - Diversified",
+        "REIT - Residential",
+        "Real Estate - Development",
+        "REIT - Office",
+        "REIT - Mortgage",
+        "REIT - Hotel & Motel"
+    ]
+}

json/nn_search_metrics.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "nn_search_metrics": [
+	    "industryDisp_num_norm",
+        "sectorDisp_num_norm",
+        "country_num_norm",
+        "ret_365_norm",
+        "vol_365_norm",
+		"marketCap_norm",
+		"beta_norm",
+        "revenueGrowth_norm",
+        "debtToEquity_norm",
+		"fullTimeEmployees_norm",
+		"trailingPE_norm"
+    ]
+}

json/numeric_columns.json ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+    "numeric_columns": [
+        "netAssets",
+        "threeYearAverageReturn",
+        "fiveYearAvgDividendYield",
+        "preMarketPrice",
+        "payoutRatio",
+        "heldPercentInstitutions",
+        "epsForward",
+        "sharesShort",
+        "preMarketChange",
+        "fiftyTwoWeekLowChange",
+        "enterpriseToEbitda",
+        "quickRatio",
+        "yield",
+        "operatingMargins",
+        "firstTradeDateMilliseconds",
+        "priceEpsCurrentYear",
+        "bookValue",
+        "forwardPE",
+        "profitMargins",
+        "netIncomeToCommon",
+        "priceToSalesTrailing12Months",
+        "currentRatio",
+        "ebitda",
+        "beta3Year",
+        "ebitdaMargins",
+        "trailingAnnualDividendYield",
+        "trailingThreeMonthNavReturns",
+        "sharesOutstanding",
+        "trailingPE",
+        "totalDebt",
+        "netExpenseRatio",
+        "dividendRate",
+        "totalAssets",
+        "heldPercentInsiders",
+        "trailingPegRatio",
+        "totalRevenue",
+        "totalCashPerShare",
+        "previousClose",
+        "returnOnAssets",
+        "revenuePerShare",
+        "enterpriseValue",
+        "debtToEquity",
+        "epsCurrentYear",
+        "dividendYield",
+        "revenueGrowth",
+        "52WeekChange",
+        "shortRatio",
+        "numberOfAnalystOpinions",
+        "operatingCashflow",
+        "sharesShortPriorMonth",
+        "twoHundredDayAverageChangePercent",
+        "grossProfits",
+        "sharesPercentSharesOut",
+        "overallRisk",
+        "priceToBook",
+        "trailingThreeMonthReturns",
+        "returnOnEquity",
+        "fiftyTwoWeekLowChangePercent",
+        "fullTimeEmployees",
+        "floatShares",
+        "regularMarketChangePercent",
+        "marketCap",
+        "averageVolume",
+        "trailingAnnualDividendRate",
+        "earningsGrowth",
+        "trailingEps",
+        "grossMargins",
+        "fiftyDayAverageChangePercent",
+        "shortPercentOfFloat",
+        "fiftyDayAverageChange",
+        "ytdReturn",
+        "preMarketChangePercent",
+        "earningsQuarterlyGrowth",
+        "fiftyTwoWeekHighChangePercent",
+        "freeCashflow",
+        "recommendationMean",
+        "fundInceptionDate",
+        "navPrice",
+        "beta",
+        "totalCash",
+        "enterpriseToRevenue",
+        "twoHundredDayAverageChange"
+    ]
+}

json/semantic_search_params.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "semantic_search_params": {
+        "k": 2000,
+        "brevity_penalty": 0.1,
+        "reward_for_literal": 0.03,
+        "partial_match_factor": 0.8
+    }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+transformers==4.44.2
+sentence-transformers
+torch
+scikit-learn
+scipy
+numpy
+pandas
+datasets
+duckdb
+pathlib
+json
+gradio

src/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# src/__init__.py
+from importlib import import_module
+import sys
+# Aggregate core sub‑modules so a caller imports the package once instead of listing each file.
+__all__ = [
+    "front_dataset_handler",
+    "env_options",
+    "semantic_search"
+]
+for _mod in __all__:
+    mod = import_module(f".{_mod}", __name__)
+    globals()[_mod] = mod
+    sys.modules[_mod] = mod # Pre‑register bare names so intra‑package imports (e.g., `import front_dataset_handler`) succeed.

src/app_utils.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import pandas as pd
+from typing import Sequence, Any
+import re
+_NEG_COLOR = "red"
+def format_large_number(n, decimals=2):
+    if n >= 1e12:
+        return f'{n / 1e12:.{decimals}f} T'
+    elif n >= 1e9:
+        return f'{n / 1e9:.{decimals}f} B'
+    elif n >= 1e6:
+        return f'{n / 1e6:.{decimals}f} M'
+    else:
+        return str(n)
+def format_results(df: pd.DataFrame, rename_columns: dict) -> pd.DataFrame:
+    # Índice 100
+    if "ind_sust" in df.columns:
+        df["ind_sust"] = df["ind_sust"].apply(lambda x: "-" if pd.isna(x) else int(round(x * 100, 0)))
+    # 1 decimal
+    for col in ["trailingPE", "beta"]:
+        if col in df.columns:
+            df[col] = df[col].apply(lambda x: "-" if pd.isna(x) else f"{x:.1f}")
+    # 2 decimales
+    if "Search dist." in df.columns:
+        df["Search dist."] = df["Search dist."].apply(lambda n: "-" if pd.isna(n) else f"{n:.2f}")
+    # Cantidades monetarias grandes
+    if "marketCap" in df.columns:
+        df["marketCap"] = df["marketCap"].apply(lambda n: "-" if pd.isna(n) else format_large_number(n, 1))
+    # Porcentajes 1 decimal
+    for col in ["ret_365", "revenueGrowth"]:
+        if col in df.columns:
+            df[col] = df[col].apply(lambda x: "-" if pd.isna(x) or x == 0 else f"{(x * 100):.1f}%")
+    # Porcentajes 1 decimal (porcentaje numérico en fuente)
+    for col in ["dividendYield"]:
+        if col in df.columns:
+            df[col] = df[col].apply(lambda x: "-" if pd.isna(x) else f"{round(x, 1)}%")
+    # Volatilidad
+    if "vol_365" in df.columns:
+        df["vol_365"] = df["vol_365"].apply(lambda x: "-" if pd.isna(x) or x == 0 else f"{x:.4f}")
+    # Devolvemos el dataframe con los nombres de columnas renombrados
+    return df.rename(columns=rename_columns)
+def random_ticker(df: pd.DataFrame) -> str:
+    return df["ticker"].sample(n=1).values[0]
+def styler_negative_red(df: pd.DataFrame, cols: list[str] | None = None):
+    """
+    Returns a Styler that paints negative numeric values in *cols*.
+    Columns absent in *df* are ignored.
+    """
+    cols = [c for c in (cols or df.columns) if c in df.columns]
+    def _style(v):
+        try:
+            num = float(re.sub(r"[ %,TMB]", "", str(v)))
+            if num < 0:
+                return f"color:{_NEG_COLOR}"
+        except ValueError:
+            pass
+        return ""
+    return df.style.applymap(_style, subset=cols)

src/env_options.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import sys
+import os
+import torch
+import transformers
+from typing import List, Dict
+def check_env(colab:bool=False, use_dotenv:bool=True, dotenv_path:str=None, colab_secrets:dict=None, env_tokens:List[str]=None) -> Dict[str, str]:
+    # Checking versions and GPU availability:
+    print(f"Python version: {sys.version}")
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"Transformers version: {transformers.__version__}")
+    if torch.cuda.is_available():
+        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
+        print(f"CUDA Version: {torch.version.cuda}")
+        print(f"FlashAttention available: {torch.backends.cuda.flash_sdp_enabled()}")
+    else:
+        print("No CUDA device available")
+    if use_dotenv:
+        from dotenv import load_dotenv
+        load_dotenv(dotenv_path) # path to your dotenv file
+        print(f"Retrieving token(s) from {dotenv_path} or environment variables")
+    def mask_token(token, unmasked_chars=4):
+        return token[:unmasked_chars] + '*' * (len(token) - unmasked_chars*2) + token[-unmasked_chars:]
+    tokens = {}
+    for token_name in env_tokens or []:
+        if use_dotenv:
+            token = os.getenv(token_name)
+        elif colab:
+            token = colab_secrets.get(token_name)
+        else:
+            token = os.environ.get(token_name)
+        if token is None:
+            print(f"{token_name} not found in the provided .env file or environment variables")
+        else:
+            print(f"Using {token_name}: {mask_token(token)}")
+            tokens[token_name] = token
+    return tokens

src/front_dataset_handler.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import os
+import numpy as np
+import pandas as pd
+from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import QuantileTransformer
+from scipy.stats import gamma
+import json
+class FrontDatasetHandler:
+    def __init__(self, maestro: pd.DataFrame=None, precios_cierre: pd.DataFrame=None, app_dataset: pd.DataFrame=None,
+                 json_path: str = None, pickle_path: str = None):
+        self.maestro = maestro
+        self.app_dataset = app_dataset # Dataframe preprocesado para la app
+        self.pickle_path = pickle_path
+        # Extraemos los ficheros JSON para la creación del dataset de la app si no se ha pasado como argumento:
+        if self.app_dataset is None and json_path is not None:
+            with open(os.path.join(json_path, "ignore_columns.json"), "r") as f:
+                self.ignore_columns = json.load(f)['ignore_columns']
+            print(f"ignore_columns: {self.ignore_columns}")
+            with open(os.path.join(json_path, "numeric_columns.json"), "r") as f:
+                self.numeric_columns = json.load(f)['numeric_columns']
+            print(f"numeric_columns: {self.numeric_columns}")
+            with open(os.path.join(json_path, "app_column_config.json"), "rb") as f:
+                self.app_dataset_cols  = json.load(f)['app_dataset_cols']
+            print(f"app_dataset_cols: {self.app_dataset_cols}")
+            with open(os.path.join(json_path, "cat_to_num_maps.json"), "r") as f:
+                num_maps = json.load(f)
+            self.sector_num_map = num_maps['sector_num_map']
+            self.industry_num_map = num_maps['industry_num_map']
+        self.norm_columns = None
+        if maestro is not None:
+            maestro.drop(columns=self.ignore_columns, inplace=True, errors='ignore')
+        self.precios_cierre = precios_cierre # Sólo necesario cuando se requiere preprocesar el dataset para la app
+        self.rend_diario_log = None
+        self.precios_cierre_fh = None
+        self.rendimientos_y_volatilidad = None
+        self.mapeos_var_categoricas = None
+        self.activos_descartados = []
+        self.quantile_scaler = None
+    def filtra_y_homogeneiza(self, n_dias=366, n_dias_descartar=1, min_dias=100):
+        if self.precios_cierre.index.name != 'date':
+            self.precios_cierre.set_index('date', inplace=True)
+        self.precios_cierre.columns.name = 'ticker'
+        end_date = self.precios_cierre.index.max()
+        start_date = end_date - pd.Timedelta(days=n_dias)
+        # Filtrar datos dentro del rango de fechas
+        precios_cierre_fh = self.precios_cierre.loc[start_date:end_date].copy()
+        # Descartar los últimos n_dias_descartar
+        if n_dias_descartar > 0:
+            dates_to_drop = precios_cierre_fh.index.sort_values(ascending=False)[:n_dias_descartar]
+            precios_cierre_fh.drop(dates_to_drop, inplace=True)
+        precios_cierre_fh.ffill(axis=0, inplace=True) # Se rellenan los datos vacíos con el dato del día anterior
+        self.activos_descartados = precios_cierre_fh.columns[precios_cierre_fh.notna().sum(axis=0) < min_dias].tolist()
+        precios_cierre_fh.drop(columns=self.activos_descartados, inplace=True)
+        self.precios_cierre = precios_cierre_fh
+        return
+    def calcula_rendimientos_y_volatilidad(self, n_dias=365, umbral_max=0.3, umbral_min=-0.3):
+        end_date = self.precios_cierre.index.max()
+        start_date = end_date - pd.Timedelta(days=n_dias)
+        # Dado que la tabla no siempre incluye fechas de fin de semana o festivos, se busca la fecha más cercana anterior a start_date
+        if start_date not in self.precios_cierre.index:
+            previous_dates = self.precios_cierre.index[self.precios_cierre.index < start_date]
+            if len(previous_dates) > 0:
+                start_date = previous_dates.max()
+            else:
+                raise ValueError(f"No hay datos históricos suficientes ({n_dias}, {end_date})")
+        _df_rend_y_vol = self.precios_cierre.loc[start_date:end_date].copy()
+        _df_rend_y_vol.dropna(how='all', inplace=True) #####
+        # Reemplazar valores cero y negativos (errores de formato) por el siguiente valor más pequeño positivo
+        _df_rend_y_vol[_df_rend_y_vol <= 0] = np.nextafter(0, 1)
+        if self.activos_descartados:
+            _df_rend_y_vol = _df_rend_y_vol.drop(columns=[col for col in self.activos_descartados if col in _df_rend_y_vol.columns])
+        if len(_df_rend_y_vol) == 0:
+            raise ValueError(f"No hay datos disponibles en el rango de {n_dias} días")
+        _rend_diario_log = np.log(_df_rend_y_vol).diff()
+        _rend_diario_log = _rend_diario_log.iloc[1:] # Eliminar la primera fila
+        # _rend_diario_log.dropna(how='all', inplace=True)
+        print(f'Datos rentabilidad ({n_dias} días) con outliers: {_rend_diario_log.shape}')
+        # Identificar activos a descartar (outliers)
+        _activos_outliers = _rend_diario_log.columns[((_rend_diario_log > umbral_max) | (_rend_diario_log < umbral_min)).any()].tolist()
+        self.activos_descartados.extend([asset for asset in _activos_outliers if asset not in self.activos_descartados])
+        # Descartar activos con rentabilidades atípicas
+        _rend_diario_log = _rend_diario_log.loc[:, ~((_rend_diario_log > umbral_max) | (_rend_diario_log < umbral_min)).any()]
+        print(f'Datos rentabilidad sin outliers: {_rend_diario_log.shape}')
+        self.rend_diario_log = _rend_diario_log.copy()
+        # Inicializar rendimientos_y_volatilidad si no existe
+        if self.rendimientos_y_volatilidad is None:
+            self.rendimientos_y_volatilidad = pd.DataFrame(columns=_rend_diario_log.columns)
+            # print(f'INIT: Tabla rendimientos {n_dias}: {self.rendimientos_y_volatilidad.shape}')
+        else:
+            # Mantener solo los activos que están en _rend_diario_log
+            self.rendimientos_y_volatilidad = self.rendimientos_y_volatilidad.loc[:, _rend_diario_log.columns]
+            # print(f'Tabla rendimientos {n_dias}: {self.rendimientos_y_volatilidad.shape}')
+        # Añadir nuevas columnas para el n_dias actual
+        self.rendimientos_y_volatilidad.loc[f'ret_log_{n_dias}'] = np.sum(_rend_diario_log, axis=0)
+        self.rendimientos_y_volatilidad.loc[f'ret_{n_dias}'] = (_df_rend_y_vol.ffill().bfill().iloc[-1] / _df_rend_y_vol.ffill().bfill().iloc[0]) - 1
+        self.rendimientos_y_volatilidad.loc[f'vol_{n_dias}'] = _rend_diario_log.var()**0.5
+        return
+    def cruza_maestro(self):
+        _rets_y_vol_maestro = self.rendimientos_y_volatilidad.T.reset_index().copy()
+        _columns_to_merge = [col for col in _rets_y_vol_maestro.columns if col not in self.maestro.columns]
+        if len(_columns_to_merge) > 0:
+            _maestro_v2 = self.maestro.merge(_rets_y_vol_maestro, left_on='ticker', right_on='ticker')
+            _maestro_v2 = _maestro_v2.replace([float('inf'), float('-inf')], np.nan)
+            self.maestro = _maestro_v2
+        else:
+            raise ValueError("No hay nuevas columnas para cruzar con el dataframe maestro")
+        return
+    def _cat_to_num_(self, df, cat, pre_map=None):
+        """
+        Transforma una columna categórica en un DataFrame a valores numéricos asignando un número entero a cada categoría.
+        Si no se proporciona un mapeo (`pre_map`), asigna 0 a la categoría más frecuente, 1 a la siguiente más frecuente, y así sucesivamente.
+        Si se proporciona un mapeo (`pre_map`), utiliza ese mapeo para la conversión.
+        Parámetros
+        ----------
+        df : pandas.DataFrame
+            DataFrame que contiene la columna categórica a transformar.
+        cat : str
+            Nombre de la columna categórica a transformar.
+        pre_map : dict, opcional
+            Diccionario que mapea cada categoría a un valor numérico. Si no se proporciona, el mapeo se genera automáticamente.
+        Devuelve
+        --------
+        pandas.DataFrame
+            DataFrame con dos columnas: la columna categórica original y una columna con los valores numéricos asignados.
+        """
+        if not pre_map:
+            pivot = pd.pivot_table(df, index=[cat], aggfunc='size')
+            df_sorted = pivot.sort_values(ascending=False).reset_index(name='count')
+            df_sorted[cat + '_num'] = range(len(df_sorted))
+        else:
+            df_sorted = pd.DataFrame({cat: list(pre_map.keys()), cat + '_num': list(pre_map.values())})
+        return df_sorted
+    def var_categorica_a_numerica(self, cat_cols):
+        for col in cat_cols:
+            if col == 'sectorDisp':
+                globals()[f"pt_{col}"] = self._cat_to_num_(self.maestro, col, self.sector_num_map)
+            elif col == 'industryDisp':
+                globals()[f"pt_{col}"] = self._cat_to_num_(self.maestro, col, self.industry_num_map)
+            else:
+                globals()[f"pt_{col}"] = self._cat_to_num_(self.maestro, col)  # Creamos un dataframe con el mapeo de cada variable categórica por frecuencia
+        self.mapeos_var_categoricas = [globals()[f"pt_{col}"] for col in cat_cols] # Lista de dataframes con los mapeos de cada una de las variables categóricas
+        _maestro = self.maestro.copy()
+        for col, pt in zip(cat_cols, self.mapeos_var_categoricas):
+            _maestro[col] = _maestro[col].astype(str)
+            pt[col] = pt[col].astype(str)
+            # Creamos un diccionario con cada variable categórica y su equivalente numérico
+            mapping_dict = dict(zip(pt[col], pt[col + '_num']))
+            _maestro[col + '_num'] = _maestro[col].map(mapping_dict)
+            _maestro[col + '_num'] = pd.to_numeric(_maestro[col + '_num'], errors='coerce')
+        self.maestro = _maestro
+        return
+    def normaliza_por_cuantiles(self):
+        maestro_copy = self.maestro.copy()
+        numeric_columns = maestro_copy.select_dtypes(include=np.number).columns
+        self.quantile_scaler = QuantileTransformer(output_distribution='uniform')
+        variables_numericas = [col for col in numeric_columns if not col.endswith('_norm')]
+        all_na_cols = [col for col in variables_numericas if maestro_copy[col].isna().all()]
+        variables_numericas = [col for col in variables_numericas if col not in all_na_cols]
+        self.norm_columns = ['{}_norm'.format(var) for var in variables_numericas]
+        maestro_copy[self.norm_columns] = self.quantile_scaler.fit_transform(maestro_copy[variables_numericas])
+        maestro_copy[self.norm_columns] = maestro_copy[self.norm_columns].clip(0, 1)
+        self.maestro = maestro_copy
+        return
+    def var_estandar_z(self):
+        maestro_copy = self.maestro.copy()
+        numeric_columns = maestro_copy.select_dtypes(include=np.number).columns
+        variables_numericas = [col for col in numeric_columns if not col.endswith('_std')]
+        variables_num_std = ['{}_std'.format(var) for var in variables_numericas]
+        def estandarizar(x):
+            # Estandariza el valor z, restando la media y dividiendo por la desviación estándar
+            mean_val = x.mean()
+            std_val = x.std()
+            if pd.isna(std_val) or std_val == 0:
+                return pd.Series(0.0, index=x.index, name=x.name)
+            else:
+                normalized_series = (x - mean_val) / std_val
+                return normalized_series.fillna(0.0)
+        normalized_data = maestro_copy[variables_numericas].apply(estandarizar, axis=0)
+        maestro_copy[variables_num_std] = normalized_data
+        self.maestro = maestro_copy
+        return
+    def configura_distr_prob(self, shape, loc, scale, max_dist, precision_cdf):
+        x = np.linspace(0, max_dist, num=precision_cdf)
+        y_pdf = gamma.pdf(x, shape, loc, scale )
+        y_cdf = gamma.cdf(x, shape, loc, scale )
+        return y_pdf, y_cdf
+    def calculos_y_ajustes_dataset_activos(self):
+        maestro_copy = self.maestro.copy()
+        # Conversiones a formato numérico de columnas que dan problemas
+        for column in self.numeric_columns:
+            if column in maestro_copy.columns:
+                maestro_copy[column] = pd.to_numeric(maestro_copy[column], errors='coerce')
+                # print(f"Columna {column} convertida a {maestro_copy[column].dtype}")
+        # Estandarización de los diferentes tipos de NaN
+        # maestro_copy = maestro_copy.replace([None, np.nan, np.inf, -np.inf], pd.NA)
+        # Antigüedad del fondo en años:
+        if self.precios_cierre is not None and not self.precios_cierre.index.empty:
+            _most_recent_date = self.precios_cierre.index.max().date()
+        else:
+            _most_recent_date = pd.Timestamp.today().date()
+        # maestro_copy['firstTradeDateMilliseconds'] = pd.to_datetime(maestro_copy['firstTradeDateMilliseconds']).dt.date
+        maestro_copy['firstTradeDateMilliseconds'] = pd.to_datetime(maestro_copy['firstTradeDateMilliseconds'], unit='ms', errors='coerce').dt.date
+        maestro_copy['asset_age'] = maestro_copy['firstTradeDateMilliseconds'].apply(
+            lambda x: ((_most_recent_date - x).days / 365) if pd.notnull(x) and hasattr(x, 'day') else 0
+        ).astype(int)
+        outlier_thresholds = {
+            'beta': (-100, 100),
+            'dividendYield': (-1,100),
+            'fiveYearAvgDividendYield': (-1,100),
+            'trailingAnnualDividendYield': (-1,100),
+            'quickRatio': (-1, 500),
+            'currentRatio': (-1, 500),
+            'ebitda': (-1e12, 1e12),
+            'grossProfits': (-1e12, 1e12),
+        }
+        for column, (lower_bound, upper_bound) in outlier_thresholds.items():
+            maestro_copy.loc[(maestro_copy[column] < lower_bound) | (maestro_copy[column] > upper_bound), column] = pd.NA
+        self.maestro = maestro_copy.copy()
+        return
+    def filtra_df_activos(self, df, isin_target, filtros, debug=False):
+        '''
+        LEGACY
+        Devuelve un dataframe filtrado, sin alterar el orden, eliminando características no deseadas, para usar en aplicación de búsqueda de activos sustitutivos.
+        Las características y valores a filtrar son las de un fondo objetivo dado por su isin.
+        Por ejemplo, si clean_share es False en filtros, el dataframe final no incluirá más activos con el mismo valor de clean_share que el ISIN objetivo
+        Argumentos:
+            df (pandas.core.frame.DataFrame): Dataframe maestro de activos
+            isin_target (str): ISIN del fondo objetivo
+            # caracteristicas (list): Lista de str con los nombres de las características
+            filtros (dict): Diccionario donde las claves son las características y los valores son True si se quiere conservar
+            debug (bool, optional): Muestra información de depuración. Por defecto False.
+        Resultado:
+            df_filt (pandas.core.frame.DataFrame): Dataframe filtrado
+        '''
+        # fondo_target = df[df['isin'] == isin_target].iloc[0]
+        fondo_target = df[df['ticker'] == isin_target].iloc[0]
+        if debug: print(f'Tamaño inicial: {df.shape}')
+        car_numericas = ['ret_365', 'vol_365', 'marketCap', 'asset_age']
+        # for feature in caracteristicas[2:]:
+        for feature in list(filtros.keys()):
+            value = fondo_target[feature]
+            if debug: print(f'{feature} = {value}')
+            # Verificar si esta característica debe ser filtrada
+            if feature in filtros and not filtros[feature]:
+                if debug: print(f'FILTRO: {feature} != {value}')
+                df = df[df[feature] != value]
+                # Aplicar filtros adicionales para variables numéricas
+                if feature in car_numericas:
+                    if feature == 'ret_365':
+                        if debug: print(f'FILTRO NUMÉRICO: {feature} > {value}')
+                        df = df[df[feature] > value]
+                    elif feature == 'vol_365':
+                        if debug: print(f'FILTRO NUMÉRICO: {feature} < {value}')
+                        df = df[df[feature] < value]
+                    elif feature == 'asset_age':
+                        if debug: print(f'FILTRO NUMÉRICO: {feature} > {value}')
+                        df = df[df[feature] > value]
+                    elif feature == 'marketCap':
+                        if debug: print(f'FILTRO NUMÉRICO: {feature} > {value}')
+                        df = df[df[feature] < value]
+        df_filt = df
+        if debug: print(f'Tamaño final: {df_filt.shape}')
+        return df_filt
+    def calcula_ind_sust (self, dist, y_cdf, precision_cdf, max_dist):
+        try:
+            idx = int((precision_cdf / max_dist) * dist)
+            idx = min(idx, len(y_cdf) - 1)
+            norm_dist = y_cdf[idx]
+            ind_sust = max(0.0, 1.0 - norm_dist)
+        except IndexError:
+            ind_sust = 0
+        return ind_sust
+    def vecinos_cercanos(self, df, variables_busq, caracteristicas, target_ticker, y_cdf, precision_cdf, max_dist, n_neighbors, filtros):
+        if target_ticker not in df['ticker'].values:
+            return f"Error: '{target_ticker}' no encontrado en la base de datos"
+        target_row = df[df['ticker'] == target_ticker]
+        if ~target_row.index.isin(df.index):
+            df = pd.concat([df, target_row], ignore_index=True)
+        # print(f'DF original: {df.shape}')
+        X = df[variables_busq]
+        model = NearestNeighbors(n_neighbors=n_neighbors) ##### probar con más y filtrar después #######
+        model.fit(X)
+        target_row = df[df['ticker'] == target_ticker][variables_busq]
+        # model.kneighbors devuelve dos arrays bidimensionales con los vecinos más cercanos y sus distancias:
+        distances, indices = model.kneighbors(target_row)
+        # combined_columns = list(set(caracteristicas + variables_busq))
+        neighbors_df = df.iloc[indices[0]][caracteristicas]
+        neighbors_df['distance'] = distances[0]
+        ind_sust = np.array([self.calcula_ind_sust(dist, y_cdf, precision_cdf, max_dist) for dist in distances[0]])
+        neighbors_df['ind_sust'] = ind_sust
+        neighbors_df = neighbors_df.sort_values(by='distance', ascending=True)
+        target_row = neighbors_df[neighbors_df['ticker'] == target_ticker]
+        # Aplicamos los filtros de exclusión:
+        ### Código pendiente de eliminar/modificar (legado de la aplicación de fondos)
+        neighbors_df = self.filtra_df_activos (df = neighbors_df, isin_target = target_ticker, filtros = filtros)
+        ####################
+        # Recupera el activo seleccionado en caso de haber hecho filtros, devolviéndolo a la primera posición del dataframe:
+        if ~target_row.index.isin(neighbors_df.index):
+            neighbors_df = pd.concat([pd.DataFrame(target_row), neighbors_df], ignore_index=True)
+        # print(f'DF filtrado: {neighbors_df.shape}')
+        # Ponemos el ticker como índice:
+        neighbors_df.set_index('ticker', inplace = True)
+        return neighbors_df
+    def format_large_number(self, n, decimals=2):
+        if n >= 1e12:
+            return f'{n / 1e12:.{decimals}f} T'
+        elif n >= 1e9:
+            return f'{n / 1e9:.{decimals}f} B'
+        elif n >= 1e6:
+            return f'{n / 1e6:.{decimals}f} M'
+        else:
+            return str(n)
+    def trae_embeddings_desde_pkl(self, embeddings_df_file_name='df_with_embeddings.pkl', embeddings_col_name='embeddings'):
+        embeddings_df = pd.read_pickle(os.path.join(self.pickle_path, embeddings_df_file_name))
+        self.maestro = self.maestro.merge(
+            embeddings_df[['ticker', embeddings_col_name]],
+            on='ticker',
+            how='left'
+        )
+        print(f"Agregados embeddings {self.maestro.shape}")
+        return
+    def procesa_app_dataset(self, periodo=366, n_dias_descartar=1, min_dias=250, umbrales_rend=(-0.3, +0.3), periodos_metricas=[60, 365],
+                        cat_cols = ['industryDisp', 'sectorDisp', 'country', 'city', 'exchange', 'financialCurrency', 'quoteType'],
+                        embeddings_df_file_name='df_with_embeddings.pkl', embeddings_col_name='embeddings'):
+        if self.app_dataset is not None:
+            print("app_dataset already exists, skipping processing")
+            return
+        self.filtra_y_homogeneiza(n_dias=periodo, n_dias_descartar=n_dias_descartar, min_dias=min_dias)
+        for periodo_metricas in periodos_metricas:
+            self.calcula_rendimientos_y_volatilidad(n_dias=periodo_metricas, umbral_max=umbrales_rend[1], umbral_min=umbrales_rend[0])
+        self.cruza_maestro()
+        self.var_categorica_a_numerica(cat_cols)
+        self.calculos_y_ajustes_dataset_activos()
+        self.normaliza_por_cuantiles()
+        self.trae_embeddings_desde_pkl(embeddings_df_file_name=embeddings_df_file_name, embeddings_col_name=embeddings_col_name)
+        app_dataset = self.maestro.copy()
+        app_dataset = app_dataset.fillna({col: 0.5 for col in self.norm_columns})
+        # Filtrado final de columnas para reducir el dataset:
+        self.app_dataset = app_dataset[self.app_dataset_cols].copy()
+        print(f"app_dataset preparado: {self.app_dataset.shape}")
+        return

src/semantic_search.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import duckdb
+from sentence_transformers import SentenceTransformer
+import pandas as pd
+import re
+def duckdb_vss_local(
+    model: SentenceTransformer,
+    duckdb_connection: duckdb.DuckDBPyConnection,
+    query: str,
+    k: int = 1000,
+    brevity_penalty: float = 0.0,
+    reward_for_literal: float = 0.0,
+    partial_match_factor: float = 0.5,
+    table_name: str = "maestro_vector_table",
+    embedding_column: str = "vec",
+):
+    query_vector = model.encode(query)
+    embedding_dim = model.get_sentence_embedding_dimension()
+    sql = f"""
+        SELECT
+            *,
+            array_cosine_distance(
+                {embedding_column}::float[{embedding_dim}],
+                {query_vector.tolist()}::float[{embedding_dim}]
+            ) as distance
+        FROM {table_name}
+        ORDER BY distance
+        LIMIT {k}
+    """
+    result = duckdb_connection.sql(sql).to_df()
+    # Utilizar los parámetros "debug" para mostrar columnas intermedias:
+    if brevity_penalty > 0:
+        result = penalize_short_summaries(result, factor = brevity_penalty, distance_column = 'distance',
+                                          summary_column = 'longBusinessSummary', debug = False)
+    if reward_for_literal > 0:
+        result = reward_literals(result, query, factor = reward_for_literal,
+                                 partial_match_factor= partial_match_factor, distance_column = 'distance',
+                                 summary_column = 'longBusinessSummary', debug = False)
+    return result
+def penalize_short_summaries(
+    df: pd.DataFrame,
+    factor: float = 0.1,
+    distance_column: str = 'distance',
+    summary_column: str = 'longBusinessSummary',
+    debug: bool = True
+    ) -> pd.DataFrame:
+    result_df = df.copy()
+    result_df['summary_length'] = result_df[summary_column].apply(
+        lambda x: len(str(x)) if pd.notna(x) else 0
+    )
+    avg_length = max(1.0, result_df['summary_length'].mean())
+    max_dist = result_df['distance'].max()
+    result_df['percent_shorter'] = result_df['summary_length'].apply(
+        lambda x: max(0, (avg_length - x) / avg_length)
+    )
+    result_df['orig_distance'] = result_df[distance_column]
+    # Penalizamos en función del porcentaje en el que el resumen es más corto que la media (multiplicado por el factor)
+    result_df[distance_column] = result_df.apply(
+        lambda row: min(max_dist, row[distance_column] + (row['percent_shorter'] * factor)),
+        axis=1
+    )
+    if not debug:
+        result_df = result_df.drop(['orig_distance', 'summary_length', 'percent_shorter'], axis=1)
+    result_df = result_df.sort_values(by=distance_column, ascending=True)
+    return result_df
+def reward_literals(
+    df: pd.DataFrame,
+    query: str,
+    factor: float = 0.1,
+    partial_match_factor: float = 0.5,
+    distance_column: str = 'distance',
+    summary_column: str = 'longBusinessSummary',
+    debug: bool = True
+    ) -> pd.DataFrame:
+    result_df = df.copy()
+    query_lower = query.lower().strip()
+    def count_phrase_occurrences(summary):
+        if pd.isna(summary):
+            return 0
+        summary_lower = str(summary).lower()
+        # Cuenta coincidencias exactas (palabras completas)
+        exact_pattern = r'\b' + re.escape(query_lower) + r'\b'
+        exact_count = len(re.findall(exact_pattern, summary_lower))
+        # Cuenta coincidencias parciales basadas en el tipo de consulta
+        if ' ' in query_lower:  # Si la consulta incluye varias palabras
+            # Para frases, contamos las veces que aparece en el texto
+            partial_pattern = re.escape(query_lower)
+            partial_count = len(re.findall(partial_pattern, summary_lower))
+        else:
+            # Para consultas de una sola palabra, buscamos subcadenas dentro de palabras
+            partial_pattern = r'\b\w*' + re.escape(query_lower) + r'\w*\b'
+            partial_count = len(re.findall(partial_pattern, summary_lower))
+        # Resta las coincidencias exactas de las parciales para evitar contar dos veces
+        partial_count = partial_count - exact_count
+        # Penalizamos las coincidencias parciales:
+        return exact_count + (partial_count * partial_match_factor)
+    result_df['term_occurrences'] = result_df[summary_column].apply(count_phrase_occurrences)
+    result_df['orig_distance'] = result_df[distance_column]
+    result_df[distance_column] = result_df.apply(
+        lambda row: max(0, row[distance_column] - (row['term_occurrences'] * factor)),
+        axis=1
+    )
+    if not debug:
+        result_df = result_df.drop(['orig_distance', 'term_occurrences'], axis=1)
+    result_df = result_df.sort_values(by=distance_column, ascending=True)
+    return result_df