n0v33n
commited on
Commit
Β·
f5262f7
1
Parent(s):
c63ec90
Added new Schema
Browse files
app.py
CHANGED
@@ -4,32 +4,64 @@ import polars as pl
|
|
4 |
import gradio as gr
|
5 |
import google.generativeai as genai
|
6 |
|
7 |
-
#
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
#
|
13 |
-
parquet_files = sorted([f for f in os.listdir(PARQUET_PATH) if f.endswith(".parquet")])
|
14 |
-
schema_dict = {}
|
15 |
file_map = {}
|
16 |
|
17 |
-
for file in parquet_files:
|
18 |
-
name = file.replace(".parquet", "")
|
19 |
-
path = os.path.join(PARQUET_PATH, file)
|
20 |
-
# Read only schema using polars
|
21 |
-
schema_dict[name] = list(pl.read_parquet(path, n_rows=0).columns)
|
22 |
-
file_map[name] = path
|
23 |
-
|
24 |
# === Build schema as prompt context ===
|
25 |
schema_description = "\n\n".join(
|
26 |
[f"### {name}\n{', '.join(cols)}" for name, cols in schema_dict.items()]
|
27 |
)
|
28 |
|
|
|
29 |
context_prompt = f"""
|
30 |
You are a helpful assistant that helps users understand which parts of the Meta-Kaggle dataset they need for their analysis.
|
31 |
|
32 |
-
Below is the dataset schema (
|
33 |
|
34 |
{schema_description}
|
35 |
|
@@ -60,61 +92,61 @@ def guide_user(prompt):
|
|
60 |
result = model.generate_content(full_prompt)
|
61 |
return result.text.strip()
|
62 |
|
63 |
-
# === Custom CSS for Gradio UI ===
|
64 |
-
css = """
|
65 |
-
body {
|
66 |
-
|
67 |
-
|
68 |
-
}
|
69 |
-
.gradio-container {
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
}
|
75 |
-
.gr-block {
|
76 |
-
|
77 |
-
}
|
78 |
-
h1 {
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
}
|
83 |
-
h2, p.subtitle, .subheading, .description {
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
}
|
88 |
-
input[type="text"], textarea, .output-text {
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
}
|
95 |
-
button {
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
}
|
104 |
-
button:hover {
|
105 |
-
|
106 |
-
}
|
107 |
-
.output-text {
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
}
|
113 |
-
"""
|
114 |
|
115 |
|
116 |
# === Launch Gradio UI ===
|
117 |
-
with gr.Blocks(
|
118 |
with gr.Column():
|
119 |
gr.Markdown("# Meta-Kaggle Dataset Navigator")
|
120 |
gr.Markdown("Ask which CSV files and columns you need for your analysis!")
|
|
|
4 |
import gradio as gr
|
5 |
import google.generativeai as genai
|
6 |
|
7 |
+
# === Predefined schema from user ===
|
8 |
+
schema_dict = {
|
9 |
+
"KernelTags": ['Id', 'KernelId', 'TagId'],
|
10 |
+
"ModelVariations": ['Id', 'ModelId', 'CurrentVariationSlug', 'ModelFramework', 'CurrentModelVariationVersionId', 'LicenseName', 'BaseModelVariationId', 'CurrentDatasourceVersionId'],
|
11 |
+
"KernelVersionCompetitionSources": ['Id', 'KernelVersionId', 'SourceCompetitionId'],
|
12 |
+
"Datasets": ['Id', 'CreatorUserId', 'OwnerUserId', 'OwnerOrganizationId', 'CurrentDatasetVersionId', 'CurrentDatasourceVersionId', 'ForumId', 'Type', 'CreationDate', 'LastActivityDate', 'TotalViews', 'TotalDownloads', 'TotalVotes', 'TotalKernels', 'Medal', 'MedalAwardDate'],
|
13 |
+
"KernelVersionKernelSources": ['Id', 'KernelVersionId', 'SourceKernelVersionId'],
|
14 |
+
"KernelVotes": ['Id', 'UserId', 'KernelVersionId', 'VoteDate'],
|
15 |
+
"Submissions": ['Id', 'SubmittedUserId', 'TeamId', 'SourceKernelVersionId', 'SubmissionDate', 'ScoreDate', 'IsAfterDeadline', 'IsSelected', 'PublicScoreLeaderboardDisplay', 'PublicScoreFullPrecision', 'PrivateScoreLeaderboardDisplay', 'PrivateScoreFullPrecision'],
|
16 |
+
"KernelLanguages": ['Id', 'Name', 'DisplayName', 'IsNotebook'],
|
17 |
+
"Users": ['Id', 'UserName', 'DisplayName', 'RegisterDate', 'PerformanceTier', 'Country', 'LocationSharingOptOut'],
|
18 |
+
"ForumMessageVotes": ['Id', 'ForumMessageId', 'FromUserId', 'ToUserId', 'VoteDate'],
|
19 |
+
"Competitions": ['Id', 'Slug', 'Title', 'Subtitle', 'HostSegmentTitle', 'ForumId', 'OrganizationId', 'EnabledDate', 'DeadlineDate', 'ProhibitNewEntrantsDeadlineDate', 'TeamMergerDeadlineDate', 'TeamModelDeadlineDate', 'ModelSubmissionDeadlineDate', 'FinalLeaderboardHasBeenVerified', 'HasKernels', 'OnlyAllowKernelSubmissions', 'HasLeaderboard', 'LeaderboardPercentage', 'ScoreTruncationNumDecimals', 'EvaluationAlgorithmAbbreviation', 'EvaluationAlgorithmName', 'EvaluationAlgorithmDescription', 'EvaluationAlgorithmIsMax', 'MaxDailySubmissions', 'NumScoredSubmissions', 'MaxTeamSize', 'BanTeamMergers', 'EnableTeamModels', 'RewardType', 'RewardQuantity', 'NumPrizes', 'UserRankMultiplier', 'CanQualifyTiers', 'TotalTeams', 'TotalCompetitors', 'TotalSubmissions', 'LicenseName', 'Overview', 'Rules', 'DatasetDescription', 'TotalCompressedBytes', 'TotalUncompressedBytes', 'ValidationSetName', 'ValidationSetValue', 'EnableSubmissionModelHashes', 'EnableSubmissionModelAttachments', 'HostName', 'CompetitionTypeId'],
|
20 |
+
"DatasetTaskSubmissions": ['Id', 'DatasetTaskId', 'SubmittedUserId', 'CreationDate', 'KernelId', 'DatasetId', 'AcceptedDate'],
|
21 |
+
"UserAchievements": ['Id', 'UserId', 'AchievementType', 'Tier', 'TierAchievementDate', 'Points', 'CurrentRanking', 'HighestRanking', 'TotalGold', 'TotalSilver', 'TotalBronze'],
|
22 |
+
"UserOrganizations": ['Id', 'UserId', 'OrganizationId', 'JoinDate'],
|
23 |
+
"Teams": ['Id', 'CompetitionId', 'TeamLeaderId', 'TeamName', 'ScoreFirstSubmittedDate', 'LastSubmissionDate', 'PublicLeaderboardSubmissionId', 'PrivateLeaderboardSubmissionId', 'IsBenchmark', 'Medal', 'MedalAwardDate', 'PublicLeaderboardRank', 'PrivateLeaderboardRank', 'WriteUpForumTopicId'],
|
24 |
+
"UserFollowers": ['Id', 'UserId', 'FollowingUserId', 'CreationDate'],
|
25 |
+
"CompetitionTags": ['Id', 'CompetitionId', 'TagId'],
|
26 |
+
"Kernels": ['Id', 'AuthorUserId', 'CurrentKernelVersionId', 'ForkParentKernelVersionId', 'ForumTopicId', 'FirstKernelVersionId', 'CreationDate', 'EvaluationDate', 'MadePublicDate', 'IsProjectLanguageTemplate', 'CurrentUrlSlug', 'Medal', 'MedalAwardDate', 'TotalViews', 'TotalComments', 'TotalVotes'],
|
27 |
+
"Organizations": ['Id', 'Name', 'Slug', 'CreationDate', 'Description'],
|
28 |
+
"Datasources": ['Id', 'CreatorUserId', 'CreationDate', 'Type', 'CurrentDatasourceVersionId'],
|
29 |
+
"ModelVersions": ['Id', 'ModelId', 'Title', 'Subtitle', 'ModelCard', 'CreationDate', 'OriginalPublishDate', 'CreatorUserId', 'ProvenanceSources'],
|
30 |
+
"ForumTopics": ['Id', 'ForumId', 'KernelId', 'LastForumMessageId', 'FirstForumMessageId', 'CreationDate', 'LastCommentDate', 'Title', 'IsSticky', 'TotalViews', 'Score', 'TotalMessages', 'TotalReplies'],
|
31 |
+
"DatasetVersions": ['Id', 'DatasetId', 'DatasourceVersionId', 'CreatorUserId', 'LicenseName', 'CreationDate', 'VersionNumber', 'Title', 'Slug', 'Subtitle', 'Description', 'VersionNotes', 'TotalCompressedBytes', 'TotalUncompressedBytes'],
|
32 |
+
"ModelVotes": ['Id', 'UserId', 'ModelId', 'VoteDate'],
|
33 |
+
"DatasetVotes": ['Id', 'UserId', 'DatasetVersionId', 'VoteDate'],
|
34 |
+
"TeamMemberships": ['Id', 'TeamId', 'UserId', 'RequestDate'],
|
35 |
+
"Forums": ['Id', 'ParentForumId', 'Title'],
|
36 |
+
"KernelVersions": ['Id', 'ScriptId', 'ParentScriptVersionId', 'ScriptLanguageId', 'AuthorUserId', 'CreationDate', 'VersionNumber', 'Title', 'EvaluationDate', 'IsChange', 'TotalLines', 'LinesInsertedFromPrevious', 'LinesChangedFromPrevious', 'LinesUnchangedFromPrevious', 'LinesInsertedFromFork', 'LinesDeletedFromFork', 'LinesChangedFromFork', 'LinesUnchangedFromFork', 'TotalVotes', 'IsInternetEnabled', 'RunningTimeInMilliseconds', 'AcceleratorTypeId', 'DockerImage'],
|
37 |
+
"ModelVariationVersions": ['Id', 'ModelVariationId', 'ModelVersionId', 'DatasourceVersionId', 'CreationDate', 'VariationOverview', 'VariationUsage', 'FineTunable', 'SourceUrl', 'SourceOrganizationName'],
|
38 |
+
"ForumMessages": ['Id', 'ForumTopicId', 'PostUserId', 'PostDate', 'ReplyToForumMessageId', 'Message', 'RawMarkdown', 'Medal', 'MedalAwardDate'],
|
39 |
+
"KernelVersionDatasetSources": ['Id', 'KernelVersionId', 'SourceDatasetVersionId'],
|
40 |
+
"Episodes": ['Id', 'Type', 'CompetitionId', 'CreateTime', 'EndTime'],
|
41 |
+
"EpisodeAgents": ['Id', 'EpisodeId', 'Index', 'Reward', 'State', 'SubmissionId', 'InitialConfidence', 'InitialScore', 'UpdatedConfidence', 'UpdatedScore'],
|
42 |
+
"KernelAcceleratorTypes": ['Id', 'Label'],
|
43 |
+
"KernelVersionModelSources": ['Id', 'KernelVersionId', 'SourceModelVariationVersionId', 'SourceModelVariationId'],
|
44 |
+
"ForumMessageReactions": ['Id', 'ForumMessageId', 'FromUserId', 'ReactionType', 'ReactionDate'],
|
45 |
+
"Tags": ['Id', 'ParentTagId', 'Name', 'Slug', 'FullPath', 'Description', 'DatasetCount', 'CompetitionCount', 'KernelCount'],
|
46 |
+
"DatasetTasks": ['Id', 'DatasetId', 'OwnerUserId', 'CreationDate', 'Description', 'ForumId', 'Title', 'Subtitle', 'Deadline', 'TotalVotes'],
|
47 |
+
"Models": ['Id', 'OwnerUserId', 'OwnerOrganizationId', 'CurrentModelVersionId', 'ForumId', 'CreationDate', 'TotalViews', 'TotalDownloads', 'TotalVotes', 'TotalKernels', 'CurrentSlug'],
|
48 |
+
"DatasetTags": ['Id', 'DatasetId', 'TagId'],
|
49 |
+
"ModelTags": ['Id', 'ModelId', 'TagId'],
|
50 |
+
}
|
51 |
|
52 |
+
# Skip file_map since no file reading is needed
|
|
|
|
|
53 |
file_map = {}
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
# === Build schema as prompt context ===
|
56 |
schema_description = "\n\n".join(
|
57 |
[f"### {name}\n{', '.join(cols)}" for name, cols in schema_dict.items()]
|
58 |
)
|
59 |
|
60 |
+
|
61 |
context_prompt = f"""
|
62 |
You are a helpful assistant that helps users understand which parts of the Meta-Kaggle dataset they need for their analysis.
|
63 |
|
64 |
+
Below is the dataset schema (CSV files):
|
65 |
|
66 |
{schema_description}
|
67 |
|
|
|
92 |
result = model.generate_content(full_prompt)
|
93 |
return result.text.strip()
|
94 |
|
95 |
+
# # === Custom CSS for Gradio UI ===
|
96 |
+
# css = """
|
97 |
+
# body {
|
98 |
+
# background-color: #e6ecf3;
|
99 |
+
# font-family: 'Segoe UI', sans-serif;
|
100 |
+
# }
|
101 |
+
# .gradio-container {
|
102 |
+
# width: 100% !important;
|
103 |
+
# max-width: 100% !important;
|
104 |
+
# padding: 40px 10%;
|
105 |
+
# background-color: #ffffff;
|
106 |
+
# }
|
107 |
+
# .gr-block {
|
108 |
+
# width: 100%;
|
109 |
+
# }
|
110 |
+
# h1 {
|
111 |
+
# color: #1967d2;
|
112 |
+
# text-align: center;
|
113 |
+
# font-size: 2rem;
|
114 |
+
# }
|
115 |
+
# h2, p.subtitle, .subheading, .description {
|
116 |
+
# color: #333;
|
117 |
+
# text-align: center;
|
118 |
+
# margin-bottom: 20px;
|
119 |
+
# }
|
120 |
+
# input[type="text"], textarea, .output-text {
|
121 |
+
# border: 1px solid #ccc;
|
122 |
+
# border-radius: 6px;
|
123 |
+
# padding: 12px;
|
124 |
+
# width: 100%;
|
125 |
+
# box-sizing: border-box;
|
126 |
+
# }
|
127 |
+
# button {
|
128 |
+
# background-color: #1967d2;
|
129 |
+
# color: white;
|
130 |
+
# border-radius: 6px;
|
131 |
+
# padding: 12px 24px;
|
132 |
+
# font-size: 1rem;
|
133 |
+
# border: none;
|
134 |
+
# cursor: pointer;
|
135 |
+
# }
|
136 |
+
# button:hover {
|
137 |
+
# background-color: #1450a3;
|
138 |
+
# }
|
139 |
+
# .output-text {
|
140 |
+
# background-color: #f7f9fc;
|
141 |
+
# border: 1px solid #ccd6e0;
|
142 |
+
# border-radius: 6px;
|
143 |
+
# padding: 15px;
|
144 |
+
# }
|
145 |
+
# """
|
146 |
|
147 |
|
148 |
# === Launch Gradio UI ===
|
149 |
+
with gr.Blocks() as demo:
|
150 |
with gr.Column():
|
151 |
gr.Markdown("# Meta-Kaggle Dataset Navigator")
|
152 |
gr.Markdown("Ask which CSV files and columns you need for your analysis!")
|