n0v33n commited on
Commit
f5262f7
Β·
1 Parent(s): c63ec90

Added new Schema

Browse files
Files changed (1) hide show
  1. app.py +99 -67
app.py CHANGED
@@ -4,32 +4,64 @@ import polars as pl
4
  import gradio as gr
5
  import google.generativeai as genai
6
 
7
- # Download Meta-Kaggle dataset in parquet format
8
- PARQUET_PATH = kagglehub.dataset_download("bwandowando/meta-kaggle-ported-to-parquet-format")
9
- print("βœ… Downloaded Meta-Kaggle parquet data.")
10
- print("πŸ“‚ PARQUET_PATH =", PARQUET_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- # === Load schema from parquet files ===
13
- parquet_files = sorted([f for f in os.listdir(PARQUET_PATH) if f.endswith(".parquet")])
14
- schema_dict = {}
15
  file_map = {}
16
 
17
- for file in parquet_files:
18
- name = file.replace(".parquet", "")
19
- path = os.path.join(PARQUET_PATH, file)
20
- # Read only schema using polars
21
- schema_dict[name] = list(pl.read_parquet(path, n_rows=0).columns)
22
- file_map[name] = path
23
-
24
  # === Build schema as prompt context ===
25
  schema_description = "\n\n".join(
26
  [f"### {name}\n{', '.join(cols)}" for name, cols in schema_dict.items()]
27
  )
28
 
 
29
  context_prompt = f"""
30
  You are a helpful assistant that helps users understand which parts of the Meta-Kaggle dataset they need for their analysis.
31
 
32
- Below is the dataset schema (parquet files):
33
 
34
  {schema_description}
35
 
@@ -60,61 +92,61 @@ def guide_user(prompt):
60
  result = model.generate_content(full_prompt)
61
  return result.text.strip()
62
 
63
- # === Custom CSS for Gradio UI ===
64
- css = """
65
- body {
66
- background-color: #e6ecf3;
67
- font-family: 'Segoe UI', sans-serif;
68
- }
69
- .gradio-container {
70
- width: 100% !important;
71
- max-width: 100% !important;
72
- padding: 40px 10%;
73
- background-color: #ffffff;
74
- }
75
- .gr-block {
76
- width: 100%;
77
- }
78
- h1 {
79
- color: #1967d2;
80
- text-align: center;
81
- font-size: 2rem;
82
- }
83
- h2, p.subtitle, .subheading, .description {
84
- color: #333;
85
- text-align: center;
86
- margin-bottom: 20px;
87
- }
88
- input[type="text"], textarea, .output-text {
89
- border: 1px solid #ccc;
90
- border-radius: 6px;
91
- padding: 12px;
92
- width: 100%;
93
- box-sizing: border-box;
94
- }
95
- button {
96
- background-color: #1967d2;
97
- color: white;
98
- border-radius: 6px;
99
- padding: 12px 24px;
100
- font-size: 1rem;
101
- border: none;
102
- cursor: pointer;
103
- }
104
- button:hover {
105
- background-color: #1450a3;
106
- }
107
- .output-text {
108
- background-color: #f7f9fc;
109
- border: 1px solid #ccd6e0;
110
- border-radius: 6px;
111
- padding: 15px;
112
- }
113
- """
114
 
115
 
116
  # === Launch Gradio UI ===
117
- with gr.Blocks(css=css) as demo:
118
  with gr.Column():
119
  gr.Markdown("# Meta-Kaggle Dataset Navigator")
120
  gr.Markdown("Ask which CSV files and columns you need for your analysis!")
 
4
  import gradio as gr
5
  import google.generativeai as genai
6
 
7
+ # === Predefined schema from user ===
8
+ schema_dict = {
9
+ "KernelTags": ['Id', 'KernelId', 'TagId'],
10
+ "ModelVariations": ['Id', 'ModelId', 'CurrentVariationSlug', 'ModelFramework', 'CurrentModelVariationVersionId', 'LicenseName', 'BaseModelVariationId', 'CurrentDatasourceVersionId'],
11
+ "KernelVersionCompetitionSources": ['Id', 'KernelVersionId', 'SourceCompetitionId'],
12
+ "Datasets": ['Id', 'CreatorUserId', 'OwnerUserId', 'OwnerOrganizationId', 'CurrentDatasetVersionId', 'CurrentDatasourceVersionId', 'ForumId', 'Type', 'CreationDate', 'LastActivityDate', 'TotalViews', 'TotalDownloads', 'TotalVotes', 'TotalKernels', 'Medal', 'MedalAwardDate'],
13
+ "KernelVersionKernelSources": ['Id', 'KernelVersionId', 'SourceKernelVersionId'],
14
+ "KernelVotes": ['Id', 'UserId', 'KernelVersionId', 'VoteDate'],
15
+ "Submissions": ['Id', 'SubmittedUserId', 'TeamId', 'SourceKernelVersionId', 'SubmissionDate', 'ScoreDate', 'IsAfterDeadline', 'IsSelected', 'PublicScoreLeaderboardDisplay', 'PublicScoreFullPrecision', 'PrivateScoreLeaderboardDisplay', 'PrivateScoreFullPrecision'],
16
+ "KernelLanguages": ['Id', 'Name', 'DisplayName', 'IsNotebook'],
17
+ "Users": ['Id', 'UserName', 'DisplayName', 'RegisterDate', 'PerformanceTier', 'Country', 'LocationSharingOptOut'],
18
+ "ForumMessageVotes": ['Id', 'ForumMessageId', 'FromUserId', 'ToUserId', 'VoteDate'],
19
+ "Competitions": ['Id', 'Slug', 'Title', 'Subtitle', 'HostSegmentTitle', 'ForumId', 'OrganizationId', 'EnabledDate', 'DeadlineDate', 'ProhibitNewEntrantsDeadlineDate', 'TeamMergerDeadlineDate', 'TeamModelDeadlineDate', 'ModelSubmissionDeadlineDate', 'FinalLeaderboardHasBeenVerified', 'HasKernels', 'OnlyAllowKernelSubmissions', 'HasLeaderboard', 'LeaderboardPercentage', 'ScoreTruncationNumDecimals', 'EvaluationAlgorithmAbbreviation', 'EvaluationAlgorithmName', 'EvaluationAlgorithmDescription', 'EvaluationAlgorithmIsMax', 'MaxDailySubmissions', 'NumScoredSubmissions', 'MaxTeamSize', 'BanTeamMergers', 'EnableTeamModels', 'RewardType', 'RewardQuantity', 'NumPrizes', 'UserRankMultiplier', 'CanQualifyTiers', 'TotalTeams', 'TotalCompetitors', 'TotalSubmissions', 'LicenseName', 'Overview', 'Rules', 'DatasetDescription', 'TotalCompressedBytes', 'TotalUncompressedBytes', 'ValidationSetName', 'ValidationSetValue', 'EnableSubmissionModelHashes', 'EnableSubmissionModelAttachments', 'HostName', 'CompetitionTypeId'],
20
+ "DatasetTaskSubmissions": ['Id', 'DatasetTaskId', 'SubmittedUserId', 'CreationDate', 'KernelId', 'DatasetId', 'AcceptedDate'],
21
+ "UserAchievements": ['Id', 'UserId', 'AchievementType', 'Tier', 'TierAchievementDate', 'Points', 'CurrentRanking', 'HighestRanking', 'TotalGold', 'TotalSilver', 'TotalBronze'],
22
+ "UserOrganizations": ['Id', 'UserId', 'OrganizationId', 'JoinDate'],
23
+ "Teams": ['Id', 'CompetitionId', 'TeamLeaderId', 'TeamName', 'ScoreFirstSubmittedDate', 'LastSubmissionDate', 'PublicLeaderboardSubmissionId', 'PrivateLeaderboardSubmissionId', 'IsBenchmark', 'Medal', 'MedalAwardDate', 'PublicLeaderboardRank', 'PrivateLeaderboardRank', 'WriteUpForumTopicId'],
24
+ "UserFollowers": ['Id', 'UserId', 'FollowingUserId', 'CreationDate'],
25
+ "CompetitionTags": ['Id', 'CompetitionId', 'TagId'],
26
+ "Kernels": ['Id', 'AuthorUserId', 'CurrentKernelVersionId', 'ForkParentKernelVersionId', 'ForumTopicId', 'FirstKernelVersionId', 'CreationDate', 'EvaluationDate', 'MadePublicDate', 'IsProjectLanguageTemplate', 'CurrentUrlSlug', 'Medal', 'MedalAwardDate', 'TotalViews', 'TotalComments', 'TotalVotes'],
27
+ "Organizations": ['Id', 'Name', 'Slug', 'CreationDate', 'Description'],
28
+ "Datasources": ['Id', 'CreatorUserId', 'CreationDate', 'Type', 'CurrentDatasourceVersionId'],
29
+ "ModelVersions": ['Id', 'ModelId', 'Title', 'Subtitle', 'ModelCard', 'CreationDate', 'OriginalPublishDate', 'CreatorUserId', 'ProvenanceSources'],
30
+ "ForumTopics": ['Id', 'ForumId', 'KernelId', 'LastForumMessageId', 'FirstForumMessageId', 'CreationDate', 'LastCommentDate', 'Title', 'IsSticky', 'TotalViews', 'Score', 'TotalMessages', 'TotalReplies'],
31
+ "DatasetVersions": ['Id', 'DatasetId', 'DatasourceVersionId', 'CreatorUserId', 'LicenseName', 'CreationDate', 'VersionNumber', 'Title', 'Slug', 'Subtitle', 'Description', 'VersionNotes', 'TotalCompressedBytes', 'TotalUncompressedBytes'],
32
+ "ModelVotes": ['Id', 'UserId', 'ModelId', 'VoteDate'],
33
+ "DatasetVotes": ['Id', 'UserId', 'DatasetVersionId', 'VoteDate'],
34
+ "TeamMemberships": ['Id', 'TeamId', 'UserId', 'RequestDate'],
35
+ "Forums": ['Id', 'ParentForumId', 'Title'],
36
+ "KernelVersions": ['Id', 'ScriptId', 'ParentScriptVersionId', 'ScriptLanguageId', 'AuthorUserId', 'CreationDate', 'VersionNumber', 'Title', 'EvaluationDate', 'IsChange', 'TotalLines', 'LinesInsertedFromPrevious', 'LinesChangedFromPrevious', 'LinesUnchangedFromPrevious', 'LinesInsertedFromFork', 'LinesDeletedFromFork', 'LinesChangedFromFork', 'LinesUnchangedFromFork', 'TotalVotes', 'IsInternetEnabled', 'RunningTimeInMilliseconds', 'AcceleratorTypeId', 'DockerImage'],
37
+ "ModelVariationVersions": ['Id', 'ModelVariationId', 'ModelVersionId', 'DatasourceVersionId', 'CreationDate', 'VariationOverview', 'VariationUsage', 'FineTunable', 'SourceUrl', 'SourceOrganizationName'],
38
+ "ForumMessages": ['Id', 'ForumTopicId', 'PostUserId', 'PostDate', 'ReplyToForumMessageId', 'Message', 'RawMarkdown', 'Medal', 'MedalAwardDate'],
39
+ "KernelVersionDatasetSources": ['Id', 'KernelVersionId', 'SourceDatasetVersionId'],
40
+ "Episodes": ['Id', 'Type', 'CompetitionId', 'CreateTime', 'EndTime'],
41
+ "EpisodeAgents": ['Id', 'EpisodeId', 'Index', 'Reward', 'State', 'SubmissionId', 'InitialConfidence', 'InitialScore', 'UpdatedConfidence', 'UpdatedScore'],
42
+ "KernelAcceleratorTypes": ['Id', 'Label'],
43
+ "KernelVersionModelSources": ['Id', 'KernelVersionId', 'SourceModelVariationVersionId', 'SourceModelVariationId'],
44
+ "ForumMessageReactions": ['Id', 'ForumMessageId', 'FromUserId', 'ReactionType', 'ReactionDate'],
45
+ "Tags": ['Id', 'ParentTagId', 'Name', 'Slug', 'FullPath', 'Description', 'DatasetCount', 'CompetitionCount', 'KernelCount'],
46
+ "DatasetTasks": ['Id', 'DatasetId', 'OwnerUserId', 'CreationDate', 'Description', 'ForumId', 'Title', 'Subtitle', 'Deadline', 'TotalVotes'],
47
+ "Models": ['Id', 'OwnerUserId', 'OwnerOrganizationId', 'CurrentModelVersionId', 'ForumId', 'CreationDate', 'TotalViews', 'TotalDownloads', 'TotalVotes', 'TotalKernels', 'CurrentSlug'],
48
+ "DatasetTags": ['Id', 'DatasetId', 'TagId'],
49
+ "ModelTags": ['Id', 'ModelId', 'TagId'],
50
+ }
51
 
52
+ # Skip file_map since no file reading is needed
 
 
53
  file_map = {}
54
 
 
 
 
 
 
 
 
55
  # === Build schema as prompt context ===
56
  schema_description = "\n\n".join(
57
  [f"### {name}\n{', '.join(cols)}" for name, cols in schema_dict.items()]
58
  )
59
 
60
+
61
  context_prompt = f"""
62
  You are a helpful assistant that helps users understand which parts of the Meta-Kaggle dataset they need for their analysis.
63
 
64
+ Below is the dataset schema (CSV files):
65
 
66
  {schema_description}
67
 
 
92
  result = model.generate_content(full_prompt)
93
  return result.text.strip()
94
 
95
+ # # === Custom CSS for Gradio UI ===
96
+ # css = """
97
+ # body {
98
+ # background-color: #e6ecf3;
99
+ # font-family: 'Segoe UI', sans-serif;
100
+ # }
101
+ # .gradio-container {
102
+ # width: 100% !important;
103
+ # max-width: 100% !important;
104
+ # padding: 40px 10%;
105
+ # background-color: #ffffff;
106
+ # }
107
+ # .gr-block {
108
+ # width: 100%;
109
+ # }
110
+ # h1 {
111
+ # color: #1967d2;
112
+ # text-align: center;
113
+ # font-size: 2rem;
114
+ # }
115
+ # h2, p.subtitle, .subheading, .description {
116
+ # color: #333;
117
+ # text-align: center;
118
+ # margin-bottom: 20px;
119
+ # }
120
+ # input[type="text"], textarea, .output-text {
121
+ # border: 1px solid #ccc;
122
+ # border-radius: 6px;
123
+ # padding: 12px;
124
+ # width: 100%;
125
+ # box-sizing: border-box;
126
+ # }
127
+ # button {
128
+ # background-color: #1967d2;
129
+ # color: white;
130
+ # border-radius: 6px;
131
+ # padding: 12px 24px;
132
+ # font-size: 1rem;
133
+ # border: none;
134
+ # cursor: pointer;
135
+ # }
136
+ # button:hover {
137
+ # background-color: #1450a3;
138
+ # }
139
+ # .output-text {
140
+ # background-color: #f7f9fc;
141
+ # border: 1px solid #ccd6e0;
142
+ # border-radius: 6px;
143
+ # padding: 15px;
144
+ # }
145
+ # """
146
 
147
 
148
  # === Launch Gradio UI ===
149
+ with gr.Blocks() as demo:
150
  with gr.Column():
151
  gr.Markdown("# Meta-Kaggle Dataset Navigator")
152
  gr.Markdown("Ask which CSV files and columns you need for your analysis!")