Spaces:

Asura05
/

Meta-Kaggle-Dataset-Navigator

Sleeping

App Files Files Community

n0v33n commited on Jun 27

Commit

f5262f7

1 Parent(s): c63ec90

Added new Schema

Browse files

Files changed (1) hide show

app.py +99 -67

app.py CHANGED Viewed

@@ -4,32 +4,64 @@ import polars as pl
 import gradio as gr
 import google.generativeai as genai
-# Download Meta-Kaggle dataset in parquet format
-PARQUET_PATH = kagglehub.dataset_download("bwandowando/meta-kaggle-ported-to-parquet-format")
-print("✅ Downloaded Meta-Kaggle parquet data.")
-print("📂 PARQUET_PATH =", PARQUET_PATH)
-# === Load schema from parquet files ===
-parquet_files = sorted([f for f in os.listdir(PARQUET_PATH) if f.endswith(".parquet")])
-schema_dict = {}
 file_map = {}
-for file in parquet_files:
-    name = file.replace(".parquet", "")
-    path = os.path.join(PARQUET_PATH, file)
-    # Read only schema using polars
-    schema_dict[name] = list(pl.read_parquet(path, n_rows=0).columns)
-    file_map[name] = path
 # === Build schema as prompt context ===
 schema_description = "\n\n".join(
     [f"### {name}\n{', '.join(cols)}" for name, cols in schema_dict.items()]
 )
 context_prompt = f"""
 You are a helpful assistant that helps users understand which parts of the Meta-Kaggle dataset they need for their analysis.
-Below is the dataset schema (parquet files):
 {schema_description}
@@ -60,61 +92,61 @@ def guide_user(prompt):
     result = model.generate_content(full_prompt)
     return result.text.strip()
-# === Custom CSS for Gradio UI ===
-css = """
-body {
-    background-color: #e6ecf3;
-    font-family: 'Segoe UI', sans-serif;
-}
-.gradio-container {
-    width: 100% !important;
-    max-width: 100% !important;
-    padding: 40px 10%;
-    background-color: #ffffff;
-}
-.gr-block {
-    width: 100%;
-}
-h1 {
-    color: #1967d2;
-    text-align: center;
-    font-size: 2rem;
-}
-h2, p.subtitle, .subheading, .description {
-    color: #333;
-    text-align: center;
-    margin-bottom: 20px;
-}
-input[type="text"], textarea, .output-text {
-    border: 1px solid #ccc;
-    border-radius: 6px;
-    padding: 12px;
-    width: 100%;
-    box-sizing: border-box;
-}
-button {
-    background-color: #1967d2;
-    color: white;
-    border-radius: 6px;
-    padding: 12px 24px;
-    font-size: 1rem;
-    border: none;
-    cursor: pointer;
-}
-button:hover {
-    background-color: #1450a3;
-}
-.output-text {
-    background-color: #f7f9fc;
-    border: 1px solid #ccd6e0;
-    border-radius: 6px;
-    padding: 15px;
-}
-"""
 # === Launch Gradio UI ===
-with gr.Blocks(css=css) as demo:
     with gr.Column():
         gr.Markdown("# Meta-Kaggle Dataset Navigator")
         gr.Markdown("Ask which CSV files and columns you need for your analysis!")

 import gradio as gr
 import google.generativeai as genai
+# === Predefined schema from user ===
+schema_dict = {
+    "KernelTags": ['Id', 'KernelId', 'TagId'],
+    "ModelVariations": ['Id', 'ModelId', 'CurrentVariationSlug', 'ModelFramework', 'CurrentModelVariationVersionId', 'LicenseName', 'BaseModelVariationId', 'CurrentDatasourceVersionId'],
+    "KernelVersionCompetitionSources": ['Id', 'KernelVersionId', 'SourceCompetitionId'],
+    "Datasets": ['Id', 'CreatorUserId', 'OwnerUserId', 'OwnerOrganizationId', 'CurrentDatasetVersionId', 'CurrentDatasourceVersionId', 'ForumId', 'Type', 'CreationDate', 'LastActivityDate', 'TotalViews', 'TotalDownloads', 'TotalVotes', 'TotalKernels', 'Medal', 'MedalAwardDate'],
+    "KernelVersionKernelSources": ['Id', 'KernelVersionId', 'SourceKernelVersionId'],
+    "KernelVotes": ['Id', 'UserId', 'KernelVersionId', 'VoteDate'],
+    "Submissions": ['Id', 'SubmittedUserId', 'TeamId', 'SourceKernelVersionId', 'SubmissionDate', 'ScoreDate', 'IsAfterDeadline', 'IsSelected', 'PublicScoreLeaderboardDisplay', 'PublicScoreFullPrecision', 'PrivateScoreLeaderboardDisplay', 'PrivateScoreFullPrecision'],
+    "KernelLanguages": ['Id', 'Name', 'DisplayName', 'IsNotebook'],
+    "Users": ['Id', 'UserName', 'DisplayName', 'RegisterDate', 'PerformanceTier', 'Country', 'LocationSharingOptOut'],
+    "ForumMessageVotes": ['Id', 'ForumMessageId', 'FromUserId', 'ToUserId', 'VoteDate'],
+    "Competitions": ['Id', 'Slug', 'Title', 'Subtitle', 'HostSegmentTitle', 'ForumId', 'OrganizationId', 'EnabledDate', 'DeadlineDate', 'ProhibitNewEntrantsDeadlineDate', 'TeamMergerDeadlineDate', 'TeamModelDeadlineDate', 'ModelSubmissionDeadlineDate', 'FinalLeaderboardHasBeenVerified', 'HasKernels', 'OnlyAllowKernelSubmissions', 'HasLeaderboard', 'LeaderboardPercentage', 'ScoreTruncationNumDecimals', 'EvaluationAlgorithmAbbreviation', 'EvaluationAlgorithmName', 'EvaluationAlgorithmDescription', 'EvaluationAlgorithmIsMax', 'MaxDailySubmissions', 'NumScoredSubmissions', 'MaxTeamSize', 'BanTeamMergers', 'EnableTeamModels', 'RewardType', 'RewardQuantity', 'NumPrizes', 'UserRankMultiplier', 'CanQualifyTiers', 'TotalTeams', 'TotalCompetitors', 'TotalSubmissions', 'LicenseName', 'Overview', 'Rules', 'DatasetDescription', 'TotalCompressedBytes', 'TotalUncompressedBytes', 'ValidationSetName', 'ValidationSetValue', 'EnableSubmissionModelHashes', 'EnableSubmissionModelAttachments', 'HostName', 'CompetitionTypeId'],
+    "DatasetTaskSubmissions": ['Id', 'DatasetTaskId', 'SubmittedUserId', 'CreationDate', 'KernelId', 'DatasetId', 'AcceptedDate'],
+    "UserAchievements": ['Id', 'UserId', 'AchievementType', 'Tier', 'TierAchievementDate', 'Points', 'CurrentRanking', 'HighestRanking', 'TotalGold', 'TotalSilver', 'TotalBronze'],
+    "UserOrganizations": ['Id', 'UserId', 'OrganizationId', 'JoinDate'],
+    "Teams": ['Id', 'CompetitionId', 'TeamLeaderId', 'TeamName', 'ScoreFirstSubmittedDate', 'LastSubmissionDate', 'PublicLeaderboardSubmissionId', 'PrivateLeaderboardSubmissionId', 'IsBenchmark', 'Medal', 'MedalAwardDate', 'PublicLeaderboardRank', 'PrivateLeaderboardRank', 'WriteUpForumTopicId'],
+    "UserFollowers": ['Id', 'UserId', 'FollowingUserId', 'CreationDate'],
+    "CompetitionTags": ['Id', 'CompetitionId', 'TagId'],
+    "Kernels": ['Id', 'AuthorUserId', 'CurrentKernelVersionId', 'ForkParentKernelVersionId', 'ForumTopicId', 'FirstKernelVersionId', 'CreationDate', 'EvaluationDate', 'MadePublicDate', 'IsProjectLanguageTemplate', 'CurrentUrlSlug', 'Medal', 'MedalAwardDate', 'TotalViews', 'TotalComments', 'TotalVotes'],
+    "Organizations": ['Id', 'Name', 'Slug', 'CreationDate', 'Description'],
+    "Datasources": ['Id', 'CreatorUserId', 'CreationDate', 'Type', 'CurrentDatasourceVersionId'],
+    "ModelVersions": ['Id', 'ModelId', 'Title', 'Subtitle', 'ModelCard', 'CreationDate', 'OriginalPublishDate', 'CreatorUserId', 'ProvenanceSources'],
+    "ForumTopics": ['Id', 'ForumId', 'KernelId', 'LastForumMessageId', 'FirstForumMessageId', 'CreationDate', 'LastCommentDate', 'Title', 'IsSticky', 'TotalViews', 'Score', 'TotalMessages', 'TotalReplies'],
+    "DatasetVersions": ['Id', 'DatasetId', 'DatasourceVersionId', 'CreatorUserId', 'LicenseName', 'CreationDate', 'VersionNumber', 'Title', 'Slug', 'Subtitle', 'Description', 'VersionNotes', 'TotalCompressedBytes', 'TotalUncompressedBytes'],
+    "ModelVotes": ['Id', 'UserId', 'ModelId', 'VoteDate'],
+    "DatasetVotes": ['Id', 'UserId', 'DatasetVersionId', 'VoteDate'],
+    "TeamMemberships": ['Id', 'TeamId', 'UserId', 'RequestDate'],
+    "Forums": ['Id', 'ParentForumId', 'Title'],
+    "KernelVersions": ['Id', 'ScriptId', 'ParentScriptVersionId', 'ScriptLanguageId', 'AuthorUserId', 'CreationDate', 'VersionNumber', 'Title', 'EvaluationDate', 'IsChange', 'TotalLines', 'LinesInsertedFromPrevious', 'LinesChangedFromPrevious', 'LinesUnchangedFromPrevious', 'LinesInsertedFromFork', 'LinesDeletedFromFork', 'LinesChangedFromFork', 'LinesUnchangedFromFork', 'TotalVotes', 'IsInternetEnabled', 'RunningTimeInMilliseconds', 'AcceleratorTypeId', 'DockerImage'],
+    "ModelVariationVersions": ['Id', 'ModelVariationId', 'ModelVersionId', 'DatasourceVersionId', 'CreationDate', 'VariationOverview', 'VariationUsage', 'FineTunable', 'SourceUrl', 'SourceOrganizationName'],
+    "ForumMessages": ['Id', 'ForumTopicId', 'PostUserId', 'PostDate', 'ReplyToForumMessageId', 'Message', 'RawMarkdown', 'Medal', 'MedalAwardDate'],
+    "KernelVersionDatasetSources": ['Id', 'KernelVersionId', 'SourceDatasetVersionId'],
+    "Episodes": ['Id', 'Type', 'CompetitionId', 'CreateTime', 'EndTime'],
+    "EpisodeAgents": ['Id', 'EpisodeId', 'Index', 'Reward', 'State', 'SubmissionId', 'InitialConfidence', 'InitialScore', 'UpdatedConfidence', 'UpdatedScore'],
+    "KernelAcceleratorTypes": ['Id', 'Label'],
+    "KernelVersionModelSources": ['Id', 'KernelVersionId', 'SourceModelVariationVersionId', 'SourceModelVariationId'],
+    "ForumMessageReactions": ['Id', 'ForumMessageId', 'FromUserId', 'ReactionType', 'ReactionDate'],
+    "Tags": ['Id', 'ParentTagId', 'Name', 'Slug', 'FullPath', 'Description', 'DatasetCount', 'CompetitionCount', 'KernelCount'],
+    "DatasetTasks": ['Id', 'DatasetId', 'OwnerUserId', 'CreationDate', 'Description', 'ForumId', 'Title', 'Subtitle', 'Deadline', 'TotalVotes'],
+    "Models": ['Id', 'OwnerUserId', 'OwnerOrganizationId', 'CurrentModelVersionId', 'ForumId', 'CreationDate', 'TotalViews', 'TotalDownloads', 'TotalVotes', 'TotalKernels', 'CurrentSlug'],
+    "DatasetTags": ['Id', 'DatasetId', 'TagId'],
+    "ModelTags": ['Id', 'ModelId', 'TagId'],
+}
+# Skip file_map since no file reading is needed
 file_map = {}
 # === Build schema as prompt context ===
 schema_description = "\n\n".join(
     [f"### {name}\n{', '.join(cols)}" for name, cols in schema_dict.items()]
 )
 context_prompt = f"""
 You are a helpful assistant that helps users understand which parts of the Meta-Kaggle dataset they need for their analysis.
+Below is the dataset schema (CSV files):
 {schema_description}
     result = model.generate_content(full_prompt)
     return result.text.strip()
+# # === Custom CSS for Gradio UI ===
+# css = """
+# body {
+#     background-color: #e6ecf3;
+#     font-family: 'Segoe UI', sans-serif;
+# }
+# .gradio-container {
+#     width: 100% !important;
+#     max-width: 100% !important;
+#     padding: 40px 10%;
+#     background-color: #ffffff;
+# }
+# .gr-block {
+#     width: 100%;
+# }
+# h1 {
+#     color: #1967d2;
+#     text-align: center;
+#     font-size: 2rem;
+# }
+# h2, p.subtitle, .subheading, .description {
+#     color: #333;
+#     text-align: center;
+#     margin-bottom: 20px;
+# }
+# input[type="text"], textarea, .output-text {
+#     border: 1px solid #ccc;
+#     border-radius: 6px;
+#     padding: 12px;
+#     width: 100%;
+#     box-sizing: border-box;
+# }
+# button {
+#     background-color: #1967d2;
+#     color: white;
+#     border-radius: 6px;
+#     padding: 12px 24px;
+#     font-size: 1rem;
+#     border: none;
+#     cursor: pointer;
+# }
+# button:hover {
+#     background-color: #1450a3;
+# }
+# .output-text {
+#     background-color: #f7f9fc;
+#     border: 1px solid #ccd6e0;
+#     border-radius: 6px;
+#     padding: 15px;
+# }
+# """
 # === Launch Gradio UI ===
+with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# Meta-Kaggle Dataset Navigator")
         gr.Markdown("Ask which CSV files and columns you need for your analysis!")