Spaces:

markojak
/

tt-creators

Runtime error

App Files Files Community

markojak commited on Mar 5

Commit

46e6e62

verified ·

1 Parent(s): f7f46aa

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

.gradio/certificate.pem +31 -0
README.md +80 -8
creators.py +506 -0
requirements.txt +4 -0

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md CHANGED Viewed

@@ -1,13 +1,85 @@
 ---
-title: Tt Creators
-emoji: 🦀
-colorFrom: purple
-colorTo: blue
 sdk: gradio
 sdk_version: 5.20.0
-app_file: app.py
-pinned: false
-short_description: TT-Creators Exploration
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: tt-creators
+app_file: creators.py
 sdk: gradio
 sdk_version: 5.20.0
 ---
+# TikTok Creator Analyzer
+A Gradio-based tool for analyzing TikTok creator profiles from CSV files.
+## Features
+- Efficiently loads and processes millions of TikTok creator profiles
+- Caches data in Parquet format for faster subsequent loads
+- Tracks processed files to avoid reprocessing the same data
+- Incrementally updates the database when new files are added
+- Advanced search with multiple filters:
+  - Follower count range (min/max)
+  - Video count range (min/max)
+  - Keywords in signature
+  - Region filter
+  - "Has Email" filter to find profiles with contact information
+- Download search results as CSV
+- Network accessible interface (binds to 0.0.0.0)
+- Shareable via temporary public URL
+## Installation
+1. Install the required dependencies:
+```bash
+pip install -r requirements.txt
+```
+2. Make sure your CSV files are in the correct location (`../data/tiktok_profiles/`)
+## Usage
+Run the script:
+```bash
+python creators.py
+```
+The first run will:
+1. Load all CSV files from the data directory
+2. Combine them into a single dataset
+3. Save the combined data as a Parquet file for faster loading in the future
+4. Track which files have been processed to avoid duplicates
+5. Launch a Gradio web interface for searching and analyzing the data
+Subsequent runs will:
+1. Load the existing data from the Parquet file
+2. Check for new CSV files that haven't been processed yet
+3. If new files exist, process only those files and update the database
+4. Launch the Gradio interface with the updated data
+The interface will be accessible from:
+- Other machines on your network at: `http://your-ip-address:7860`
+- A temporary public URL that will be displayed in the console (thanks to `share=True`)
+## Maintenance
+The application includes a Maintenance tab that shows:
+- How many files have been processed
+- When the database was last updated
+- An option to force reload all files (useful if you suspect data corruption)
+## Data Format
+The CSV files should have the following columns:
+- id
+- unique_id
+- follower_count
+- nickname
+- video_count
+- following_count
+- signature
+- email
+- bio_link
+- updated_at
+- tt_seller
+- region
+- language
+- url

creators.py ADDED Viewed

	@@ -0,0 +1,506 @@

+#!/usr/bin/env python3
+import os
+import glob
+import pandas as pd
+import gradio as gr
+import time
+import pyarrow as pa
+import pyarrow.parquet as pq
+import json
+from pathlib import Path
+# Configuration
+DATA_DIR = Path("../data/tiktok_profiles")
+CACHE_FILE = Path("../data/tiktok_profiles_combined.parquet")
+PROCESSED_FILES_LOG = Path("../data/processed_files.json")
+COLUMNS = [
+    "id",
+    "unique_id",
+    "follower_count",
+    "nickname",
+    "video_count",
+    "following_count",
+    "signature",
+    "email",
+    "bio_link",
+    "updated_at",
+    "tt_seller",
+    "region",
+    "language",
+    "url",
+]
+def get_processed_files():
+    """
+    Get the list of already processed files from the log.
+    Returns a set of filenames that have been processed.
+    """
+    if PROCESSED_FILES_LOG.exists():
+        with open(PROCESSED_FILES_LOG, "r") as f:
+            return set(json.load(f))
+    return set()
+def update_processed_files(processed_files):
+    """
+    Update the log of processed files.
+    """
+    PROCESSED_FILES_LOG.parent.mkdir(exist_ok=True)
+    with open(PROCESSED_FILES_LOG, "w") as f:
+        json.dump(list(processed_files), f)
+def load_data(force_reload=False):
+    """
+    Load data from either the cache file or from individual CSV files.
+    Only processes new files that haven't been processed before.
+    Returns a pandas DataFrame with all the data.
+    Args:
+        force_reload: If True, reprocess all files regardless of whether they've been processed before.
+    """
+    start_time = time.time()
+    # Get all available CSV files
+    all_csv_files = {file.name: file for file in DATA_DIR.glob("*.csv")}
+    # If cache exists and we're not forcing a reload, load from cache
+    if CACHE_FILE.exists() and not force_reload:
+        print(f"Loading data from cache file: {CACHE_FILE}")
+        df = pd.read_parquet(CACHE_FILE)
+        # Check for new files
+        processed_files = get_processed_files()
+        new_files = [
+            all_csv_files[name] for name in all_csv_files if name not in processed_files
+        ]
+        if not new_files:
+            print(
+                f"No new files to process. Data loaded in {time.time() - start_time:.2f} seconds"
+            )
+            return df
+        print(f"Found {len(new_files)} new files to process")
+        # Process only the new files
+        new_dfs = []
+        for i, file in enumerate(new_files):
+            print(f"Loading new file {i+1}/{len(new_files)}: {file.name}")
+            # Read CSV with optimized settings
+            chunk_df = pd.read_csv(
+                file,
+                dtype={
+                    "id": "str",
+                    "unique_id": "str",
+                    "follower_count": "Int64",
+                    "nickname": "str",
+                    "video_count": "Int64",
+                    "following_count": "Int64",
+                    "signature": "str",
+                    "email": "str",
+                    "bio_link": "str",
+                    "updated_at": "str",
+                    "tt_seller": "str",
+                    "region": "str",
+                    "language": "str",
+                    "url": "str",
+                },
+                low_memory=False,
+            )
+            new_dfs.append(chunk_df)
+            processed_files.add(file.name)
+        if new_dfs:
+            # Combine new data with existing data
+            print("Combining new data with existing data...")
+            new_data = pd.concat(new_dfs, ignore_index=True)
+            df = pd.concat([df, new_data], ignore_index=True)
+            # Remove duplicates based on unique_id
+            df = df.drop_duplicates(subset=["unique_id"], keep="last")
+            # Save updated data to cache file
+            print(f"Saving updated data to {CACHE_FILE}")
+            df.to_parquet(CACHE_FILE, index=False)
+            # Update the processed files log
+            update_processed_files(processed_files)
+        print(f"Data loaded and updated in {time.time() - start_time:.2f} seconds")
+        return df
+    # If no cache file or force_reload is True, process all files
+    print(f"Loading data from CSV files in {DATA_DIR}")
+    # Get all CSV files
+    csv_files = list(all_csv_files.values())
+    total_files = len(csv_files)
+    print(f"Found {total_files} CSV files")
+    # Load data in chunks
+    dfs = []
+    processed_files = set()
+    for i, file in enumerate(csv_files):
+        if i % 10 == 0:
+            print(f"Loading file {i+1}/{total_files}: {file.name}")
+        # Read CSV with optimized settings
+        chunk_df = pd.read_csv(
+            file,
+            dtype={
+                "id": "str",
+                "unique_id": "str",
+                "follower_count": "Int64",
+                "nickname": "str",
+                "video_count": "Int64",
+                "following_count": "Int64",
+                "signature": "str",
+                "email": "str",
+                "bio_link": "str",
+                "updated_at": "str",
+                "tt_seller": "str",
+                "region": "str",
+                "language": "str",
+                "url": "str",
+            },
+            low_memory=False,
+        )
+        dfs.append(chunk_df)
+        processed_files.add(file.name)
+    # Combine all dataframes
+    print("Combining all dataframes...")
+    df = pd.concat(dfs, ignore_index=True)
+    # Remove duplicates based on unique_id
+    df = df.drop_duplicates(subset=["unique_id"], keep="last")
+    # Save to cache file
+    print(f"Saving combined data to {CACHE_FILE}")
+    CACHE_FILE.parent.mkdir(exist_ok=True)
+    df.to_parquet(CACHE_FILE, index=False)
+    # Update the processed files log
+    update_processed_files(processed_files)
+    print(f"Data loaded and cached in {time.time() - start_time:.2f} seconds")
+    return df
+def search_by_username(df, username):
+    """Search for profiles by username (unique_id)"""
+    if not username:
+        return pd.DataFrame()
+    # Case-insensitive search
+    results = df[df["unique_id"].str.lower().str.contains(username.lower(), na=False)]
+    return results.head(100)  # Limit results to prevent UI overload
+def search_by_nickname(df, nickname):
+    """Search for profiles by nickname"""
+    if not nickname:
+        return pd.DataFrame()
+    # Case-insensitive search
+    results = df[df["nickname"].str.lower().str.contains(nickname.lower(), na=False)]
+    return results.head(100)  # Limit results to prevent UI overload
+def search_by_follower_count(df, min_followers, max_followers):
+    """Search for profiles by follower count range"""
+    if min_followers is None:
+        min_followers = 0
+    if max_followers is None:
+        max_followers = df["follower_count"].max()
+    results = df[
+        (df["follower_count"] >= min_followers)
+        & (df["follower_count"] <= max_followers)
+    ]
+    return results.head(100)  # Limit results to prevent UI overload
+def format_results(df):
+    """Format the results for display"""
+    if df.empty:
+        # Return an empty DataFrame with the same columns instead of a string
+        return pd.DataFrame(columns=df.columns)
+    # Format the DataFrame for display
+    display_df = df.copy()
+    # Convert follower count to human-readable format
+    def format_number(num):
+        if pd.isna(num):
+            return "N/A"
+        if num >= 1_000_000:
+            return f"{num/1_000_000:.1f}M"
+        elif num >= 1_000:
+            return f"{num/1_000:.1f}K"
+        return str(num)
+    display_df["follower_count"] = display_df["follower_count"].apply(format_number)
+    display_df["video_count"] = display_df["video_count"].apply(format_number)
+    display_df["following_count"] = display_df["following_count"].apply(format_number)
+    return display_df
+def combined_search(
+    df,
+    min_followers,
+    max_followers,
+    min_videos,
+    max_videos,
+    signature_query,
+    region,
+    has_email,
+):
+    """Combined search function using all criteria"""
+    results = df.copy()
+    # Apply each filter if provided
+    if min_followers is not None:
+        results = results[results["follower_count"] >= min_followers]
+    if max_followers is not None:
+        results = results[results["follower_count"] <= max_followers]
+    if min_videos is not None:
+        results = results[results["video_count"] >= min_videos]
+    if max_videos is not None:
+        results = results[results["video_count"] <= max_videos]
+    if signature_query:
+        results = results[
+            results["signature"]
+            .str.lower()
+            .str.contains(signature_query.lower(), na=False)
+        ]
+    if region:
+        results = results[results["region"].str.lower() == region.lower()]
+    # Filter for profiles with email
+    if has_email:
+        results = results[results["email"].notna() & (results["email"] != "")]
+    return results.head(1000)  # Limit to 1000 results to prevent UI overload
+def create_interface(df):
+    """Create the Gradio interface"""
+    # Get min and max follower counts for slider
+    min_followers_global = max(1000, int(df["follower_count"].min()))
+    max_followers_global = min(10000000, int(df["follower_count"].max()))
+    # Get min and max video counts for slider
+    min_videos_global = max(1, int(df["video_count"].min()))
+    max_videos_global = min(10000, int(df["video_count"].max()))
+    # Get unique regions for dropdown
+    regions = sorted(df["region"].dropna().unique().tolist())
+    regions = [""] + regions  # Add empty option
+    with gr.Blocks(title="TikTok Creator Analyzer") as interface:
+        gr.Markdown("# TikTok Creator Analyzer")
+        gr.Markdown(f"Database contains {len(df):,} creator profiles")
+        # Show top 100 profiles by default
+        top_profiles = df.sort_values(by="follower_count", ascending=False).head(100)
+        default_view = format_results(top_profiles)
+        with gr.Tab("Overview"):
+            gr.Markdown("## Top 100 Profiles by Follower Count")
+            overview_results = gr.Dataframe(value=default_view, label="Top Profiles")
+            refresh_btn = gr.Button("Refresh")
+            refresh_btn.click(
+                fn=lambda: format_results(
+                    df.sort_values(by="follower_count", ascending=False).head(100)
+                ),
+                inputs=[],
+                outputs=overview_results,
+            )
+        with gr.Tab("Advanced Search"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### Follower Count")
+                    min_followers_slider = gr.Slider(
+                        minimum=min_followers_global,
+                        maximum=max_followers_global,
+                        value=min_followers_global,
+                        step=1000,
+                        label="Minimum Followers",
+                        interactive=True,
+                    )
+                    max_followers_slider = gr.Slider(
+                        minimum=min_followers_global,
+                        maximum=max_followers_global,
+                        value=max_followers_global,
+                        step=1000,
+                        label="Maximum Followers",
+                        interactive=True,
+                    )
+                    gr.Markdown("### Video Count")
+                    min_videos_slider = gr.Slider(
+                        minimum=min_videos_global,
+                        maximum=max_videos_global,
+                        value=min_videos_global,
+                        step=10,
+                        label="Minimum Videos",
+                        interactive=True,
+                    )
+                    max_videos_slider = gr.Slider(
+                        minimum=min_videos_global,
+                        maximum=max_videos_global,
+                        value=max_videos_global,
+                        step=10,
+                        label="Maximum Videos",
+                        interactive=True,
+                    )
+                with gr.Column(scale=1):
+                    signature_input = gr.Textbox(label="Keywords in Signature")
+                    region_input = gr.Dropdown(label="Region", choices=regions)
+                    has_email_checkbox = gr.Checkbox(label="Has Email", value=False)
+                    search_btn = gr.Button("Search", variant="primary", size="lg")
+            results_count = gr.Markdown("### Results: 0 profiles found")
+            # Create a dataframe with download button
+            with gr.Row():
+                search_results = gr.Dataframe(label="Results")
+                download_btn = gr.Button("Download Results as CSV")
+            # Function to update results count
+            def update_results_count(results_df):
+                count = len(results_df)
+                return f"### Results: {count:,} profiles found"
+            # Function to perform search and update results
+            def perform_search(
+                min_followers,
+                max_followers,
+                min_videos,
+                max_videos,
+                signature,
+                region,
+                has_email,
+            ):
+                results = combined_search(
+                    df,
+                    min_followers,
+                    max_followers,
+                    min_videos,
+                    max_videos,
+                    signature,
+                    region,
+                    has_email,
+                )
+                formatted_results = format_results(results)
+                count_text = update_results_count(results)
+                return formatted_results, count_text
+            # Function to download results as CSV
+            def download_results(results_df):
+                if results_df.empty:
+                    return None
+                # Convert back to original format for download
+                download_df = df[df["unique_id"].isin(results_df["unique_id"])]
+                # Save to temporary CSV file
+                temp_csv = "temp_results.csv"
+                download_df.to_csv(temp_csv, index=False)
+                return temp_csv
+            # Connect the search button
+            search_btn.click(
+                fn=perform_search,
+                inputs=[
+                    min_followers_slider,
+                    max_followers_slider,
+                    min_videos_slider,
+                    max_videos_slider,
+                    signature_input,
+                    region_input,
+                    has_email_checkbox,
+                ],
+                outputs=[search_results, results_count],
+            )
+            # Connect the download button
+            download_btn.click(
+                fn=download_results,
+                inputs=[search_results],
+                outputs=[gr.File(label="Download")],
+            )
+        with gr.Tab("Statistics"):
+            gr.Markdown("## Database Statistics")
+            # Calculate some basic statistics
+            total_creators = len(df)
+            total_followers = df["follower_count"].sum()
+            avg_followers = df["follower_count"].mean()
+            median_followers = df["follower_count"].median()
+            max_followers = df["follower_count"].max()
+            stats_md = f"""
+            - Total Creators: {total_creators:,}
+            - Total Followers: {total_followers:,}
+            - Average Followers: {avg_followers:,.2f}
+            - Median Followers: {median_followers:,}
+            - Max Followers: {max_followers:,}
+            """
+            gr.Markdown(stats_md)
+        with gr.Tab("Maintenance"):
+            gr.Markdown("## Database Maintenance")
+            # Get processed files info
+            processed_files = get_processed_files()
+            maintenance_md = f"""
+            - Total processed files: {len(processed_files)}
+            - Last update: {time.ctime(CACHE_FILE.stat().st_mtime) if CACHE_FILE.exists() else 'Never'}
+            """
+            gr.Markdown(maintenance_md)
+            with gr.Row():
+                force_reload_btn = gr.Button("Force Reload All Files")
+                reload_status = gr.Markdown("Click to reload all files from scratch")
+            def reload_all_files():
+                return "Reloading all files... This may take a while. Please restart the application."
+            force_reload_btn.click(
+                fn=reload_all_files, inputs=[], outputs=reload_status
+            )
+    return interface
+def main():
+    print("Loading TikTok creator data...")
+    df = load_data()
+    print(f"Loaded {len(df):,} creator profiles")
+    # Create and launch the interface
+    interface = create_interface(df)
+    interface.launch(share=True, server_name="0.0.0.0")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+pandas
+gradio
+pyarrow
+pip-chillpython