File size: 2,300 Bytes
1b73364 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# generate the 100K record CSV file
#
import polars as pl
import numpy as np
from datetime import datetime, timedelta
def generate(nrows: int, filename: str):
names = np.asarray(
[
"Laptop",
"Smartphone",
"Desk",
"Chair",
"Monitor",
"Printer",
"Paper",
"Pen",
"Notebook",
"Coffee Maker",
"Cabinet",
"Plastic Cups",
]
)
categories = np.asarray(
[
"Electronics",
"Electronics",
"Office",
"Office",
"Electronics",
"Electronics",
"Stationery",
"Stationery",
"Stationery",
"Electronics",
"Office",
"Sundry",
]
)
product_id = np.random.randint(len(names), size=nrows)
quantity = np.random.randint(1, 11, size=nrows)
price = np.random.randint(199, 10000, size=nrows) / 100
# Generate random dates between 2010-01-01 and 2023-12-31
start_date = datetime(2010, 1, 1)
end_date = datetime(2023, 12, 31)
date_range = (end_date - start_date).days
# Create random dates as np.array and convert to string format
order_dates = np.array([(start_date + timedelta(days=np.random.randint(0, date_range))).strftime('%Y-%m-%d') for _ in range(nrows)])
# Define columns
columns = {
"order_id": np.arange(nrows),
"order_date": order_dates,
"customer_id": np.random.randint(100, 1000, size=nrows),
"customer_name": [f"Customer_{i}" for i in np.random.randint(2**15, size=nrows)],
"product_id": product_id + 200,
"product_names": names[product_id],
"categories": categories[product_id],
"quantity": quantity,
"price": price,
"total": price * quantity,
}
# Create Polars DataFrame and write to CSV with explicit delimiter
df = pl.DataFrame(columns)
df.write_csv(filename, separator=',',include_header=True) # Ensure comma is used as the delimiter
# Generate 100,000 rows of data with random order_date and save to CSV
generate(100_000, "D:/Python_Projects/dashboard/sales_data.csv") |