|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
plt.style.use('seaborn-darkgrid') |
|
|
|
dataset_path = '/content/real_estate_texas_500_2024.csv' |
|
df = pd.read_csv(dataset_path) |
|
|
|
df.head() |
|
|
|
df['listPrice'] = df['listPrice'].fillna(df['listPrice'].mean()) |
|
|
|
df.drop(columns=['baths_full_calc'], inplace=True) |
|
|
|
df.dropna(subset=['text'], inplace=True) |
|
|
|
df.info() |
|
|
|
plt.figure(figsize=(10, 6)) |
|
sns.histplot(df['listPrice'], bins=30, kde=True) |
|
plt.title('Distribution of Listing Prices') |
|
plt.xlabel('Listing Price ($)') |
|
plt.ylabel('Frequency') |
|
plt.show() |
|
|
|
price_summary = df['listPrice'].describe() |
|
|
|
price_summary_df = pd.DataFrame(price_summary).transpose() |
|
|
|
price_summary_df |
|
|
|
plt.figure(figsize=(10, 6)) |
|
sns.countplot(y = 'type', data=df, palette='Set2') |
|
plt.title('Count of Property Types') |
|
plt.xlabel('Count') |
|
plt.ylabel('Property Type') |
|
plt.show() |
|
|
|
type_counts = df['type'].value_counts().reset_index() |
|
|
|
type_counts.columns = ['Property Type', 'Count'] |
|
|
|
type_counts |
|
|
|
type_counts = df['type'].value_counts().reset_index() |
|
|
|
type_counts.columns = ['Property Type', 'Count'] |
|
|
|
type_counts |
|
|
|
yearly_summary = df.groupby('year_built').agg( |
|
Average_Listing_Price=('listPrice', 'mean'), |
|
Average_Square_Footage=('sqft', 'mean') |
|
).reset_index() |
|
|
|
yearly_summary['Average_Listing_Price'] = yearly_summary['Average_Listing_Price'].round(2) |
|
yearly_summary['Average_Square_Footage'] = yearly_summary['Average_Square_Footage'].round(2) |
|
|
|
yearly_summay = yearly_summary.sort_values(by='year_built') |
|
|
|
yearly_summary |
|
|
|
yearly_summary = df.groupby('year_built').agg( |
|
Average_Listing_Price=('listPrice', 'mean'), |
|
Average_Square_Footage=('sqft', 'mean') |
|
).reset_index() |
|
|
|
yearly_summary['Average_Listing_Price'] = yearly_summary['Average_Listing_Price'].round(2) |
|
yearly_summary['Average_Square_Footage'] = yearly_summary['Average_Square_Footage'].round(2) |
|
|
|
yearly_summary = yearly_summary.sort_values(by='year_built') |
|
|
|
yearly_summary |
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.decomposition import LatentDirichletAllocation |
|
|
|
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') |
|
dtm = vectorizer.fit_transform(df['text']) |
|
|
|
lda = LatentDirichletAllocation(n_components=5, random_state=42) |
|
lda.fit(dtm) |
|
|
|
def display_topics(model, feature_names, no_top_words): |
|
for topic_idx, topic in enumerate(model.components_): |
|
print(f"Theme {topic_idx+1}:") |
|
print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]])) |
|
print() |
|
|
|
display_topics(lda, vectorizer.get_feature_names_out(), 10) |
|
|
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
sns.set(style="whitegrid") |
|
|
|
fig, axes = plt.subplots(2, 2, figsize=(16, 12)) |
|
fig.suptitle('Texas Real Estate Market Insights - 2024', fontsize=20) |
|
|
|
sns.histplot(df['listPrice'], kde=True, ax=axes[0,0], color='skyblue') |
|
axes[0,0].set_title('Distribution of Listing Prices') |
|
axes[0, 0].set_xlabel('Listing Price ($)') |
|
axes[0, 0].set_ylabel('Frequency') |
|
|
|
avg_price_by_type = df.groupby('type')['listPrice'].mean().sort_values() |
|
avg_price_by_type.plot(kind='barh', ax=axes[0,1], color='lightgreen') |
|
axes[0, 1].set_title('Average Listing Price by Property Type') |
|
axes[0, 1].set_xlabel('Average Listing Price ($)') |
|
axes[0, 1].set_ylabel('Property Type') |
|
|
|
properties_by_year = df.groupby('year_built').size() |
|
properties_by_year.plot(ax=axes[1, 0], color='salmon') |
|
axes[1, 0].set_title('Count of Properties by Year Built') |
|
axes[1, 0].set_xlabel('Year Built') |
|
axes[1, 0].set_ylabel('Count') |
|
|
|
plt.tight_layout(rect=[0, 0.03, 1, 0.95]) |
|
|
|
fig.delaxes(axes[1,1]) |
|
|
|
plt.show() |