File size: 3,603 Bytes
8b28677 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-darkgrid')
dataset_path = '/content/real_estate_texas_500_2024.csv'
df = pd.read_csv(dataset_path)
df.head()
df['listPrice'] = df['listPrice'].fillna(df['listPrice'].mean())
df.drop(columns=['baths_full_calc'], inplace=True)
df.dropna(subset=['text'], inplace=True)
df.info()
plt.figure(figsize=(10, 6))
sns.histplot(df['listPrice'], bins=30, kde=True)
plt.title('Distribution of Listing Prices')
plt.xlabel('Listing Price ($)')
plt.ylabel('Frequency')
plt.show()
price_summary = df['listPrice'].describe()
price_summary_df = pd.DataFrame(price_summary).transpose()
price_summary_df
plt.figure(figsize=(10, 6))
sns.countplot(y = 'type', data=df, palette='Set2')
plt.title('Count of Property Types')
plt.xlabel('Count')
plt.ylabel('Property Type')
plt.show()
type_counts = df['type'].value_counts().reset_index()
type_counts.columns = ['Property Type', 'Count']
type_counts
type_counts = df['type'].value_counts().reset_index()
type_counts.columns = ['Property Type', 'Count']
type_counts
yearly_summary = df.groupby('year_built').agg(
Average_Listing_Price=('listPrice', 'mean'),
Average_Square_Footage=('sqft', 'mean')
).reset_index()
yearly_summary['Average_Listing_Price'] = yearly_summary['Average_Listing_Price'].round(2)
yearly_summary['Average_Square_Footage'] = yearly_summary['Average_Square_Footage'].round(2)
yearly_summay = yearly_summary.sort_values(by='year_built')
yearly_summary
yearly_summary = df.groupby('year_built').agg(
Average_Listing_Price=('listPrice', 'mean'),
Average_Square_Footage=('sqft', 'mean')
).reset_index()
yearly_summary['Average_Listing_Price'] = yearly_summary['Average_Listing_Price'].round(2)
yearly_summary['Average_Square_Footage'] = yearly_summary['Average_Square_Footage'].round(2)
yearly_summary = yearly_summary.sort_values(by='year_built')
yearly_summary
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['text'])
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print(f"Theme {topic_idx+1}:")
print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]]))
print()
display_topics(lda, vectorizer.get_feature_names_out(), 10)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Texas Real Estate Market Insights - 2024', fontsize=20)
sns.histplot(df['listPrice'], kde=True, ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Distribution of Listing Prices')
axes[0, 0].set_xlabel('Listing Price ($)')
axes[0, 0].set_ylabel('Frequency')
avg_price_by_type = df.groupby('type')['listPrice'].mean().sort_values()
avg_price_by_type.plot(kind='barh', ax=axes[0,1], color='lightgreen')
axes[0, 1].set_title('Average Listing Price by Property Type')
axes[0, 1].set_xlabel('Average Listing Price ($)')
axes[0, 1].set_ylabel('Property Type')
properties_by_year = df.groupby('year_built').size()
properties_by_year.plot(ax=axes[1, 0], color='salmon')
axes[1, 0].set_title('Count of Properties by Year Built')
axes[1, 0].set_xlabel('Year Built')
axes[1, 0].set_ylabel('Count')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
fig.delaxes(axes[1,1])
plt.show() |