_594.159.447.252 / 903_159_651_252.py
antitheft159's picture
Update 903_159_651_252.py
279cf8d verified
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-darkgrid')
dataset_path = '/content/real_estate_texas_500_2024.csv'
df = pd.read_csv(dataset_path)
df.head()
df['listPrice'] = df['listPrice'].fillna(df['listPrice'].mean())
df.drop(columns=['baths_full_calc'], inplace=True)
df.dropna(subset=['text'], inplace=True)
df.info()
plt.figure(figsize=(10, 6))
sns.histplot(df['listPrice'], bins=30, kde=True)
plt.title('Distribution of Listing Prices')
plt.xlabel('Listing Price ($)')
plt.ylabel('Frequency')
plt.show()
price_summary = df['listPrice'].describe()
price_summary_df = pd.DataFrame(price_summary).transpose()
price_summary_df
plt.figure(figsize=(10, 6))
sns.countplot(y = 'type', data=df, palette='Set2')
plt.title('Count of Property Types')
plt.xlabel('Count')
plt.ylabel('Property Type')
plt.show()
type_counts = df['type'].value_counts().reset_index()
type_counts.columns = ['Property Type', 'Count']
type_counts
type_counts = df['type'].value_counts().reset_index()
type_counts.columns = ['Property Type', 'Count']
type_counts
yearly_summary = df.groupby('year_built').agg(
Average_Listing_Price=('listPrice', 'mean'),
Average_Square_Footage=('sqft', 'mean')
).reset_index()
yearly_summary['Average_Listing_Price'] = yearly_summary['Average_Listing_Price'].round(2)
yearly_summary['Average_Square_Footage'] = yearly_summary['Average_Square_Footage'].round(2)
yearly_summay = yearly_summary.sort_values(by='year_built')
yearly_summary
yearly_summary = df.groupby('year_built').agg(
Average_Listing_Price=('listPrice', 'mean'),
Average_Square_Footage=('sqft', 'mean')
).reset_index()
yearly_summary['Average_Listing_Price'] = yearly_summary['Average_Listing_Price'].round(2)
yearly_summary['Average_Square_Footage'] = yearly_summary['Average_Square_Footage'].round(2)
yearly_summary = yearly_summary.sort_values(by='year_built')
yearly_summary
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['text'])
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print(f"Theme {topic_idx+1}:")
print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]]))
print()
display_topics(lda, vectorizer.get_feature_names_out(), 10)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Texas Real Estate Market Insights - 2024', fontsize=20)
sns.histplot(df['listPrice'], kde=True, ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Distribution of Listing Prices')
axes[0, 0].set_xlabel('Listing Price ($)')
axes[0, 0].set_ylabel('Frequency')
avg_price_by_type = df.groupby('type')['listPrice'].mean().sort_values()
avg_price_by_type.plot(kind='barh', ax=axes[0,1], color='lightgreen')
axes[0, 1].set_title('Average Listing Price by Property Type')
axes[0, 1].set_xlabel('Average Listing Price ($)')
axes[0, 1].set_ylabel('Property Type')
properties_by_year = df.groupby('year_built').size()
properties_by_year.plot(ax=axes[1, 0], color='salmon')
axes[1, 0].set_title('Count of Properties by Year Built')
axes[1, 0].set_xlabel('Year Built')
axes[1, 0].set_ylabel('Count')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
fig.delaxes(axes[1,1])
plt.show()