_594.159.447.252 / 903_159_651_252.py

Update 903_159_651_252.py

279cf8d verified about 1 year ago

3.6 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns

	plt.style.use('seaborn-darkgrid')

	dataset_path = '/content/real_estate_texas_500_2024.csv'
	df = pd.read_csv(dataset_path)

	df.head()

	df['listPrice'] = df['listPrice'].fillna(df['listPrice'].mean())

	df.drop(columns=['baths_full_calc'], inplace=True)

	df.dropna(subset=['text'], inplace=True)

	df.info()

	plt.figure(figsize=(10, 6))
	sns.histplot(df['listPrice'], bins=30, kde=True)
	plt.title('Distribution of Listing Prices')
	plt.xlabel('Listing Price ($)')
	plt.ylabel('Frequency')
	plt.show()

	price_summary = df['listPrice'].describe()

	price_summary_df = pd.DataFrame(price_summary).transpose()

	price_summary_df

	plt.figure(figsize=(10, 6))
	sns.countplot(y = 'type', data=df, palette='Set2')
	plt.title('Count of Property Types')
	plt.xlabel('Count')
	plt.ylabel('Property Type')
	plt.show()

	type_counts = df['type'].value_counts().reset_index()

	type_counts.columns = ['Property Type', 'Count']

	type_counts

	type_counts = df['type'].value_counts().reset_index()

	type_counts.columns = ['Property Type', 'Count']

	type_counts

	yearly_summary = df.groupby('year_built').agg(
	Average_Listing_Price=('listPrice', 'mean'),
	Average_Square_Footage=('sqft', 'mean')
	).reset_index()

	yearly_summary['Average_Listing_Price'] = yearly_summary['Average_Listing_Price'].round(2)
	yearly_summary['Average_Square_Footage'] = yearly_summary['Average_Square_Footage'].round(2)

	yearly_summay = yearly_summary.sort_values(by='year_built')

	yearly_summary

	yearly_summary = df.groupby('year_built').agg(
	Average_Listing_Price=('listPrice', 'mean'),
	Average_Square_Footage=('sqft', 'mean')
	).reset_index()

	yearly_summary['Average_Listing_Price'] = yearly_summary['Average_Listing_Price'].round(2)
	yearly_summary['Average_Square_Footage'] = yearly_summary['Average_Square_Footage'].round(2)

	yearly_summary = yearly_summary.sort_values(by='year_built')

	yearly_summary

	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.decomposition import LatentDirichletAllocation

	vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
	dtm = vectorizer.fit_transform(df['text'])

	lda = LatentDirichletAllocation(n_components=5, random_state=42)
	lda.fit(dtm)

	def display_topics(model, feature_names, no_top_words):
	for topic_idx, topic in enumerate(model.components_):
	print(f"Theme {topic_idx+1}:")
	print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]]))
	print()

	display_topics(lda, vectorizer.get_feature_names_out(), 10)

	import matplotlib.pyplot as plt
	import seaborn as sns

	sns.set(style="whitegrid")

	fig, axes = plt.subplots(2, 2, figsize=(16, 12))
	fig.suptitle('Texas Real Estate Market Insights - 2024', fontsize=20)

	sns.histplot(df['listPrice'], kde=True, ax=axes[0,0], color='skyblue')
	axes[0,0].set_title('Distribution of Listing Prices')
	axes[0, 0].set_xlabel('Listing Price ($)')
	axes[0, 0].set_ylabel('Frequency')

	avg_price_by_type = df.groupby('type')['listPrice'].mean().sort_values()
	avg_price_by_type.plot(kind='barh', ax=axes[0,1], color='lightgreen')
	axes[0, 1].set_title('Average Listing Price by Property Type')
	axes[0, 1].set_xlabel('Average Listing Price ($)')
	axes[0, 1].set_ylabel('Property Type')

	properties_by_year = df.groupby('year_built').size()
	properties_by_year.plot(ax=axes[1, 0], color='salmon')
	axes[1, 0].set_title('Count of Properties by Year Built')
	axes[1, 0].set_xlabel('Year Built')
	axes[1, 0].set_ylabel('Count')

	plt.tight_layout(rect=[0, 0.03, 1, 0.95])

	fig.delaxes(axes[1,1])

	plt.show()