antitheft159 commited on
Commit
8b28677
·
verified ·
1 Parent(s): e6d9ad9

Upload 903_159_651_252.py

Browse files
Files changed (1) hide show
  1. 903_159_651_252.py +132 -0
903_159_651_252.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """903.159.651.252
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1UBJL9vF_K8ZO_vRZkvTES3G8LBRGjzGP
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ import seaborn as sns
14
+
15
+ plt.style.use('seaborn-darkgrid')
16
+
17
+ dataset_path = '/content/real_estate_texas_500_2024.csv'
18
+ df = pd.read_csv(dataset_path)
19
+
20
+ df.head()
21
+
22
+ df['listPrice'] = df['listPrice'].fillna(df['listPrice'].mean())
23
+
24
+ df.drop(columns=['baths_full_calc'], inplace=True)
25
+
26
+ df.dropna(subset=['text'], inplace=True)
27
+
28
+ df.info()
29
+
30
+ plt.figure(figsize=(10, 6))
31
+ sns.histplot(df['listPrice'], bins=30, kde=True)
32
+ plt.title('Distribution of Listing Prices')
33
+ plt.xlabel('Listing Price ($)')
34
+ plt.ylabel('Frequency')
35
+ plt.show()
36
+
37
+ price_summary = df['listPrice'].describe()
38
+
39
+ price_summary_df = pd.DataFrame(price_summary).transpose()
40
+
41
+ price_summary_df
42
+
43
+ plt.figure(figsize=(10, 6))
44
+ sns.countplot(y = 'type', data=df, palette='Set2')
45
+ plt.title('Count of Property Types')
46
+ plt.xlabel('Count')
47
+ plt.ylabel('Property Type')
48
+ plt.show()
49
+
50
+ type_counts = df['type'].value_counts().reset_index()
51
+
52
+ type_counts.columns = ['Property Type', 'Count']
53
+
54
+ type_counts
55
+
56
+ type_counts = df['type'].value_counts().reset_index()
57
+
58
+ type_counts.columns = ['Property Type', 'Count']
59
+
60
+ type_counts
61
+
62
+ yearly_summary = df.groupby('year_built').agg(
63
+ Average_Listing_Price=('listPrice', 'mean'),
64
+ Average_Square_Footage=('sqft', 'mean')
65
+ ).reset_index()
66
+
67
+ yearly_summary['Average_Listing_Price'] = yearly_summary['Average_Listing_Price'].round(2)
68
+ yearly_summary['Average_Square_Footage'] = yearly_summary['Average_Square_Footage'].round(2)
69
+
70
+ yearly_summay = yearly_summary.sort_values(by='year_built')
71
+
72
+ yearly_summary
73
+
74
+ yearly_summary = df.groupby('year_built').agg(
75
+ Average_Listing_Price=('listPrice', 'mean'),
76
+ Average_Square_Footage=('sqft', 'mean')
77
+ ).reset_index()
78
+
79
+ yearly_summary['Average_Listing_Price'] = yearly_summary['Average_Listing_Price'].round(2)
80
+ yearly_summary['Average_Square_Footage'] = yearly_summary['Average_Square_Footage'].round(2)
81
+
82
+ yearly_summary = yearly_summary.sort_values(by='year_built')
83
+
84
+ yearly_summary
85
+
86
+ from sklearn.feature_extraction.text import CountVectorizer
87
+ from sklearn.decomposition import LatentDirichletAllocation
88
+
89
+ vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
90
+ dtm = vectorizer.fit_transform(df['text'])
91
+
92
+ lda = LatentDirichletAllocation(n_components=5, random_state=42)
93
+ lda.fit(dtm)
94
+
95
+ def display_topics(model, feature_names, no_top_words):
96
+ for topic_idx, topic in enumerate(model.components_):
97
+ print(f"Theme {topic_idx+1}:")
98
+ print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]]))
99
+ print()
100
+
101
+ display_topics(lda, vectorizer.get_feature_names_out(), 10)
102
+
103
+ import matplotlib.pyplot as plt
104
+ import seaborn as sns
105
+
106
+ sns.set(style="whitegrid")
107
+
108
+ fig, axes = plt.subplots(2, 2, figsize=(16, 12))
109
+ fig.suptitle('Texas Real Estate Market Insights - 2024', fontsize=20)
110
+
111
+ sns.histplot(df['listPrice'], kde=True, ax=axes[0,0], color='skyblue')
112
+ axes[0,0].set_title('Distribution of Listing Prices')
113
+ axes[0, 0].set_xlabel('Listing Price ($)')
114
+ axes[0, 0].set_ylabel('Frequency')
115
+
116
+ avg_price_by_type = df.groupby('type')['listPrice'].mean().sort_values()
117
+ avg_price_by_type.plot(kind='barh', ax=axes[0,1], color='lightgreen')
118
+ axes[0, 1].set_title('Average Listing Price by Property Type')
119
+ axes[0, 1].set_xlabel('Average Listing Price ($)')
120
+ axes[0, 1].set_ylabel('Property Type')
121
+
122
+ properties_by_year = df.groupby('year_built').size()
123
+ properties_by_year.plot(ax=axes[1, 0], color='salmon')
124
+ axes[1, 0].set_title('Count of Properties by Year Built')
125
+ axes[1, 0].set_xlabel('Year Built')
126
+ axes[1, 0].set_ylabel('Count')
127
+
128
+ plt.tight_layout(rect=[0, 0.03, 1, 0.95])
129
+
130
+ fig.delaxes(axes[1,1])
131
+
132
+ plt.show()