import pandas as pd
# Load data
data = pd.read_csv("E:\\kaggle\\housing.csv")
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 20640 non-null float64 1 latitude 20640 non-null float64 2 housing_median_age 20640 non-null float64 3 total_rooms 20640 non-null float64 4 total_bedrooms 20433 non-null float64 5 population 20640 non-null float64 6 households 20640 non-null float64 7 median_income 20640 non-null float64 8 median_house_value 20640 non-null float64 9 ocean_proximity 20640 non-null object dtypes: float64(9), object(1) memory usage: 1.6+ MB
#let's check for null values
# Print the sum of null values in each column
print("\nSum of Null Values in Each Column:")
print(data.isnull().sum())
Sum of Null Values in Each Column: longitude 0 latitude 0 housing_median_age 0 total_rooms 0 total_bedrooms 207 population 0 households 0 median_income 0 median_house_value 0 ocean_proximity 0 dtype: int64
# Drop the null values
data = data.dropna()
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 20433 entries, 0 to 20639 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 20433 non-null float64 1 latitude 20433 non-null float64 2 housing_median_age 20433 non-null float64 3 total_rooms 20433 non-null float64 4 total_bedrooms 20433 non-null float64 5 population 20433 non-null float64 6 households 20433 non-null float64 7 median_income 20433 non-null float64 8 median_house_value 20433 non-null float64 9 ocean_proximity 20433 non-null object dtypes: float64(9), object(1) memory usage: 1.7+ MB
#Checking for duplicated values and dropping them
data.duplicated()
data.drop_duplicates()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
20635 | -121.09 | 39.48 | 25.0 | 1665.0 | 374.0 | 845.0 | 330.0 | 1.5603 | 78100.0 | INLAND |
20636 | -121.21 | 39.49 | 18.0 | 697.0 | 150.0 | 356.0 | 114.0 | 2.5568 | 77100.0 | INLAND |
20637 | -121.22 | 39.43 | 17.0 | 2254.0 | 485.0 | 1007.0 | 433.0 | 1.7000 | 92300.0 | INLAND |
20638 | -121.32 | 39.43 | 18.0 | 1860.0 | 409.0 | 741.0 | 349.0 | 1.8672 | 84700.0 | INLAND |
20639 | -121.24 | 39.37 | 16.0 | 2785.0 | 616.0 | 1387.0 | 530.0 | 2.3886 | 89400.0 | INLAND |
20433 rows × 10 columns
import plotly.express as px
# Define custom colors for each ocean proximity category
colors = {'<1H OCEAN': 'blue', 'INLAND': 'red',
'NEAR OCEAN': 'green', 'NEAR BAY': 'orange', 'ISLAND': 'purple'}
# Create an interactive geographical distribution map
fig = px.scatter_mapbox(data, lat='latitude', lon='longitude', color='ocean_proximity',
color_discrete_map=colors, size_max=15, zoom=9, height=600,
labels={'ocean_proximity': 'Ocean Proximity'})
# Customize map layout
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
# Show the map
fig.show()
# Create a scatter plot on a map with color-coded markers based on median income
fig = px.scatter_mapbox(data, lat='latitude', lon='longitude', color='median_income',
color_continuous_scale='plasma', size_max=15, zoom=9, height=600,
labels={'median_income': 'Median Income'})
# Customize map layout
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
# Show the map
fig.show()
import plotly.express as px
# Create an interactive scatter plot on a map
fig = px.scatter_mapbox(
data,
lat = 'latitude',
lon = 'longitude',
color = 'housing_median_age',
color_continuous_scale=[[0, 'green'], [20/50, 'yellow'], [1, 'red']],
size_max = 15,
zoom = 10
)
# Customize the layout
fig.update_layout(mapbox_style = 'carto-positron',
mapbox_zoom = 10,
margin = dict(t=0, r=0, l=0, b=0))
# Display the plot
fig.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Calculate Population Density
data['population_density'] = data['population'] / data['households']
# Create a heatmap to visualize the correlation matrix
correlation_matrix = data[['population_density', 'total_rooms', 'total_bedrooms', 'households']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot = True, cmap = 'coolwarm', linewidths=.5)
plt.title('Correlation Heatmap: Population Density vs Housing Characterstics')
plt.show()
# Define colors for each scatter plots
colors = ['blue', 'green', 'orange', 'red']
# Create scatter plots to visualize the relationship between median income and housing features
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
# Scatter plot for Total Rooms vs Median Income
axs[0, 0].scatter(data['median_income'], data['total_rooms'], alpha=0.5, color=colors[0])
axs[0, 0].set_title('Total Rooms vs Median Income')
axs[0, 0].set_xlabel('Median Income')
axs[0, 0].set_ylabel('Total Rooms')
# Scatter plot for Total Bedrooms vs Median Income
axs[0, 1].scatter(data['median_income'], data['total_bedrooms'], alpha=0.5, color=colors[1])
axs[0, 1].set_title('Total Bedrooms vs Median Income')
axs[0, 1].set_xlabel('Median Income')
axs[0, 1].set_ylabel('Total Bedrooms')
# Scatter plot for Households vs Median Income
axs[1, 0].scatter(data['median_income'], data['households'], alpha=0.5, color=colors[2])
axs[1, 0].set_title('Households vs Median Income')
axs[1, 0].set_xlabel('Median Income')
axs[1, 0].set_ylabel('Households')
# Scatter plot for Population vs Median Income
axs[1, 1].scatter(data['median_income'], data['population'], alpha=0.5, color=colors[3])
axs[1, 1].set_title('Population vs Median Income')
axs[1, 1].set_xlabel('Median Income')
axs[1, 1].set_ylabel('Population')
plt.tight_layout()
plt.show()