Actividad: Regresión Lineal
El objetivo es aplicar una técnica de regresión lineal al valor medio de un hogar según su ubicación en los distritos de California.
Dataset: https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset
Regresión lineal: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
Como hemos comentario anteriormente, el proceso de aprendizaje automático consta de una serie de pasos.
Preparación de datos: carga y limpieza de datos
Selección de atributos o métricas adecuadas
Selección de la técnica a aplicar: LinearRegression
Ajuste de los hiperparámetros
Evaluación del modelo
[ ]:
# 0. Al inicio, incluímos las librerías pertinentes
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
import pandas as pd
[ ]:
# 1º. Cargamos los datos
data = fetch_california_housing(as_frame=True)
print(data.feature_names)
print("-"*100)
df = data.frame #obtenemos el dataframe
print(df.head())
[ ]:
df.describe()
Attribute information:
MedInc median income in block group
HouseAge median house age in block group
AveRooms average number of rooms per household
AveBedrms average number of bedrooms per household
Population block group population
AveOccup average number of household members
Latitude block group latitude
Longitude block group longitude
The target variable is the median house value (MedHouseVal) for California districts, expressed in hundreds of thousands of dollars ($100,000).
[ ]:
# Vuestro turno...
Isaac Lera and Gabriel Moya Universitat de les Illes Balears isaac.lera@uib.edu, gabriel.moya@uib.edu
[ ]:
#TODO Anexo - We include a new feature: BeachDistance
# This part of the script needs more libraries since we get the coastline data from a WFS service.
# Required dependencies:
# uv add owslib shapely requests
# pip install owslib shapely requests
## We load the dataset and after, we save the file in a new version of the dataset.
## This process requires a lot of time to run.
## So, we cannot launch it every time we want to run the script.!!!!!!
## The process call to externa services, so maybe we do not obtain the data...
## And it is more simple using other tools like QGIS or ArcGIS!!!
# Note: this is a simplified version of the coastline, we can avoid the external services
california_coastline_points = [
# Northern California
(-124.4, 42.0), (-124.3, 41.8), (-124.2, 41.5), (-124.1, 41.2),
(-124.0, 41.0), (-123.9, 40.8), (-123.8, 40.6), (-123.7, 40.4),
(-123.6, 40.2), (-123.5, 40.0), (-123.4, 39.8), (-123.3, 39.6),
(-123.2, 39.4), (-123.1, 39.2), (-123.0, 39.0), (-122.9, 38.8),
(-122.8, 38.6), (-122.7, 38.4), (-122.6, 38.2), (-122.5, 38.0),
# Central California (San Francisco Bay area)
(-122.4, 37.8), (-122.3, 37.6), (-122.2, 37.4), (-122.1, 37.2),
(-122.0, 37.0), (-121.9, 36.8), (-121.8, 36.6), (-121.7, 36.4),
# Central Coast
(-121.6, 36.2), (-121.5, 36.0), (-121.4, 35.8), (-121.3, 35.6),
(-121.2, 35.4), (-121.1, 35.2), (-121.0, 35.0), (-120.9, 34.8),
(-120.8, 34.6), (-120.7, 34.4), (-120.6, 34.2), (-120.5, 34.0),
# Southern California1
(-120.4, 33.8), (-120.3, 33.6), (-120.2, 33.4), (-120.1, 33.2),
(-120.0, 33.0), (-119.9, 32.8), (-119.8, 32.6), (-119.7, 32.4),
(-119.6, 32.2), (-119.5, 32.0), (-119.4, 31.8), (-119.3, 31.6),
(-119.2, 31.4), (-119.1, 31.2), (-119.0, 31.0), (-118.9, 30.8),
(-118.8, 30.6), (-118.7, 30.4), (-118.6, 30.2), (-118.5, 30.0),
]
geometry_california_coastline = [LineString(california_coastline_points)]
# FIRST: We declare the functions
from owslib.wfs import WebFeatureService
from shapely.geometry import Point, LineString, MultiLineString
from shapely.ops import nearest_points
import requests
import json
from typing import Optional
import sys
import pandas as pd
def fetch_coastline_arcgis(service_url: str) -> Optional[list]:
try:
# Query the ArcGIS service
query_url = f"{service_url}/query"
params = {
'where': '1=1',
'outFields': '*',
'f': 'geojson',
'outSR': '4326'
}
response = requests.get(query_url, params=params, timeout=30)
response.raise_for_status()
geojson_data = response.json()
# Convert to Shapely geometries
geometries = []
for feature in geojson_data.get('features', []):
geom = feature['geometry']
if geom['type'] == 'LineString':
coords = geom['coordinates']
geometries.append(LineString(coords))
elif geom['type'] == 'MultiLineString':
coords = geom['coordinates']
geometries.append(MultiLineString(coords))
elif geom['type'] == 'Polyline':
# ArcGIS Polyline format
paths = geom.get('paths', [])
for path in paths:
geometries.append(LineString(path))
return geometries
except Exception as e:
print(f"Error fetching from ArcGIS service: {e}")
return None
def get_california_coastline() -> list:
arcgis_services = [
'https://gis.water.ca.gov/arcgis/rest/services/Boundaries/i03_WestCoastShoreline/FeatureServer/0',
'https://gis.cnra.ca.gov/arcgis/rest/services/Ocean/CSMW_Coastal_Conditions/MapServer/0',
]
for service_url in arcgis_services:
coastline = fetch_coastline_arcgis(service_url)
if coastline:
print(f"Successfully fetched coastline from: {service_url}")
return coastline
print("Warning: Could not fetch coastline from services. ")
sys.exit(1)
def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
"""
Calculate the great circle distance between two points on Earth.
Returns distance in kilometers.
Args:
lat1, lon1: Latitude and longitude of first point
lat2, lon2: Latitude and longitude of second point
Returns:
Distance in kilometers
"""
from math import radians, sin, cos, sqrt, atan2
# Earth radius in kilometers
R = 6371.0
# Convert to radians
lat1_rad = radians(lat1)
lon1_rad = radians(lon1)
lat2_rad = radians(lat2)
lon2_rad = radians(lon2)
# Haversine formula
dlat = lat2_rad - lat1_rad
dlon = lon2_rad - lon1_rad
a = sin(dlat / 2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
return distance
def calculate_distance_to_coastline(lat: float, lon: float, coastline_geoms: list) -> float:
point = Point(lon, lat) # Note: Shapely uses (x, y) = (lon, lat)
min_distance = float('inf')
for geom in coastline_geoms:
# Find the nearest point on the coastline
nearest_geom_point, nearest_coast_point = nearest_points(point, geom)
# Get coordinates of nearest point on coastline
coast_lon, coast_lat = nearest_coast_point.coords[0]
# Calculate distance using Haversine formula for accuracy
distance_km = haversine_distance(lat, lon, coast_lat, coast_lon)
min_distance = min(min_distance, distance_km)
return min_distance
def add_beach_distance_feature(df: pd.DataFrame, coastline_geoms: list) -> pd.DataFrame:
print("Calculating distances to coastline...")
distances = []
for idx, row in df.iterrows():
lat = row['Latitude']
lon = row['Longitude']
distance = calculate_distance_to_coastline(lat, lon, coastline_geoms)
distances.append(distance)
if (idx + 1) % 1000 == 0:
print(f"Processed {idx + 1}/{len(df)} houses...")
df['BeachDistance'] = distances
print(f"Completed! Added 'BeachDistance' feature to {len(df)} houses.")
return df
[ ]:
# SECOND: We load the dataset and call the functions
## Require a lot of time to run...
### DO NOT RUN THIS CELL WITHOUT THINKING about the time it requires to run...
## If we call often to external services, we can get banned by the server...
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)
df = data.frame
[ ]:
coastline_geometries = get_california_coastline() #or geometry_california_coastline
[ ]:
df = add_beach_distance_feature(df, coastline_geometries)
[ ]:
print("\nBeach Distance Statistics:")
print(df['BeachDistance'].describe())
print("-" * 100)
print(df[['BeachDistance', 'MedHouseVal']].head(10))
[ ]:
df.to_csv("data/california_housing_with_beach_distance.csv", index=False)