Actividad: Regresión Lineal

El objetivo es aplicar una técnica de regresión lineal al valor medio de un hogar según su ubicación en los distritos de California.

Como hemos comentario anteriormente, el proceso de aprendizaje automático consta de una serie de pasos.

  • Preparación de datos: carga y limpieza de datos

  • Selección de atributos o métricas adecuadas

  • Selección de la técnica a aplicar: LinearRegression

  • Ajuste de los hiperparámetros

  • Evaluación del modelo

[ ]:
# 0. Al inicio, incluímos las librerías pertinentes

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
import pandas as pd

[ ]:
# 1º. Cargamos los datos

data = fetch_california_housing(as_frame=True)
print(data.feature_names)
print("-"*100)

df = data.frame #obtenemos el dataframe
print(df.head())
[ ]:
df.describe()

Attribute information:

  • MedInc median income in block group

  • HouseAge median house age in block group

  • AveRooms average number of rooms per household

  • AveBedrms average number of bedrooms per household

  • Population block group population

  • AveOccup average number of household members

  • Latitude block group latitude

  • Longitude block group longitude

The target variable is the median house value (MedHouseVal) for California districts, expressed in hundreds of thousands of dollars ($100,000).

[ ]:
# Vuestro turno...

License: CC BY 4.0 Isaac Lera and Gabriel Moya Universitat de les Illes Balears isaac.lera@uib.edu, gabriel.moya@uib.edu

[ ]:
#TODO Anexo - We include a new feature: BeachDistance

# This part of the script needs more libraries since we get the coastline data from a WFS service.
# Required dependencies:
#   uv add owslib shapely requests
#   pip install owslib shapely requests

## We load the dataset and after, we save the file in a new version of the dataset.
## This process requires a lot of time to run.
## So, we cannot launch it every time we want to run the script.!!!!!!

## The process call to externa services, so maybe we do not obtain the data...
## And it is more simple using other tools like QGIS or ArcGIS!!!
# Note: this is a simplified version of the coastline, we can avoid the external services
california_coastline_points = [
    # Northern California
    (-124.4, 42.0), (-124.3, 41.8), (-124.2, 41.5), (-124.1, 41.2),
    (-124.0, 41.0), (-123.9, 40.8), (-123.8, 40.6), (-123.7, 40.4),
    (-123.6, 40.2), (-123.5, 40.0), (-123.4, 39.8), (-123.3, 39.6),
    (-123.2, 39.4), (-123.1, 39.2), (-123.0, 39.0), (-122.9, 38.8),
    (-122.8, 38.6), (-122.7, 38.4), (-122.6, 38.2), (-122.5, 38.0),
    # Central California (San Francisco Bay area)
    (-122.4, 37.8), (-122.3, 37.6), (-122.2, 37.4), (-122.1, 37.2),
    (-122.0, 37.0), (-121.9, 36.8), (-121.8, 36.6), (-121.7, 36.4),
    # Central Coast
    (-121.6, 36.2), (-121.5, 36.0), (-121.4, 35.8), (-121.3, 35.6),
    (-121.2, 35.4), (-121.1, 35.2), (-121.0, 35.0), (-120.9, 34.8),
    (-120.8, 34.6), (-120.7, 34.4), (-120.6, 34.2), (-120.5, 34.0),
    # Southern California1
    (-120.4, 33.8), (-120.3, 33.6), (-120.2, 33.4), (-120.1, 33.2),
    (-120.0, 33.0), (-119.9, 32.8), (-119.8, 32.6), (-119.7, 32.4),
    (-119.6, 32.2), (-119.5, 32.0), (-119.4, 31.8), (-119.3, 31.6),
    (-119.2, 31.4), (-119.1, 31.2), (-119.0, 31.0), (-118.9, 30.8),
    (-118.8, 30.6), (-118.7, 30.4), (-118.6, 30.2), (-118.5, 30.0),
]

geometry_california_coastline = [LineString(california_coastline_points)]

# FIRST: We declare the functions
from owslib.wfs import WebFeatureService
from shapely.geometry import Point, LineString, MultiLineString
from shapely.ops import nearest_points
import requests
import json
from typing import Optional
import sys
import pandas as pd
def fetch_coastline_arcgis(service_url: str) -> Optional[list]:
    try:
        # Query the ArcGIS service
        query_url = f"{service_url}/query"
        params = {
            'where': '1=1',
            'outFields': '*',
            'f': 'geojson',
            'outSR': '4326'
        }

        response = requests.get(query_url, params=params, timeout=30)
        response.raise_for_status()

        geojson_data = response.json()

        # Convert to Shapely geometries
        geometries = []
        for feature in geojson_data.get('features', []):
            geom = feature['geometry']
            if geom['type'] == 'LineString':
                coords = geom['coordinates']
                geometries.append(LineString(coords))
            elif geom['type'] == 'MultiLineString':
                coords = geom['coordinates']
                geometries.append(MultiLineString(coords))
            elif geom['type'] == 'Polyline':
                # ArcGIS Polyline format
                paths = geom.get('paths', [])
                for path in paths:
                    geometries.append(LineString(path))

        return geometries
    except Exception as e:
        print(f"Error fetching from ArcGIS service: {e}")
        return None

def get_california_coastline() -> list:
    arcgis_services = [
        'https://gis.water.ca.gov/arcgis/rest/services/Boundaries/i03_WestCoastShoreline/FeatureServer/0',
        'https://gis.cnra.ca.gov/arcgis/rest/services/Ocean/CSMW_Coastal_Conditions/MapServer/0',
    ]

    for service_url in arcgis_services:
        coastline = fetch_coastline_arcgis(service_url)
        if coastline:
            print(f"Successfully fetched coastline from: {service_url}")
            return coastline


    print("Warning: Could not fetch coastline from services. ")
    sys.exit(1)

def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    """
    Calculate the great circle distance between two points on Earth.
    Returns distance in kilometers.

    Args:
        lat1, lon1: Latitude and longitude of first point
        lat2, lon2: Latitude and longitude of second point

    Returns:
        Distance in kilometers
    """
    from math import radians, sin, cos, sqrt, atan2

    # Earth radius in kilometers
    R = 6371.0

    # Convert to radians
    lat1_rad = radians(lat1)
    lon1_rad = radians(lon1)
    lat2_rad = radians(lat2)
    lon2_rad = radians(lon2)

    # Haversine formula
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    a = sin(dlat / 2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

def calculate_distance_to_coastline(lat: float, lon: float, coastline_geoms: list) -> float:
    point = Point(lon, lat)  # Note: Shapely uses (x, y) = (lon, lat)

    min_distance = float('inf')

    for geom in coastline_geoms:
        # Find the nearest point on the coastline
        nearest_geom_point, nearest_coast_point = nearest_points(point, geom)

        # Get coordinates of nearest point on coastline
        coast_lon, coast_lat = nearest_coast_point.coords[0]

        # Calculate distance using Haversine formula for accuracy
        distance_km = haversine_distance(lat, lon, coast_lat, coast_lon)

        min_distance = min(min_distance, distance_km)

    return min_distance

def add_beach_distance_feature(df: pd.DataFrame, coastline_geoms: list) -> pd.DataFrame:
    print("Calculating distances to coastline...")
    distances = []

    for idx, row in df.iterrows():
        lat = row['Latitude']
        lon = row['Longitude']
        distance = calculate_distance_to_coastline(lat, lon, coastline_geoms)
        distances.append(distance)

        if (idx + 1) % 1000 == 0:
            print(f"Processed {idx + 1}/{len(df)} houses...")

    df['BeachDistance'] = distances
    print(f"Completed! Added 'BeachDistance' feature to {len(df)} houses.")

    return df
[ ]:
# SECOND: We load the dataset and call the functions
## Require a lot of time to run...
### DO NOT RUN THIS CELL WITHOUT THINKING about the time it requires to run...
## If we call often to external services, we can get banned by the server...
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)
df = data.frame


[ ]:
coastline_geometries = get_california_coastline() #or geometry_california_coastline


[ ]:
df = add_beach_distance_feature(df, coastline_geometries)
[ ]:

print("\nBeach Distance Statistics:") print(df['BeachDistance'].describe()) print("-" * 100) print(df[['BeachDistance', 'MedHouseVal']].head(10))
[ ]:
df.to_csv("data/california_housing_with_beach_distance.csv", index=False)