{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Actividad: Regresión Lineal\n",
"\n",
"El objetivo es aplicar una técnica de regresión lineal al valor medio de un hogar según su ubicación en los distritos de California.\n",
"- Dataset: https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset\n",
"- Regresión lineal: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html\n",
"\n",
"Como hemos comentario anteriormente, el proceso de aprendizaje automático consta de una serie de pasos.\n",
"- Preparación de datos: carga y limpieza de datos \n",
"- Selección de atributos o métricas adecuadas \n",
"- Selección de la técnica a aplicar: LinearRegression\n",
"- Ajuste de los hiperparámetros\n",
"- Evaluación del modelo"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 0. Al inicio, incluímos las librerías pertinentes\n",
"\n",
"from sklearn.datasets import fetch_california_housing\n",
"from sklearn.linear_model import LinearRegression\n",
"import pandas as pd\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 1º. Cargamos los datos \n",
"\n",
"data = fetch_california_housing(as_frame=True)\n",
"print(data.feature_names)\n",
"print(\"-\"*100)\n",
"\n",
"df = data.frame #obtenemos el dataframe\n",
"print(df.head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Attribute information:\n",
"- MedInc median income in block group\n",
"- HouseAge median house age in block group\n",
"- AveRooms average number of rooms per household\n",
"- AveBedrms average number of bedrooms per household\n",
"- Population block group population\n",
"- AveOccup average number of household members\n",
"- Latitude block group latitude\n",
"- Longitude block group longitude\n",
"\n",
"The **target variable** is the median house value (*MedHouseVal*) for California districts, expressed in hundreds of thousands of dollars ($100,000)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Vuestro turno..."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[](https://creativecommons.org/licenses/by/4.0/)
\n",
"Isaac Lera and Gabriel Moya
\n",
"Universitat de les Illes Balears
\n",
"isaac.lera@uib.edu, gabriel.moya@uib.edu"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#TODO Anexo - We include a new feature: BeachDistance\n",
"\n",
"# This part of the script needs more libraries since we get the coastline data from a WFS service.\n",
"# Required dependencies:\n",
"# uv add owslib shapely requests\n",
"# pip install owslib shapely requests\n",
"\n",
"## We load the dataset and after, we save the file in a new version of the dataset.\n",
"## This process requires a lot of time to run.\n",
"## So, we cannot launch it every time we want to run the script.!!!!!!\n",
"\n",
"## The process call to externa services, so maybe we do not obtain the data...\n",
"## And it is more simple using other tools like QGIS or ArcGIS!!!\n",
"# Note: this is a simplified version of the coastline, we can avoid the external services\n",
"california_coastline_points = [\n",
" # Northern California\n",
" (-124.4, 42.0), (-124.3, 41.8), (-124.2, 41.5), (-124.1, 41.2),\n",
" (-124.0, 41.0), (-123.9, 40.8), (-123.8, 40.6), (-123.7, 40.4),\n",
" (-123.6, 40.2), (-123.5, 40.0), (-123.4, 39.8), (-123.3, 39.6),\n",
" (-123.2, 39.4), (-123.1, 39.2), (-123.0, 39.0), (-122.9, 38.8),\n",
" (-122.8, 38.6), (-122.7, 38.4), (-122.6, 38.2), (-122.5, 38.0),\n",
" # Central California (San Francisco Bay area)\n",
" (-122.4, 37.8), (-122.3, 37.6), (-122.2, 37.4), (-122.1, 37.2),\n",
" (-122.0, 37.0), (-121.9, 36.8), (-121.8, 36.6), (-121.7, 36.4),\n",
" # Central Coast\n",
" (-121.6, 36.2), (-121.5, 36.0), (-121.4, 35.8), (-121.3, 35.6),\n",
" (-121.2, 35.4), (-121.1, 35.2), (-121.0, 35.0), (-120.9, 34.8),\n",
" (-120.8, 34.6), (-120.7, 34.4), (-120.6, 34.2), (-120.5, 34.0),\n",
" # Southern California1\n",
" (-120.4, 33.8), (-120.3, 33.6), (-120.2, 33.4), (-120.1, 33.2),\n",
" (-120.0, 33.0), (-119.9, 32.8), (-119.8, 32.6), (-119.7, 32.4),\n",
" (-119.6, 32.2), (-119.5, 32.0), (-119.4, 31.8), (-119.3, 31.6),\n",
" (-119.2, 31.4), (-119.1, 31.2), (-119.0, 31.0), (-118.9, 30.8),\n",
" (-118.8, 30.6), (-118.7, 30.4), (-118.6, 30.2), (-118.5, 30.0),\n",
"]\n",
"\n",
"geometry_california_coastline = [LineString(california_coastline_points)]\n",
"\n",
"# FIRST: We declare the functions\n",
"from owslib.wfs import WebFeatureService\n",
"from shapely.geometry import Point, LineString, MultiLineString\n",
"from shapely.ops import nearest_points\n",
"import requests\n",
"import json\n",
"from typing import Optional\n",
"import sys\n",
"import pandas as pd\n",
"def fetch_coastline_arcgis(service_url: str) -> Optional[list]:\n",
" try:\n",
" # Query the ArcGIS service\n",
" query_url = f\"{service_url}/query\"\n",
" params = {\n",
" 'where': '1=1',\n",
" 'outFields': '*',\n",
" 'f': 'geojson',\n",
" 'outSR': '4326'\n",
" }\n",
" \n",
" response = requests.get(query_url, params=params, timeout=30)\n",
" response.raise_for_status()\n",
" \n",
" geojson_data = response.json()\n",
" \n",
" # Convert to Shapely geometries\n",
" geometries = []\n",
" for feature in geojson_data.get('features', []):\n",
" geom = feature['geometry']\n",
" if geom['type'] == 'LineString':\n",
" coords = geom['coordinates']\n",
" geometries.append(LineString(coords))\n",
" elif geom['type'] == 'MultiLineString':\n",
" coords = geom['coordinates']\n",
" geometries.append(MultiLineString(coords))\n",
" elif geom['type'] == 'Polyline':\n",
" # ArcGIS Polyline format\n",
" paths = geom.get('paths', [])\n",
" for path in paths:\n",
" geometries.append(LineString(path))\n",
" \n",
" return geometries\n",
" except Exception as e:\n",
" print(f\"Error fetching from ArcGIS service: {e}\")\n",
" return None\n",
"\n",
"def get_california_coastline() -> list:\n",
" arcgis_services = [\n",
" 'https://gis.water.ca.gov/arcgis/rest/services/Boundaries/i03_WestCoastShoreline/FeatureServer/0',\n",
" 'https://gis.cnra.ca.gov/arcgis/rest/services/Ocean/CSMW_Coastal_Conditions/MapServer/0',\n",
" ]\n",
" \n",
" for service_url in arcgis_services:\n",
" coastline = fetch_coastline_arcgis(service_url)\n",
" if coastline:\n",
" print(f\"Successfully fetched coastline from: {service_url}\")\n",
" return coastline\n",
" \n",
" \n",
" print(\"Warning: Could not fetch coastline from services. \")\n",
" sys.exit(1)\n",
"\n",
"def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:\n",
" \"\"\"\n",
" Calculate the great circle distance between two points on Earth.\n",
" Returns distance in kilometers.\n",
" \n",
" Args:\n",
" lat1, lon1: Latitude and longitude of first point\n",
" lat2, lon2: Latitude and longitude of second point\n",
" \n",
" Returns:\n",
" Distance in kilometers\n",
" \"\"\"\n",
" from math import radians, sin, cos, sqrt, atan2\n",
" \n",
" # Earth radius in kilometers\n",
" R = 6371.0\n",
" \n",
" # Convert to radians\n",
" lat1_rad = radians(lat1)\n",
" lon1_rad = radians(lon1)\n",
" lat2_rad = radians(lat2)\n",
" lon2_rad = radians(lon2)\n",
" \n",
" # Haversine formula\n",
" dlat = lat2_rad - lat1_rad\n",
" dlon = lon2_rad - lon1_rad\n",
" \n",
" a = sin(dlat / 2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2)**2\n",
" c = 2 * atan2(sqrt(a), sqrt(1 - a))\n",
" \n",
" distance = R * c\n",
" return distance\n",
"\n",
"def calculate_distance_to_coastline(lat: float, lon: float, coastline_geoms: list) -> float:\n",
" point = Point(lon, lat) # Note: Shapely uses (x, y) = (lon, lat)\n",
" \n",
" min_distance = float('inf')\n",
" \n",
" for geom in coastline_geoms:\n",
" # Find the nearest point on the coastline\n",
" nearest_geom_point, nearest_coast_point = nearest_points(point, geom)\n",
" \n",
" # Get coordinates of nearest point on coastline\n",
" coast_lon, coast_lat = nearest_coast_point.coords[0]\n",
" \n",
" # Calculate distance using Haversine formula for accuracy\n",
" distance_km = haversine_distance(lat, lon, coast_lat, coast_lon)\n",
" \n",
" min_distance = min(min_distance, distance_km)\n",
" \n",
" return min_distance\n",
"\n",
"def add_beach_distance_feature(df: pd.DataFrame, coastline_geoms: list) -> pd.DataFrame:\n",
" print(\"Calculating distances to coastline...\")\n",
" distances = []\n",
" \n",
" for idx, row in df.iterrows():\n",
" lat = row['Latitude']\n",
" lon = row['Longitude']\n",
" distance = calculate_distance_to_coastline(lat, lon, coastline_geoms)\n",
" distances.append(distance)\n",
" \n",
" if (idx + 1) % 1000 == 0:\n",
" print(f\"Processed {idx + 1}/{len(df)} houses...\")\n",
" \n",
" df['BeachDistance'] = distances\n",
" print(f\"Completed! Added 'BeachDistance' feature to {len(df)} houses.\")\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# SECOND: We load the dataset and call the functions\n",
"## Require a lot of time to run...\n",
"### DO NOT RUN THIS CELL WITHOUT THINKING about the time it requires to run...\n",
"## If we call often to external services, we can get banned by the server...\n",
"from sklearn.datasets import fetch_california_housing\n",
"data = fetch_california_housing(as_frame=True)\n",
"df = data.frame \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"coastline_geometries = get_california_coastline() #or geometry_california_coastline\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = add_beach_distance_feature(df, coastline_geometries) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"print(\"\\nBeach Distance Statistics:\")\n",
"print(df['BeachDistance'].describe())\n",
"print(\"-\" * 100)\n",
"\n",
"print(df[['BeachDistance', 'MedHouseVal']].head(10))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"data/california_housing_with_beach_distance.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}