We’ll use the California Housing dataset (sklearn).

# 0. Imports
import numpy as np, pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# 1. Load data
data = fetch_california_housing(as_frame=True)
df = data.frame
X = df.drop(columns=["MedHouseVal"])
y = df["MedHouseVal"]

# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Preprocessing pipeline (numeric only here)
numeric_feats = X.select_dtypes(include=["float64", "int"]).columns.tolist()
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_feats)
], remainder="drop")

# 4. Full pipeline with a model
pipeline = Pipeline([
    ("preproc", preprocessor),
    ("model", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
])

# 5. Train
pipeline.fit(X_train, y_train)

# 6. Evaluate
y_pred = pipeline.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

# 7. Persist model
joblib.dump(pipeline, "house_price_pipeline.joblib")

Hyperparameter tuning (example)

param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [10, 20, None],
}
search = GridSearchCV(pipeline, param_grid, cv=3, scoring="neg_mean_absolute_error", n_jobs=-1)
search.fit(X_train, y_train)
print("Best params:", search.best_params_)

Deployment notes

  • Save model + preprocessor (we used a single pipeline).
  • Serve via a lightweight API (FastAPI / Flask) that loads the joblib artifact and exposes prediction endpoint.
  • Add input validation and schema (Pydantic) to avoid garbage-in.