Step 3: Create a Predictive Model

Import Packages

Code
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas
import requests
import seaborn as sns
import contextily as ctx

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Pipelines
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

np.random.seed(42)

pd.options.display.max_columns = 999
%matplotlib inline

Preprocess Data & Test for Correlation

Load merged data.

Code
street = gpd.read_file("Data/street_merge.geojson")
street.head()
R_HUNDRED CLASS STNAME ST_CODE SEG_ID Shape__Length crash_2022 crash_2023 Truck 2022_Paved 2023_Paved 311_2022 311_2023 nearest_lts_score geometry
0 9200 2 BARTRAM AVE 16120 100002 1137.344551 0 0 0.0 0.0 1.0 0 0 3 LINESTRING (-75.25362 39.88134, -75.25383 39.8...
1 9100 2 BARTRAM AVE 16120 100003 371.888030 0 0 0.0 0.0 1.0 0 0 3 LINESTRING (-75.25149 39.88373, -75.25337 39.8...
2 0 2 BARTRAM AVE 16120 100004 48.720227 0 0 0.0 0.0 1.0 0 0 4 LINESTRING (-75.25337 39.88161, -75.25362 39.8...
3 8500 5 EASTWICK PL 30570 100006 292.693856 0 0 0.0 0.0 0.0 0 0 1 LINESTRING (-75.24576 39.89227, -75.24724 39.8...
4 8500 5 HARLEY PL 40570 100007 292.440377 0 0 0.0 0.0 0.0 0 0 1 LINESTRING (-75.24648 39.89267, -75.24798 39.8...
Code
cols= [
    "CLASS",
    "Truck",
    "crash_2022",
    "311_2022",
    "crash_2023",
    "2022_Paved",
    "311_2023",
    "nearest_lts_score",
]
sns.heatmap(street[cols].corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1);
C:\Users\epark\AppData\Local\Temp\ipykernel_21156\1834119373.py:11: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(street[cols].corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1);

The correlation plot shows little correlation between any of the independent variables.

Now we can pull out the 2022 data to make our test and train set.

Code
cols_2022 = [
    "R_HUNDRED",
    "CLASS",
    "STNAME",
    "ST_CODE",
    "SEG_ID",
    "Shape__Length",
    "crash_2022",
    "Truck",
    "2022_Paved",
    "2023_Paved",
    "311_2022",
    "nearest_lts_score",
    "geometry",
]

street_2022 = street[cols_2022]

cols_to_drop = ["R_HUNDRED", "STNAME","ST_CODE", "SEG_ID", "Shape__Length", "geometry"]
street_2022_features = street_2022.drop(columns=cols_to_drop).copy()

street_2022_features = street_2022_features.dropna(subset=['2022_Paved'])
len(street_2022_features)
41151

Run the models

We’ll try running the following models:

(1) Random Forest Model - A collection of decision trees, each trained on a random subset of the data. The final prediction is a majority vote of the individual tree predictions.

(2) XG Boost - A collection of decision trees, built sequentially, with each new tree correcting the errors of the previous ones.

(3) Logistic Regression Model - Uses the logistic function to transform a linear combination of input features into a probability score. The output is then thresholded to make binary (or multiclass) predictions.

(4) K-Nearest Neighbors - Classifies or predicts based on the majority class (for classification) in the feature space. The “nearest neighbors” are determined by a distance metric.

Preprocess Data

Code
numerical_features = ['crash_2022', '311_2022']
categorical_features = ['CLASS', 'Truck', 'nearest_lts_score','2022_Paved']
target_variable = '2023_Paved'

# Creating feature and target dataframes
X = street_2022_features[numerical_features + categorical_features]
y = street_2022_features[target_variable]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Creating a preprocessor for categorical features
transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", 
         make_pipeline(SimpleImputer(strategy='constant', fill_value=0),  # Fill missing values with 0
                       OneHotEncoder(handle_unknown="ignore")),  # Apply OneHotEncoder
         categorical_features)
    ]
)

Random Forest Model

Code
# Make a random forest pipeline
pipe = make_pipeline(
    transformer, RandomForestClassifier(n_estimators=20, random_state=42)
)

# Fit on the training data
pipe.fit(X_train, y_train);
pipe.score(X_test, y_test)

y_pred = pipe.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy (Random Forest): {accuracy}')

# Confusion Matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Classification Report
print('Classification Report:')
print(classification_report(y_test, y_pred))
Accuracy (Random Forest): 0.9411145310221934
Confusion Matrix:
[[11612     6]
 [  721     7]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97     11618
         1.0       0.54      0.01      0.02       728

    accuracy                           0.94     12346
   macro avg       0.74      0.50      0.49     12346
weighted avg       0.92      0.94      0.91     12346

That’s not a great model! The model is really good at predicting true negatives, but bad at predicting true positives - only 1% of true positives were predicted. The overall accuracy is high since most street segments are not being repaved. While it’s likely because the variables that are chosen aren’t great for the model, let’s try a few other models to see if anything can be improved.

XG Boost

Code
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

# Create and train the XGBoost model
model = XGBClassifier()
model.fit(X_train_transformed, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test_transformed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy (XGBoost): {accuracy}')

# Confusion Matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Classification Report
print('Classification Report:')
print(classification_report(y_test, y_pred))
Accuracy (XGBoost): 0.9411145310221934
Confusion Matrix:
[[11618     0]
 [  727     1]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97     11618
         1.0       1.00      0.00      0.00       728

    accuracy                           0.94     12346
   macro avg       0.97      0.50      0.49     12346
weighted avg       0.94      0.94      0.91     12346

The XG Boost model is worse than the random forest - 0% of true positives were predicted correctly.

Logistic Regression Model

Code
# Create a logistic regression model using make_pipeline with preprocessing steps
logistic_model = make_pipeline(transformer, LogisticRegression())

# Train the model
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_logistic = logistic_model.predict(X_test)

# Evaluate the model
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)

print(f'Accuracy (Logistic Regression): {accuracy_logistic}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_logistic))
print('Classification Report:')
print(classification_report(y_test, y_pred_logistic))
Accuracy (Logistic Regression): 0.9410335331281386
Confusion Matrix:
[[11618     0]
 [  728     0]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97     11618
         1.0       0.00      0.00      0.00       728

    accuracy                           0.94     12346
   macro avg       0.47      0.50      0.48     12346
weighted avg       0.89      0.94      0.91     12346
C:\Users\epark\mambaforge\envs\musa-550-fall-2023\lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\epark\mambaforge\envs\musa-550-fall-2023\lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\epark\mambaforge\envs\musa-550-fall-2023\lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\epark\mambaforge\envs\musa-550-fall-2023\lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

The logistic regression model is worse than the random forest - 0% of true positives were predicted correctly.

K-Nearest Neighbors

Code
# Transform the data
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

# Create a KNN model
knn_model = KNeighborsClassifier()

# Train the model
knn_model.fit(X_train_transformed, y_train)

# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test_transformed)

# Evaluate the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'Accuracy (KNN): {accuracy_knn}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_knn))
print('Classification Report:')
print(classification_report(y_test, y_pred_knn))
Accuracy (KNN): 0.9407095415519197
Confusion Matrix:
[[11614     4]
 [  728     0]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97     11618
         1.0       0.00      0.00      0.00       728

    accuracy                           0.94     12346
   macro avg       0.47      0.50      0.48     12346
weighted avg       0.89      0.94      0.91     12346

KNN is worse than the original random forest. We’ll use the random forest model going forward.