Purpose

To explore Machine Learning techniques: clustering and classification.

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly as pl
import plotly.express as px
import scipy as sp
import scipy.stats
import seaborn as sns

plt.style.use("classic")
%matplotlib inline

Loading Data

lw_raw = pd.read_csv("livingwage50states.csv")
state_id = range(1,52)

lw_raw["state_id"] = state_id

lw_state = pd.get_dummies(lw_raw, columns=["2020_election"], prefix=("2020_election"))

lw_state = lw_state.astype("float", errors="ignore")

Matplotlib

fig, lw_plot1 = plt.subplots(constrained_layout = False, figsize=(20,5))

lw_plot1.bar(lw_state["state_territory"], lw_state["oneadult_nokids"], color ='tab:red', edgecolor ='white', label = "Living Wage")
lw_plot1.bar(lw_state["state_territory"], lw_state["min_wage"], color ='tab:blue', edgecolor ='white', label = "Minimum Wage")
lw_plot1.legend()

lw_plot1.set_title("Minimum Wage vs. Living Wage for a Singe Individual", size = 20)

for tick in lw_plot1.get_xticklabels():
    tick.set_rotation('vertical')

fig.align_labels()

plt.show()

png

fig, lw_plot2 = plt.subplots(1,2, constrained_layout = False, figsize=(20,10))

lw_plot2[0].scatter(lw_state["population_2020"], lw_state["oneadult_nokids"],  c = lw_state["state_id"], edgecolors='face', s = 150)
lw_plot2[1].scatter(lw_state["population_density"], lw_state["oneadult_nokids"],  c = lw_state["state_id"], edgecolors='face', s = 150)
lw_plot2[0].ticklabel_format(style = 'plain')

lw_plot2[0].set(xlim=(0))
lw_plot2[1].set(xlim=(0))

fig.suptitle("Living Wage vs. Population and Density", fontsize = 20)

plt.show()

png

fig, lw_plot3 = plt.subplots(constrained_layout = False, figsize=(10,10))

for axis in [lw_plot3.xaxis, lw_plot3.yaxis]:
    axis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))

lw_plot3.scatter(lw_state["population_2020"], lw_state["oneadult_nokids"], s = lw_state["population_density"], c = lw_state["state_id"], edgecolors='face')
lw_plot3.set(xlim=(0))

for i in range(0,51):
 plt.text(x=lw_state.population_2020[i]+0.5,y=lw_state.oneadult_nokids[i],s=lw_state.state_territory[i], c="black",
          fontdict=dict(color='y',size=10))

lw_plot3.set_title("Living Wage vs. Population and Density", size = 20)

plt.ticklabel_format(style = 'plain')

plt.show()

png

Classification

Can we classify a Democrat vs. Republican State (2020 Presidential Election Results) from this data set? Only using the population, density, minimum wage, and living wage.

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

fig, lw_plot4 = plt.subplots( constrained_layout = False, figsize=(10,10))

for axis in [lw_plot4.xaxis, lw_plot4.yaxis]:
    axis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))

lw_plot4.scatter(lw_state["population_2020"], lw_state["oneadult_nokids"], s = lw_state["population_density"], c = lw_state["2020_election_R"], edgecolors='black')
lw_plot4.set(xlim=(0))

for i in range(0,51):
 plt.text(x=lw_state.population_2020[i]+0.5,y=lw_state.oneadult_nokids[i],s=lw_state.state_territory[i], c="grey",
          fontdict=dict(color='y',size=10))

lw_plot4.set_title("Living Wage vs. Population and Density ('Blue' v. 'Red States')", size = 20)

plt.ticklabel_format(style = 'plain')

plt.show()

png

Naive Bayes Models

# Multinomial NB
x = lw_state.iloc[:,[1,3,4,16]].values
y = lw_state.iloc[:,19].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)

model = MultinomialNB()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print("Accuracy for Predicting Democratic v. Republican States:", (accuracy_score(y_test, y_pred)*100),"%")

cm = confusion_matrix(y_test, y_pred)

print(cm)

    Accuracy for Predicting Democratic v. Republican States: 62.5 %
    [[7 0]
     [6 3]]

# Complement NB
x = lw_state.iloc[:,[1,3,4,16]].values
y = lw_state.iloc[:,19].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)

model = ComplementNB()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print("Accuracy for Predicting Democratic v. Republican States:", (accuracy_score(y_test, y_pred)*100),"%")

cm = confusion_matrix(y_test, y_pred)

print(cm)

    Accuracy for Predicting Democratic v. Republican States: 62.5 %
    [[7 0]
     [6 3]]

# Gaussian NB
x = lw_state.iloc[:,[1,3,4,16]].values
y = lw_state.iloc[:,19].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)

model = GaussianNB()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print("Accuracy for Predicting Democratic v. Republican States:", (accuracy_score(y_test, y_pred)*100),"%")

cm = confusion_matrix(y_test, y_pred)

print(cm)

    Accuracy for Predicting Democratic v. Republican States: 50.0 %
    [[7 0]
     [8 1]]

# Bernoulli NB
## states in order: population, density, living wage, minimum wage
x = lw_state.iloc[:,[1,3,4,16]].values
y = lw_state.iloc[:,19].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)

model = BernoulliNB()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print("Accuracy for Predicting Democratic v. Republican States:", (accuracy_score(y_test, y_pred)*100),"%")

cm = confusion_matrix(y_test, y_pred)

print(cm)

    Accuracy for Predicting Democratic v. Republican States: 43.75 %
    [[7 0]
     [9 0]]

The Bernoulli Naive Bayes (after standardizing the data), was the most accurate model, at 87.5%. Additionally, the model only returned false negatives (e.g. failed to classify a state as Democrat).

Visual Classification

# Multinomial NB
x = lw_state.iloc[:,[1,3,4,16]].values
y = lw_state.iloc[:,19].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)

model = GaussianNB()
model.fit(x_train,y_train)

rng = np.random.RandomState(0)
Xnew2 = [0, 1, 0, 0] + [40000000, 800, 30, 30] * rng.rand(2000, 4)

ynew2 = model.predict(Xnew2)

fig, lw_plot5= plt.subplots(constrained_layout = False, figsize=(10,10))

for axis in [lw_plot5.xaxis, lw_plot5.yaxis]:
    axis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))

lw_plot5.scatter(Xnew2[:, 0], Xnew2[:, 3], s = Xnew2[:, 1], c = ynew2, alpha = 0.2)
lw_plot5.scatter(lw_state["population_2020"], lw_state["oneadult_nokids"], s = lw_state["population_density"], c = lw_state["2020_election_R"], edgecolors='black')
lw_plot5.set(xlim=(0))

lw_plot5.set_title("Multinomial NB: Living Wage vs. Population and Density ('Blue' v. 'Red States')", size = 20)

plt.ticklabel_format(style = 'plain')

plt.show()

png

# Complement NB
x = lw_state.iloc[:,[1,3,4,16]].values
y = lw_state.iloc[:,19].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)

model = ComplementNB()
model.fit(x_train,y_train)

rng = np.random.RandomState(0)
Xnew1 = [0, 1, 0, 0] + [40000000, 800, 30, 30] * rng.rand(2000, 4)

ynew1 = model.predict(Xnew1)

fig, lw_plot6= plt.subplots(constrained_layout = False, figsize=(10,10))

for axis in [lw_plot6.xaxis, lw_plot6.yaxis]:
    axis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))

lw_plot6.scatter(Xnew1[:, 0], Xnew1[:, 3], s = Xnew1[:, 1], c = ynew1, edgecolors='face', alpha = 0.2)
lw_plot6.scatter(lw_state["population_2020"], lw_state["oneadult_nokids"], s = lw_state["population_density"], c = lw_state["2020_election_R"], edgecolors='black')
lw_plot6.set(xlim=(0))

lw_plot6.set_title("Complement NB: Living Wage vs. Population and Density ('Blue' v. 'Red States')", size = 20)

plt.ticklabel_format(style = 'plain')

plt.show()

png

Clustering

What will the data points cluster into?

from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering

x = lw_state.iloc[:,[1,3,4,16]].values

kmeans2 = KMeans(n_clusters=2)
kmeans2.fit(x)

y_means2 = kmeans2.predict(x)

centers = kmeans2.cluster_centers_

fig, lw_plot7= plt.subplots(constrained_layout = False, figsize=(10,10))

for axis in [lw_plot7.xaxis, lw_plot7.yaxis]:
    axis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))

lw_plot7.scatter(lw_state["population_2020"], lw_state["oneadult_nokids"], s = lw_state["population_density"], c = y_means2, edgecolors='face', cmap='Set2')
lw_plot7.set(xlim=(0))

for i in range(0,51):
 plt.text(x=lw_state.population_2020[i]+0.5,y=lw_state.oneadult_nokids[i],s=lw_state.state_territory[i], c="black",
          fontdict=dict(color='y',size=10))

lw_plot7.set_title("KMeans with 2 Clusters", size = 20)

plt.ticklabel_format(style = 'plain')

plt.show()

png

This does not look to be a good cluster grouping.

x = lw_state.iloc[:,[1,3,4,16]].values

kmeans4 = KMeans(n_clusters=4)
kmeans4.fit(x)

y_means4 = kmeans4.predict(x)

centers = kmeans4.cluster_centers_

fig, lw_plot8= plt.subplots(constrained_layout = False, figsize=(10,10))

for axis in [lw_plot8.xaxis, lw_plot8.yaxis]:
    axis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))

lw_plot8.scatter(lw_state["population_2020"], lw_state["oneadult_nokids"], s = lw_state["population_density"], c = y_means4, edgecolors='face', cmap='Set2')
lw_plot8.set(xlim=(0))

for i in range(0,51):
 plt.text(x=lw_state.population_2020[i]+0.5,y=lw_state.oneadult_nokids[i],s=lw_state.state_territory[i], c="black",
          fontdict=dict(color='y',size=10))

lw_plot8.set_title("KMeans with 4 Clusters", size = 20)

plt.ticklabel_format(style = 'plain')

plt.show()

png

x = lw_state.iloc[:,[1,3,4,16]].values

model = SpectralClustering(n_clusters = 2, affinity = "nearest_neighbors", assign_labels="kmeans")
labels2 = model.fit_predict(x)

fig, lw_plot9= plt.subplots(constrained_layout = False, figsize=(10,10))

for axis in [lw_plot9.xaxis, lw_plot9.yaxis]:
    axis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))

lw_plot9.scatter(lw_state["population_2020"], lw_state["oneadult_nokids"], s = lw_state["population_density"], c = labels2, edgecolors='face', cmap='Set2')
lw_plot9.set(xlim=(0))

for i in range(0,51):
 plt.text(x=lw_state.population_2020[i]+0.5,y=lw_state.oneadult_nokids[i],s=lw_state.state_territory[i], c="black",
          fontdict=dict(color='y',size=10))

lw_plot9.set_title("Spectral Clustering Nearest Neighbor with 2 Clusters", size = 20)

plt.ticklabel_format(style = 'plain')

plt.show()

png

x = lw_state.iloc[:,[1,3,4,16]].values

model = SpectralClustering(n_clusters = 4, affinity = "nearest_neighbors", assign_labels="kmeans")
labels4 = model.fit_predict(x)

fig, lw_plot10= plt.subplots(constrained_layout = False, figsize=(10,10))

for axis in [lw_plot10.xaxis, lw_plot10.yaxis]:
    axis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))

lw_plot10.scatter(lw_state["population_2020"], lw_state["oneadult_nokids"], s = lw_state["population_density"], c = labels4, edgecolors='face', cmap='Set2')
lw_plot10.set(xlim=(0))

for i in range(0,51):
 plt.text(x=lw_state.population_2020[i]+0.5,y=lw_state.oneadult_nokids[i],s=lw_state.state_territory[i], c="black",
          fontdict=dict(color='y',size=10))

lw_plot10.set_title("Spectral Clustering Nearest Neighbor with 4 Clusters", size = 20)

plt.ticklabel_format(style = 'plain')

plt.show()

png

Final Thoughts

This data likely wasn’t the best to try and classify or cluster, but it was interesting to visualize different Bayesian classification methods and various clustering mechanisms. Looking at the graphs, the classification and clustering may have had too great of an influence, as most of the states were clearly split by population size (vertically).