Wine Data Modeling - Assignment 13 by Kyle Maher

Setup

library(tidyverse)  # Data Manipulation
library(corrplot)  # Correlation Matrix
library(ggridges)  # Ridgeline plot
library(FactoMineR) # PCA
library(factoextra) # PCA Plots
library(pheatmap) # Heatmap
library(RColorBrewer) # Heatmap Cultivar Colors
library(caret) # Stratified Train-Test Split

set.seed(85) # Reproducibility

Data

data <- read_csv("wine.data.csv")

Cleaning

df <- data %>%
  # Drop the three rows that contain multiple missing values
  drop_na() %>%
  mutate(
    # The single negative value for Magnesium is likely a miss-entry
    Magnesium = abs(Magnesium),
    # The maximum Alcohol value of 1388 is alsolikely a miss-entry
    Alcohol = case_when(
        Alcohol == 1388 ~ 13.88,
        TRUE ~ Alcohol
    )
  )

Data Normalization

normalize <- function(x) { 
  ((x - mean(x)) / sd(x))
  }

df_norm <- mutate(df, across(-Cultivars, normalize))

Principal Component Analysis

PCA on All Parameters

pca <- PCA(select(df_norm, -Cultivars), scale.unit = TRUE, graph = FALSE)
# fviz_eig(pca)
# fviz_pca_var(pca)

fviz_pca_biplot(
  pca,

  # Add data point information
  geom.ind = "point",
  pointshape = 21,
  fill.ind = factor(df_norm$Cultivars), # Color points by Cultivar,
  col.ind = "black",
  palette = "set1",
  addEllipses = TRUE, # Add confidence ellipses around groups

  # Add variable vector information
  label = "var",
  col.var = "black",
  repel = TRUE, # Avoids text overlap
  title = "PCA Biplot (Dim1 / Dim2)"
  )

# Credit: Zach

PCA - Parameter Association

Total Phenols, Flavanoids, Proanthocyanins, OD280_OD315, and Hue are associated with positive Dim1 while Nonflavanoid_phenols, Alcalinity_of_ash, and Malic_acid are associated with negative Dim1. Color_intensity, Ash, Alcohol, Magnesium, and Proline are associated with postive Dim2.

Parameter Grouping

corr_dist <- as.dist(1 - cor(select(df_norm, -Cultivars)))
hc <- hclust(corr_dist, method = "complete")

plot(
  hc, 
  main = "Hierarchical Clustering (Correlation Distance)"
  )

# Credit: Jordan

Maximum distance clustering creates a natural cut point with three distinct groups.

hc <- hclust(corr_dist, method = "single")

plot(
  hc, 
  main = "Hierarchical Clustering (Correlation Distance)"
  )

The minimum distance clustering singles out Magnesium and Maltic Acid from the groups they were in previously.

Heatmap

df_numeric <- select(df_norm, -Cultivars)
rownames(df_numeric) <- 1:nrow(df_numeric) # Rownames resolves color issues

# Color by Cultivars
row_annotation <- data.frame(Cultivar = factor(df_norm$Cultivars))
rownames(row_annotation) <- rownames(df_numeric)
cultivar_colors <- setNames(
  brewer.pal(n = length(levels(row_annotation$Cultivar)), name = "Set1"), 
  levels(row_annotation$Cultivar)
)

# Heatmap
pheatmap(
  df_numeric,
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  color = colorRampPalette(c("red", "white", "blue"))(100),
  main = "Heatmap",
  fontsize_row = 10,
  fontsize_col = 10,
  annotation_row = row_annotation,
  annotation_colors = list(Cultivar = cultivar_colors)
)

Cultivar 1 has relatively high Hue, Proanthocyanins, OD280_OD315, Total_phenols, Flavanoids, Alcohol, and Proline. While Cultivar 3 has relatively low Hue, Proanthocyanins, OD280_OD315, Total_phenols, and Flavanoids. Cultivar 2 has relatively low Magnesium, Color_intensity, Alcohol, and Proline.

Classification With All Parameters

Stratified Train Test Split

df_norm <- df_norm %>%
  mutate(Cultivars = factor(Cultivars))

train_index <- createDataPartition(df_norm$Cultivars, p = 0.7, list = FALSE)
train <- df_norm[train_index, ]
test  <- df_norm[-train_index, ]

K Nearest Neighbors

library(class)

pred_knn <- knn(train[, -1], test[, -1], train$Cultivars, k = 4)
table(pred_knn, test$Cultivars)

        
pred_knn  1  2  3
       1 18  1  0
       2  0 18  0
       3  0  2 14

mean(pred_knn == test$Cultivars)

[1] 0.9433962

KNN performs very well at 94% accuracy.

Decision Tree

library(rpart)

fit_tree <- rpart(Cultivars ~ ., data = train)
pred_tree <- predict(fit_tree, test, type = "class")
table(pred_tree, test$Cultivars)

         
pred_tree  1  2  3
        1 16  2  0
        2  0 17  1
        3  2  2 13

mean(pred_tree == test$Cultivars)

[1] 0.8679245

The decision tree performed decently well at 87% accuracy.

Random Forest

library(randomForest)

fit_rf <- randomForest(Cultivars ~ ., data = train)
pred_rf <- predict(fit_rf, test)
table(pred_rf, test$Cultivars)

       
pred_rf  1  2  3
      1 18  0  0
      2  0 20  0
      3  0  1 14

mean(pred_rf == test$Cultivars)

[1] 0.9811321

Random forest correctly classified all but one of the test values giving it an accuracy of 98%.

Classification With Only Color_intensity, Total_phenols, and Hue

Stratified Train Test Split

df_selected <- df_norm %>%
  mutate(Cultivars = factor(Cultivars)) %>%
  select(
    Cultivars,
    Color_intensity,
    Total_phenols,
    Hue
  )

train_index <- createDataPartition(df_selected$Cultivars, p = 0.7, list = FALSE)
train <- df_selected[train_index, ]
test  <- df_selected[-train_index, ]

round(cor(df_selected %>% select(-Cultivars)), 2)

                Color_intensity Total_phenols   Hue
Color_intensity            1.00         -0.05 -0.51
Total_phenols             -0.05          1.00  0.45
Hue                       -0.51          0.45  1.00

Verified that the correlations are not extreme.

K Nearest Neighbors

library(class)

pred_knn <- knn(train[, -1], test[, -1], train$Cultivars, k = 4)
table(pred_knn, test$Cultivars)

        
pred_knn  1  2  3
       1 18  3  0
       2  0 17  1
       3  0  1 13

mean(pred_knn == test$Cultivars)

[1] 0.9056604

KNN performed slightly worse at 90% accuracy instead of 94%.

Decision Tree

library(rpart)

fit_tree <- rpart(Cultivars ~ ., data = train)
pred_tree <- predict(fit_tree, test, type = "class")
table(pred_tree, test$Cultivars)

         
pred_tree  1  2  3
        1 18  2  1
        2  0 19  0
        3  0  0 13

mean(pred_tree == test$Cultivars)

[1] 0.9433962

The decision tree performed better at 94% accuracy instead of 87%.

Random Forest

library(randomForest)

fit_rf <- randomForest(Cultivars ~ ., data = train)
pred_rf <- predict(fit_rf, test)
table(pred_rf, test$Cultivars)

       
pred_rf  1  2  3
      1 18  2  0
      2  0 18  1
      3  0  1 13

mean(pred_rf == test$Cultivars)

[1] 0.9245283

Random forest performed somewhat worse at 92% accuracy instead of 98%.

Classification Conclusions

By picking influential parameters a similar classification accuracy can be achieved while reducing noise due to correlated parameters.