library(tidyverse) # Data Manipulation
library(corrplot) # Correlation Matrix
library(ggridges) # Ridgeline plot
library(FactoMineR) # PCA
library(factoextra) # PCA Plots
library(pheatmap) # Heatmap
library(RColorBrewer) # Heatmap Cultivar Colors
library(caret) # Stratified Train-Test Split
set.seed(85) # Reproducibility Wine Data Modeling - Assignment 13 by Kyle Maher
Setup
Data
data <- read_csv("wine.data.csv")Cleaning
df <- data %>%
# Drop the three rows that contain multiple missing values
drop_na() %>%
mutate(
# The single negative value for Magnesium is likely a miss-entry
Magnesium = abs(Magnesium),
# The maximum Alcohol value of 1388 is alsolikely a miss-entry
Alcohol = case_when(
Alcohol == 1388 ~ 13.88,
TRUE ~ Alcohol
)
)Data Normalization
normalize <- function(x) {
((x - mean(x)) / sd(x))
}
df_norm <- mutate(df, across(-Cultivars, normalize))Principal Component Analysis
PCA on All Parameters
pca <- PCA(select(df_norm, -Cultivars), scale.unit = TRUE, graph = FALSE)
# fviz_eig(pca)
# fviz_pca_var(pca)fviz_pca_biplot(
pca,
# Add data point information
geom.ind = "point",
pointshape = 21,
fill.ind = factor(df_norm$Cultivars), # Color points by Cultivar,
col.ind = "black",
palette = "set1",
addEllipses = TRUE, # Add confidence ellipses around groups
# Add variable vector information
label = "var",
col.var = "black",
repel = TRUE, # Avoids text overlap
title = "PCA Biplot (Dim1 / Dim2)"
)# Credit: ZachPCA - Parameter Association
Total Phenols, Flavanoids, Proanthocyanins, OD280_OD315, and Hue are associated with positive Dim1 while Nonflavanoid_phenols, Alcalinity_of_ash, and Malic_acid are associated with negative Dim1. Color_intensity, Ash, Alcohol, Magnesium, and Proline are associated with postive Dim2.
Parameter Grouping
corr_dist <- as.dist(1 - cor(select(df_norm, -Cultivars)))
hc <- hclust(corr_dist, method = "complete")
plot(
hc,
main = "Hierarchical Clustering (Correlation Distance)"
)# Credit: JordanMaximum distance clustering creates a natural cut point with three distinct groups.
hc <- hclust(corr_dist, method = "single")
plot(
hc,
main = "Hierarchical Clustering (Correlation Distance)"
)The minimum distance clustering singles out Magnesium and Maltic Acid from the groups they were in previously.
Heatmap
df_numeric <- select(df_norm, -Cultivars)
rownames(df_numeric) <- 1:nrow(df_numeric) # Rownames resolves color issues
# Color by Cultivars
row_annotation <- data.frame(Cultivar = factor(df_norm$Cultivars))
rownames(row_annotation) <- rownames(df_numeric)
cultivar_colors <- setNames(
brewer.pal(n = length(levels(row_annotation$Cultivar)), name = "Set1"),
levels(row_annotation$Cultivar)
)
# Heatmap
pheatmap(
df_numeric,
cluster_rows = TRUE,
cluster_cols = TRUE,
color = colorRampPalette(c("red", "white", "blue"))(100),
main = "Heatmap",
fontsize_row = 10,
fontsize_col = 10,
annotation_row = row_annotation,
annotation_colors = list(Cultivar = cultivar_colors)
)Cultivar 1 has relatively high Hue, Proanthocyanins, OD280_OD315, Total_phenols, Flavanoids, Alcohol, and Proline. While Cultivar 3 has relatively low Hue, Proanthocyanins, OD280_OD315, Total_phenols, and Flavanoids. Cultivar 2 has relatively low Magnesium, Color_intensity, Alcohol, and Proline.
Classification With All Parameters
Stratified Train Test Split
df_norm <- df_norm %>%
mutate(Cultivars = factor(Cultivars))
train_index <- createDataPartition(df_norm$Cultivars, p = 0.7, list = FALSE)
train <- df_norm[train_index, ]
test <- df_norm[-train_index, ]K Nearest Neighbors
library(class)
pred_knn <- knn(train[, -1], test[, -1], train$Cultivars, k = 4)
table(pred_knn, test$Cultivars)
pred_knn 1 2 3
1 18 1 0
2 0 18 0
3 0 2 14
mean(pred_knn == test$Cultivars)[1] 0.9433962
KNN performs very well at 94% accuracy.
Decision Tree
library(rpart)
fit_tree <- rpart(Cultivars ~ ., data = train)
pred_tree <- predict(fit_tree, test, type = "class")
table(pred_tree, test$Cultivars)
pred_tree 1 2 3
1 16 2 0
2 0 17 1
3 2 2 13
mean(pred_tree == test$Cultivars)[1] 0.8679245
The decision tree performed decently well at 87% accuracy.
Random Forest
library(randomForest)
fit_rf <- randomForest(Cultivars ~ ., data = train)
pred_rf <- predict(fit_rf, test)
table(pred_rf, test$Cultivars)
pred_rf 1 2 3
1 18 0 0
2 0 20 0
3 0 1 14
mean(pred_rf == test$Cultivars)[1] 0.9811321
Random forest correctly classified all but one of the test values giving it an accuracy of 98%.
Classification With Only Color_intensity, Total_phenols, and Hue
Stratified Train Test Split
df_selected <- df_norm %>%
mutate(Cultivars = factor(Cultivars)) %>%
select(
Cultivars,
Color_intensity,
Total_phenols,
Hue
)
train_index <- createDataPartition(df_selected$Cultivars, p = 0.7, list = FALSE)
train <- df_selected[train_index, ]
test <- df_selected[-train_index, ]round(cor(df_selected %>% select(-Cultivars)), 2) Color_intensity Total_phenols Hue
Color_intensity 1.00 -0.05 -0.51
Total_phenols -0.05 1.00 0.45
Hue -0.51 0.45 1.00
Verified that the correlations are not extreme.
K Nearest Neighbors
library(class)
pred_knn <- knn(train[, -1], test[, -1], train$Cultivars, k = 4)
table(pred_knn, test$Cultivars)
pred_knn 1 2 3
1 18 3 0
2 0 17 1
3 0 1 13
mean(pred_knn == test$Cultivars)[1] 0.9056604
KNN performed slightly worse at 90% accuracy instead of 94%.
Decision Tree
library(rpart)
fit_tree <- rpart(Cultivars ~ ., data = train)
pred_tree <- predict(fit_tree, test, type = "class")
table(pred_tree, test$Cultivars)
pred_tree 1 2 3
1 18 2 1
2 0 19 0
3 0 0 13
mean(pred_tree == test$Cultivars)[1] 0.9433962
The decision tree performed better at 94% accuracy instead of 87%.
Random Forest
library(randomForest)
fit_rf <- randomForest(Cultivars ~ ., data = train)
pred_rf <- predict(fit_rf, test)
table(pred_rf, test$Cultivars)
pred_rf 1 2 3
1 18 2 0
2 0 18 1
3 0 1 13
mean(pred_rf == test$Cultivars)[1] 0.9245283
Random forest performed somewhat worse at 92% accuracy instead of 98%.
Classification Conclusions
By picking influential parameters a similar classification accuracy can be achieved while reducing noise due to correlated parameters.