Wine EDA - Assignment 12 by Kyle Maher

Setup

library(tidyverse)  # Data Manipulation
library(corrplot)  # Correlation Matrix
library(ggridges)  # Ridgeline plot
library(FactoMineR) # PCA
library(factoextra) # PCA Plots

Data

data <- read_csv("wine.data.csv")

Raw Data Summary Statistics

summary(data)
   Cultivars        Alcohol          Malic_acid         Ash       
 Min.   :1.000   Min.   :  11.03   Min.   :0.740   Min.   :1.360  
 1st Qu.:1.000   1st Qu.:  12.37   1st Qu.:1.575   1st Qu.:2.210  
 Median :2.000   Median :  13.05   Median :1.845   Median :2.360  
 Mean   :1.934   Mean   :  20.52   Mean   :2.333   Mean   :2.363  
 3rd Qu.:3.000   3rd Qu.:  13.70   3rd Qu.:3.083   3rd Qu.:2.555  
 Max.   :3.000   Max.   :1388.00   Max.   :5.800   Max.   :3.230  
 NA's   :1                         NA's   :1                      
 Alcalinity_of_ash   Magnesium       Total_phenols     Flavanoids   
 Min.   :10.60     Min.   :-101.00   Min.   :0.980   Min.   :0.340  
 1st Qu.:17.20     1st Qu.:  88.00   1st Qu.:1.740   1st Qu.:1.210  
 Median :19.50     Median :  98.00   Median :2.360   Median :2.140  
 Mean   :19.49     Mean   :  98.41   Mean   :2.292   Mean   :2.031  
 3rd Qu.:21.50     3rd Qu.: 107.00   3rd Qu.:2.800   3rd Qu.:2.885  
 Max.   :30.00     Max.   : 162.00   Max.   :3.880   Max.   :5.080  
 NA's   :1                           NA's   :2                      
 Nonflavanoid_phenols Proanthocyanins Color_intensity       Hue        
 Min.   :0.1300       Min.   :0.410   Min.   : 1.280   Min.   :0.4800  
 1st Qu.:0.2700       1st Qu.:1.250   1st Qu.: 3.220   1st Qu.:0.7850  
 Median :0.3400       Median :1.545   Median : 4.750   Median :0.9800  
 Mean   :0.3608       Mean   :1.588   Mean   : 5.046   Mean   :0.9599  
 3rd Qu.:0.4300       3rd Qu.:1.950   3rd Qu.: 6.183   3rd Qu.:1.1200  
 Max.   :0.6600       Max.   :3.580   Max.   :13.000   Max.   :1.7100  
 NA's   :1            NA's   :1       NA's   :1                        
  OD280_OD315       Proline      
 Min.   :1.270   Min.   : 278.0  
 1st Qu.:1.930   1st Qu.: 500.0  
 Median :2.780   Median : 672.0  
 Mean   :2.608   Mean   : 746.2  
 3rd Qu.:3.170   3rd Qu.: 987.5  
 Max.   :4.000   Max.   :1680.0  
 NA's   :2                       

More Cleaning Investigation

filter(data, if_any(everything(), is.na))
# A tibble: 3 × 14
  Cultivars Alcohol Malic_acid   Ash Alcalinity_of_ash Magnesium Total_phenols
      <dbl>   <dbl>      <dbl> <dbl>             <dbl>     <dbl>         <dbl>
1         1    14.4      NA     2.28                16       102         NA   
2        NA    12         0.92  2                   19        86          2.42
3         2    12.6       1.34  1.9                 NA        88         NA   
# ℹ 7 more variables: Flavanoids <dbl>, Nonflavanoid_phenols <dbl>,
#   Proanthocyanins <dbl>, Color_intensity <dbl>, Hue <dbl>, OD280_OD315 <dbl>,
#   Proline <dbl>

Cleaning Choices

  • Since there are only three rows with missing data, we’ll drop those three rows from future analysis.
  • The single negative value for Magnesium is likely a miss-entry, so the -101 will be switched to positive 101.
  • The maximum Alcohol value of 1388 is alsolikely a miss-entry, and will be replaced with 13.88.
df <- data %>%
  drop_na() %>%
  mutate(
    Magnesium = abs(Magnesium),
    Alcohol = case_when(
        Alcohol == 1388 ~ 13.88,
        TRUE ~ Alcohol
    )
  )

Cleaned Data Summary Statistics

summary(df)
   Cultivars        Alcohol        Malic_acid         Ash       
 Min.   :1.000   Min.   :11.03   Min.   :0.740   Min.   :1.360  
 1st Qu.:1.000   1st Qu.:12.37   1st Qu.:1.597   1st Qu.:2.210  
 Median :2.000   Median :13.05   Median :1.865   Median :2.360  
 Mean   :1.939   Mean   :13.01   Mean   :2.347   Mean   :2.368  
 3rd Qu.:3.000   3rd Qu.:13.70   3rd Qu.:3.105   3rd Qu.:2.560  
 Max.   :3.000   Max.   :14.83   Max.   :5.800   Max.   :3.230  
 Alcalinity_of_ash   Magnesium      Total_phenols     Flavanoids   
 Min.   :10.60     Min.   : 70.00   Min.   :0.980   Min.   :0.340  
 1st Qu.:17.20     1st Qu.: 88.00   1st Qu.:1.735   1st Qu.:1.175  
 Median :19.50     Median : 98.00   Median :2.355   Median :2.135  
 Mean   :19.51     Mean   : 99.64   Mean   :2.292   Mean   :2.027  
 3rd Qu.:21.50     3rd Qu.:107.00   3rd Qu.:2.800   3rd Qu.:2.882  
 Max.   :30.00     Max.   :162.00   Max.   :3.880   Max.   :5.080  
 Nonflavanoid_phenols Proanthocyanins Color_intensity       Hue        
 Min.   :0.1300       Min.   :0.410   Min.   : 1.280   Min.   :0.4800  
 1st Qu.:0.2700       1st Qu.:1.248   1st Qu.: 3.240   1st Qu.:0.7800  
 Median :0.3400       Median :1.555   Median : 4.750   Median :0.9650  
 Mean   :0.3617       Mean   :1.590   Mean   : 5.061   Mean   :0.9566  
 3rd Qu.:0.4325       3rd Qu.:1.952   3rd Qu.: 6.200   3rd Qu.:1.1200  
 Max.   :0.6600       Max.   :3.580   Max.   :13.000   Max.   :1.7100  
  OD280_OD315       Proline      
 Min.   :1.270   Min.   : 278.0  
 1st Qu.:1.927   1st Qu.: 500.0  
 Median :2.780   Median : 673.5  
 Mean   :2.605   Mean   : 748.0  
 3rd Qu.:3.170   3rd Qu.: 986.2  
 Max.   :4.000   Max.   :1680.0  

Distributions

df %>%
  pivot_longer(everything()) %>%
  ggplot(aes(x = value, fill = name)) +
  geom_histogram(bins = 25) +
  facet_wrap(~name, scales = "free", ncol = 2) +
  labs(x = NULL, y = NULL) +
  theme(legend.position = "none")

Bivariate Relationships

correlations <- round(cor(df), 2)
corrplot(correlations, type = "upper")

Total_phenols, Flavanoids, Proanthocyanins, Hue, OD280_OD315, and Proline are all negatively correlated with Cultivars and positively correlated with each other. Nonflavanoid_phenols, Maltic_acid, and Alcalinity_of_ash are positively correlated with Cultivars and negatively correlated with the other previously mentioned parameters. The same is true for Magnesium but a smaller magnitude of correlation. Alcohol and Color_intensity are postitively correlated with each other and proline. Interestingly Proline is somewhat correlated with every other parameter. Finally, Ash is not particularly correlated with Cultivars, which may make it a poor parameter to use within a clustering model.

df %>%
  pivot_longer(-Cultivars) %>%
  ggplot(aes(x = factor(Cultivars), y = value, fill = factor(Cultivars))) +
  geom_boxplot() +
  facet_wrap(~ name, scales = "free", ncol = 3) +
  labs(x = NULL, y = NULL) +
  theme(legend.position = "none")

Total_phenols and Alcohol are the most promising parameters for predicting Cultivars when examining the boxplots.

df %>%
  pivot_longer(all_of(c("Alcohol", "Total_phenols", "Flavanoids", "OD280_OD315"))) %>%
  ggplot(aes(x = value, y = factor(Cultivars), fill = factor(Cultivars))) +
  geom_density_ridges() +
  facet_wrap(~ name, scales = "free") +
  labs(x = NULL, y = NULL, fill = "Cultivars")

Data Normalization

normalize <- function(x) { 
  ((x - mean(x)) / sd(x))
}

df_norm <- mutate(df, across(-Cultivars, normalize))

Principal Component Analysis

Main Group of Correlated Parameters:

  • Flavanoids
  • Total_phenols
  • OD280_OD315
  • Hue
  • Proline
  • Proanthocyanins
df_norm %>%
  ggplot(aes(x = Total_phenols, y = Flavanoids, color = factor(Cultivars))) +
  geom_point()

df_norm %>%
  ggplot(aes(x = Total_phenols, y = OD280_OD315, color = factor(Cultivars))) +
  geom_point()

df_norm %>%
  ggplot(aes(x = Flavanoids, y = OD280_OD315, color = factor(Cultivars))) +
  geom_point()

PCA on Selected Parameters

pca <- PCA(
  select(
    df_norm, 
    Flavanoids, 
    Total_phenols, 
    OD280_OD315,
    Hue,
    Proline,
    Proanthocyanins
  ), 
    scale.unit = TRUE, 
    graph = FALSE
)

fviz_eig(pca)

fviz_pca_var(pca)

scores <- as.data.frame(pca$ind$coord)
scores$Cultivars <- df_norm$Cultivars
ggplot(scores, aes(Dim.1, Dim.2, color = factor(Cultivars))) +
  geom_point()

PCA on All Parameters

pca <- PCA(select(df_norm, -Cultivars), scale.unit = TRUE, graph = FALSE)
fviz_eig(pca)

fviz_pca_var(pca)

Main Segmentations:
Total Phenols, Flavanoids, Proanthocyanins, OD280_OD315, and Hue are associated with positive Dim1 while Nonflavanoid_phenols, Alcalinity_of_ash, and Malic_acid are associated with negative Dim1. Color_intensity, Ash, Alcohol, Magnesium, and Proline are associated with postive Dim2.

scores <- as.data.frame(pca$ind$coord)
scores$Cultivars <- df_norm$Cultivars
ggplot(scores, aes(Dim.1, Dim.2, color = factor(Cultivars))) +
  geom_point()

The first two dimensions from PCA are able to explain the cultivar types fairly well. The scatterplot shows that Dim2 doesn’t separate Cultivar 1 and 3 very well.