library(tidyverse) # Data Manipulation
library(corrplot) # Correlation Matrix
library(ggridges) # Ridgeline plot
library(FactoMineR) # PCA
library(factoextra) # PCA PlotsWine EDA - Assignment 12 by Kyle Maher
Setup
Data
data <- read_csv("wine.data.csv")Raw Data Summary Statistics
summary(data) Cultivars Alcohol Malic_acid Ash
Min. :1.000 Min. : 11.03 Min. :0.740 Min. :1.360
1st Qu.:1.000 1st Qu.: 12.37 1st Qu.:1.575 1st Qu.:2.210
Median :2.000 Median : 13.05 Median :1.845 Median :2.360
Mean :1.934 Mean : 20.52 Mean :2.333 Mean :2.363
3rd Qu.:3.000 3rd Qu.: 13.70 3rd Qu.:3.083 3rd Qu.:2.555
Max. :3.000 Max. :1388.00 Max. :5.800 Max. :3.230
NA's :1 NA's :1
Alcalinity_of_ash Magnesium Total_phenols Flavanoids
Min. :10.60 Min. :-101.00 Min. :0.980 Min. :0.340
1st Qu.:17.20 1st Qu.: 88.00 1st Qu.:1.740 1st Qu.:1.210
Median :19.50 Median : 98.00 Median :2.360 Median :2.140
Mean :19.49 Mean : 98.41 Mean :2.292 Mean :2.031
3rd Qu.:21.50 3rd Qu.: 107.00 3rd Qu.:2.800 3rd Qu.:2.885
Max. :30.00 Max. : 162.00 Max. :3.880 Max. :5.080
NA's :1 NA's :2
Nonflavanoid_phenols Proanthocyanins Color_intensity Hue
Min. :0.1300 Min. :0.410 Min. : 1.280 Min. :0.4800
1st Qu.:0.2700 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7850
Median :0.3400 Median :1.545 Median : 4.750 Median :0.9800
Mean :0.3608 Mean :1.588 Mean : 5.046 Mean :0.9599
3rd Qu.:0.4300 3rd Qu.:1.950 3rd Qu.: 6.183 3rd Qu.:1.1200
Max. :0.6600 Max. :3.580 Max. :13.000 Max. :1.7100
NA's :1 NA's :1 NA's :1
OD280_OD315 Proline
Min. :1.270 Min. : 278.0
1st Qu.:1.930 1st Qu.: 500.0
Median :2.780 Median : 672.0
Mean :2.608 Mean : 746.2
3rd Qu.:3.170 3rd Qu.: 987.5
Max. :4.000 Max. :1680.0
NA's :2
More Cleaning Investigation
filter(data, if_any(everything(), is.na))# A tibble: 3 × 14
Cultivars Alcohol Malic_acid Ash Alcalinity_of_ash Magnesium Total_phenols
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 14.4 NA 2.28 16 102 NA
2 NA 12 0.92 2 19 86 2.42
3 2 12.6 1.34 1.9 NA 88 NA
# ℹ 7 more variables: Flavanoids <dbl>, Nonflavanoid_phenols <dbl>,
# Proanthocyanins <dbl>, Color_intensity <dbl>, Hue <dbl>, OD280_OD315 <dbl>,
# Proline <dbl>
Cleaning Choices
- Since there are only three rows with missing data, we’ll drop those three rows from future analysis.
- The single negative value for Magnesium is likely a miss-entry, so the -101 will be switched to positive 101.
- The maximum Alcohol value of 1388 is alsolikely a miss-entry, and will be replaced with 13.88.
df <- data %>%
drop_na() %>%
mutate(
Magnesium = abs(Magnesium),
Alcohol = case_when(
Alcohol == 1388 ~ 13.88,
TRUE ~ Alcohol
)
)Cleaned Data Summary Statistics
summary(df) Cultivars Alcohol Malic_acid Ash
Min. :1.000 Min. :11.03 Min. :0.740 Min. :1.360
1st Qu.:1.000 1st Qu.:12.37 1st Qu.:1.597 1st Qu.:2.210
Median :2.000 Median :13.05 Median :1.865 Median :2.360
Mean :1.939 Mean :13.01 Mean :2.347 Mean :2.368
3rd Qu.:3.000 3rd Qu.:13.70 3rd Qu.:3.105 3rd Qu.:2.560
Max. :3.000 Max. :14.83 Max. :5.800 Max. :3.230
Alcalinity_of_ash Magnesium Total_phenols Flavanoids
Min. :10.60 Min. : 70.00 Min. :0.980 Min. :0.340
1st Qu.:17.20 1st Qu.: 88.00 1st Qu.:1.735 1st Qu.:1.175
Median :19.50 Median : 98.00 Median :2.355 Median :2.135
Mean :19.51 Mean : 99.64 Mean :2.292 Mean :2.027
3rd Qu.:21.50 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.882
Max. :30.00 Max. :162.00 Max. :3.880 Max. :5.080
Nonflavanoid_phenols Proanthocyanins Color_intensity Hue
Min. :0.1300 Min. :0.410 Min. : 1.280 Min. :0.4800
1st Qu.:0.2700 1st Qu.:1.248 1st Qu.: 3.240 1st Qu.:0.7800
Median :0.3400 Median :1.555 Median : 4.750 Median :0.9650
Mean :0.3617 Mean :1.590 Mean : 5.061 Mean :0.9566
3rd Qu.:0.4325 3rd Qu.:1.952 3rd Qu.: 6.200 3rd Qu.:1.1200
Max. :0.6600 Max. :3.580 Max. :13.000 Max. :1.7100
OD280_OD315 Proline
Min. :1.270 Min. : 278.0
1st Qu.:1.927 1st Qu.: 500.0
Median :2.780 Median : 673.5
Mean :2.605 Mean : 748.0
3rd Qu.:3.170 3rd Qu.: 986.2
Max. :4.000 Max. :1680.0
Distributions
df %>%
pivot_longer(everything()) %>%
ggplot(aes(x = value, fill = name)) +
geom_histogram(bins = 25) +
facet_wrap(~name, scales = "free", ncol = 2) +
labs(x = NULL, y = NULL) +
theme(legend.position = "none")Bivariate Relationships
correlations <- round(cor(df), 2)
corrplot(correlations, type = "upper")Total_phenols, Flavanoids, Proanthocyanins, Hue, OD280_OD315, and Proline are all negatively correlated with Cultivars and positively correlated with each other. Nonflavanoid_phenols, Maltic_acid, and Alcalinity_of_ash are positively correlated with Cultivars and negatively correlated with the other previously mentioned parameters. The same is true for Magnesium but a smaller magnitude of correlation. Alcohol and Color_intensity are postitively correlated with each other and proline. Interestingly Proline is somewhat correlated with every other parameter. Finally, Ash is not particularly correlated with Cultivars, which may make it a poor parameter to use within a clustering model.
df %>%
pivot_longer(-Cultivars) %>%
ggplot(aes(x = factor(Cultivars), y = value, fill = factor(Cultivars))) +
geom_boxplot() +
facet_wrap(~ name, scales = "free", ncol = 3) +
labs(x = NULL, y = NULL) +
theme(legend.position = "none")Total_phenols and Alcohol are the most promising parameters for predicting Cultivars when examining the boxplots.
df %>%
pivot_longer(all_of(c("Alcohol", "Total_phenols", "Flavanoids", "OD280_OD315"))) %>%
ggplot(aes(x = value, y = factor(Cultivars), fill = factor(Cultivars))) +
geom_density_ridges() +
facet_wrap(~ name, scales = "free") +
labs(x = NULL, y = NULL, fill = "Cultivars")Data Normalization
normalize <- function(x) {
((x - mean(x)) / sd(x))
}
df_norm <- mutate(df, across(-Cultivars, normalize))Principal Component Analysis
PCA on Selected Parameters
pca <- PCA(
select(
df_norm,
Flavanoids,
Total_phenols,
OD280_OD315,
Hue,
Proline,
Proanthocyanins
),
scale.unit = TRUE,
graph = FALSE
)
fviz_eig(pca)fviz_pca_var(pca)scores <- as.data.frame(pca$ind$coord)
scores$Cultivars <- df_norm$Cultivars
ggplot(scores, aes(Dim.1, Dim.2, color = factor(Cultivars))) +
geom_point()PCA on All Parameters
pca <- PCA(select(df_norm, -Cultivars), scale.unit = TRUE, graph = FALSE)
fviz_eig(pca)fviz_pca_var(pca)Main Segmentations:
Total Phenols, Flavanoids, Proanthocyanins, OD280_OD315, and Hue are associated with positive Dim1 while Nonflavanoid_phenols, Alcalinity_of_ash, and Malic_acid are associated with negative Dim1. Color_intensity, Ash, Alcohol, Magnesium, and Proline are associated with postive Dim2.
scores <- as.data.frame(pca$ind$coord)
scores$Cultivars <- df_norm$Cultivars
ggplot(scores, aes(Dim.1, Dim.2, color = factor(Cultivars))) +
geom_point()The first two dimensions from PCA are able to explain the cultivar types fairly well. The scatterplot shows that Dim2 doesn’t separate Cultivar 1 and 3 very well.