Diamond Dataset EDA - Week 2 by Kyle Maher

Setup

library(tidymodels)
library(scales)    # Axis Formatting
library(gt)    # Table Formatting

set.seed(28)

df <- ggplot2::diamonds

Cleaning

# Remove 20 rows with at least one zero dimension
df <- filter(df, x > 0, y > 0, z > 0)

Split Into Train & Test Data

df_split <- initial_split(df)
df_train <- training(df_split)
df_test <- testing(df_split)

df_split

<Training/Testing/Total>
<40440/13480/53920>

Functions

Show Coefficients

show_coefficients <- function(model) {
  tidy(model) %>% 
  gt() %>% 
  tab_options(table.align = "left")
}

Show Summary

show_summary <- function(model) {
  glance(model) %>% 
  gt() %>% 
  tab_options(table.align = "left")
}

Show Plots

show_plots <- function(model) {
  par(mfrow = c(4,1)) # adjust to show all four model plots
  plot(model)
  par(mfrow = c(1,1)) # reset
}

Show rmse

show_rmse <- function(rmse) {
  tibble(Root_Mean_Squared_Error = rmse) %>% 
  gt() %>% 
  tab_options(table.align = "left")
}

Model Version #1 (Basic)

model <- lm(price ~ carat, data = df_train)
df_test$pred <- predict(model, newdata = df_test)

df_test %>%
  ggplot(aes(x = carat, y = price)) +
  geom_point(color = "#456882", alpha = .3) +
  geom_line(aes(y = pred), color = "#25b9c7", linewidth = 1.2) +
  labs(
    x = "Carat", 
    y = "Price ($)", 
    title = "Basic Linear Model Using Carat to Predict the Price")

This obviously is not the best fit. However, lets define how poor it actually is.

show_coefficients(model)

term	estimate	std.error	statistic	p.value
(Intercept)	-2254.987	15.03722	-149.9604	0
carat	7749.773	16.25453	476.7761	0

show_summary(model)

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.848973	0.8489693	1546.164	227315.4	0	1	-354353.3	708712.6	708738.5	9.6672e+10	40438	40440

show_plots(model)

rmse <- sqrt(mean((df_test$price - df_test$pred)^2))
show_rmse(rmse)

Root_Mean_Squared_Error
1552.396

Well, it looks like there are a lot of things wrong here. Most of the red lines are not close to horizontal and the Q-Q plot has some extreme tails. Not to mention the large RMSE.

Model Version #2 (Log Scale)

model <- lm(log(price) ~ log(carat), data = df_train)

# See (1) for log backwards conversion formula
df_test$pred <- exp(predict(model, df_test) + summary(model)$sigma^2 / 2)

df_test %>%
  ggplot(aes(x = carat, y = price)) +
  geom_point(color = "#456882", alpha = .3) +
  geom_line(aes(y = pred), color = "#25b9c7", linewidth = 1.2) +
  labs(
    x = "Carat", 
    y = "Price ($)", 
    title = "Linear Model Using the Natural Log of Carat to Predict the Natural Log of Price")

Finding the linear model at the log scale gives it a curved fit once the reverse transformation is done. This enables it to better follow the distribution.

show_coefficients(model)

term	estimate	std.error	statistic	p.value
(Intercept)	8.447025	0.001580803	5343.5028	0
log(carat)	1.674577	0.002231835	750.3139	0

show_summary(model)

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.9329841	0.9329824	0.2626561	562971	0	1	-3316.251	6638.502	6664.325	2789.747	40438	40440

show_plots(model)

rmse <- sqrt(mean((df_test$price - df_test$pred)^2))
show_rmse(rmse)

Root_Mean_Squared_Error
1721.216

While the diagnostic plots look significantly better, the R Squared did not improve and RMSE actually got worse. Also there are several points with significant leverage. It may make sense to break up the model by carat to address that.

Splitting Up The Model

Where To Split

df %>%
  ggplot(aes(x = carat, y = price, color = clarity)) +
  geom_point() +
  scale_x_log10(breaks = scales::log_breaks(n = 10)) +
  scale_y_log10(breaks = scales::log_breaks(n = 10)) + 
  geom_vline(xintercept = c(.5, 2), color = "red", linetype = "dashed") +
  labs(
    x = "Carat",
    y = "Price ($)",
    title = "Carat and Price by Clarity at a Log Scale"
  )

The splits .5 and 2 Carats were chosen based on general break points.

New Train & Test Data Splits

df_small <- filter(df, carat < .5)
df_medium <- filter(df, carat >= .5 & carat < 2)
df_large <- filter(df, carat >= 2)

df_small_split <- initial_split(df_small)
df_small_train <- training(df_small_split)
df_small_test <- testing(df_small_split)

df_medium_split <- initial_split(df_medium)
df_medium_train <- training(df_medium_split)
df_medium_test <- testing(df_medium_split)

df_large_split <- initial_split(df_large)
df_large_train <- training(df_large_split)
df_large_test <- testing(df_large_split)

Model Version #3 (Split & Log Scale)

Small Diamonds

model <- lm(log(price) ~ log(carat), data = df_small_train)

df_small_test$pred <- exp(
  predict(model, df_small_test) + summary(model)$sigma^2 / 2)

df_small_test %>%
  ggplot(aes(x = carat, y = price)) +
  geom_point(color = "#456882", alpha = .3) +
  geom_line(aes(y = pred), color = "#25b9c7", linewidth = 1.2) +
  labs(
    x = "Carat", 
    y = "Price ($)", 
    title = "Using Carat to Predict Price for Small Diamonds")

Under .5 Carats the Carat almost acts as a categorical variable due to the very distinct levels. In theory converting it to a factor may improve the results, but lets keep moving.

show_coefficients(model)

term	estimate	std.error	statistic	p.value
(Intercept)	7.913997	0.01427789	554.28321	0
log(carat)	1.191059	0.01311144	90.84119	0

show_summary(model)

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.3837282	0.3836817	0.2373798	8252.122	0	1	254.9053	-503.8105	-481.3342	746.7954	13253	13255

show_plots(model)

rmse <- sqrt(mean((df_small_test$price - df_small_test$pred)^2))
show_rmse(rmse)

Root_Mean_Squared_Error
193.9803

Even with the almost discrete nature, this model was able to predict the diamond Price within about $194 for diamonds less than .5 Carats only using the Carat.

Medium Diamonds

model <- lm(log(price) ~ log(carat), data = df_medium_train)
df_medium_test$pred <- exp(
  predict(model, df_medium_test) + summary(model)$sigma^2 / 2)

df_medium_test %>%
  ggplot(aes(x = carat, y = price)) +
  geom_point(color = "#456882", alpha = .3) +
  geom_line(aes(y = pred), color = "#25b9c7", linewidth = 1.2) +
  labs(
    x = "Carat", 
    y = "Price ($)", 
    title = "Linear Model Using the Natural Log of Carat to Predict the Natural Log of Price for Medium Diamonds")

This line looks pretty good, however there is some significant variation in price at the same carat which may hinder the model performance.

show_coefficients(model)

term	estimate	std.error	statistic	p.value
(Intercept)	8.470972	0.001757063	4821.0966	0
log(carat)	1.737863	0.004750738	365.8091	0

show_summary(model)

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.8395669	0.8395606	0.2672787	133816.3	0	1	-2542.875	5091.75	5116.197	1826.738	25571	25573

show_plots(model)

rmse <- sqrt(mean((df_medium_test$price - df_medium_test$pred)^2))
show_rmse(rmse)

Root_Mean_Squared_Error
1643.344

The extreme variation results in a somewhat poor prediction for medium sized diamonds. Since there is extreme variation in price at the same carat level, adding the categorical variables to the model may improve it significantly.

Large Diamonds

model <- lm(log(price) ~ log(carat), data = df_large_train)

df_large_test$pred <- exp(
  predict(model, df_large_test) + summary(model)$sigma^2 / 2)

df_large_test %>%
  ggplot(aes(x = carat, y = price)) +
  geom_point(color = "#456882", alpha = .3) +
  geom_line(aes(y = pred), color = "#25b9c7", linewidth = 1.2) +
  labs(
    x = "Carat", 
    y = "Price ($)", 
    title = "Linear Model Using the Natural Log of Carat to Predict the Natural Log of Price for Large Diamonds")

Woah there. This does not look like a good case for a linear model.

show_coefficients(model)

term	estimate	std.error	statistic	p.value
(Intercept)	9.4739098	0.04354947	217.543613	0.00000000
log(carat)	0.1474736	0.05720113	2.578158	0.01002106

show_summary(model)

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.00411408	0.003495133	0.2166625	6.646901	0.01002106	1	178.977	-351.954	-335.8002	75.53073	1609	1611

show_plots(model)

rmse <- sqrt(mean((df_large_test$price - df_large_test$pred)^2))
show_rmse(rmse)

Root_Mean_Squared_Error
2715.613

The diagnostic plots confirm that less data and a non visable linear relationship here makes this model quite poor. Lets focus on the small and medium sized diamonds limiting the size to under 2 carats.

Model Version 4 (Split & Log Scale & Categorical Parameters)

Small Diamonds

model <- lm(log(price) ~ log(carat) + clarity + color + cut, data = df_small_train)

df_small_test$pred <- exp(
  predict(model, df_small_test) + summary(model)$sigma^2 / 2)

df_small_test %>%
  ggplot(aes(x = carat, y = price)) +
  geom_point(color = "#456882", alpha = .3) +
  geom_point(aes(y = pred), color = "#25b9c7", alpha = .1) +
  labs(
    x = "Carat", 
    y = "Price ($)", 
    title = "Four C's to Predict Price for Small Diamonds")

Adding the categorical variables causes this to no longer be a simple linear model. As a result we would expect the correlation to improve drasticly.

show_coefficients(model)

term	estimate	std.error	statistic	p.value
(Intercept)	7.979398215	0.009026044	884.0415464	0.000000e+00
log(carat)	1.378418878	0.007305658	188.6782550	0.000000e+00
clarity.L	0.724174284	0.015904260	45.5333523	0.000000e+00
clarity.Q	-0.074806246	0.015752487	-4.7488530	2.067189e-06
clarity.C	0.014669319	0.012733646	1.1520124	2.493368e-01
clarity^4	0.027766656	0.008784352	3.1609225	1.576249e-03
clarity^5	-0.031916212	0.005434514	-5.8728738	4.385836e-09
clarity^6	0.003396417	0.003456065	0.9827409	3.257530e-01
clarity^7	0.022803912	0.002683709	8.4971615	2.148901e-17
color.L	-0.429337732	0.004792249	-89.5900371	0.000000e+00
color.Q	-0.084722030	0.004508298	-18.7924647	8.926012e-78
color.C	-0.011096431	0.004033515	-2.7510577	5.948394e-03
color^4	0.012091840	0.003474594	3.4800729	5.028914e-04
color^5	0.002504757	0.003154540	0.7940166	4.272000e-01
color^6	0.002312288	0.002724583	0.8486758	3.960771e-01
cut.L	0.042693132	0.009114784	4.6839436	2.842060e-06
cut.Q	0.055716298	0.007823117	7.1220077	1.118607e-12
cut.C	-0.086640086	0.005458199	-15.8733834	3.195591e-56
cut^4	-0.038436130	0.003415515	-11.2533924	3.025503e-29

show_summary(model)

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.8204785	0.8202344	0.1282019	3360.741	0	18	8429.219	-16818.44	-16668.6	217.5433	13236	13255

show_plots(model)

rmse <- sqrt(mean((df_small_test$price - df_small_test$pred)^2))
show_rmse(rmse)

Root_Mean_Squared_Error
105.8284

Yes, we are able to predict the price to within about $106 for Small Diamonds on average.

Medium Diamonds

model <- lm(log(price) ~ log(carat) + clarity + color + cut, data = df_medium_train)
df_medium_test$pred <- exp(
  predict(model, df_medium_test) + summary(model)$sigma^2 / 2)

df_medium_test %>%
  ggplot(aes(x = carat, y = price)) +
  geom_point(color = "#456882", alpha = .3) +
  geom_point(aes(y = pred), color = "#25b9c7", alpha = .1) +
  labs(
    x = "Carat", 
    y = "Price ($)", 
    title = "Four C's to Predict Price for Medium Diamonds")

We would expect a better correlation here as well.

show_coefficients(model)

term	estimate	std.error	statistic	p.value
(Intercept)	8.472276362	0.001461528	5796.863213	0.000000e+00
log(carat)	1.946553488	0.002237492	869.970969	0.000000e+00
clarity.L	0.910368088	0.004766595	190.989167	0.000000e+00
clarity.Q	-0.224320276	0.004510679	-49.730933	0.000000e+00
clarity.C	0.109049382	0.003951640	27.595978	3.289869e-165
clarity^4	-0.060553703	0.003292588	-18.390912	4.741829e-75
clarity^5	0.027540132	0.002751180	10.010298	1.517718e-23
clarity^6	0.009609196	0.002326057	4.131110	3.621647e-05
clarity^7	0.031858018	0.001917784	16.611889	1.204182e-61
color.L	-0.455389254	0.002529034	-180.064537	0.000000e+00
color.Q	-0.096608578	0.002326115	-41.532154	0.000000e+00
color.C	-0.009809832	0.002199902	-4.459213	8.261142e-06
color^4	0.017047308	0.002046355	8.330571	8.444168e-17
color^5	-0.004226729	0.001923012	-2.197973	2.795997e-02
color^6	0.002099278	0.001750096	1.199522	2.303362e-01
cut.L	0.121306883	0.002678300	45.292489	0.000000e+00
cut.Q	-0.030763807	0.002393821	-12.851338	1.106191e-37
cut.C	0.035591181	0.002135963	16.662823	5.191683e-62
cut^4	0.016480758	0.001762875	9.348794	9.567782e-21

show_summary(model)

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.9687576	0.9687356	0.1179869	44020.75	0	18	18377.14	-36714.28	-36551.3	355.7348	25554	25573

show_plots(model)

rmse <- sqrt(mean((df_medium_test$price - df_medium_test$pred)^2))
show_rmse(rmse)

Root_Mean_Squared_Error
715.2202

RMSE improved from $1721 to $742 by adding the categorical variables.

Back To One Model

Dropping Large Values

cap = 2
df_capped <- filter(df, carat <= cap)

df_split <- initial_split(df_capped)
df_train <- training(df_split)
df_test <- testing(df_split)

Model Version #5 (Log Scale & Categorical Parameters & Capped Carat Size)

model <- lm(log(price) ~ log(carat) + 
  color + cut + clarity, data = df_train)

df_test$pred <- exp(
  predict(model, df_test) + summary(model)$sigma^2 / 2)

df_test %>%
  ggplot(aes(x = carat, y = price)) +
  geom_point(color = "#456882", alpha = .3) +
  geom_point(aes(y = pred), color = "#25b9c7", alpha = .1) +
  labs(
    x = "Carat", 
    y = "Price ($)", 
    title = paste("Four C's to Predict Price for Diamonds Under", cap, "Carats"))

These predictions look quite good, but perhaps subject to overfitting.

show_coefficients(model)

term	estimate	std.error	statistic	p.value
(Intercept)	8.4669821380	0.001400897	6043.97383408	0.000000e+00
log(carat)	1.8908563404	0.001348136	1402.57072135	0.000000e+00
color.L	-0.4384929651	0.002383146	-183.99754730	0.000000e+00
color.Q	-0.0943794959	0.002212800	-42.65161999	0.000000e+00
color.C	-0.0132793291	0.002062571	-6.43824235	1.222665e-10
color^4	0.0137867074	0.001886154	7.30942952	2.734131e-13
color^5	-0.0006587587	0.001769239	-0.37234021	7.096416e-01
color^6	0.0017391644	0.001593453	1.09144393	2.750843e-01
cut.L	0.1204918074	0.002729975	44.13659667	0.000000e+00
cut.Q	-0.0311158935	0.002401581	-12.95641942	2.592975e-38
cut.C	0.0103211965	0.002082851	4.95532180	7.251261e-07
cut^4	-0.0003912555	0.001668611	-0.23447979	8.146138e-01
clarity.L	0.8949920616	0.004225872	211.78875378	0.000000e+00
clarity.Q	-0.2179366281	0.003966771	-54.94055714	0.000000e+00
clarity.C	0.1109633901	0.003384419	32.78654112	1.336951e-232
clarity^4	-0.0528129209	0.002688557	-19.64359180	1.700975e-85
clarity^5	0.0189473858	0.002166131	8.74710819	2.274685e-18
clarity^6	-0.0001205286	0.001863563	-0.06467643	9.484320e-01
clarity^7	0.0332776142	0.001643420	20.24900391	1.062312e-90

show_summary(model)

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.9816212	0.9816127	0.1317789	115746.3	0	18	23725.86	-47411.71	-47240.27	677.4008	39008	39027

show_plots(model)

rmse <- sqrt(mean((df_test$price - df_test$pred)^2))
show_rmse(rmse)

Root_Mean_Squared_Error
610.6686

# Training RMSE to test for overfitting
df_train$pred <- exp(
  predict(model, df_train) + summary(model)$sigma^2 / 2)

rmse <- sqrt(mean((df_train$price - df_train$pred)^2))
show_rmse(rmse)

Root_Mean_Squared_Error
622.7129

Since the rmse for the Training and Testing data are similar it is unlikely that the model is overfit. It is likely that this model can actually predict the diamond price within around $610 on average.

Conclusions

For diamonds less than 2 carats, a model with an R Squared of .98 was reached which is able to predict the diamond price using Carat, Clarity, Color, and Cut with a RMSE of $610.

For diamonds smaller than .5 Carats model version 4 was able to predict the price with a RMSE of $106.

Unanswered Questions:

There appears to be a price where no diamonds are for sale around $1500.

References

(1) Reverse Transformation For The Mean For Log Transformed Data

https://stats.stackexchange.com/questions/359088/correcting-log-transformation-bias-in-a-linear-model

E(Y) = e^(mu + sigma^2 / 2)