Rookie All-Star Modeling

The goal of this project is twofold: 1) predict if a player will become an all-star based on their rookie offensive stats and 2) determine clusters of rookies.

At the end of the day, this is a fairly limited analysis, though we can see some interesting patterns in the data.

options(warn=-1)
setwd("C:/Users/Micah/Desktop/applied_data_mining")
set.seed(19)

library(ggplot2)
library(lattice)
library(caret)
library(pROC)
library(plyr)
library(rpart)
library(rattle)
library(cluster)
library(data.table)
library(MASS)
library(colorRamps)
library(nFactors)
library(gplots)
library(RColorBrewer)
library(semPlot)
library(waffle)
library(extrafont)

font_import()
## Importing fonts may take a few minutes, depending on the number of fonts and the speed of the system.
## Continue? [y/n]

Cleaning Functions

subset_to_rookie_year <- function(df){
  # Define rookie season as first yearID in which player had more than 100 ABs.
  df[, 'yearID'] <- sapply(df[, 'yearID'], as.numeric)
  eligible_df <- df[ which(df$AB > 100), ]
  rookie_df <- aggregate(eligible_df$yearID, by=list(eligible_df$playerID), min)
  colnames(rookie_df) <- c('playerID', 'yearID')
  df <- merge(df, rookie_df, by=c('playerID', 'yearID'))
  df <- df[!duplicated(df$playerID),]
  # For simplicity, remove small number of players with 100+ ABs for two teams 
  # in their rookie season. 
  df <- df[ which(df$AB > 100), ]
  return(df)
}


# Only use the last few decades of players.
# Do not use players who are too recent - they may still become all-stars. 
subset_to_between_1970_and_2010 <- function(df){
  df <- df[ which(df$yearID >= 1970 & df$yearID <= 2010), ]
  return(df)
}


count_all_star_appearances <- function(all_stars, batting){
  all_star_temp <- all_stars[,c('playerID', 'yearID')]
  all_star_temp$rookie_all_star_appearance <- 'yes'
  batting <- merge(batting, all_star_temp, by=c('playerID', 'yearID'), all.x=TRUE)
  batting$rookie_all_star_appearance[is.na(batting$rookie_all_star_appearance)] <- 'no'
  
  batting$rookie_id <- 'yes'
  all_stars_non_rookie <- merge(all_stars, batting, by=c('playerID', 'yearID'), all.x=TRUE)
  all_stars_non_rookie$rookie_id[is.na(all_stars_non_rookie$rookie_id)] <- 'no'
  all_stars_non_rookie <- all_stars_non_rookie[ which(all_stars_non_rookie$rookie_id == 'no'), ]
  
  all_stars_non_rookie <- all_stars_non_rookie[c('playerID')]
  all_stars_non_rookie <- as.data.frame(table(all_stars_non_rookie))
  colnames(all_stars_non_rookie) <- c('playerID', 'all_star')
  
  merged_df <- merge(batting, all_stars_non_rookie, by='playerID', all.x=TRUE)
  merged_df$all_star[merged_df$all_star > 0] <- "yes"
  merged_df$all_star[merged_df$all_star != 'yes'] <- "no"
  merged_df$all_star[is.na(merged_df$all_star)] <- 'no'
  return(merged_df)
}


create_name_to_id_mapping <- function(df){
  df$playerName <- paste(df$nameFirst, ' ', df$nameLast)
  df <- df[c('playerID', 'playerName')]
  return(df)
}


calculate_slg_obp_obp_and_avg <- function(df){
  df[is.na(df)] <- 0
  df$avg <- df$H / df$AB
  df$obp <- (df$H + df$BB + df$HBP) / (df$AB + df$BB + df$HBP + df$SF)
  df$slg <- (df$H + (df$X2B + df$X3B + df$HR) + (df$X2B * 2) + 
              (df$X3B * 3 + df$HR * 4)) / df$AB
  df[is.na(df)] <- 0
  return(df)
}


select_columns_for_modeling <- function(df){
  df <- subset(df, select=c(G, AB, R, H, X2B, X3B, HR, RBI, SB, BB, SO, avg, obp, slg, 
                            all_star, playerID))
  return(df)
}


drop_player_id <- function(df){
  drop <- c('playerID')
  df <- df[ , !(names(df) %in% drop)]
  return(df)
}

Exploration Functions

count_factor_occurrences_by_target <- function(df, feature, target, title){
  print(ggplot(df, aes_string(feature, fill = target)) +
          geom_bar() + ggtitle(title))
}


make_histogram_by_target <- function(df, feature, target, title, bins){
  print(ggplot(df, aes_string(feature, fill = target)) +
          geom_histogram(binwidth = bins) + ggtitle(title))
}


make_parallel_coordinates <- function(df, feature, cuts){
  c <- blue2red(cuts)
  r <- cut(feature, cuts)
  parcoord(df, col=c[as.numeric(r)])
}

Factor Analysis Functions

make_scree_table_for_factor_analysis <- function(df){
  nScree(df)
}


get_eigenvalues <- function(df){
  eigen(cor(df))
}


build_factor_analysis_model <- function(df, n_factors){
  fa <- factanal(df, factors = n_factors, lower = 0.01)
  print(fa)
  return(fa)
}


make_factor_analysis_heatmp <- function(fa){
  heatmap.2(fa$loadings, col = brewer.pal(9, "Greens"), trace = "none",
            key = FALSE, dend = 'none', Colv = FALSE, cexCol = 1.2,
            main = "Factor Loadings")
}


make_factor_analysis_sem_plot <- function(fa){
  semPaths(fa, what = "est", residuals = FALSE, cut = 0.4,
           posCol = c("white", "darkgreen"), 
           negCol = c("white", "red"),
           edge.label.cex = 0.60, nCharNodes = 7)
}

Supervised Machine Learning Functions

train_random_forest <- function(train_df, target){
  control <- trainControl(method="repeatedcv", number=3, repeats=3, classProbs=TRUE)
  mtry <- c(sqrt(ncol(train_df)), log2(ncol(train_df)))
  grid <- expand.grid(.mtry=mtry)
  formula <- as.formula(paste(target, "~ ."))
  
  model <- train(formula, 
                 data=train_df, 
                 preProcess=c("center", "scale"),
                 method="rf", 
                 metric="ROC",
                 trControl=control, 
                 tuneGrid=grid,
                 allowParallel=TRUE,
                 num.threads=4)
  return(model)
}


train_log_reg <- function(train_df, target){
  control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
  grid <- expand.grid(parameter=c(0.001, 0.01, 0.1, 1,10, 100))
  formula <- as.formula(paste(target, "~ ."))
  
  model <- train(formula, 
                 data=train_df, 
                 preProcess=c("center", "scale"),
                 method="glm", 
                 family="binomial", 
                 metric="ROC",
                 trControl=control, 
                 tuneGrid=grid)
  
  return(model)
}


train_decision_tree <- function(train_df, target){
  control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
  grid <- expand.grid(.maxdepth=c(3, 5, 7, 10))
  formula <- as.formula(paste(target, "~ ."))
  
  model <- train(formula, 
                 data=train_df, 
                 preProcess=c("center", "scale"),
                 method="rpart2", 
                 metric="ROC",
                 trControl=control, 
                 tuneGrid=grid)
  return(model)
}


train_gradient_boosting <- function(train_df, target){
  control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
  grid <- expand.grid(interaction.depth = c(1, 3, 5), 
              n.trees = c(50, 100, 150), 
              shrinkage = 0.1,
              n.minobsinnode = 20)
  
  formula <- as.formula(paste(target, "~ ."))
  
  model <- train(formula, 
                 data=train_df, 
                 preProcess=c("center", "scale"),
                 method="gbm", 
                 metric="ROC",
                 verbose=F,
                 trControl=control, 
                 tuneGrid=grid)
  return(model)
}


plot_decision_tree <- function(df, target, depth){
  formula <- as.formula(paste(target, "~ ."))
  set.seed(19)
  tree <- rpart(formula, method="class", maxdepth=depth, data=df)
  printcp(tree)
  print(tree)
  fancyRpartPlot(tree)
}


plot_model <- function(model){
  plot(model)
}


print_grid_search_results <- function(model){
  model$bestTune
  results <- model$results
  results 
}


print_confusion_matrix <- function(model, df, target){
  predictions <- predict(model, df)
  con_matrix <- confusionMatrix(predictions, target, positive = 'yes')   
  con_matrix
}


get_roc_auc <- function(model, df, target){
  probabilities <- predict(model, df, type="prob")
  
  ROC <- roc(predictor=probabilities$yes,
             response=target)
  print(ROC$auc)
  plot(ROC, main="ROC")
  return(ROC)
}


get_variable_importances <- function(model){
  varImp(model)
}

Unsupervised Machine Learning Functions

scale_dataframe <- function(df){
  df[, -c(3)] <- scale(df[, -c(3)])
  df <- data.frame(df)
  return(df)
}


plot_within_cluster_sum_of_squares <- function(df, title){
  wss <- (nrow(df)-1) * sum(apply(df, 2, var))
  for (i in 2:15) wss[i] <- sum(kmeans(df, centers=i)$withinss)
  plot(1:15, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares", main = paste(title,' elbow plot'))
}


train_k_means_model <- function(df, k){
  set.seed(19)
  model <- kmeans(df, k, nstart=25)
  return(model)
}


plot_k_means_model <- function(model, df, title){
  clusplot(df, model$cluster, color=TRUE, shade=TRUE,
           labels=2, lines=0, main = paste(title,' PCA Plot of K-Means'))
}


create_hclust_and_plot <- function(df){
  set.seed(19)
  dm = dist(df,method="euclidean")
  hclust_model <- hclust(dm, method="complete")
  plot(hclust_model)
  return(hclust_model)
}


summarize_clusters <- function(df){
  cluster1 <- df[which(df$k_means.cluster=='1'),]
  cluster2 <- df[which(df$k_means.cluster=='2'),]
  cluster3 <- df[which(df$k_means.cluster=='3'),]
  
  print('cluster 1 summary')
  print(summary(cluster1))
  print('cluster 2 summary')
  print(summary(cluster2))
  print('cluster 3 summary')
  print(summary(cluster3))
}

Execution

Read in data

all_star_df <- read.csv('data/AllstarFull.csv')
batting_df <- read.csv('data/Batting.csv')
people_df <- read.csv('data/People.csv')

Data cleaning

batting_df <- subset_to_rookie_year(batting_df)
batting_df <- subset_to_between_1970_and_2010(batting_df)
batting_df <- count_all_star_appearances(all_star_df, batting_df)
batting_df <- calculate_slg_obp_obp_and_avg(batting_df)
batting_df <- select_columns_for_modeling(batting_df)
batting_df_copy <- batting_df
batting_df <- drop_player_id(batting_df)

Data Exploration

agg_cols_for_hist <- c('G', 'H', 'X2B', 'HR', 'RBI', 'SB')
for (column in agg_cols_for_hist){
  make_histogram_by_target(batting_df, column, 'all_star', 
                           paste(column,' histogram by all star'), 10)
}

rate_cols_for_hist <- c('avg', 'obp', 'slg')
for (column in rate_cols_for_hist){
  make_histogram_by_target(batting_df, column, 'all_star', 
                           paste(column,' histogram by all star'), .1)
}

# home runs paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$HR, 20)

# obp paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$obp, 20)

# hits paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$H, 20)

# slg paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$slg, 20)

Factor Analysis

make_scree_table_for_factor_analysis(batting_df[1:14])
##   noc naf nparallel nkaiser
## 1   3   1         3       3
get_eigenvalues(batting_df[1:14])
## $values
##  [1] 8.249808403 1.854338281 1.395355460 0.685687678 0.554203569
##  [6] 0.437636506 0.234931186 0.224893153 0.140682786 0.085785935
## [11] 0.054002627 0.043706253 0.034306568 0.004661595
## 
## $vectors
##             [,1]        [,2]         [,3]         [,4]        [,5]
##  [1,] -0.3004331  0.22552342  0.002991965 -0.214291356 -0.20908291
##  [2,] -0.3251658  0.20256023  0.008520668 -0.087614482 -0.21942073
##  [3,] -0.3322291  0.09149677  0.072664606  0.014272143  0.02906008
##  [4,] -0.3320129  0.09386681  0.082728669 -0.077402592 -0.26707296
##  [5,] -0.3090893 -0.01617361 -0.089239947 -0.004405929 -0.30455914
##  [6,] -0.2141106  0.12724805  0.348745639  0.566515090 -0.07910700
##  [7,] -0.2532477 -0.16026550 -0.460780197  0.184045072  0.23455915
##  [8,] -0.3157035 -0.05599209 -0.248637445 -0.002842869 -0.09575287
##  [9,] -0.1645273  0.25745151  0.496860370  0.250937159  0.47108337
## [10,] -0.2834618  0.07026278 -0.017809804 -0.433516479  0.44767756
## [11,] -0.2794128  0.15070538 -0.265222056  0.115119859  0.26886264
## [12,] -0.1741356 -0.48821838  0.374024681 -0.051084340 -0.30215863
## [13,] -0.1802661 -0.48063767  0.308843148 -0.371532047  0.27393383
## [14,] -0.1729315 -0.53136815 -0.176803007  0.424928679  0.10864709
##              [,6]         [,7]         [,8]        [,9]        [,10]
##  [1,] -0.02260710 -0.132918557  0.073247444  0.82866115  0.117395503
##  [2,]  0.04654594 -0.049925842  0.008121485 -0.06269410 -0.115759642
##  [3,]  0.08216492 -0.005373923  0.141127312 -0.14317407 -0.661216214
##  [4,]  0.08859406 -0.083719295  0.025905815 -0.16922713 -0.127925992
##  [5,]  0.14337766  0.721473104 -0.123363058 -0.15480659  0.130758741
##  [6,] -0.66601937 -0.056150991  0.079160662 -0.05721246  0.088637853
##  [7,]  0.12030128 -0.345758070  0.306311390 -0.04117933 -0.146299231
##  [8,]  0.05358724 -0.209325916  0.228804563 -0.28719137  0.643289987
##  [9,]  0.55814528  0.055640613  0.084156029  0.04418578  0.216009589
## [10,] -0.34447179  0.225734361  0.188871195 -0.08344856  0.005853165
## [11,] -0.05046551 -0.149869540 -0.843018472 -0.00437432  0.017853242
## [12,]  0.18113704 -0.322380880 -0.199041868 -0.07256980 -0.018510412
## [13,] -0.18403031 -0.015351012 -0.095160330  0.01798346  0.075267759
## [14,]  0.04275459  0.324163645  0.064883823  0.36649091 -0.077749979
##              [,11]       [,12]        [,13]         [,14]
##  [1,]  0.156817419  0.01008885  0.134504983 -0.0494372667
##  [2,] -0.344207453 -0.14592765 -0.451773086  0.6568209315
##  [3,]  0.361156603  0.48923254  0.128071769  0.0658505782
##  [4,] -0.240386079 -0.11481465 -0.349081662 -0.7371808459
##  [5,]  0.273125021 -0.28783110  0.217895508  0.0343269306
##  [6,]  0.082353918 -0.11969163  0.061609080  0.0104300563
##  [7,]  0.150319319 -0.56298225  0.135453728  0.0114920150
##  [8,]  0.116276488  0.46362841 -0.029821344  0.0386872680
##  [9,] -0.023818168 -0.06321212 -0.003121233  0.0008024401
## [10,] -0.448366476  0.02153507  0.332770603 -0.0390386657
## [11,]  0.001338981  0.07298571  0.023370218 -0.0378598948
## [12,] -0.253233594 -0.04068682  0.489347379  0.1015157019
## [13,]  0.429695122 -0.13033954 -0.419767278  0.0324633607
## [14,] -0.317565203  0.26185694 -0.205134930 -0.0337207140
batting_factor_analysis <- build_factor_analysis_model(batting_df[1:14], 3)
## 
## Call:
## factanal(x = df, factors = n_factors, lower = 0.01)
## 
## Uniquenesses:
##     G    AB     R     H   X2B   X3B    HR   RBI    SB    BB    SO   avg 
## 0.162 0.010 0.110 0.010 0.218 0.612 0.059 0.089 0.675 0.400 0.282 0.010 
##   obp   slg 
## 0.338 0.232 
## 
## Loadings:
##     Factor1 Factor2 Factor3
## G    0.860   0.312         
## AB   0.925   0.356         
## R    0.813   0.426   0.220 
## H    0.898   0.337   0.269 
## X2B  0.714   0.461   0.245 
## X3B  0.593           0.178 
## HR   0.258   0.923   0.151 
## RBI  0.605   0.701   0.231 
## SB   0.556                 
## BB   0.642   0.428         
## SO   0.588   0.609         
## avg  0.245           0.963 
## obp  0.177   0.161   0.777 
## slg          0.647   0.588 
## 
##                Factor1 Factor2 Factor3
## SS loadings      5.518   3.088   2.192
## Proportion Var   0.394   0.221   0.157
## Cumulative Var   0.394   0.615   0.771
## 
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 6744.37 on 52 degrees of freedom.
## The p-value is 0
make_factor_analysis_heatmp(batting_factor_analysis)

make_factor_analysis_sem_plot(batting_factor_analysis)

Classification Models

Class Imbalance

# This is not perfectly to scale but close enough to be useful. 
waffle(c(all_star = 52, non_all_star = 233), rows = 19, 
       title = "Target Distribution")

Train-Test Splits

partition <- createDataPartition(batting_df$all_star, p = 0.7, list=FALSE)
train_df <- batting_df[partition, ]
test_df <- batting_df[-partition, ]

Decision Tree

decision_tree <- train_decision_tree(train_df, 'all_star')
plot_model(decision_tree)

print_grid_search_results(decision_tree)
##   maxdepth  Accuracy     Kappa AccuracySD    KappaSD
## 1        3 0.7775736 0.1498455 0.02189015 0.07485220
## 2        5 0.7755173 0.1535988 0.01721050 0.06187979
## 3        7 0.7724271 0.1520060 0.01902593 0.07211831
## 4       10 0.7742789 0.1618712 0.01673774 0.06819085
print_confusion_matrix(decision_tree, test_df, test_df$all_star)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  526 141
##        yes  10  16
##                                           
##                Accuracy : 0.7821          
##                  95% CI : (0.7495, 0.8123)
##     No Information Rate : 0.7734          
##     P-Value [Acc > NIR] : 0.3111          
##                                           
##                   Kappa : 0.1181          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.10191         
##             Specificity : 0.98134         
##          Pos Pred Value : 0.61538         
##          Neg Pred Value : 0.78861         
##              Prevalence : 0.22655         
##          Detection Rate : 0.02309         
##    Detection Prevalence : 0.03752         
##       Balanced Accuracy : 0.54163         
##                                           
##        'Positive' Class : yes             
## 
tree_roc <- get_roc_auc(decision_tree, test_df, test_df$all_star)
## Area under the curve: 0.5845

plot_decision_tree(train_df, 'all_star', 3)
## 
## Classification tree:
## rpart(formula = formula, data = df, method = "class", maxdepth = depth)
## 
## Variables actually used in tree construction:
## [1] avg G   H  
## 
## Root node error: 367/1620 = 0.22654
## 
## n= 1620 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.031789      0   1.00000 1.00000 0.045908
## 2 0.010000      3   0.90463 0.93188 0.044756
## n= 1620 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 1620 367 no (0.7734568 0.2265432)  
##    2) H< 116.5 1396 256 no (0.8166189 0.1833811) *
##    3) H>=116.5 224 111 no (0.5044643 0.4955357)  
##      6) G< 156.5 204  93 no (0.5441176 0.4558824)  
##       12) avg< 0.2941119 155  59 no (0.6193548 0.3806452) *
##       13) avg>=0.2941119 49  15 yes (0.3061224 0.6938776) *
##      7) G>=156.5 20   2 yes (0.1000000 0.9000000) *

Random Forest

random_forest <- train_random_forest(train_df, 'all_star')
plot_model(random_forest)

print_grid_search_results(random_forest)
##       mtry  Accuracy     Kappa AccuracySD    KappaSD
## 1 3.872983 0.7781886 0.1775086 0.01084145 0.04207982
## 2 3.906891 0.7792178 0.1781382 0.01009534 0.03001527
print_confusion_matrix(random_forest, test_df, test_df$all_star)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  518 133
##        yes  18  24
##                                           
##                Accuracy : 0.7821          
##                  95% CI : (0.7495, 0.8123)
##     No Information Rate : 0.7734          
##     P-Value [Acc > NIR] : 0.3111          
##                                           
##                   Kappa : 0.161           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.15287         
##             Specificity : 0.96642         
##          Pos Pred Value : 0.57143         
##          Neg Pred Value : 0.79570         
##              Prevalence : 0.22655         
##          Detection Rate : 0.03463         
##    Detection Prevalence : 0.06061         
##       Balanced Accuracy : 0.55964         
##                                           
##        'Positive' Class : yes             
## 
forest_roc <- get_roc_auc(random_forest, test_df, test_df$all_star)
## Area under the curve: 0.6496

get_variable_importances(random_forest)
## rf variable importance
## 
##     Overall
## G    100.00
## slg   97.65
## H     88.13
## AB    86.04
## RBI   80.95
## R     80.64
## obp   80.56
## avg   79.19
## SO    55.91
## BB    49.14
## X2B   37.37
## SB    28.71
## HR    23.32
## X3B    0.00

Gradient Boosting

gradient_boosting <- train_gradient_boosting(train_df, 'all_star')
plot_model(gradient_boosting)

print_grid_search_results(gradient_boosting)
##   shrinkage interaction.depth n.minobsinnode n.trees  Accuracy     Kappa
## 1       0.1                 1             20      50 0.7866273 0.1608468
## 4       0.1                 3             20      50 0.7847653 0.1833904
## 7       0.1                 5             20      50 0.7816929 0.1882807
## 2       0.1                 1             20     100 0.7862145 0.1773515
## 5       0.1                 3             20     100 0.7835219 0.1997769
## 8       0.1                 5             20     100 0.7738649 0.1891862
## 3       0.1                 1             20     150 0.7866260 0.1842003
## 6       0.1                 3             20     150 0.7773464 0.2004193
## 9       0.1                 5             20     150 0.7693330 0.1961379
##   AccuracySD    KappaSD
## 1 0.01357915 0.06110472
## 4 0.01774190 0.06735603
## 7 0.01927243 0.07705919
## 2 0.01423996 0.06377382
## 5 0.02021889 0.07906791
## 8 0.02398342 0.09344342
## 3 0.01591576 0.06131095
## 6 0.02132703 0.07398816
## 9 0.02550751 0.08945573
print_confusion_matrix(gradient_boosting, test_df, test_df$all_star)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  522 139
##        yes  14  18
##                                           
##                Accuracy : 0.7792          
##                  95% CI : (0.7465, 0.8096)
##     No Information Rate : 0.7734          
##     P-Value [Acc > NIR] : 0.3782          
##                                           
##                   Kappa : 0.1232          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.11465         
##             Specificity : 0.97388         
##          Pos Pred Value : 0.56250         
##          Neg Pred Value : 0.78971         
##              Prevalence : 0.22655         
##          Detection Rate : 0.02597         
##    Detection Prevalence : 0.04618         
##       Balanced Accuracy : 0.54427         
##                                           
##        'Positive' Class : yes             
## 
gb_roc <- get_roc_auc(gradient_boosting, test_df, test_df$all_star)
## Area under the curve: 0.63

get_variable_importances(gradient_boosting)
## gbm variable importance
## 
##     Overall
## R    100.00
## slg   89.54
## RBI   84.23
## H     74.38
## AB    45.15
## G     38.65
## HR    30.22
## avg   25.09
## obp   20.03
## X2B   11.64
## SO    10.93
## SB    10.63
## X3B    0.00
## BB     0.00

Logistic Regression

log_reg <- train_log_reg(train_df, 'all_star')
print_grid_search_results(log_reg)
##   parameter  Accuracy    Kappa AccuracySD    KappaSD
## 1     1e-03 0.7753127 0.154482 0.01962346 0.06788572
## 2     1e-02 0.7753127 0.154482 0.01962346 0.06788572
## 3     1e-01 0.7753127 0.154482 0.01962346 0.06788572
## 4     1e+00 0.7753127 0.154482 0.01962346 0.06788572
## 5     1e+01 0.7753127 0.154482 0.01962346 0.06788572
## 6     1e+02 0.7753127 0.154482 0.01962346 0.06788572
print_confusion_matrix(log_reg, test_df, test_df$all_star)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  517 132
##        yes  19  25
##                                           
##                Accuracy : 0.7821          
##                  95% CI : (0.7495, 0.8123)
##     No Information Rate : 0.7734          
##     P-Value [Acc > NIR] : 0.3111          
##                                           
##                   Kappa : 0.166           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.15924         
##             Specificity : 0.96455         
##          Pos Pred Value : 0.56818         
##          Neg Pred Value : 0.79661         
##              Prevalence : 0.22655         
##          Detection Rate : 0.03608         
##    Detection Prevalence : 0.06349         
##       Balanced Accuracy : 0.56189         
##                                           
##        'Positive' Class : yes             
## 
lr_roc <- get_roc_auc(log_reg, test_df, test_df$all_star)
## Area under the curve: 0.6762

Clustering

Data Preparation

rownames(batting_df_copy) <- batting_df_copy$playerID
batting_df_copy <- subset(batting_df_copy, select=-c(playerID, all_star))
batting_df_scaled <- scale_dataframe(batting_df_copy)

K-Means Clustering

plot_within_cluster_sum_of_squares(batting_df_scaled, 'rookie batting data')

k_means <- train_k_means_model(batting_df_scaled, 3)
plot_k_means_model(k_means, batting_df_scaled, 'rookie batting')

batting_df_copy <- data.frame(batting_df_copy, k_means$cluster)
summarize_clusters(batting_df_copy)
## [1] "cluster 1 summary"
##        G               AB              R                H        
##  Min.   : 86.0   Min.   :280.0   Min.   : 57.00   Min.   : 73.0  
##  1st Qu.:124.0   1st Qu.:427.8   1st Qu.: 62.00   1st Qu.:116.0  
##  Median :138.0   Median :485.0   Median : 69.00   Median :134.0  
##  Mean   :135.2   Mean   :486.7   Mean   : 71.96   Mean   :134.6  
##  3rd Qu.:150.0   3rd Qu.:545.8   3rd Qu.: 78.75   3rd Qu.:150.0  
##  Max.   :163.0   Max.   :701.0   Max.   :127.00   Max.   :242.0  
##       X2B             X3B               HR             RBI        
##  Min.   : 8.00   Min.   : 0.000   Min.   : 0.00   Min.   : 12.00  
##  1st Qu.:18.00   1st Qu.: 2.000   1st Qu.: 6.00   1st Qu.: 45.00  
##  Median :24.00   Median : 4.000   Median :13.00   Median : 57.00  
##  Mean   :23.94   Mean   : 4.341   Mean   :13.35   Mean   : 59.44  
##  3rd Qu.:29.00   3rd Qu.: 6.000   3rd Qu.:19.00   3rd Qu.: 73.00  
##  Max.   :47.00   Max.   :19.000   Max.   :49.00   Max.   :130.00  
##        SB               BB               SO              avg        
##  Min.   :  0.00   Min.   : 10.00   Min.   : 21.00   Min.   :0.2186  
##  1st Qu.:  4.00   1st Qu.: 34.00   1st Qu.: 65.00   1st Qu.:0.2596  
##  Median : 11.00   Median : 44.50   Median : 83.00   Median :0.2772  
##  Mean   : 14.87   Mean   : 46.14   Mean   : 85.72   Mean   :0.2766  
##  3rd Qu.: 21.00   3rd Qu.: 57.00   3rd Qu.:105.00   3rd Qu.:0.2914  
##  Max.   :110.00   Max.   :105.00   Max.   :185.00   Max.   :0.3509  
##       obp              slg         k_means.cluster
##  Min.   :0.2684   Min.   :0.2933   Min.   :1      
##  1st Qu.:0.3235   1st Qu.:0.5288   1st Qu.:1      
##  Median :0.3417   Median :0.6003   Median :1      
##  Mean   :0.3426   Mean   :0.5997   Mean   :1      
##  3rd Qu.:0.3603   3rd Qu.:0.6716   3rd Qu.:1      
##  Max.   :0.4101   Max.   :0.9268   Max.   :1      
## [1] "cluster 2 summary"
##        G               AB              R               H         
##  Min.   : 40.0   Min.   :134.0   Min.   :30.00   Min.   : 36.00  
##  1st Qu.: 89.0   1st Qu.:260.0   1st Qu.:34.00   1st Qu.: 67.00  
##  Median :105.0   Median :317.5   Median :41.00   Median : 84.00  
##  Mean   :104.5   Mean   :320.6   Mean   :41.01   Mean   : 83.87  
##  3rd Qu.:122.0   3rd Qu.:377.2   3rd Qu.:48.00   3rd Qu.: 99.25  
##  Max.   :160.0   Max.   :547.0   Max.   :56.00   Max.   :151.00  
##       X2B            X3B               HR              RBI       
##  Min.   : 2.0   Min.   : 0.000   Min.   : 0.000   Min.   : 8.00  
##  1st Qu.:11.0   1st Qu.: 1.000   1st Qu.: 3.000   1st Qu.:26.00  
##  Median :15.0   Median : 2.000   Median : 6.000   Median :34.00  
##  Mean   :14.8   Mean   : 2.531   Mean   : 6.701   Mean   :35.26  
##  3rd Qu.:19.0   3rd Qu.: 4.000   3rd Qu.:10.000   3rd Qu.:43.00  
##  Max.   :37.0   Max.   :12.000   Max.   :23.000   Max.   :80.00  
##        SB               BB              SO             avg        
##  Min.   : 0.000   Min.   : 5.00   Min.   : 12.0   Min.   :0.1866  
##  1st Qu.: 2.000   1st Qu.:19.00   1st Qu.: 39.0   1st Qu.:0.2436  
##  Median : 4.000   Median :26.00   Median : 54.0   Median :0.2606  
##  Mean   : 6.681   Mean   :27.25   Mean   : 56.6   Mean   :0.2626  
##  3rd Qu.: 9.000   3rd Qu.:34.00   3rd Qu.: 72.0   3rd Qu.:0.2816  
##  Max.   :50.000   Max.   :94.00   Max.   :158.0   Max.   :0.3491  
##       obp              slg         k_means.cluster
##  Min.   :0.2176   Min.   :0.2946   Min.   :2      
##  1st Qu.:0.2999   1st Qu.:0.4665   1st Qu.:2      
##  Median :0.3242   Median :0.5365   Median :2      
##  Mean   :0.3239   Mean   :0.5453   Mean   :2      
##  3rd Qu.:0.3454   3rd Qu.:0.6178   3rd Qu.:2      
##  Max.   :0.4542   Max.   :0.9486   Max.   :2      
## [1] "cluster 3 summary"
##        G                AB              R               H        
##  Min.   : 26.00   Min.   :101.0   Min.   : 2.00   Min.   : 6.00  
##  1st Qu.: 47.00   1st Qu.:121.0   1st Qu.:13.00   1st Qu.:29.00  
##  Median : 59.00   Median :150.0   Median :17.00   Median :36.00  
##  Mean   : 62.82   Mean   :163.4   Mean   :17.79   Mean   :39.48  
##  3rd Qu.: 77.00   3rd Qu.:195.0   3rd Qu.:23.00   3rd Qu.:48.00  
##  Max.   :141.00   Max.   :410.0   Max.   :29.00   Max.   :97.00  
##       X2B              X3B               HR              RBI       
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000   Min.   : 2.00  
##  1st Qu.: 4.000   1st Qu.:0.0000   1st Qu.: 1.000   1st Qu.:11.00  
##  Median : 7.000   Median :1.0000   Median : 2.000   Median :15.00  
##  Mean   : 6.939   Mean   :0.9702   Mean   : 3.036   Mean   :16.88  
##  3rd Qu.: 9.000   3rd Qu.:1.0000   3rd Qu.: 4.000   3rd Qu.:22.00  
##  Max.   :21.000   Max.   :7.0000   Max.   :18.000   Max.   :59.00  
##        SB               BB              SO             avg         
##  Min.   : 0.000   Min.   : 0.00   Min.   : 5.00   Min.   :0.05825  
##  1st Qu.: 0.000   1st Qu.: 8.00   1st Qu.:21.00   1st Qu.:0.21739  
##  Median : 1.000   Median :12.00   Median :29.00   Median :0.23973  
##  Mean   : 2.385   Mean   :13.39   Mean   :31.22   Mean   :0.24055  
##  3rd Qu.: 3.000   3rd Qu.:17.00   3rd Qu.:39.00   3rd Qu.:0.26431  
##  Max.   :27.000   Max.   :44.00   Max.   :99.00   Max.   :0.36216  
##       obp              slg          k_means.cluster
##  Min.   :0.1019   Min.   :0.05825   Min.   :3      
##  1st Qu.:0.2750   1st Qu.:0.40187   1st Qu.:3      
##  Median :0.3007   Median :0.47964   Median :3      
##  Mean   :0.3005   Mean   :0.48697   Mean   :3      
##  3rd Qu.:0.3263   3rd Qu.:0.55927   3rd Qu.:3      
##  Max.   :0.4348   Max.   :0.94215   Max.   :3
batting_df_copy$k_means.cluster <- as.factor(batting_df_copy$k_means.cluster)

agg_summary_cols <- c('G', 'AB', 'R', 'H', 'X2B', 'X3B', 'HR', 'RBI', 'SB')
for (column in agg_summary_cols){
  make_histogram_by_target(batting_df_copy, column, 'k_means.cluster', 
                           paste(column,' histogram by cluster'), 10)
}

rate_summary_cols <- c('avg', 'obp', 'slg')
for (column in rate_summary_cols){
  make_histogram_by_target(batting_df_copy, column, 'k_means.cluster', 
                           paste(column,' histogram by cluster'), .1)
}

Heirarchical Clustering

hclust_model <- create_hclust_and_plot(batting_df_scaled)

plot(cut(as.dendrogram(hclust_model), h=8)$lower[[4]])

plot(cut(as.dendrogram(hclust_model), h=6)$lower[[15]])