Cleaning Functions

subset_to_rookie_year <- function(df){
  # Define rookie season as first yearID in which player had more than 100 ABs.
  df[, 'yearID'] <- sapply(df[, 'yearID'], as.numeric)
  eligible_df <- df[ which(df$AB > 100), ]
  rookie_df <- aggregate(eligible_df$yearID, by=list(eligible_df$playerID), min)
  colnames(rookie_df) <- c('playerID', 'yearID')
  df <- merge(df, rookie_df, by=c('playerID', 'yearID'))
  df <- df[!duplicated(df$playerID),]
  # For simplicity, remove small number of players with 100+ ABs for two teams 
  # in their rookie season. 
  df <- df[ which(df$AB > 100), ]
  return(df)
}


# Only use the last few decades of players.
# Do not use players who are too recent - they may still become all-stars. 
subset_to_between_1970_and_2010 <- function(df){
  df <- df[ which(df$yearID >= 1970 & df$yearID <= 2010), ]
  return(df)
}


count_all_star_appearances <- function(all_stars, batting){
  all_star_temp <- all_stars[,c('playerID', 'yearID')]
  all_star_temp$rookie_all_star_appearance <- 'yes'
  batting <- merge(batting, all_star_temp, by=c('playerID', 'yearID'), all.x=TRUE)
  batting$rookie_all_star_appearance[is.na(batting$rookie_all_star_appearance)] <- 'no'
  
  batting$rookie_id <- 'yes'
  all_stars_non_rookie <- merge(all_stars, batting, by=c('playerID', 'yearID'), all.x=TRUE)
  all_stars_non_rookie$rookie_id[is.na(all_stars_non_rookie$rookie_id)] <- 'no'
  all_stars_non_rookie <- all_stars_non_rookie[ which(all_stars_non_rookie$rookie_id == 'no'), ]
  
  all_stars_non_rookie <- all_stars_non_rookie[c('playerID')]
  all_stars_non_rookie <- as.data.frame(table(all_stars_non_rookie))
  colnames(all_stars_non_rookie) <- c('playerID', 'all_star')
  
  merged_df <- merge(batting, all_stars_non_rookie, by='playerID', all.x=TRUE)
  merged_df$all_star[merged_df$all_star > 0] <- "yes"
  merged_df$all_star[merged_df$all_star != 'yes'] <- "no"
  merged_df$all_star[is.na(merged_df$all_star)] <- 'no'
  return(merged_df)
}


create_name_to_id_mapping <- function(df){
  df$playerName <- paste(df$nameFirst, ' ', df$nameLast)
  df <- df[c('playerID', 'playerName')]
  return(df)
}


calculate_slg_obp_obp_and_avg <- function(df){
  df[is.na(df)] <- 0
  df$avg <- df$H / df$AB
  df$obp <- (df$H + df$BB + df$HBP) / (df$AB + df$BB + df$HBP + df$SF)
  df$slg <- (df$H + (df$X2B + df$X3B + df$HR) + (df$X2B * 2) + 
              (df$X3B * 3 + df$HR * 4)) / df$AB
  df[is.na(df)] <- 0
  return(df)
}


select_columns_for_modeling <- function(df){
  df <- subset(df, select=c(G, AB, R, H, X2B, X3B, HR, RBI, SB, BB, SO, avg, obp, slg, 
                            all_star, playerID))
  return(df)
}


drop_player_id <- function(df){
  drop <- c('playerID')
  df <- df[ , !(names(df) %in% drop)]
  return(df)
}

Exploration Functions

count_factor_occurrences_by_target <- function(df, feature, target, title){
  print(ggplot(df, aes_string(feature, fill = target)) +
          geom_bar() + ggtitle(title))
}


make_histogram_by_target <- function(df, feature, target, title, bins){
  print(ggplot(df, aes_string(feature, fill = target)) +
          geom_histogram(binwidth = bins) + ggtitle(title))
}


make_parallel_coordinates <- function(df, feature, cuts){
  c <- blue2red(cuts)
  r <- cut(feature, cuts)
  parcoord(df, col=c[as.numeric(r)])
}

Factor Analysis Functions

make_scree_table_for_factor_analysis <- function(df){
  nScree(df)
}


get_eigenvalues <- function(df){
  eigen(cor(df))
}


build_factor_analysis_model <- function(df, n_factors){
  fa <- factanal(df, factors = n_factors, lower = 0.01)
  print(fa)
  return(fa)
}


make_factor_analysis_heatmp <- function(fa){
  heatmap.2(fa$loadings, col = brewer.pal(9, "Greens"), trace = "none",
            key = FALSE, dend = 'none', Colv = FALSE, cexCol = 1.2,
            main = "Factor Loadings")
}


make_factor_analysis_sem_plot <- function(fa){
  semPaths(fa, what = "est", residuals = FALSE, cut = 0.4,
           posCol = c("white", "darkgreen"), 
           negCol = c("white", "red"),
           edge.label.cex = 0.60, nCharNodes = 7)
}

Supervised Machine Learning Functions

train_random_forest <- function(train_df, target){
  control <- trainControl(method="repeatedcv", number=3, repeats=3, classProbs=TRUE)
  mtry <- c(sqrt(ncol(train_df)), log2(ncol(train_df)))
  grid <- expand.grid(.mtry=mtry)
  formula <- as.formula(paste(target, "~ ."))
  
  model <- train(formula, 
                 data=train_df, 
                 preProcess=c("center", "scale"),
                 method="rf", 
                 metric="ROC",
                 trControl=control, 
                 tuneGrid=grid,
                 allowParallel=TRUE,
                 num.threads=4)
  return(model)
}


train_log_reg <- function(train_df, target){
  control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
  grid <- expand.grid(parameter=c(0.001, 0.01, 0.1, 1,10, 100))
  formula <- as.formula(paste(target, "~ ."))
  
  model <- train(formula, 
                 data=train_df, 
                 preProcess=c("center", "scale"),
                 method="glm", 
                 family="binomial", 
                 metric="ROC",
                 trControl=control, 
                 tuneGrid=grid)
  
  return(model)
}


train_decision_tree <- function(train_df, target){
  control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
  grid <- expand.grid(.maxdepth=c(3, 5, 7, 10))
  formula <- as.formula(paste(target, "~ ."))
  
  model <- train(formula, 
                 data=train_df, 
                 preProcess=c("center", "scale"),
                 method="rpart2", 
                 metric="ROC",
                 trControl=control, 
                 tuneGrid=grid)
  return(model)
}


train_gradient_boosting <- function(train_df, target){
  control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
  grid <- expand.grid(interaction.depth = c(1, 3, 5), 
              n.trees = c(50, 100, 150), 
              shrinkage = 0.1,
              n.minobsinnode = 20)
  
  formula <- as.formula(paste(target, "~ ."))
  
  model <- train(formula, 
                 data=train_df, 
                 preProcess=c("center", "scale"),
                 method="gbm", 
                 metric="ROC",
                 verbose=F,
                 trControl=control, 
                 tuneGrid=grid)
  return(model)
}


plot_decision_tree <- function(df, target, depth){
  formula <- as.formula(paste(target, "~ ."))
  set.seed(19)
  tree <- rpart(formula, method="class", maxdepth=depth, data=df)
  printcp(tree)
  print(tree)
  fancyRpartPlot(tree)
}


plot_model <- function(model){
  plot(model)
}


print_grid_search_results <- function(model){
  model$bestTune
  results <- model$results
  results 
}


print_confusion_matrix <- function(model, df, target){
  predictions <- predict(model, df)
  con_matrix <- confusionMatrix(predictions, target, positive = 'yes')   
  con_matrix
}


get_roc_auc <- function(model, df, target){
  probabilities <- predict(model, df, type="prob")
  
  ROC <- roc(predictor=probabilities$yes,
             response=target)
  print(ROC$auc)
  plot(ROC, main="ROC")
  return(ROC)
}


get_variable_importances <- function(model){
  varImp(model)
}

Unsupervised Machine Learning Functions

scale_dataframe <- function(df){
  df[, -c(3)] <- scale(df[, -c(3)])
  df <- data.frame(df)
  return(df)
}


plot_within_cluster_sum_of_squares <- function(df, title){
  wss <- (nrow(df)-1) * sum(apply(df, 2, var))
  for (i in 2:15) wss[i] <- sum(kmeans(df, centers=i)$withinss)
  plot(1:15, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares", main = paste(title,' elbow plot'))
}


train_k_means_model <- function(df, k){
  set.seed(19)
  model <- kmeans(df, k, nstart=25)
  return(model)
}


plot_k_means_model <- function(model, df, title){
  clusplot(df, model$cluster, color=TRUE, shade=TRUE,
           labels=2, lines=0, main = paste(title,' PCA Plot of K-Means'))
}


create_hclust_and_plot <- function(df){
  set.seed(19)
  dm = dist(df,method="euclidean")
  hclust_model <- hclust(dm, method="complete")
  plot(hclust_model)
  return(hclust_model)
}


summarize_clusters <- function(df){
  cluster1 <- df[which(df$k_means.cluster=='1'),]
  cluster2 <- df[which(df$k_means.cluster=='2'),]
  cluster3 <- df[which(df$k_means.cluster=='3'),]
  
  print('cluster 1 summary')
  print(summary(cluster1))
  print('cluster 2 summary')
  print(summary(cluster2))
  print('cluster 3 summary')
  print(summary(cluster3))
}

Execution

Read in data

all_star_df <- read.csv('data/AllstarFull.csv')
batting_df <- read.csv('data/Batting.csv')
people_df <- read.csv('data/People.csv')

Data cleaning

batting_df <- subset_to_rookie_year(batting_df)
batting_df <- subset_to_between_1970_and_2010(batting_df)
batting_df <- count_all_star_appearances(all_star_df, batting_df)
batting_df <- calculate_slg_obp_obp_and_avg(batting_df)
batting_df <- select_columns_for_modeling(batting_df)
batting_df_copy <- batting_df
batting_df <- drop_player_id(batting_df)

Data Exploration

agg_cols_for_hist <- c('G', 'H', 'X2B', 'HR', 'RBI', 'SB')
for (column in agg_cols_for_hist){
  make_histogram_by_target(batting_df, column, 'all_star', 
                           paste(column,' histogram by all star'), 10)
}

rate_cols_for_hist <- c('avg', 'obp', 'slg')
for (column in rate_cols_for_hist){
  make_histogram_by_target(batting_df, column, 'all_star', 
                           paste(column,' histogram by all star'), .1)
}

# home runs paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$HR, 20)

# obp paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$obp, 20)

# hits paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$H, 20)

# slg paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$slg, 20)

Factor Analysis

make_scree_table_for_factor_analysis(batting_df[1:14])

##   noc naf nparallel nkaiser
## 1   3   1         3       3

get_eigenvalues(batting_df[1:14])

## $values
##  [1] 8.249808403 1.854338281 1.395355460 0.685687678 0.554203569
##  [6] 0.437636506 0.234931186 0.224893153 0.140682786 0.085785935
## [11] 0.054002627 0.043706253 0.034306568 0.004661595
## 
## $vectors
##             [,1]        [,2]         [,3]         [,4]        [,5]
##  [1,] -0.3004331  0.22552342  0.002991965 -0.214291356 -0.20908291
##  [2,] -0.3251658  0.20256023  0.008520668 -0.087614482 -0.21942073
##  [3,] -0.3322291  0.09149677  0.072664606  0.014272143  0.02906008
##  [4,] -0.3320129  0.09386681  0.082728669 -0.077402592 -0.26707296
##  [5,] -0.3090893 -0.01617361 -0.089239947 -0.004405929 -0.30455914
##  [6,] -0.2141106  0.12724805  0.348745639  0.566515090 -0.07910700
##  [7,] -0.2532477 -0.16026550 -0.460780197  0.184045072  0.23455915
##  [8,] -0.3157035 -0.05599209 -0.248637445 -0.002842869 -0.09575287
##  [9,] -0.1645273  0.25745151  0.496860370  0.250937159  0.47108337
## [10,] -0.2834618  0.07026278 -0.017809804 -0.433516479  0.44767756
## [11,] -0.2794128  0.15070538 -0.265222056  0.115119859  0.26886264
## [12,] -0.1741356 -0.48821838  0.374024681 -0.051084340 -0.30215863
## [13,] -0.1802661 -0.48063767  0.308843148 -0.371532047  0.27393383
## [14,] -0.1729315 -0.53136815 -0.176803007  0.424928679  0.10864709
##              [,6]         [,7]         [,8]        [,9]        [,10]
##  [1,] -0.02260710 -0.132918557  0.073247444  0.82866115  0.117395503
##  [2,]  0.04654594 -0.049925842  0.008121485 -0.06269410 -0.115759642
##  [3,]  0.08216492 -0.005373923  0.141127312 -0.14317407 -0.661216214
##  [4,]  0.08859406 -0.083719295  0.025905815 -0.16922713 -0.127925992
##  [5,]  0.14337766  0.721473104 -0.123363058 -0.15480659  0.130758741
##  [6,] -0.66601937 -0.056150991  0.079160662 -0.05721246  0.088637853
##  [7,]  0.12030128 -0.345758070  0.306311390 -0.04117933 -0.146299231
##  [8,]  0.05358724 -0.209325916  0.228804563 -0.28719137  0.643289987
##  [9,]  0.55814528  0.055640613  0.084156029  0.04418578  0.216009589
## [10,] -0.34447179  0.225734361  0.188871195 -0.08344856  0.005853165
## [11,] -0.05046551 -0.149869540 -0.843018472 -0.00437432  0.017853242
## [12,]  0.18113704 -0.322380880 -0.199041868 -0.07256980 -0.018510412
## [13,] -0.18403031 -0.015351012 -0.095160330  0.01798346  0.075267759
## [14,]  0.04275459  0.324163645  0.064883823  0.36649091 -0.077749979
##              [,11]       [,12]        [,13]         [,14]
##  [1,]  0.156817419  0.01008885  0.134504983 -0.0494372667
##  [2,] -0.344207453 -0.14592765 -0.451773086  0.6568209315
##  [3,]  0.361156603  0.48923254  0.128071769  0.0658505782
##  [4,] -0.240386079 -0.11481465 -0.349081662 -0.7371808459
##  [5,]  0.273125021 -0.28783110  0.217895508  0.0343269306
##  [6,]  0.082353918 -0.11969163  0.061609080  0.0104300563
##  [7,]  0.150319319 -0.56298225  0.135453728  0.0114920150
##  [8,]  0.116276488  0.46362841 -0.029821344  0.0386872680
##  [9,] -0.023818168 -0.06321212 -0.003121233  0.0008024401
## [10,] -0.448366476  0.02153507  0.332770603 -0.0390386657
## [11,]  0.001338981  0.07298571  0.023370218 -0.0378598948
## [12,] -0.253233594 -0.04068682  0.489347379  0.1015157019
## [13,]  0.429695122 -0.13033954 -0.419767278  0.0324633607
## [14,] -0.317565203  0.26185694 -0.205134930 -0.0337207140

batting_factor_analysis <- build_factor_analysis_model(batting_df[1:14], 3)

## 
## Call:
## factanal(x = df, factors = n_factors, lower = 0.01)
## 
## Uniquenesses:
##     G    AB     R     H   X2B   X3B    HR   RBI    SB    BB    SO   avg 
## 0.162 0.010 0.110 0.010 0.218 0.612 0.059 0.089 0.675 0.400 0.282 0.010 
##   obp   slg 
## 0.338 0.232 
## 
## Loadings:
##     Factor1 Factor2 Factor3
## G    0.860   0.312         
## AB   0.925   0.356         
## R    0.813   0.426   0.220 
## H    0.898   0.337   0.269 
## X2B  0.714   0.461   0.245 
## X3B  0.593           0.178 
## HR   0.258   0.923   0.151 
## RBI  0.605   0.701   0.231 
## SB   0.556                 
## BB   0.642   0.428         
## SO   0.588   0.609         
## avg  0.245           0.963 
## obp  0.177   0.161   0.777 
## slg          0.647   0.588 
## 
##                Factor1 Factor2 Factor3
## SS loadings      5.518   3.088   2.192
## Proportion Var   0.394   0.221   0.157
## Cumulative Var   0.394   0.615   0.771
## 
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 6744.37 on 52 degrees of freedom.
## The p-value is 0

make_factor_analysis_heatmp(batting_factor_analysis)

make_factor_analysis_sem_plot(batting_factor_analysis)

Classification Models

Class Imbalance

# This is not perfectly to scale but close enough to be useful. 
waffle(c(all_star = 52, non_all_star = 233), rows = 19, 
       title = "Target Distribution")

Train-Test Splits

partition <- createDataPartition(batting_df$all_star, p = 0.7, list=FALSE)
train_df <- batting_df[partition, ]
test_df <- batting_df[-partition, ]

Decision Tree

decision_tree <- train_decision_tree(train_df, 'all_star')
plot_model(decision_tree)

print_grid_search_results(decision_tree)

##   maxdepth  Accuracy     Kappa AccuracySD    KappaSD
## 1        3 0.7775736 0.1498455 0.02189015 0.07485220
## 2        5 0.7755173 0.1535988 0.01721050 0.06187979
## 3        7 0.7724271 0.1520060 0.01902593 0.07211831
## 4       10 0.7742789 0.1618712 0.01673774 0.06819085

print_confusion_matrix(decision_tree, test_df, test_df$all_star)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  526 141
##        yes  10  16
##                                           
##                Accuracy : 0.7821          
##                  95% CI : (0.7495, 0.8123)
##     No Information Rate : 0.7734          
##     P-Value [Acc > NIR] : 0.3111          
##                                           
##                   Kappa : 0.1181          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.10191         
##             Specificity : 0.98134         
##          Pos Pred Value : 0.61538         
##          Neg Pred Value : 0.78861         
##              Prevalence : 0.22655         
##          Detection Rate : 0.02309         
##    Detection Prevalence : 0.03752         
##       Balanced Accuracy : 0.54163         
##                                           
##        'Positive' Class : yes             
##

tree_roc <- get_roc_auc(decision_tree, test_df, test_df$all_star)

## Area under the curve: 0.5845

plot_decision_tree(train_df, 'all_star', 3)

## 
## Classification tree:
## rpart(formula = formula, data = df, method = "class", maxdepth = depth)
## 
## Variables actually used in tree construction:
## [1] avg G   H  
## 
## Root node error: 367/1620 = 0.22654
## 
## n= 1620 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.031789      0   1.00000 1.00000 0.045908
## 2 0.010000      3   0.90463 0.93188 0.044756
## n= 1620 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 1620 367 no (0.7734568 0.2265432)  
##    2) H< 116.5 1396 256 no (0.8166189 0.1833811) *
##    3) H>=116.5 224 111 no (0.5044643 0.4955357)  
##      6) G< 156.5 204  93 no (0.5441176 0.4558824)  
##       12) avg< 0.2941119 155  59 no (0.6193548 0.3806452) *
##       13) avg>=0.2941119 49  15 yes (0.3061224 0.6938776) *
##      7) G>=156.5 20   2 yes (0.1000000 0.9000000) *

Random Forest

random_forest <- train_random_forest(train_df, 'all_star')
plot_model(random_forest)

print_grid_search_results(random_forest)

##       mtry  Accuracy     Kappa AccuracySD    KappaSD
## 1 3.872983 0.7781886 0.1775086 0.01084145 0.04207982
## 2 3.906891 0.7792178 0.1781382 0.01009534 0.03001527

print_confusion_matrix(random_forest, test_df, test_df$all_star)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  518 133
##        yes  18  24
##                                           
##                Accuracy : 0.7821          
##                  95% CI : (0.7495, 0.8123)
##     No Information Rate : 0.7734          
##     P-Value [Acc > NIR] : 0.3111          
##                                           
##                   Kappa : 0.161           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.15287         
##             Specificity : 0.96642         
##          Pos Pred Value : 0.57143         
##          Neg Pred Value : 0.79570         
##              Prevalence : 0.22655         
##          Detection Rate : 0.03463         
##    Detection Prevalence : 0.06061         
##       Balanced Accuracy : 0.55964         
##                                           
##        'Positive' Class : yes             
##

forest_roc <- get_roc_auc(random_forest, test_df, test_df$all_star)

## Area under the curve: 0.6496

get_variable_importances(random_forest)

## rf variable importance
## 
##     Overall
## G    100.00
## slg   97.65
## H     88.13
## AB    86.04
## RBI   80.95
## R     80.64
## obp   80.56
## avg   79.19
## SO    55.91
## BB    49.14
## X2B   37.37
## SB    28.71
## HR    23.32
## X3B    0.00

Gradient Boosting

gradient_boosting <- train_gradient_boosting(train_df, 'all_star')
plot_model(gradient_boosting)

print_grid_search_results(gradient_boosting)

##   shrinkage interaction.depth n.minobsinnode n.trees  Accuracy     Kappa
## 1       0.1                 1             20      50 0.7866273 0.1608468
## 4       0.1                 3             20      50 0.7847653 0.1833904
## 7       0.1                 5             20      50 0.7816929 0.1882807
## 2       0.1                 1             20     100 0.7862145 0.1773515
## 5       0.1                 3             20     100 0.7835219 0.1997769
## 8       0.1                 5             20     100 0.7738649 0.1891862
## 3       0.1                 1             20     150 0.7866260 0.1842003
## 6       0.1                 3             20     150 0.7773464 0.2004193
## 9       0.1                 5             20     150 0.7693330 0.1961379
##   AccuracySD    KappaSD
## 1 0.01357915 0.06110472
## 4 0.01774190 0.06735603
## 7 0.01927243 0.07705919
## 2 0.01423996 0.06377382
## 5 0.02021889 0.07906791
## 8 0.02398342 0.09344342
## 3 0.01591576 0.06131095
## 6 0.02132703 0.07398816
## 9 0.02550751 0.08945573

print_confusion_matrix(gradient_boosting, test_df, test_df$all_star)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  522 139
##        yes  14  18
##                                           
##                Accuracy : 0.7792          
##                  95% CI : (0.7465, 0.8096)
##     No Information Rate : 0.7734          
##     P-Value [Acc > NIR] : 0.3782          
##                                           
##                   Kappa : 0.1232          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.11465         
##             Specificity : 0.97388         
##          Pos Pred Value : 0.56250         
##          Neg Pred Value : 0.78971         
##              Prevalence : 0.22655         
##          Detection Rate : 0.02597         
##    Detection Prevalence : 0.04618         
##       Balanced Accuracy : 0.54427         
##                                           
##        'Positive' Class : yes             
##

gb_roc <- get_roc_auc(gradient_boosting, test_df, test_df$all_star)

## Area under the curve: 0.63

get_variable_importances(gradient_boosting)

## gbm variable importance
## 
##     Overall
## R    100.00
## slg   89.54
## RBI   84.23
## H     74.38
## AB    45.15
## G     38.65
## HR    30.22
## avg   25.09
## obp   20.03
## X2B   11.64
## SO    10.93
## SB    10.63
## X3B    0.00
## BB     0.00

Logistic Regression

log_reg <- train_log_reg(train_df, 'all_star')
print_grid_search_results(log_reg)

##   parameter  Accuracy    Kappa AccuracySD    KappaSD
## 1     1e-03 0.7753127 0.154482 0.01962346 0.06788572
## 2     1e-02 0.7753127 0.154482 0.01962346 0.06788572
## 3     1e-01 0.7753127 0.154482 0.01962346 0.06788572
## 4     1e+00 0.7753127 0.154482 0.01962346 0.06788572
## 5     1e+01 0.7753127 0.154482 0.01962346 0.06788572
## 6     1e+02 0.7753127 0.154482 0.01962346 0.06788572

print_confusion_matrix(log_reg, test_df, test_df$all_star)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  517 132
##        yes  19  25
##                                           
##                Accuracy : 0.7821          
##                  95% CI : (0.7495, 0.8123)
##     No Information Rate : 0.7734          
##     P-Value [Acc > NIR] : 0.3111          
##                                           
##                   Kappa : 0.166           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.15924         
##             Specificity : 0.96455         
##          Pos Pred Value : 0.56818         
##          Neg Pred Value : 0.79661         
##              Prevalence : 0.22655         
##          Detection Rate : 0.03608         
##    Detection Prevalence : 0.06349         
##       Balanced Accuracy : 0.56189         
##                                           
##        'Positive' Class : yes             
##

lr_roc <- get_roc_auc(log_reg, test_df, test_df$all_star)

## Area under the curve: 0.6762

Clustering

Data Preparation

rownames(batting_df_copy) <- batting_df_copy$playerID
batting_df_copy <- subset(batting_df_copy, select=-c(playerID, all_star))
batting_df_scaled <- scale_dataframe(batting_df_copy)

K-Means Clustering

plot_within_cluster_sum_of_squares(batting_df_scaled, 'rookie batting data')

k_means <- train_k_means_model(batting_df_scaled, 3)
plot_k_means_model(k_means, batting_df_scaled, 'rookie batting')

batting_df_copy <- data.frame(batting_df_copy, k_means$cluster)
summarize_clusters(batting_df_copy)

## [1] "cluster 1 summary"
##        G               AB              R                H        
##  Min.   : 86.0   Min.   :280.0   Min.   : 57.00   Min.   : 73.0  
##  1st Qu.:124.0   1st Qu.:427.8   1st Qu.: 62.00   1st Qu.:116.0  
##  Median :138.0   Median :485.0   Median : 69.00   Median :134.0  
##  Mean   :135.2   Mean   :486.7   Mean   : 71.96   Mean   :134.6  
##  3rd Qu.:150.0   3rd Qu.:545.8   3rd Qu.: 78.75   3rd Qu.:150.0  
##  Max.   :163.0   Max.   :701.0   Max.   :127.00   Max.   :242.0  
##       X2B             X3B               HR             RBI        
##  Min.   : 8.00   Min.   : 0.000   Min.   : 0.00   Min.   : 12.00  
##  1st Qu.:18.00   1st Qu.: 2.000   1st Qu.: 6.00   1st Qu.: 45.00  
##  Median :24.00   Median : 4.000   Median :13.00   Median : 57.00  
##  Mean   :23.94   Mean   : 4.341   Mean   :13.35   Mean   : 59.44  
##  3rd Qu.:29.00   3rd Qu.: 6.000   3rd Qu.:19.00   3rd Qu.: 73.00  
##  Max.   :47.00   Max.   :19.000   Max.   :49.00   Max.   :130.00  
##        SB               BB               SO              avg        
##  Min.   :  0.00   Min.   : 10.00   Min.   : 21.00   Min.   :0.2186  
##  1st Qu.:  4.00   1st Qu.: 34.00   1st Qu.: 65.00   1st Qu.:0.2596  
##  Median : 11.00   Median : 44.50   Median : 83.00   Median :0.2772  
##  Mean   : 14.87   Mean   : 46.14   Mean   : 85.72   Mean   :0.2766  
##  3rd Qu.: 21.00   3rd Qu.: 57.00   3rd Qu.:105.00   3rd Qu.:0.2914  
##  Max.   :110.00   Max.   :105.00   Max.   :185.00   Max.   :0.3509  
##       obp              slg         k_means.cluster
##  Min.   :0.2684   Min.   :0.2933   Min.   :1      
##  1st Qu.:0.3235   1st Qu.:0.5288   1st Qu.:1      
##  Median :0.3417   Median :0.6003   Median :1      
##  Mean   :0.3426   Mean   :0.5997   Mean   :1      
##  3rd Qu.:0.3603   3rd Qu.:0.6716   3rd Qu.:1      
##  Max.   :0.4101   Max.   :0.9268   Max.   :1      
## [1] "cluster 2 summary"
##        G               AB              R               H         
##  Min.   : 40.0   Min.   :134.0   Min.   :30.00   Min.   : 36.00  
##  1st Qu.: 89.0   1st Qu.:260.0   1st Qu.:34.00   1st Qu.: 67.00  
##  Median :105.0   Median :317.5   Median :41.00   Median : 84.00  
##  Mean   :104.5   Mean   :320.6   Mean   :41.01   Mean   : 83.87  
##  3rd Qu.:122.0   3rd Qu.:377.2   3rd Qu.:48.00   3rd Qu.: 99.25  
##  Max.   :160.0   Max.   :547.0   Max.   :56.00   Max.   :151.00  
##       X2B            X3B               HR              RBI       
##  Min.   : 2.0   Min.   : 0.000   Min.   : 0.000   Min.   : 8.00  
##  1st Qu.:11.0   1st Qu.: 1.000   1st Qu.: 3.000   1st Qu.:26.00  
##  Median :15.0   Median : 2.000   Median : 6.000   Median :34.00  
##  Mean   :14.8   Mean   : 2.531   Mean   : 6.701   Mean   :35.26  
##  3rd Qu.:19.0   3rd Qu.: 4.000   3rd Qu.:10.000   3rd Qu.:43.00  
##  Max.   :37.0   Max.   :12.000   Max.   :23.000   Max.   :80.00  
##        SB               BB              SO             avg        
##  Min.   : 0.000   Min.   : 5.00   Min.   : 12.0   Min.   :0.1866  
##  1st Qu.: 2.000   1st Qu.:19.00   1st Qu.: 39.0   1st Qu.:0.2436  
##  Median : 4.000   Median :26.00   Median : 54.0   Median :0.2606  
##  Mean   : 6.681   Mean   :27.25   Mean   : 56.6   Mean   :0.2626  
##  3rd Qu.: 9.000   3rd Qu.:34.00   3rd Qu.: 72.0   3rd Qu.:0.2816  
##  Max.   :50.000   Max.   :94.00   Max.   :158.0   Max.   :0.3491  
##       obp              slg         k_means.cluster
##  Min.   :0.2176   Min.   :0.2946   Min.   :2      
##  1st Qu.:0.2999   1st Qu.:0.4665   1st Qu.:2      
##  Median :0.3242   Median :0.5365   Median :2      
##  Mean   :0.3239   Mean   :0.5453   Mean   :2      
##  3rd Qu.:0.3454   3rd Qu.:0.6178   3rd Qu.:2      
##  Max.   :0.4542   Max.   :0.9486   Max.   :2      
## [1] "cluster 3 summary"
##        G                AB              R               H        
##  Min.   : 26.00   Min.   :101.0   Min.   : 2.00   Min.   : 6.00  
##  1st Qu.: 47.00   1st Qu.:121.0   1st Qu.:13.00   1st Qu.:29.00  
##  Median : 59.00   Median :150.0   Median :17.00   Median :36.00  
##  Mean   : 62.82   Mean   :163.4   Mean   :17.79   Mean   :39.48  
##  3rd Qu.: 77.00   3rd Qu.:195.0   3rd Qu.:23.00   3rd Qu.:48.00  
##  Max.   :141.00   Max.   :410.0   Max.   :29.00   Max.   :97.00  
##       X2B              X3B               HR              RBI       
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000   Min.   : 2.00  
##  1st Qu.: 4.000   1st Qu.:0.0000   1st Qu.: 1.000   1st Qu.:11.00  
##  Median : 7.000   Median :1.0000   Median : 2.000   Median :15.00  
##  Mean   : 6.939   Mean   :0.9702   Mean   : 3.036   Mean   :16.88  
##  3rd Qu.: 9.000   3rd Qu.:1.0000   3rd Qu.: 4.000   3rd Qu.:22.00  
##  Max.   :21.000   Max.   :7.0000   Max.   :18.000   Max.   :59.00  
##        SB               BB              SO             avg         
##  Min.   : 0.000   Min.   : 0.00   Min.   : 5.00   Min.   :0.05825  
##  1st Qu.: 0.000   1st Qu.: 8.00   1st Qu.:21.00   1st Qu.:0.21739  
##  Median : 1.000   Median :12.00   Median :29.00   Median :0.23973  
##  Mean   : 2.385   Mean   :13.39   Mean   :31.22   Mean   :0.24055  
##  3rd Qu.: 3.000   3rd Qu.:17.00   3rd Qu.:39.00   3rd Qu.:0.26431  
##  Max.   :27.000   Max.   :44.00   Max.   :99.00   Max.   :0.36216  
##       obp              slg          k_means.cluster
##  Min.   :0.1019   Min.   :0.05825   Min.   :3      
##  1st Qu.:0.2750   1st Qu.:0.40187   1st Qu.:3      
##  Median :0.3007   Median :0.47964   Median :3      
##  Mean   :0.3005   Mean   :0.48697   Mean   :3      
##  3rd Qu.:0.3263   3rd Qu.:0.55927   3rd Qu.:3      
##  Max.   :0.4348   Max.   :0.94215   Max.   :3

batting_df_copy$k_means.cluster <- as.factor(batting_df_copy$k_means.cluster)

agg_summary_cols <- c('G', 'AB', 'R', 'H', 'X2B', 'X3B', 'HR', 'RBI', 'SB')
for (column in agg_summary_cols){
  make_histogram_by_target(batting_df_copy, column, 'k_means.cluster', 
                           paste(column,' histogram by cluster'), 10)
}

rate_summary_cols <- c('avg', 'obp', 'slg')
for (column in rate_summary_cols){
  make_histogram_by_target(batting_df_copy, column, 'k_means.cluster', 
                           paste(column,' histogram by cluster'), .1)
}

Heirarchical Clustering

hclust_model <- create_hclust_and_plot(batting_df_scaled)

plot(cut(as.dendrogram(hclust_model), h=8)$lower[[4]])

plot(cut(as.dendrogram(hclust_model), h=6)$lower[[15]])

Rookie All-Star Modeling

Cleaning Functions

Exploration Functions

Factor Analysis Functions

Supervised Machine Learning Functions

Unsupervised Machine Learning Functions

Execution

Read in data

Data cleaning

Data Exploration

Factor Analysis

Classification Models

Class Imbalance

Train-Test Splits

Decision Tree

Random Forest

Gradient Boosting

Logistic Regression

Clustering

Data Preparation

K-Means Clustering

Heirarchical Clustering