Introduction
Our project examines NCAA Division 1 football teams who made head
coaching changes in the last ten seasons. Our goal is to measure the
impact that first year head coaches have during their first season with
a new team. We plan to do this by comparing various measures of success
such as win percentage, touchdown differential, turnover differential,
and offensive/defensive rank to the season before under the previous
head coach. Our project is inspired by the recent success Duke’s
football team has seen after the hiring of head coach Mike Elko after
numerous sub-par seasons.
Data Gathering and Cleaning
We started by gathering data from https://www.sports-reference.com/cfb/ which has college
football data dating back to 2013 on both teams and coaches. We
downloaded the csv’s of every college football season available as well
as all of the head coach data that was available. After cleaning our
datasets to eliminate potential conflicts with joining the data, we were
able to combine our datasets of different seasons into two main datasets
with all of the team data and all of the coach data. Then, we were able
to join the two new datasets (coach_data
and
cfb_data
) into one big dataset (big_data
) with
all of the team data and coach’s record from each season dating back to
2013.
#Load packages
library(tidyverse)
library(readr)
library(dplyr)
library(knitr)
library(stringr)
library(data.table)
library(broom)
library(gghighlight)
library(gapminder)
library(patchwork)
#Load CSVs and Initial Data Cleaning
coach13 <- read_csv("Data/2013_Coaches.csv")
coach14 <- read_csv("Data/2014_Coaches.csv")
coach15 <- read_csv("Data/2015_Coaches.csv")
coach16 <- read_csv("Data/2016_Coaches.csv") %>%
select(-Year) %>%
mutate(Year = 2016)
coach17 <- read_csv("Data/2017_Coaches.csv")
coach18 <- read_csv("Data/2018_Coaches.csv")
coach19 <- read_csv("Data/2019_Coaches.csv")
coach20 <- read_csv("Data/2020_Coaches.csv")
coach21 <- read_csv("Data/2021_Coaches.csv")
cfb13 <- read_csv("Data/cfb13.csv") %>%
mutate(Year = 2013) %>%
mutate(Team = case_when(
Team == "Miami (FL) (ACC)" ~ "Miami_FL (ACC)",
Team == "Miami (OH) (MAC)" ~ "Miami_OH (MAC)",
TRUE ~ Team
)) %>% separate(col = "Team", into = "Team", sep = " [(]")
cfb14 <- read_csv("Data/cfb14.csv") %>%
mutate(Year = 2014) %>%
mutate(Team = case_when(
Team == "Miami (FL) (ACC)" ~ "Miami_FL (ACC)",
Team == "Miami (OH) (MAC)" ~ "Miami_OH (MAC)",
TRUE ~ Team
)) %>% separate(col = "Team", into = "Team", sep = " [(]")
cfb15 <- read_csv("Data/cfb15.csv") %>%
mutate(Year = 2015) %>%
mutate(Team = case_when(
Team == "Miami (FL) (ACC)" ~ "Miami_FL (ACC)",
Team == "Miami (OH) (MAC)" ~ "Miami_OH (MAC)",
TRUE ~ Team
)) %>% separate(col = "Team", into = "Team", sep = " [(]")
cfb16 <- read_csv("Data/cfb16.csv") %>%
mutate(Year = 2016) %>%
mutate(Team = case_when(
Team == "Miami (FL) (ACC)" ~ "Miami_FL (ACC)",
Team == "Miami (OH) (MAC)" ~ "Miami_OH (MAC)",
TRUE ~ Team
)) %>% separate(col = "Team", into = "Team", sep = " [(]")
cfb17 <- read_csv("Data/cfb17.csv") %>%
mutate(Year = 2017) %>%
mutate(Team = case_when(
Team == "Miami (FL) (ACC)" ~ "Miami_FL (ACC)",
Team == "Miami (OH) (MAC)" ~ "Miami_OH (MAC)",
TRUE ~ Team
)) %>% separate(col = "Team", into = "Team", sep = " [(]")
cfb18 <- read_csv("Data/cfb18.csv") %>%
mutate(Year = 2018) %>%
mutate(Team = case_when(
Team == "Miami (FL) (ACC)" ~ "Miami_FL (ACC)",
Team == "Miami (OH) (MAC)" ~ "Miami_OH (MAC)",
TRUE ~ Team
)) %>% separate(col = "Team", into = "Team", sep = " [(]")
cfb19 <- read_csv("Data/cfb19.csv") %>%
mutate(Year = 2019) %>%
mutate(Team = case_when(
Team == "Miami (FL) (ACC)" ~ "Miami_FL (ACC)",
Team == "Miami (OH) (MAC)" ~ "Miami_OH (MAC)",
TRUE ~ Team
)) %>% separate(col = "Team", into = "Team", sep = " [(]")
cfb20 <- read_csv("Data/cfb20.csv") %>%
mutate(Year = 2020) %>%
mutate(Team = case_when(
Team == "Miami (FL) (ACC)" ~ "Miami_FL (ACC)",
Team == "Miami (OH) (MAC)" ~ "Miami_OH (MAC)",
TRUE ~ Team
)) %>% separate(col = "Team", into = "Team", sep = " [(]")
cfb21 <- read_csv("Data/cfb21.csv") %>%
mutate(Year = 2021) %>%
select(-"...1") %>%
separate(col = "Team", into = "Team", sep = " [(]")
#Combine datasets to make one big dataset
names(cfb21) <- make.names(names(cfb21), unique=TRUE)
cfb_data <- bind_rows(cfb13, cfb14, cfb15, cfb16, cfb17, cfb18, cfb19, cfb20,
cfb21)
coach_data <- bind_rows(coach13, coach14, coach15, coach16, coach17, coach18,
coach19, coach20, coach21)
names(coach_data)[3] <- "Team"
cfb_data <- cfb_data %>%
select(-c("...41", "Kickoff.Return.Def.Rank", "Opp.Kickoff.Returns",
"Kickoff.Touchbacks", "Opponent.Kickoff.Return.Yards",
"Opp.Kickoff.Return.Touchdowns.Allowed",
"Avg.Yards.per.Kickoff.Return.Allowed", "Win.Loss", "NA.",
"Interceptions.Thrown.y"))
coach_data <- coach_data %>%
filter(W + L > 7)
cfb_data[cfb_data == "App State"] <- "Appalachian State"
cfb_data[cfb_data == "Appalachian St."] <- "Appalachian State"
cfb_data[cfb_data == "Arizona St."] <- "Arizona State"
cfb_data[cfb_data == "Arkansas St."] <- "Arkansas State"
cfb_data[cfb_data == "Army West Point"] <- "Army"
cfb_data[cfb_data == "Ball St."] <- "Ball State"
cfb_data[cfb_data == "Boise St."] <- "Ball State"
cfb_data[cfb_data == "Central Mich."] <- "Central Michigan"
cfb_data[cfb_data == "Coastal Caro."] <- "Coastal Carolina"
cfb_data[cfb_data == "Colorado St."] <- "Colorado State"
cfb_data[cfb_data == "UConn"] <- "Connecticut"
cfb_data[cfb_data == "Eastern Mich."] <- "Eastern Michigan"
cfb_data[cfb_data == "Fla. Atlantic"] <- "Florida Atlantic"
cfb_data[cfb_data == "FIU"] <- "Florida International"
cfb_data[cfb_data == "Florida St."] <- "Florida State"
cfb_data[cfb_data == "Fresno St."] <- "Fresno State"
cfb_data[cfb_data == "Ga. Southern"] <- "Georgia Southern"
cfb_data[cfb_data == "Georgia St."] <- "Georgia State"
cfb_data[cfb_data == "Iowa St."] <- "Iowa State"
cfb_data[cfb_data == "Kansas St."] <- "Kansas State"
cfb_data[cfb_data == "Kent St."] <- "Kent State"
cfb_data[cfb_data == "La.-Monroe"] <- "Louisiana-Monroe"
cfb_data[cfb_data == "ULM"] <- "Louisiana-Monroe"
cfb_data[cfb_data == "Michigan St."] <- "Michigan State"
cfb_data[cfb_data == "Middle Tenn."] <- "Middle Tennessee State"
cfb_data[cfb_data == "Mississippi St."] <- "Mississippi State"
cfb_data[cfb_data == "New Mexico St."] <- "New Mexico State"
cfb_data[cfb_data == "NIU"] <- "Norhtern Illinois"
cfb_data[cfb_data == "Northern Ill."] <- "Norhtern Illinois"
cfb_data[cfb_data == "Ohio St."] <- "Ohio State"
cfb_data[cfb_data == "Oklahoma St."] <- "Oklahoma State"
cfb_data[cfb_data == "Oregon St."] <- "Oregon State"
cfb_data[cfb_data == "Penn St."] <- "Penn State"
cfb_data[cfb_data == "San Diego St."] <- "San Diego State"
cfb_data[cfb_data == "San Jose St."] <- "San Jose State"
cfb_data[cfb_data == "South Fla."] <- "South Florida"
cfb_data[cfb_data == "Southern Miss."] <- "Southern Mississippi"
cfb_data[cfb_data == "Southern California"] <- "USC"
cfb_data[cfb_data == "Western Mich."] <- "Western Michigan"
cfb_data[cfb_data == "Western Ky."] <- "Western Kentucky"
cfb_data[cfb_data == "Washington St."] <- "Washington State"
cfb_data[cfb_data == "Utah St."] <- "Utah State"
cfb_data[cfb_data == "Texas St."] <- "Texas State"
coach_data[coach_data == "Bowling Green State"] <- "Bowling Green"
coach_data[coach_data == "Brigham Young"] <- "BYU"
coach_data[coach_data == "North Carolina State"] <- "NC State"
coach_data[coach_data == "Pitt"] <- "Pittsburgh"
coach_data[coach_data == "Miami (FL)"] <- "Miami_FL"
coach_data[coach_data == "Miami (OH)"] <- "Miami_OH"
big_data <- inner_join(cfb_data, coach_data, by = c("Team", "Year")) %>%
select(c(Team, Coach, W, L, Pct, Year, Conf, Total.Points, Points.Allowed,
Off.Rank, Off.Yards.Play, Turnover.Margin, Def.Rank, Touchdowns,
Penalties, Penalty.Yards.Per.Game, Yards.Play.Allowed,
Off.TDs.Allowed, Scoring.Off.Rank, Avg.Points.per.Game.Allowed,
Touchdowns.Allowed))
Modeling and Data Analysis
To examine the impact of first year coaches, we first need to
identify seasons in which the head coach is in his first year. To do
this, we created a for loop that checks if the previous observation is
of the same team and a different coach. Because the observations are
ordered by school name and then year, we know that two observations of
the same school indicate back-to-back seasons. If the previous row was
the same team and a different head coach, then we assign the indicator
variable new_coach
to be 1. If the coach is the same as the
previous year or if the observation above is a different team (there is
no older data for the team being examined), then new_coach
is assigned a value of 0. After we identified rows that were
back-to-back seasons, we created lag variables for certain statistics
from the season before including touchdowns, turnover differential, win
percentage, touchdowns allowed, and offensive and defensive rank. Then,
each observation that also had data from the previous season contained
the statistics of the first season under the new head coach as well as
the statistics from the final season of the old head coach. For the
statistics we were considering, we created delta
variables
that measure the change in the statistic from previous season. For
example, delta_pct
is the winning percentage of the first
season under the new head coach minus the winning percentage of the last
season under the old coach. We then created linear regression models to
test if new_coach
is a significant predictor on any of the
delta
variables.
#Determining new coaches
new_coach = c()
big_data <- big_data %>%
arrange(Team, Year)
for (i in 2:nrow(big_data)){
if ((big_data$Team[i] == big_data$Team[i-1]) &
(big_data$Coach[i] != big_data$Coach[i-1])){
new_coach = append(new_coach, 1)
}
else{
new_coach = append(new_coach, 0)
}
}
#Making new_coach a factor variable
big_data <- big_data[-1, ] %>%
cbind(new_coach)
big_data <- big_data %>%
mutate(new_coach = as.factor(new_coach))
old_data <- big_data %>% select(c("Turnover.Margin", "Touchdowns",
"Scoring.Off.Rank"))
new_data <- paste("lag", old_data, sep = ".")
#Create lag variables from last year stats, delta variables
big_data <-
big_data %>%
group_by(Team) %>%
mutate(lastyr_TD = lag(Touchdowns, n = 1, default = NA),
lastyr_TOmargin = lag(Turnover.Margin, n = 1, default = NA),
lastyr_pct = lag(Pct, n = 1, default = NA),
lastyr_TD.allowed = lag(Off.TDs.Allowed, n = 1, default = NA),
lastyr_Offrank = lag(Off.Rank, n = 1, default = NA),
lastyr_Defrank = lag(Def.Rank, n = 1, default = NA),
delta_TOmargin = Turnover.Margin - lastyr_TOmargin,
delta_TD = Touchdowns - lastyr_TD,
delta_pct = Pct - lastyr_pct,
delta_TD.allowed = Off.TDs.Allowed - lastyr_TD.allowed,
delta_Offrank = Off.Rank - lastyr_Offrank,
delta_Defrank = Def.Rank - lastyr_Defrank) %>%
filter(Year != 2013)
#Linear models to test significance of new_coach on different stats
lm.1 <- lm(delta_pct ~ new_coach, data = big_data)
lm.2 <- lm(delta_TD.allowed ~ new_coach, data = big_data)
lm.3 <- lm(delta_TD ~ new_coach, data = big_data)
lm.4 <- lm(delta_TOmargin ~ new_coach, data = big_data)
lm.5 <- lm(delta_Offrank ~ new_coach, data = big_data)
lm.6 <- lm(delta_Defrank ~ new_coach, data = big_data)
tidy(lm.1) %>%
kable(digits = 3)
(Intercept) |
0.007 |
0.008 |
0.928 |
0.354 |
new_coach1 |
-0.031 |
0.018 |
-1.741 |
0.082 |
tidy(lm.2) %>%
kable(digits = 3)
(Intercept) |
-0.208 |
0.422 |
-0.492 |
0.623 |
new_coach1 |
0.553 |
0.970 |
0.570 |
0.569 |
tidy(lm.3) %>%
kable(digits = 3)
(Intercept) |
0.462 |
0.559 |
0.827 |
0.409 |
new_coach1 |
-3.281 |
1.284 |
-2.555 |
0.011 |
tidy(lm.4) %>%
kable(digits = 3)
(Intercept) |
0.205 |
0.348 |
0.590 |
0.556 |
new_coach1 |
-1.147 |
0.799 |
-1.435 |
0.152 |
tidy(lm.5) %>%
kable(digits = 3)
(Intercept) |
-0.871 |
1.349 |
-0.646 |
0.519 |
new_coach1 |
4.731 |
3.099 |
1.527 |
0.127 |
tidy(lm.6) %>%
kable(digits = 3)
(Intercept) |
-0.047 |
1.35 |
-0.034 |
0.973 |
new_coach1 |
3.701 |
3.10 |
1.194 |
0.233 |
Our model outputs above do not show much about
new_coach
, but it does show that new_coach
is
significant in predicting the change in touchdowns scored using an \(\alpha\)-level of 0.05. The coefficient is
negative, which means that teams with new coaches are statistically more
likely to see a regression in touchdowns from the year before than teams
with returning coaches. However, there are many outside lurking
variables as well as numerous different types of head coaching changes
as we mention in the discussion. For this reason, it is very hard to
group every new coaching observation together and even harder to make
generalized predictions about teams using only whether or not they have
a new head coach. Still, it makes sense that teams with a new head coach
would score less on average than teams without a new head coach.
Visualizations
annotations <- data.frame(
xpos = c(-Inf,-Inf,Inf,Inf),
ypos = c(-Inf, Inf,-Inf,Inf),
annotateText = c("Good Offense, Good Defense",
"Good Offense, Bad Defense",
"Bad Offense, Good Defense",
"Bad Offense, Bad Defense"),
hjustvar = c(0,0,1,1) ,
vjustvar = c(0,1,0,1))
big_data %>%
ggplot(aes(x=Off.Rank, y=Def.Rank)) +
geom_point(color = "red") +
gghighlight::gghighlight(new_coach == 1) +
scale_x_continuous(breaks = seq(0, 150, 10)) +
scale_y_continuous(breaks = seq(0, 150, 10)) +
geom_vline(xintercept = 65, linetype ="dashed") +
geom_hline(yintercept = 65, linetype ="dashed") +
labs(title = "Offensive and Defensive Rank",
subtitle = "Highlighted by New Head Coach",
x = "Offensive Rank", y = "Defensive Rank") +
geom_text(data=annotations,
aes(x=xpos,y=ypos,hjust=hjustvar,vjust=vjustvar,label=annotateText))