Visualization Exercise: On-duty police officer deaths in the US
The below plot from FiveThirtyEight is what I’m trying to replicate. The data for on-duty police officer deaths comes from the Officer Down Memorial Page, started in 1996 by a college student who is now a police officer and who continues to maintain the database. It extends much further back than the Federal Bureau of Investigation’s data — all the way back to 1791, when two officers were killed, both in New York state. (It hadn’t yet been updated with the Dallas shootings as of Friday afternoon.) National data on the number of police officers isn’t available that far back, but the rate of police officers per the overall population has declined significantly over the years.
Dallas shooting cleaning
Loading required libraries
Data Loading
# path to data
# note the use of the here() package and not absolute paths
<- here::here("Data","all_data_police_deaths.csv")
data_location<- readr::read_csv(data_location) rawdata
Cleaning the fields: extracting cause of death, year
<- rawdata %>%
clean_data mutate(
cause_short = gsub('Cause of Death: ', '', cause),
date = mdy(gsub('EOW: ', '', eow)),
year = year(date),
canine = ifelse(substr(person, 0, 2) == 'K9' & substr(person, 0, 10) != 'K9 Officer', TRUE, FALSE)
)
Glimpse of the data
glimpse(clean_data)
Rows: 22,800
Columns: 8
$ person <chr> "Constable Darius Quimby", "Sheriff Cornelius Hogeboom", "…
$ dept <chr> "Albany County Constable's Office, NY", "Columbia County S…
$ eow <chr> "EOW: Monday, January 3, 1791", "EOW: Saturday, October 22…
$ cause <chr> "Cause of Death: Gunfire", "Cause of Death: Gunfire", "Cau…
$ cause_short <chr> "Gunfire", "Gunfire", "Gunfire", "Gunfire", "Gunfire", "Gu…
$ date <date> 1791-01-03, 1791-10-22, 1792-05-17, 1794-01-11, 1797-11-1…
$ year <dbl> 1791, 1791, 1792, 1794, 1797, 1804, 1806, 1807, 1808, 1808…
$ canine <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
Split department and state
<- strsplit(clean_data$dept, ",(?=[^,]+$)", perl=TRUE)
dept_state
<- data.frame(matrix(unlist(dept_state), nrow=length(dept_state), byrow=T),stringsAsFactors=FALSE) %>%
dept_state_df rename(dept_name=X1, state=X2)
<- clean_data %>%
clean_data bind_cols(dept_state_df)
Glimpse of this dataframe
glimpse(dept_state_df)
Rows: 22,800
Columns: 2
$ dept_name <chr> "Albany County Constable's Office", "Columbia County Sheriff…
$ state <chr> " NY", " NY", " NY", " US", " SC", " NC", " NY", " KY", " ME…
Remove canines
<- clean_data %>%
persons_data filter(canine == FALSE)
Filter for major categories where count of cause of death is at least 20
<- persons_data %>%
large_categories group_by(year, cause_short) %>%
summarize(count = n()) %>%
data.frame() %>%
filter(count >= 20) %>%
select(cause_short) %>%
unique()
Glimpse of data
glimpse(large_categories)
Rows: 8
Columns: 1
$ cause_short <chr> "Gunfire", "Motorcycle accident", "Automobile accident", "…
Categories of cause of death to plot
<- c(large_categories$cause_short, "Gunfire (Accidental)") cat_to_plot
?arrange
Arranging count of cause of death in descending order
<- persons_data %>%
plot_order mutate(cat = ifelse(cause_short %in% cat_to_plot, cause_short, 'other')) %>%
group_by(cat) %>%
summarize(count = n()) %>%
data.frame() %>%
arrange(desc(count)) %>%
extract2(1)
Move cause of death ‘other’ to the end
<- c(plot_order[! (plot_order == 'other')], 'other') plot_order
Glimpse of data
glimpse(plot_order)
chr [1:10] "Gunfire" "Automobile accident" "Motorcycle accident" ...
Create data for visualization
<- persons_data %>%
data_for_plot mutate(category = ifelse(cause_short %in% cat_to_plot, cause_short, 'other')) %>%
group_by(year, category) %>%
summarize(count = n()) %>%
data.frame() %>%
spread(category, count)
Set missing to zero
is.na(data_for_plot)] <- 0 data_for_plot[
Glimpse of data
glimpse(data_for_plot)
Rows: 202
Columns: 11
$ year <dbl> 1791, 1792, 1794, 1797, 1804, 1806, 1807, 1808,…
$ `Aircraft accident` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `Automobile accident` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Gunfire <dbl> 2, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 1,…
$ `Gunfire (Accidental)` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `Heart attack` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `Motorcycle accident` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ other <dbl> 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 1, 1, 2, 1, 1,…
$ `Struck by vehicle` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `Terrorist attack` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `Vehicular assault` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
Selecting variables yer, category of death, and count. Also arranging the category
<- data_for_plot %>%
data_for_plot gather(category, count, -year) %>%
mutate(category = factor(category, levels=plot_order)) %>%
arrange(category)
Plot of Deaths by cause
<- ggplot(data_for_plot, aes(x=year, y=count, group=category, order=category)) +
p_area geom_area(aes(fill=category), position='stack')
+ ggtitle("On-duty police officer deaths in the US", "By cause since 1791") +
p_area theme(legend.position = c(0.2,0.6))
Reproducing this graph was an amazing experience. It was pretty fun to go through every possible detail of the original plot and try to perfectly reproduce it. Going through that process of “trying to be perfect” actually helps me learned other cool stuff that ggplot has to offer. One is how to manipulate the legend position in the graph.