Work on questions in R, make sure to keep a copy of your R code - you will be asked to submit this script at the end.
Data on all flights in and out of Des Moines (DSM) for October 2008 are available at http://www.hofroe.net/data/dsm-flights.csv.
See http://stat-computing.org/dataexpo/2009/the-data.html for a description of the variables.
flights <- read.csv("http://www.hofroe.net/data/dsm-flights.csv")
which.max(flights$ArrDelay)
## [1] 1516
flights[which.max(flights$ArrDelay), c("Origin", "DepDelay")]
## Origin DepDelay
## 1516 DSM 614
summary(flights$Day)
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 456 368 248 336 460 360 459
days <- levels(flights$Day)
flights$Day <- factor(flights$Day, levels=days[c(2,6,7,5,1,3,4)])
summary(flights$Day)
## Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 368 360 459 460 456 248 336
# create new variable Weekend
flights$Weekend <- flights$Day %in% c("Saturday", "Sunday")
summary(flights$Weekend)
## Mode FALSE TRUE
## logical 2103 584
# idea 1:
table(subset(flights, Dest=="DSM")$Day) # overall number of flights by day of week
##
## Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 184 180 230 230 228 124 168
# problem: how many Mondays, Tuesdays, are there in October 2008?
require(lubridate)
octs <- data.frame( date = ymd(paste("2008/10/",1:31, sep="")))
octs$day = wday(octs$date, label=TRUE)
table(octs$day)
##
## Sun Mon Tue Wed Thu Fri Sat
## 4 4 4 5 5 5 4
table(subset(flights, Dest=="DSM")$Day)/c(4,4,5,5,5,4,4)
##
## Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 46.0 45.0 46.0 46.0 45.6 31.0 42.0
# idea 2:
require(dplyr)
require(lubridate)
flights %>% filter(Dest == "DSM") %>% group_by(DayofMonth) %>% summarise(
day = Day[1],
n = n()
) %>% group_by(day) %>% summarize(avg = mean(n), n = n())
## # A tibble: 7 x 3
## day avg n
## <fct> <dbl> <int>
## 1 Monday 46 4
## 2 Tuesday 45 4
## 3 Wednesday 46 5
## 4 Thursday 46 5
## 5 Friday 45.6 5
## 6 Saturday 31 4
## 7 Sunday 42 4
nrow(subset(flights, Dest=='DEN'))
## [1] 145
nrow(subset(flights, Dest=='DEN'))/nrow(subset(flights, Dest != 'DSM'))*100
## [1] 10.79672
flights %>%
filter(Origin != "DSM") %>%
group_by(Origin) %>%
summarise(n = n()) %>%
arrange(desc(n))
## # A tibble: 13 x 2
## Origin n
## <fct> <int>
## 1 ORD 379
## 2 DFW 178
## 3 DEN 119
## 4 ATL 89
## 5 MSP 89
## 6 MKE 85
## 7 CVG 80
## 8 MEM 62
## 9 DTW 58
## 10 IAH 58
## 11 LGA 58
## 12 PHX 58
## 13 DCA 31
sort(table(flights$Origin), decreasing=T)[2]
## ORD
## 379
library(ggplot2)
flights %>% filter(Dest =="DSM") %>%
ggplot( aes(x = reorder(factor(Origin), ArrDelay, na.rm=T), y = ArrDelay)) + geom_boxplot()
Draw a scatterplot of average departure delay by scheduled hour of departure. Color points by top destination, adjust point size to reflect the number of flights for each hour.
dep.summary <- flights %>%
filter(Origin == 'DSM') %>%
mutate(hour = CRSDepTime%/%100) %>%
group_by(hour, Dest) %>%
mutate(test = n()) %>%
group_by(hour) %>%
mutate(rank = dense_rank(desc(test))) %>%
summarise(
count = n(),
pct.delayed = sum(DepDelay>15, na.rm=TRUE)/n()*100,
avg.delay = mean(DepDelay, na.rm=T),
top.Dest.1 = Dest[which.max(test)],
top.Dest.2 = first(Dest[which(rank == 2)]),
top.Dest.3=first(Dest[which(rank == 3)]))
dep.summary %>%
ggplot(aes(x = hour, avg.delay, colour = top.Dest.1, size = count)) + geom_point()