YOUR TURN #1

Q: Create date objects for today’s date by typing the date in text format and converting it with one of the lubridate converter functions.

library(lubridate)
ymd("2020-03-26")
## [1] "2020-03-26"
mdy("03-26-2020")
## [1] "2020-03-26"

Q: Try different formats of writing the date and compare the end results.

ymd("2020-03-26", tz = "UTC")
## [1] "2020-03-26 UTC"

Q: Use the appropriate lubridate function to parse each of the following dates:

d1 <- "January 1, 2010"
d2 <- "2015-Mar-07"
d3 <- "06-Jun-2017"
d4 <- c("August 19 (2015)", "July 1 (2015)")
d5 <- "12/30/14" # Dec 30, 2014
mdy(d1)
## [1] "2010-01-01"
ymd(d2)
## [1] "2015-03-07"
dmy(d3)
## [1] "2017-06-06"
mdy(d4)
## [1] "2015-08-19" "2015-07-01"
mdy(d5)
## [1] "2014-12-30"

YOUR TURN #2

For this question use the flights data from the nycflights13 package.

library(nycflights13)
library(tidyverse)

Q: Use make_datetime() to create a date-time variable for dep_time and arr_time.

Hint: use modular arithmetic %/% for hour and %% for minute.

flights_dt <- flights %>% 
  filter(!is.na(dep_time), !is.na(arr_time)) %>% 
  mutate(dep_hour = dep_time %/% 100,
         dep_min = dep_time %% 100,
         dep_time = make_datetime(year, month, day, dep_hour, dep_min),
         arr_hour = arr_time %/% 100,
         arr_min = arr_time %% 100,
         arr_time = make_datetime(year, month, day, arr_hour, arr_min))
flights_dt
## # A tibble: 328,063 x 23
##     year month   day dep_time            sched_dep_time dep_delay
##    <int> <int> <int> <dttm>                       <int>     <dbl>
##  1  2013     1     1 2013-01-01 05:17:00            515         2
##  2  2013     1     1 2013-01-01 05:33:00            529         4
##  3  2013     1     1 2013-01-01 05:42:00            540         2
##  4  2013     1     1 2013-01-01 05:44:00            545        -1
##  5  2013     1     1 2013-01-01 05:54:00            600        -6
##  6  2013     1     1 2013-01-01 05:54:00            558        -4
##  7  2013     1     1 2013-01-01 05:55:00            600        -5
##  8  2013     1     1 2013-01-01 05:57:00            600        -3
##  9  2013     1     1 2013-01-01 05:57:00            600        -3
## 10  2013     1     1 2013-01-01 05:58:00            600        -2
## # … with 328,053 more rows, and 17 more variables: arr_time <dttm>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, dep_hour <dbl>, dep_min <dbl>,
## #   arr_hour <dbl>, arr_min <dbl>

YOUR TURN #3

For this question use the flights_dt data created in the last your turn.

Q: Use an accessor function to calculate the average departure delay by minute within the hour. Use ggplot2 to plot your results.

flights_dt %>% 
  mutate(minute = minute(dep_time)) %>% 
  group_by(minute) %>% 
  summarise(
    avg_delay = mean(arr_delay, na.rm = TRUE),
    n = n()) %>% 
  ggplot(aes(minute, avg_delay)) +
    geom_line()

YOUR TURN #4

Q: Inspect the budget data set from the classdata package

library(classdata)
budget %>% glimpse()
## Observations: 5,855
## Variables: 5
## $ ReleaseDate      <date> 2009-12-17, 2011-05-20, 2019-04-23, 2015-04-22, 201…
## $ Movie            <chr> "Avatar", "Pirates of the Caribbean: On Stranger Tid…
## $ ProductionBudget <dbl> 425000000, 410600000, 400000000, 330600000, 31700000…
## $ DomesticGross    <dbl> 760507625, 241063875, 858373000, 459005868, 62018138…
## $ WorldwideGross   <dbl> 2789705275, 1045663875, 2795473000, 1403013963, 1316…

Q: Make sure the variable Release Date is a date format.

summary(budget$ReleaseDate)
##         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
## "1915-02-08" "2000-04-21" "2007-02-11" "2004-07-14" "2012-12-25" "2022-12-16" 
##         NA's 
##        "163"
budget %>% 
  arrange(desc(ReleaseDate)) %>%
  head()
##   ReleaseDate              Movie ProductionBudget DomesticGross WorldwideGross
## 1  2022-12-16    Heaven and Hell         4.00e+07             0              0
## 2  2021-01-15                355         7.50e+07             0              0
## 3  2020-08-28           Unhinged         2.90e+07             0              0
## 4  2020-07-17              Tenet         2.24e+08             0              0
## 5  2020-02-21   Call of the Wild         8.20e+07             0              0
## 6  2020-02-14 Sonic The Hedgehog         9.00e+07             0              0

Q: Plot a histogram of the variable

budget %>% 
  ggplot(aes(x = ReleaseDate)) +
  geom_histogram(binwidth=365)

Q: Merge (join) budget and box office data (by movie name)

box_budget <- box %>% left_join(budget, by = "Movie")
head(box_budget)
##   Rank Rank.Last.Week             Movie        Distributor    Gross Change
## 1    1              1             Joker       Warner Bros. 55861403    -42
## 2    2             NA The Addams Family     United Artists 30300007     NA
## 3    2             NA The Addams Family     United Artists 30300007     NA
## 4    3             NA        Gemini Man Paramount Pictures 20552372     NA
## 5    4              2        Abominable          Universal  6072235    -49
## 6    5              3     Downton Abbey     Focus Features  4881075    -39
##   Thtrs. Per.Thtr. Total.Gross Week       Date ReleaseDate ProductionBudget
## 1   4374     12771   193590190    2 2019-10-11  2019-10-02          5.5e+07
## 2   4007      7562    30300007    1 2019-10-11  1991-11-22          3.0e+07
## 3   4007      7562    30300007    1 2019-10-11  2019-10-11          2.4e+07
## 4   3642      5643    20552372    1 2019-10-11        <NA>               NA
## 5   3496      1737    47873585    3 2019-10-11        <NA>               NA
## 6   3019      1617    82668665    4 2019-10-11        <NA>               NA
##   DomesticGross WorldwideGross
## 1     208909478      563609478
## 2     113502246      191502246
## 3      37784650       37784650
## 4            NA             NA
## 5            NA             NA
## 6            NA             NA

Q: Is the time between the release of a movie and the date is equal to the number of weeks in theaters?

box_budget %>% 
  filter(!is.na(ReleaseDate)) %>% 
  mutate(test_week = (ReleaseDate %--% Date) %/% dweeks(1)) %>% 
  select(Movie, ReleaseDate, Date, Week, test_week) %>% 
  head()
##               Movie ReleaseDate       Date Week test_week
## 1             Joker  2019-10-02 2019-10-11    2         1
## 2 The Addams Family  1991-11-22 2019-10-11    1      1455
## 3 The Addams Family  2019-10-11 2019-10-11    1         0
## 4          Hustlers  2019-09-12 2019-10-11    5         4
## 5   It: Chapter Two  2019-09-04 2019-10-11    6         5
## 6          Ad Astra  2019-09-19 2019-10-11    4         3
box_budget %>% 
  filter(!is.na(ReleaseDate)) %>% 
  mutate(test_week = ceiling((ReleaseDate %--% Date) / dweeks(1))) %>% 
  select(Movie, ReleaseDate, Date, Week, test_week) %>% 
  head()
##               Movie ReleaseDate       Date Week test_week
## 1             Joker  2019-10-02 2019-10-11    2         2
## 2 The Addams Family  1991-11-22 2019-10-11    1      1455
## 3 The Addams Family  2019-10-11 2019-10-11    1         0
## 4          Hustlers  2019-09-12 2019-10-11    5         5
## 5   It: Chapter Two  2019-09-04 2019-10-11    6         6
## 6          Ad Astra  2019-09-19 2019-10-11    4         4