library(lubridate)
ymd("2020-03-26")
## [1] "2020-03-26"
mdy("03-26-2020")
## [1] "2020-03-26"
ymd("2020-03-26", tz = "UTC")
## [1] "2020-03-26 UTC"
d1 <- "January 1, 2010"
d2 <- "2015-Mar-07"
d3 <- "06-Jun-2017"
d4 <- c("August 19 (2015)", "July 1 (2015)")
d5 <- "12/30/14" # Dec 30, 2014
mdy(d1)
## [1] "2010-01-01"
ymd(d2)
## [1] "2015-03-07"
dmy(d3)
## [1] "2017-06-06"
mdy(d4)
## [1] "2015-08-19" "2015-07-01"
mdy(d5)
## [1] "2014-12-30"
For this question use the flights
data from the nycflights13
package.
library(nycflights13)
library(tidyverse)
make_datetime()
to create a date-time variable for dep_time
and arr_time
.Hint: use modular arithmetic %/%
for hour and %%
for minute.
flights_dt <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(dep_hour = dep_time %/% 100,
dep_min = dep_time %% 100,
dep_time = make_datetime(year, month, day, dep_hour, dep_min),
arr_hour = arr_time %/% 100,
arr_min = arr_time %% 100,
arr_time = make_datetime(year, month, day, arr_hour, arr_min))
flights_dt
## # A tibble: 328,063 x 23
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <dttm> <int> <dbl>
## 1 2013 1 1 2013-01-01 05:17:00 515 2
## 2 2013 1 1 2013-01-01 05:33:00 529 4
## 3 2013 1 1 2013-01-01 05:42:00 540 2
## 4 2013 1 1 2013-01-01 05:44:00 545 -1
## 5 2013 1 1 2013-01-01 05:54:00 600 -6
## 6 2013 1 1 2013-01-01 05:54:00 558 -4
## 7 2013 1 1 2013-01-01 05:55:00 600 -5
## 8 2013 1 1 2013-01-01 05:57:00 600 -3
## 9 2013 1 1 2013-01-01 05:57:00 600 -3
## 10 2013 1 1 2013-01-01 05:58:00 600 -2
## # … with 328,053 more rows, and 17 more variables: arr_time <dttm>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, dep_hour <dbl>, dep_min <dbl>,
## # arr_hour <dbl>, arr_min <dbl>
For this question use the flights_dt
data created in the last your turn.
flights_dt %>%
mutate(minute = minute(dep_time)) %>%
group_by(minute) %>%
summarise(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n()) %>%
ggplot(aes(minute, avg_delay)) +
geom_line()
budget
data set from the classdata
packagelibrary(classdata)
budget %>% glimpse()
## Observations: 5,855
## Variables: 5
## $ ReleaseDate <date> 2009-12-17, 2011-05-20, 2019-04-23, 2015-04-22, 201…
## $ Movie <chr> "Avatar", "Pirates of the Caribbean: On Stranger Tid…
## $ ProductionBudget <dbl> 425000000, 410600000, 400000000, 330600000, 31700000…
## $ DomesticGross <dbl> 760507625, 241063875, 858373000, 459005868, 62018138…
## $ WorldwideGross <dbl> 2789705275, 1045663875, 2795473000, 1403013963, 1316…
Release Date
is a date format.summary(budget$ReleaseDate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "1915-02-08" "2000-04-21" "2007-02-11" "2004-07-14" "2012-12-25" "2022-12-16"
## NA's
## "163"
budget %>%
arrange(desc(ReleaseDate)) %>%
head()
## ReleaseDate Movie ProductionBudget DomesticGross WorldwideGross
## 1 2022-12-16 Heaven and Hell 4.00e+07 0 0
## 2 2021-01-15 355 7.50e+07 0 0
## 3 2020-08-28 Unhinged 2.90e+07 0 0
## 4 2020-07-17 Tenet 2.24e+08 0 0
## 5 2020-02-21 Call of the Wild 8.20e+07 0 0
## 6 2020-02-14 Sonic The Hedgehog 9.00e+07 0 0
budget %>%
ggplot(aes(x = ReleaseDate)) +
geom_histogram(binwidth=365)
join
) budget and box office data (by movie name)box_budget <- box %>% left_join(budget, by = "Movie")
head(box_budget)
## Rank Rank.Last.Week Movie Distributor Gross Change
## 1 1 1 Joker Warner Bros. 55861403 -42
## 2 2 NA The Addams Family United Artists 30300007 NA
## 3 2 NA The Addams Family United Artists 30300007 NA
## 4 3 NA Gemini Man Paramount Pictures 20552372 NA
## 5 4 2 Abominable Universal 6072235 -49
## 6 5 3 Downton Abbey Focus Features 4881075 -39
## Thtrs. Per.Thtr. Total.Gross Week Date ReleaseDate ProductionBudget
## 1 4374 12771 193590190 2 2019-10-11 2019-10-02 5.5e+07
## 2 4007 7562 30300007 1 2019-10-11 1991-11-22 3.0e+07
## 3 4007 7562 30300007 1 2019-10-11 2019-10-11 2.4e+07
## 4 3642 5643 20552372 1 2019-10-11 <NA> NA
## 5 3496 1737 47873585 3 2019-10-11 <NA> NA
## 6 3019 1617 82668665 4 2019-10-11 <NA> NA
## DomesticGross WorldwideGross
## 1 208909478 563609478
## 2 113502246 191502246
## 3 37784650 37784650
## 4 NA NA
## 5 NA NA
## 6 NA NA
box_budget %>%
filter(!is.na(ReleaseDate)) %>%
mutate(test_week = (ReleaseDate %--% Date) %/% dweeks(1)) %>%
select(Movie, ReleaseDate, Date, Week, test_week) %>%
head()
## Movie ReleaseDate Date Week test_week
## 1 Joker 2019-10-02 2019-10-11 2 1
## 2 The Addams Family 1991-11-22 2019-10-11 1 1455
## 3 The Addams Family 2019-10-11 2019-10-11 1 0
## 4 Hustlers 2019-09-12 2019-10-11 5 4
## 5 It: Chapter Two 2019-09-04 2019-10-11 6 5
## 6 Ad Astra 2019-09-19 2019-10-11 4 3
box_budget %>%
filter(!is.na(ReleaseDate)) %>%
mutate(test_week = ceiling((ReleaseDate %--% Date) / dweeks(1))) %>%
select(Movie, ReleaseDate, Date, Week, test_week) %>%
head()
## Movie ReleaseDate Date Week test_week
## 1 Joker 2019-10-02 2019-10-11 2 2
## 2 The Addams Family 1991-11-22 2019-10-11 1 1455
## 3 The Addams Family 2019-10-11 2019-10-11 1 0
## 4 Hustlers 2019-09-12 2019-10-11 5 5
## 5 It: Chapter Two 2019-09-04 2019-10-11 6 6
## 6 Ad Astra 2019-09-19 2019-10-11 4 4