library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# Read in data 
# March 13
Confirmed_State_3_13 <-   read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/03-13-2020.csv")) %>%
  rename(Country_Region = "Country/Region", Province_State = "Province/State") %>% 
  filter (Country_Region == "US") %>% 
  group_by(Province_State, Country_Region) %>% 
  summarise(Confirmed = sum(Confirmed)) 
## Parsed with column specification:
## cols(
##   `Province/State` = col_character(),
##   `Country/Region` = col_character(),
##   `Last Update` = col_datetime(format = ""),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Latitude = col_double(),
##   Longitude = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
str(Confirmed_State_3_13)
## tibble [53 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ Province_State: chr [1:53] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Country_Region: chr [1:53] "US" "US" "US" "US" ...
##  $ Confirmed     : num [1:53] 5 1 9 6 282 49 11 4 46 10 ...
##  - attr(*, "groups")= tibble [53 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ Province_State: chr [1:53] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##   ..$ .rows         : list<int> [1:53] 
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int 13
##   .. ..$ : int 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   .. ..$ : int 18
##   .. ..$ : int 19
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 22
##   .. ..$ : int 23
##   .. ..$ : int 24
##   .. ..$ : int 25
##   .. ..$ : int 26
##   .. ..$ : int 27
##   .. ..$ : int 28
##   .. ..$ : int 29
##   .. ..$ : int 30
##   .. ..$ : int 31
##   .. ..$ : int 32
##   .. ..$ : int 33
##   .. ..$ : int 34
##   .. ..$ : int 35
##   .. ..$ : int 36
##   .. ..$ : int 37
##   .. ..$ : int 38
##   .. ..$ : int 39
##   .. ..$ : int 40
##   .. ..$ : int 41
##   .. ..$ : int 42
##   .. ..$ : int 43
##   .. ..$ : int 44
##   .. ..$ : int 45
##   .. ..$ : int 46
##   .. ..$ : int 47
##   .. ..$ : int 48
##   .. ..$ : int 49
##   .. ..$ : int 50
##   .. ..$ : int 51
##   .. ..$ : int 52
##   .. ..$ : int 53
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
# Sept 13
Confirmed_State_9_13 <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/09-13-2020.csv")) %>% 
  filter (Country_Region == "US") %>% 
  group_by(Province_State, Country_Region) %>% 
  summarise(Confirmed = sum(Confirmed)) 
## Parsed with column specification:
## cols(
##   FIPS = col_double(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Last_Update = col_datetime(format = ""),
##   Lat = col_double(),
##   Long_ = col_double(),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Active = col_double(),
##   Combined_Key = col_character(),
##   Incidence_Rate = col_double(),
##   `Case-Fatality_Ratio` = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
str(Confirmed_State_9_13)
## tibble [58 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Country_Region: chr [1:58] "US" "US" "US" "US" ...
##  $ Confirmed     : num [1:58] 138755 6268 208512 70219 761728 ...
##  - attr(*, "groups")= tibble [58 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##   ..$ .rows         : list<int> [1:58] 
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int 13
##   .. ..$ : int 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   .. ..$ : int 18
##   .. ..$ : int 19
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 22
##   .. ..$ : int 23
##   .. ..$ : int 24
##   .. ..$ : int 25
##   .. ..$ : int 26
##   .. ..$ : int 27
##   .. ..$ : int 28
##   .. ..$ : int 29
##   .. ..$ : int 30
##   .. ..$ : int 31
##   .. ..$ : int 32
##   .. ..$ : int 33
##   .. ..$ : int 34
##   .. ..$ : int 35
##   .. ..$ : int 36
##   .. ..$ : int 37
##   .. ..$ : int 38
##   .. ..$ : int 39
##   .. ..$ : int 40
##   .. ..$ : int 41
##   .. ..$ : int 42
##   .. ..$ : int 43
##   .. ..$ : int 44
##   .. ..$ : int 45
##   .. ..$ : int 46
##   .. ..$ : int 47
##   .. ..$ : int 48
##   .. ..$ : int 49
##   .. ..$ : int 50
##   .. ..$ : int 51
##   .. ..$ : int 52
##   .. ..$ : int 53
##   .. ..$ : int 54
##   .. ..$ : int 55
##   .. ..$ : int 56
##   .. ..$ : int 57
##   .. ..$ : int 58
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
# Notice from the above `str’ calls that there are 53 states in the table on 3/13/2020 and 
# 58 states in the table on 9/13/2020... Let's see what the differences are 

setdiff(Confirmed_State_9_13$Province_State, Confirmed_State_3_13$Province_State)
## [1] "Guam"                     "Northern Mariana Islands"
## [3] "Puerto Rico"              "Recovered"               
## [5] "Virgin Islands"
# There are 4 US territories and a field you may have notice last week: “Recovered”. 
# We don’t need the “Recovered” row so let’s filter it from the data set

Confirmed_State_9_13 <- Confirmed_State_9_13 %>% 
  filter(Province_State != "Recovered") 
# Since all of the states and territories in 9/13/2020 are found in 3/13/2020 a left_join or full_join can 
# be used to merge the tables

Confirmed_State_3_13_9_13_joined <- full_join(Confirmed_State_3_13,
                                              Confirmed_State_9_13, by = c("Province_State"))
# look at the new joined dataframe 
head(Confirmed_State_3_13_9_13_joined)
## # A tibble: 6 x 5
## # Groups:   Province_State [6]
##   Province_State Country_Region.x Confirmed.x Country_Region.y Confirmed.y
##   <chr>          <chr>                  <dbl> <chr>                  <dbl>
## 1 Alabama        US                         5 US                    138755
## 2 Alaska         US                         1 US                      6268
## 3 Arizona        US                         9 US                    208512
## 4 Arkansas       US                         6 US                     70219
## 5 California     US                       282 US                    761728
## 6 Colorado       US                        49 US                     61293
tail(Confirmed_State_3_13_9_13_joined, 5)
## # A tibble: 5 x 5
## # Groups:   Province_State [5]
##   Province_State       Country_Region.x Confirmed.x Country_Region.y Confirmed.y
##   <chr>                <chr>                  <dbl> <chr>                  <dbl>
## 1 Wyoming              US                         1 US                      4346
## 2 Guam                 <NA>                      NA US                      1863
## 3 Northern Mariana Is… <NA>                      NA US                        60
## 4 Puerto Rico          <NA>                      NA US                     37380
## 5 Virgin Islands       <NA>                      NA US                      1220
# look to see what cells are NA  
which(is.na(Confirmed_State_3_13_9_13_joined))
## [1] 111 112 113 114 168 169 170 171
# let's turn the NAs to 0 and rename the columns 
Confirmed_State_3_13_9_13_joined <- full_join(Confirmed_State_3_13,
                                              Confirmed_State_9_13, by = c("Province_State")) %>% 
  rename(Confirmed_3_13_2020 = "Confirmed.x", Confirmed_9_13_2020 = "Confirmed.y") %>% 
  select(-Country_Region.x, -Country_Region.y) %>% 
  replace_na(list(Confirmed_3_13_2020 = 0))

head(Confirmed_State_3_13_9_13_joined)
## # A tibble: 6 x 3
## # Groups:   Province_State [6]
##   Province_State Confirmed_3_13_2020 Confirmed_9_13_2020
##   <chr>                        <dbl>               <dbl>
## 1 Alabama                          5              138755
## 2 Alaska                           1                6268
## 3 Arizona                          9              208512
## 4 Arkansas                         6               70219
## 5 California                     282              761728
## 6 Colorado                        49               61293
# Switching between wide and long table formats
Confirmed_State_3_13_9_13_joined_long <- Confirmed_State_3_13_9_13_joined %>% 
  pivot_longer(-c(Province_State),
               names_to = "Date", values_to = "Confirmed")
# to adjust figure size {r, fig.width = 5, fig.height = 10}
ggplot(Confirmed_State_3_13_9_13_joined_long, aes(x = Confirmed,  y = Province_State))  + 
  geom_point(aes(color = Date)) 

# Working with the time series data 
# load the most recent time series data for confirmed cases 

download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", 
              destfile = "time_series_covid19_confirmed_global.csv")

time_series_confirmed <- read_csv("time_series_covid19_confirmed_global.csv")%>%
  rename(Province_State = "Province/State", Country_Region = "Country/Region")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
head(time_series_confirmed)
## # A tibble: 6 x 258
##   Province_State Country_Region   Lat   Long `1/22/20` `1/23/20` `1/24/20`
##   <chr>          <chr>          <dbl>  <dbl>     <dbl>     <dbl>     <dbl>
## 1 <NA>           Afghanistan     33.9  67.7          0         0         0
## 2 <NA>           Albania         41.2  20.2          0         0         0
## 3 <NA>           Algeria         28.0   1.66         0         0         0
## 4 <NA>           Andorra         42.5   1.52         0         0         0
## 5 <NA>           Angola         -11.2  17.9          0         0         0
## 6 <NA>           Antigua and B…  17.1 -61.8          0         0         0
## # … with 251 more variables: `1/25/20` <dbl>, `1/26/20` <dbl>, `1/27/20` <dbl>,
## #   `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>, `1/31/20` <dbl>,
## #   `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>, `2/4/20` <dbl>,
## #   `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>,
## #   `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>, `2/12/20` <dbl>,
## #   `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>, `2/16/20` <dbl>,
## #   `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>,
## #   `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, `2/24/20` <dbl>,
## #   `2/25/20` <dbl>, `2/26/20` <dbl>, `2/27/20` <dbl>, `2/28/20` <dbl>,
## #   `2/29/20` <dbl>, `3/1/20` <dbl>, `3/2/20` <dbl>, `3/3/20` <dbl>,
## #   `3/4/20` <dbl>, `3/5/20` <dbl>, `3/6/20` <dbl>, `3/7/20` <dbl>,
## #   `3/8/20` <dbl>, `3/9/20` <dbl>, `3/10/20` <dbl>, `3/11/20` <dbl>,
## #   `3/12/20` <dbl>, `3/13/20` <dbl>, `3/14/20` <dbl>, `3/15/20` <dbl>,
## #   `3/16/20` <dbl>, `3/17/20` <dbl>, `3/18/20` <dbl>, `3/19/20` <dbl>,
## #   `3/20/20` <dbl>, `3/21/20` <dbl>, `3/22/20` <dbl>, `3/23/20` <dbl>,
## #   `3/24/20` <dbl>, `3/25/20` <dbl>, `3/26/20` <dbl>, `3/27/20` <dbl>,
## #   `3/28/20` <dbl>, `3/29/20` <dbl>, `3/30/20` <dbl>, `3/31/20` <dbl>,
## #   `4/1/20` <dbl>, `4/2/20` <dbl>, `4/3/20` <dbl>, `4/4/20` <dbl>,
## #   `4/5/20` <dbl>, `4/6/20` <dbl>, `4/7/20` <dbl>, `4/8/20` <dbl>,
## #   `4/9/20` <dbl>, `4/10/20` <dbl>, `4/11/20` <dbl>, `4/12/20` <dbl>,
## #   `4/13/20` <dbl>, `4/14/20` <dbl>, `4/15/20` <dbl>, `4/16/20` <dbl>,
## #   `4/17/20` <dbl>, `4/18/20` <dbl>, `4/19/20` <dbl>, `4/20/20` <dbl>,
## #   `4/21/20` <dbl>, `4/22/20` <dbl>, `4/23/20` <dbl>, `4/24/20` <dbl>,
## #   `4/25/20` <dbl>, `4/26/20` <dbl>, `4/27/20` <dbl>, `4/28/20` <dbl>,
## #   `4/29/20` <dbl>, `4/30/20` <dbl>, `5/1/20` <dbl>, `5/2/20` <dbl>,
## #   `5/3/20` <dbl>, …
time_series_confirmed_long <- time_series_confirmed %>% 
  pivot_longer(-c(Province_State, Country_Region, Lat, Long),
               names_to = "Date", values_to = "Confirmed") 

head(time_series_confirmed_long)
## # A tibble: 6 x 6
##   Province_State Country_Region   Lat  Long Date    Confirmed
##   <chr>          <chr>          <dbl> <dbl> <chr>       <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 1/22/20         0
## 2 <NA>           Afghanistan     33.9  67.7 1/23/20         0
## 3 <NA>           Afghanistan     33.9  67.7 1/24/20         0
## 4 <NA>           Afghanistan     33.9  67.7 1/25/20         0
## 5 <NA>           Afghanistan     33.9  67.7 1/26/20         0
## 6 <NA>           Afghanistan     33.9  67.7 1/27/20         0
download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv", 
              destfile = "time_series_covid19_deaths_global.csv")

time_series_deaths <- read_csv("time_series_covid19_deaths_global.csv")%>%
  rename(Province_State = "Province/State", Country_Region = "Country/Region")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
time_series_deaths_long <- time_series_deaths %>% 
  pivot_longer(-c(Province_State, Country_Region, Lat, Long),
               names_to = "Date", values_to = "Deaths") 

head(time_series_deaths_long)
## # A tibble: 6 x 6
##   Province_State Country_Region   Lat  Long Date    Deaths
##   <chr>          <chr>          <dbl> <dbl> <chr>    <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 1/22/20      0
## 2 <NA>           Afghanistan     33.9  67.7 1/23/20      0
## 3 <NA>           Afghanistan     33.9  67.7 1/24/20      0
## 4 <NA>           Afghanistan     33.9  67.7 1/25/20      0
## 5 <NA>           Afghanistan     33.9  67.7 1/26/20      0
## 6 <NA>           Afghanistan     33.9  67.7 1/27/20      0
# Joining the time series tables 

# first need to create the common column (key), which will be a column where we combine province, country, and date 
time_series_confirmed_long <- time_series_confirmed_long %>% 
  unite(Key, Province_State, Country_Region, Date, sep = ".", remove = FALSE)

head(time_series_confirmed_long)
## # A tibble: 6 x 7
##   Key                 Province_State Country_Region   Lat  Long Date   Confirmed
##   <chr>               <chr>          <chr>          <dbl> <dbl> <chr>      <dbl>
## 1 NA.Afghanistan.1/2… <NA>           Afghanistan     33.9  67.7 1/22/…         0
## 2 NA.Afghanistan.1/2… <NA>           Afghanistan     33.9  67.7 1/23/…         0
## 3 NA.Afghanistan.1/2… <NA>           Afghanistan     33.9  67.7 1/24/…         0
## 4 NA.Afghanistan.1/2… <NA>           Afghanistan     33.9  67.7 1/25/…         0
## 5 NA.Afghanistan.1/2… <NA>           Afghanistan     33.9  67.7 1/26/…         0
## 6 NA.Afghanistan.1/2… <NA>           Afghanistan     33.9  67.7 1/27/…         0
# do the same thing for deaths 
time_series_deaths_long <- time_series_deaths_long %>% 
  unite(Key, Province_State, Country_Region, Date, sep = ".") %>% 
  select(Key, Deaths)

head(time_series_deaths_long)
## # A tibble: 6 x 2
##   Key                    Deaths
##   <chr>                   <dbl>
## 1 NA.Afghanistan.1/22/20      0
## 2 NA.Afghanistan.1/23/20      0
## 3 NA.Afghanistan.1/24/20      0
## 4 NA.Afghanistan.1/25/20      0
## 5 NA.Afghanistan.1/26/20      0
## 6 NA.Afghanistan.1/27/20      0
# now join the tables 
time_series_long_joined <- full_join(time_series_confirmed_long,
                                     time_series_deaths_long, by = c("Key")) %>% 
  select(-Key)

head(time_series_long_joined)
## # A tibble: 6 x 7
##   Province_State Country_Region   Lat  Long Date    Confirmed Deaths
##   <chr>          <chr>          <dbl> <dbl> <chr>       <dbl>  <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 1/22/20         0      0
## 2 <NA>           Afghanistan     33.9  67.7 1/23/20         0      0
## 3 <NA>           Afghanistan     33.9  67.7 1/24/20         0      0
## 4 <NA>           Afghanistan     33.9  67.7 1/25/20         0      0
## 5 <NA>           Afghanistan     33.9  67.7 1/26/20         0      0
## 6 <NA>           Afghanistan     33.9  67.7 1/27/20         0      0
which(is.na(time_series_long_joined$Confirmed))
## integer(0)
which(is.na(time_series_long_joined$Deaths))
## integer(0)

no NAs - great

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
time_series_long_joined$Date <- mdy(time_series_long_joined$Date)

# create a column using pivot_long 

time_series_long_joined_counts <- time_series_long_joined %>% 
  pivot_longer(-c(Province_State, Country_Region, Lat, Long, Date),
               names_to = "Report_Type", values_to = "Counts")
time_series_long_joined_counts <- time_series_long_joined %>% 
  pivot_longer(-c(Province_State, Country_Region, Lat, Long, Date),
               names_to = "Report_Type", values_to = "Counts")

head(time_series_long_joined_counts)
## # A tibble: 6 x 7
##   Province_State Country_Region   Lat  Long Date       Report_Type Counts
##   <chr>          <chr>          <dbl> <dbl> <date>     <chr>        <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 2020-01-22 Confirmed        0
## 2 <NA>           Afghanistan     33.9  67.7 2020-01-22 Deaths           0
## 3 <NA>           Afghanistan     33.9  67.7 2020-01-23 Confirmed        0
## 4 <NA>           Afghanistan     33.9  67.7 2020-01-23 Deaths           0
## 5 <NA>           Afghanistan     33.9  67.7 2020-01-24 Confirmed        0
## 6 <NA>           Afghanistan     33.9  67.7 2020-01-24 Deaths           0
# Making graphs from the time series data 

time_series_long_joined %>% 
  group_by(Country_Region,Date) %>% 
  summarise_at(c("Confirmed", "Deaths"), sum) %>% 
  filter (Country_Region == "US") %>% 
  ggplot(aes(x = Date,  y = Deaths)) + 
  geom_point() +
  geom_line() +
  ggtitle("US COVID-19 Deaths")

# some other countries 
time_series_long_joined %>% 
  group_by(Country_Region,Date) %>% 
  summarise_at(c("Confirmed", "Deaths"), sum) %>% 
  filter (Country_Region %in% c("China","Japan", "Korea, South",
                                "Italy","Spain", "US")) %>% 
  ggplot(aes(x = Date,  y = Deaths)) + 
  geom_point() +
  geom_line() +
  ggtitle("COVID-19 Deaths") +
  facet_wrap(~Country_Region, ncol=2, scales="free_y")

# several countries on the same graph

time_series_long_joined %>% 
  group_by(Country_Region,Date) %>% 
  summarise_at(c("Confirmed", "Deaths"), sum) %>% 
  filter (Country_Region %in% c("China","France","Italy", 
                                "Korea, South", "US")) %>% 
  ggplot(aes(x = Date,  y = Deaths, color = Country_Region)) + 
  geom_point() +
  geom_line() +
  ggtitle("COVID-19 Deaths")

# cases and deaths on the same graph 

time_series_long_joined_counts %>% 
  group_by(Country_Region, Report_Type, Date) %>% 
  summarise(Counts = sum(Counts)) %>% 
  filter (Country_Region == "US") %>% 
  ggplot(aes(x = Date,  y = log2(Counts), fill = Report_Type, color = Report_Type)) + 
  geom_point() +
  geom_line() +
  ggtitle("US COVID-19 Cases")
## `summarise()` regrouping output by 'Country_Region', 'Report_Type' (override with `.groups` argument)

Exercises

  1. Finish the visualization tutorial: Lab 5A

  2. Revise the above example for joining tables using 6/13/2020 and 9/13/2020. Plot the data as a bar plot. Make sure your graph looks nice in the report by adjusting the height in the R code chunk header.

# Read in the June data

june <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/06-13-2020.csv")) %>%
  filter (Country_Region == "US") %>% 
  group_by(Province_State, Country_Region) %>% 
  summarise(Confirmed = sum(Confirmed))
## Parsed with column specification:
## cols(
##   FIPS = col_double(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Last_Update = col_datetime(format = ""),
##   Lat = col_double(),
##   Long_ = col_double(),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Active = col_double(),
##   Combined_Key = col_character(),
##   Incidence_Rate = col_double(),
##   `Case-Fatality_Ratio` = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
str(june)
## tibble [58 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Country_Region: chr [1:58] "US" "US" "US" "US" ...
##  $ Confirmed     : num [1:58] 24601 653 34660 12095 150018 ...
##  - attr(*, "groups")= tibble [58 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##   ..$ .rows         : list<int> [1:58] 
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int 13
##   .. ..$ : int 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   .. ..$ : int 18
##   .. ..$ : int 19
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 22
##   .. ..$ : int 23
##   .. ..$ : int 24
##   .. ..$ : int 25
##   .. ..$ : int 26
##   .. ..$ : int 27
##   .. ..$ : int 28
##   .. ..$ : int 29
##   .. ..$ : int 30
##   .. ..$ : int 31
##   .. ..$ : int 32
##   .. ..$ : int 33
##   .. ..$ : int 34
##   .. ..$ : int 35
##   .. ..$ : int 36
##   .. ..$ : int 37
##   .. ..$ : int 38
##   .. ..$ : int 39
##   .. ..$ : int 40
##   .. ..$ : int 41
##   .. ..$ : int 42
##   .. ..$ : int 43
##   .. ..$ : int 44
##   .. ..$ : int 45
##   .. ..$ : int 46
##   .. ..$ : int 47
##   .. ..$ : int 48
##   .. ..$ : int 49
##   .. ..$ : int 50
##   .. ..$ : int 51
##   .. ..$ : int 52
##   .. ..$ : int 53
##   .. ..$ : int 54
##   .. ..$ : int 55
##   .. ..$ : int 56
##   .. ..$ : int 57
##   .. ..$ : int 58
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
sept <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/09-13-2020.csv")) %>%
  filter (Country_Region == "US") %>% 
  group_by(Province_State, Country_Region) %>% 
  summarise(Confirmed = sum(Confirmed))
## Parsed with column specification:
## cols(
##   FIPS = col_double(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Last_Update = col_datetime(format = ""),
##   Lat = col_double(),
##   Long_ = col_double(),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Active = col_double(),
##   Combined_Key = col_character(),
##   Incidence_Rate = col_double(),
##   `Case-Fatality_Ratio` = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
str(sept)
## tibble [58 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Country_Region: chr [1:58] "US" "US" "US" "US" ...
##  $ Confirmed     : num [1:58] 138755 6268 208512 70219 761728 ...
##  - attr(*, "groups")= tibble [58 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##   ..$ .rows         : list<int> [1:58] 
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int 13
##   .. ..$ : int 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   .. ..$ : int 18
##   .. ..$ : int 19
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 22
##   .. ..$ : int 23
##   .. ..$ : int 24
##   .. ..$ : int 25
##   .. ..$ : int 26
##   .. ..$ : int 27
##   .. ..$ : int 28
##   .. ..$ : int 29
##   .. ..$ : int 30
##   .. ..$ : int 31
##   .. ..$ : int 32
##   .. ..$ : int 33
##   .. ..$ : int 34
##   .. ..$ : int 35
##   .. ..$ : int 36
##   .. ..$ : int 37
##   .. ..$ : int 38
##   .. ..$ : int 39
##   .. ..$ : int 40
##   .. ..$ : int 41
##   .. ..$ : int 42
##   .. ..$ : int 43
##   .. ..$ : int 44
##   .. ..$ : int 45
##   .. ..$ : int 46
##   .. ..$ : int 47
##   .. ..$ : int 48
##   .. ..$ : int 49
##   .. ..$ : int 50
##   .. ..$ : int 51
##   .. ..$ : int 52
##   .. ..$ : int 53
##   .. ..$ : int 54
##   .. ..$ : int 55
##   .. ..$ : int 56
##   .. ..$ : int 57
##   .. ..$ : int 58
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
# Join the two dataframes 
june_sept_joined <- full_join(june, sept, by = c("Province_State"))

str(june_sept_joined)
## tibble [58 × 5] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ Province_State  : chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Country_Region.x: chr [1:58] "US" "US" "US" "US" ...
##  $ Confirmed.x     : num [1:58] 24601 653 34660 12095 150018 ...
##  $ Country_Region.y: chr [1:58] "US" "US" "US" "US" ...
##  $ Confirmed.y     : num [1:58] 138755 6268 208512 70219 761728 ...
##  - attr(*, "groups")= tibble [58 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##   ..$ .rows         : list<int> [1:58] 
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int 13
##   .. ..$ : int 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   .. ..$ : int 18
##   .. ..$ : int 19
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 22
##   .. ..$ : int 23
##   .. ..$ : int 24
##   .. ..$ : int 25
##   .. ..$ : int 26
##   .. ..$ : int 27
##   .. ..$ : int 28
##   .. ..$ : int 29
##   .. ..$ : int 30
##   .. ..$ : int 31
##   .. ..$ : int 32
##   .. ..$ : int 33
##   .. ..$ : int 34
##   .. ..$ : int 35
##   .. ..$ : int 36
##   .. ..$ : int 37
##   .. ..$ : int 38
##   .. ..$ : int 39
##   .. ..$ : int 40
##   .. ..$ : int 41
##   .. ..$ : int 42
##   .. ..$ : int 43
##   .. ..$ : int 44
##   .. ..$ : int 45
##   .. ..$ : int 46
##   .. ..$ : int 47
##   .. ..$ : int 48
##   .. ..$ : int 49
##   .. ..$ : int 50
##   .. ..$ : int 51
##   .. ..$ : int 52
##   .. ..$ : int 53
##   .. ..$ : int 54
##   .. ..$ : int 55
##   .. ..$ : int 56
##   .. ..$ : int 57
##   .. ..$ : int 58
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
# look at the new joined dataframe 
head(june_sept_joined)
## # A tibble: 6 x 5
## # Groups:   Province_State [6]
##   Province_State Country_Region.x Confirmed.x Country_Region.y Confirmed.y
##   <chr>          <chr>                  <dbl> <chr>                  <dbl>
## 1 Alabama        US                     24601 US                    138755
## 2 Alaska         US                       653 US                      6268
## 3 Arizona        US                     34660 US                    208512
## 4 Arkansas       US                     12095 US                     70219
## 5 California     US                    150018 US                    761728
## 6 Colorado       US                     29002 US                     61293
tail(june_sept_joined, 5)
## # A tibble: 5 x 5
## # Groups:   Province_State [5]
##   Province_State Country_Region.x Confirmed.x Country_Region.y Confirmed.y
##   <chr>          <chr>                  <dbl> <chr>                  <dbl>
## 1 Virginia       US                     53869 US                    133742
## 2 Washington     US                     25538 US                     79826
## 3 West Virginia  US                      2274 US                     12705
## 4 Wisconsin      US                     22518 US                     89185
## 5 Wyoming        US                      1050 US                      4346
# look to see what cells are NA  
which(is.na(june_sept_joined))
## integer(0)
# rename the columns 
june_sept_joined <- full_join(june, sept, by = c("Province_State")) %>% 
  rename(Confirmed_June = "Confirmed.x", Confirmed_Sept = "Confirmed.y") %>% 
  select(-Country_Region.x, -Country_Region.y)

head(june_sept_joined)
## # A tibble: 6 x 3
## # Groups:   Province_State [6]
##   Province_State Confirmed_June Confirmed_Sept
##   <chr>                   <dbl>          <dbl>
## 1 Alabama                 24601         138755
## 2 Alaska                    653           6268
## 3 Arizona                 34660         208512
## 4 Arkansas                12095          70219
## 5 California             150018         761728
## 6 Colorado                29002          61293
# pivot the data 
june_sept_joined_long <- june_sept_joined %>% 
              pivot_longer(-c(Province_State),
                            names_to = "Date", values_to = "Confirmed")

head(june_sept_joined_long)
## # A tibble: 6 x 3
## # Groups:   Province_State [3]
##   Province_State Date           Confirmed
##   <chr>          <chr>              <dbl>
## 1 Alabama        Confirmed_June     24601
## 2 Alabama        Confirmed_Sept    138755
## 3 Alaska         Confirmed_June       653
## 4 Alaska         Confirmed_Sept      6268
## 5 Arizona        Confirmed_June     34660
## 6 Arizona        Confirmed_Sept    208512
ggplot(june_sept_joined_long, aes(x = Confirmed,  y = Province_State))  + 
    geom_point(aes(color = Date)) 

  1. Add more informative x and y labels and a title to the above graph in Ex2.
ggplot(june_sept_joined_long, aes(x = Confirmed,  y = Province_State))  + 
    geom_point(aes(color = Date)) +
  labs(x = "Confirmed Cases", y = "State/Province")

  1. Using the time series data. Make a plot of the total number of confirmed deaths per day worldwide.
# load time series death data again
download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv", 
               destfile = "data_raw/time_series_covid19_deaths_global.csv")


time_series_deaths <- read_csv("data_raw/time_series_covid19_deaths_global.csv")%>%
  rename(Province_State = "Province/State", Country_Region = "Country/Region")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
head(time_series_deaths)
## # A tibble: 6 x 258
##   Province_State Country_Region   Lat   Long `1/22/20` `1/23/20` `1/24/20`
##   <chr>          <chr>          <dbl>  <dbl>     <dbl>     <dbl>     <dbl>
## 1 <NA>           Afghanistan     33.9  67.7          0         0         0
## 2 <NA>           Albania         41.2  20.2          0         0         0
## 3 <NA>           Algeria         28.0   1.66         0         0         0
## 4 <NA>           Andorra         42.5   1.52         0         0         0
## 5 <NA>           Angola         -11.2  17.9          0         0         0
## 6 <NA>           Antigua and B…  17.1 -61.8          0         0         0
## # … with 251 more variables: `1/25/20` <dbl>, `1/26/20` <dbl>, `1/27/20` <dbl>,
## #   `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>, `1/31/20` <dbl>,
## #   `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>, `2/4/20` <dbl>,
## #   `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>,
## #   `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>, `2/12/20` <dbl>,
## #   `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>, `2/16/20` <dbl>,
## #   `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>,
## #   `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, `2/24/20` <dbl>,
## #   `2/25/20` <dbl>, `2/26/20` <dbl>, `2/27/20` <dbl>, `2/28/20` <dbl>,
## #   `2/29/20` <dbl>, `3/1/20` <dbl>, `3/2/20` <dbl>, `3/3/20` <dbl>,
## #   `3/4/20` <dbl>, `3/5/20` <dbl>, `3/6/20` <dbl>, `3/7/20` <dbl>,
## #   `3/8/20` <dbl>, `3/9/20` <dbl>, `3/10/20` <dbl>, `3/11/20` <dbl>,
## #   `3/12/20` <dbl>, `3/13/20` <dbl>, `3/14/20` <dbl>, `3/15/20` <dbl>,
## #   `3/16/20` <dbl>, `3/17/20` <dbl>, `3/18/20` <dbl>, `3/19/20` <dbl>,
## #   `3/20/20` <dbl>, `3/21/20` <dbl>, `3/22/20` <dbl>, `3/23/20` <dbl>,
## #   `3/24/20` <dbl>, `3/25/20` <dbl>, `3/26/20` <dbl>, `3/27/20` <dbl>,
## #   `3/28/20` <dbl>, `3/29/20` <dbl>, `3/30/20` <dbl>, `3/31/20` <dbl>,
## #   `4/1/20` <dbl>, `4/2/20` <dbl>, `4/3/20` <dbl>, `4/4/20` <dbl>,
## #   `4/5/20` <dbl>, `4/6/20` <dbl>, `4/7/20` <dbl>, `4/8/20` <dbl>,
## #   `4/9/20` <dbl>, `4/10/20` <dbl>, `4/11/20` <dbl>, `4/12/20` <dbl>,
## #   `4/13/20` <dbl>, `4/14/20` <dbl>, `4/15/20` <dbl>, `4/16/20` <dbl>,
## #   `4/17/20` <dbl>, `4/18/20` <dbl>, `4/19/20` <dbl>, `4/20/20` <dbl>,
## #   `4/21/20` <dbl>, `4/22/20` <dbl>, `4/23/20` <dbl>, `4/24/20` <dbl>,
## #   `4/25/20` <dbl>, `4/26/20` <dbl>, `4/27/20` <dbl>, `4/28/20` <dbl>,
## #   `4/29/20` <dbl>, `4/30/20` <dbl>, `5/1/20` <dbl>, `5/2/20` <dbl>,
## #   `5/3/20` <dbl>, …
time_series_deaths_long <- time_series_deaths %>% 
               pivot_longer(-c(Province_State, Country_Region, Lat, Long),
                            names_to = "Date", values_to = "Deaths") 

head(time_series_deaths_long)
## # A tibble: 6 x 6
##   Province_State Country_Region   Lat  Long Date    Deaths
##   <chr>          <chr>          <dbl> <dbl> <chr>    <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 1/22/20      0
## 2 <NA>           Afghanistan     33.9  67.7 1/23/20      0
## 3 <NA>           Afghanistan     33.9  67.7 1/24/20      0
## 4 <NA>           Afghanistan     33.9  67.7 1/25/20      0
## 5 <NA>           Afghanistan     33.9  67.7 1/26/20      0
## 6 <NA>           Afghanistan     33.9  67.7 1/27/20      0
# reformat date 
library(lubridate)
time_series_deaths_long$Date <- mdy(time_series_deaths_long$Date)

head(time_series_deaths_long)
## # A tibble: 6 x 6
##   Province_State Country_Region   Lat  Long Date       Deaths
##   <chr>          <chr>          <dbl> <dbl> <date>      <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 2020-01-22      0
## 2 <NA>           Afghanistan     33.9  67.7 2020-01-23      0
## 3 <NA>           Afghanistan     33.9  67.7 2020-01-24      0
## 4 <NA>           Afghanistan     33.9  67.7 2020-01-25      0
## 5 <NA>           Afghanistan     33.9  67.7 2020-01-26      0
## 6 <NA>           Afghanistan     33.9  67.7 2020-01-27      0
time_series_deaths_long %>%
  group_by(Date) %>% 
  summarize(Total_Deaths = sum(Deaths)) %>%
  ggplot(aes(x = Date,  y = Total_Deaths)) + 
    geom_line() +
    ggtitle("Worldwide COVID-19 Deaths")
## `summarise()` ungrouping output (override with `.groups` argument)

  1. Use Mutate to make a new column with deaths/confirmed cases (Hint: this might be best done after joining the tables, but before pivoting longer).
# lets look at the data set we want, which we made a bit ago 
head(time_series_long_joined)
## # A tibble: 6 x 7
##   Province_State Country_Region   Lat  Long Date       Confirmed Deaths
##   <chr>          <chr>          <dbl> <dbl> <date>         <dbl>  <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 2020-01-22         0      0
## 2 <NA>           Afghanistan     33.9  67.7 2020-01-23         0      0
## 3 <NA>           Afghanistan     33.9  67.7 2020-01-24         0      0
## 4 <NA>           Afghanistan     33.9  67.7 2020-01-25         0      0
## 5 <NA>           Afghanistan     33.9  67.7 2020-01-26         0      0
## 6 <NA>           Afghanistan     33.9  67.7 2020-01-27         0      0
# change the date format
time_series_long_joined$Date <- mdy(time_series_long_joined$Date)
## Warning: All formats failed to parse. No formats found.
# now mutate to add column 
mortality<-time_series_long_joined %>% 
  mutate(rate = Deaths/Confirmed)

head(mortality)
## # A tibble: 6 x 8
##   Province_State Country_Region   Lat  Long Date       Confirmed Deaths  rate
##   <chr>          <chr>          <dbl> <dbl> <date>         <dbl>  <dbl> <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 NA                 0      0   NaN
## 2 <NA>           Afghanistan     33.9  67.7 NA                 0      0   NaN
## 3 <NA>           Afghanistan     33.9  67.7 NA                 0      0   NaN
## 4 <NA>           Afghanistan     33.9  67.7 NA                 0      0   NaN
## 5 <NA>           Afghanistan     33.9  67.7 NA                 0      0   NaN
## 6 <NA>           Afghanistan     33.9  67.7 NA                 0      0   NaN
  1. Plot US deaths/confirmed cases per day. For some reason my code works - it produces th plot that I want - but it runs into an error when I knit.
#mortality %>% 
#  replace_na(list(rate = 0))  %>% 
#  filter(Country_Region == "US") %>% 
#  ggplot(aes(x = Date, y = rate)) +
#  geom_line() + 
#  ggtitle("Mortality rate in the US (deaths per positive case)")
  1. Make a single graph with from the times series data with the 10 countries with the hightest death total.
# first find the total deaths of the top ten countries 
top_deaths<-time_series_deaths_long %>% 
  group_by(Country_Region) %>% 
  summarize(total_deaths = sum(Deaths)) %>% 
  arrange(desc(total_deaths)) %>% 
  slice_head(n=10)
## `summarise()` ungrouping output (override with `.groups` argument)
countries <- top_deaths$Country_Region
time_series_deaths_long %>% 
  filter(Country_Region == countries) %>% 
  ggplot(aes(x = Date, y = Deaths, color = Country_Region)) + 
  geom_line(lwd = .7) +
  ggtitle("COVID-19 Deaths in Top 10 Countries") 
## Warning in Country_Region == countries: longer object length is not a multiple
## of shorter object length

I guess UK and France were reporting numbers once a week?

  1. Using Facet_Wraps make a graph from the times series data with the 10 countries with the hightest death total. Format the table to look nice in the report by changing the number of rows in your graph and/or adjusting the height in the R code chunk header.
time_series_deaths_long %>% 
  filter(Country_Region == countries) %>% 
  ggplot(aes(x = Date,  y = Deaths)) + 
  geom_point() +
  geom_line() +
  ggtitle("COVID-19 Deaths") +
  facet_wrap(~Country_Region, ncol=2, scales="free_y")
## Warning in Country_Region == countries: longer object length is not a multiple
## of shorter object length

  1. Using Facet_Wraps make a graph from the times series data for confirmed cases for all US states and territories. Format the table to look nice in the report by changing the number of rows in your graph and/or adjusting the height in the R code chunk header.

NEED TO FIND THIS DATA

Currently, the only time series data I have downloaded has all of US together (no states). The states data I have is cumulative total cases/deaths, not time series.

#data %>% 
#  filter(Country_Region == "US") %>% 
#  ggplot(aes(x = Date,  y = Confirmed)) + 
#  geom_point() +
#  geom_line() +
#  ggtitle("COVID-19 Cases") +
#  facet_wrap(~Province_State, ncol=2, scales="free_y")
  1. Apply a theme other than the one used in the Data Carpentries tutorial to the graph in Ex8.
time_series_deaths_long %>% 
  filter(Country_Region == countries) %>% 
  ggplot(aes(x = Date,  y = Deaths)) + 
  geom_point() +
  geom_line() +
  ggtitle("COVID-19 Deaths") +
  facet_wrap(~Country_Region, ncol=2) + 
  theme_classic()
## Warning in Country_Region == countries: longer object length is not a multiple
## of shorter object length