library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# Read in the data from JHU github
report_03_11_2020 <-   read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/03-13-2020.csv")) %>%
  rename(Country_Region = "Country/Region", Province_State = "Province/State")
## Parsed with column specification:
## cols(
##   `Province/State` = col_character(),
##   `Country/Region` = col_character(),
##   `Last Update` = col_datetime(format = ""),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Latitude = col_double(),
##   Longitude = col_double()
## )
str(report_03_11_2020)
## tibble [230 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Province_State: chr [1:230] "Hubei" "Guangdong" "Henan" "Zhejiang" ...
##  $ Country_Region: chr [1:230] "China" "China" "China" "China" ...
##  $ Last Update   : POSIXct[1:230], format: "2020-03-13 11:09:03" "2020-03-13 11:09:03" ...
##  $ Confirmed     : num [1:230] 67786 1356 1273 1215 1018 ...
##  $ Deaths        : num [1:230] 3062 8 22 1 4 ...
##  $ Recovered     : num [1:230] 51553 1296 1249 1197 1005 ...
##  $ Latitude      : num [1:230] 31 23.3 33.9 29.2 27.6 ...
##  $ Longitude     : num [1:230] 112 113 114 120 112 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Province/State` = col_character(),
##   ..   `Country/Region` = col_character(),
##   ..   `Last Update` = col_datetime(format = ""),
##   ..   Confirmed = col_double(),
##   ..   Deaths = col_double(),
##   ..   Recovered = col_double(),
##   ..   Latitude = col_double(),
##   ..   Longitude = col_double()
##   .. )

Interactive data tables:

library(DT)
datatable(report_03_11_2020)

Exercises Part 1

  1. The format of the daily reports has changed over time. What are the differences between 3/12/2020, the Friday before spring break 2020 and yesterday 6 months later on 9/12/2020? Load the files into R and use str() and View() (or in the top right corner click on Environment and then the data frame.). Note the in the more recent dates they have changed the column headers so that it is no longer necessary to rename Province_State and Country_Region.
report_03_12_2020 <-   read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/03-12-2020.csv")) %>%
  rename(Country_Region = "Country/Region", Province_State = "Province/State")
## Parsed with column specification:
## cols(
##   `Province/State` = col_character(),
##   `Country/Region` = col_character(),
##   `Last Update` = col_datetime(format = ""),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Latitude = col_double(),
##   Longitude = col_double()
## )
report_09_12_2020 <-   read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/09-12-2020.csv"))
## Parsed with column specification:
## cols(
##   FIPS = col_double(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Last_Update = col_datetime(format = ""),
##   Lat = col_double(),
##   Long_ = col_double(),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Active = col_double(),
##   Combined_Key = col_character(),
##   Incidence_Rate = col_double(),
##   `Case-Fatality_Ratio` = col_double()
## )
str(report_03_12_2020)
## tibble [218 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Province_State: chr [1:218] "Hubei" NA NA NA ...
##  $ Country_Region: chr [1:218] "China" "Italy" "Iran" "Korea, South" ...
##  $ Last Update   : POSIXct[1:218], format: "2020-03-12 09:53:06" "2020-03-11 21:33:02" ...
##  $ Confirmed     : num [1:218] 67781 12462 10075 7869 2281 ...
##  $ Deaths        : num [1:218] 3056 827 429 66 48 ...
##  $ Recovered     : num [1:218] 50318 1045 2959 333 12 ...
##  $ Latitude      : num [1:218] 31 43 32 36 46.2 ...
##  $ Longitude     : num [1:218] 112.27 12 53 128 2.21 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Province/State` = col_character(),
##   ..   `Country/Region` = col_character(),
##   ..   `Last Update` = col_datetime(format = ""),
##   ..   Confirmed = col_double(),
##   ..   Deaths = col_double(),
##   ..   Recovered = col_double(),
##   ..   Latitude = col_double(),
##   ..   Longitude = col_double()
##   .. )
str(report_09_12_2020)
## tibble [3,954 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ FIPS               : num [1:3954] NA NA NA NA NA NA NA NA NA NA ...
##  $ Admin2             : chr [1:3954] NA NA NA NA ...
##  $ Province_State     : chr [1:3954] NA NA NA NA ...
##  $ Country_Region     : chr [1:3954] "Afghanistan" "Albania" "Algeria" "Andorra" ...
##  $ Last_Update        : POSIXct[1:3954], format: "2020-09-13 04:30:52" "2020-09-13 04:30:52" ...
##  $ Lat                : num [1:3954] 33.9 41.2 28 42.5 -11.2 ...
##  $ Long_              : num [1:3954] 67.71 20.17 1.66 1.52 17.87 ...
##  $ Confirmed          : num [1:3954] 38641 11185 48007 1344 3335 ...
##  $ Deaths             : num [1:3954] 1420 330 1605 53 132 ...
##  $ Recovered          : num [1:3954] 31234 6494 33875 943 1289 ...
##  $ Active             : num [1:3954] 5987 4361 12527 348 1914 ...
##  $ Combined_Key       : chr [1:3954] "Afghanistan" "Albania" "Algeria" "Andorra" ...
##  $ Incidence_Rate     : num [1:3954] 99.3 388.7 109.5 1739.5 10.1 ...
##  $ Case-Fatality_Ratio: num [1:3954] 3.67 2.95 3.34 3.94 3.96 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   FIPS = col_double(),
##   ..   Admin2 = col_character(),
##   ..   Province_State = col_character(),
##   ..   Country_Region = col_character(),
##   ..   Last_Update = col_datetime(format = ""),
##   ..   Lat = col_double(),
##   ..   Long_ = col_double(),
##   ..   Confirmed = col_double(),
##   ..   Deaths = col_double(),
##   ..   Recovered = col_double(),
##   ..   Active = col_double(),
##   ..   Combined_Key = col_character(),
##   ..   Incidence_Rate = col_double(),
##   ..   `Case-Fatality_Ratio` = col_double()
##   .. )
  1. Many countries have multiple entries representing different regions (e.g. counties and states). To get the counts for a country we need to summarise the data. Use group_by() and summarize() to make a data frame (tibble) with total counts for each country.
country_confirmed_cases <- report_09_12_2020 %>% 
  group_by(Country_Region) %>%
  summarize(total_confirmed = sum(Confirmed))
## `summarise()` ungrouping output (override with `.groups` argument)
head(country_confirmed_cases)
## # A tibble: 6 x 2
##   Country_Region      total_confirmed
##   <chr>                         <dbl>
## 1 Afghanistan                   38641
## 2 Albania                       11185
## 3 Algeria                       48007
## 4 Andorra                        1344
## 5 Angola                         3335
## 6 Antigua and Barbuda              95
  1. To get counts for confirmed cases per U.S state use filter() to select US and group_by and summarize as above.
states_cases <- report_09_12_2020 %>% 
  filter(Country_Region == "US") %>% 
  group_by(Province_State) %>% 
  summarize(total_confirmed = sum(Confirmed))
## `summarise()` ungrouping output (override with `.groups` argument)
head(states_cases)
## # A tibble: 6 x 2
##   Province_State total_confirmed
##   <chr>                    <dbl>
## 1 Alabama                 137646
## 2 Alaska                    6207
## 3 Arizona                 208128
## 4 Arkansas                 69710
## 5 California              758933
## 6 Colorado                 60876
  1. It will be difficult to make simple graphs with the data from all countries in the world or even the US states. Use arrange_by() and slice() to get just the top 10 countries for Deaths.
sept_top10_countries_deaths <- report_09_12_2020 %>% 
  group_by(Country_Region) %>% 
  summarize(total_deaths = sum(Deaths)) %>% 
  arrange(desc(total_deaths)) %>% 
  slice_head(n=10) 
## `summarise()` ungrouping output (override with `.groups` argument)
# could also do slice(n=1:10)

datatable(sept_top10_countries_deaths)

Exercises Part 2

  1. Make plots using geom_point() of the 10 countries with the confirmed cases and deaths (two separate graphs) on 3/12/2020 and 9/12/2020.
# Make cases dataframe for September
sept_top10_countries_cases <- report_09_12_2020 %>% 
  group_by(Country_Region) %>% 
  summarize(total_cases = sum(Confirmed)) %>% 
  arrange(desc(total_cases)) %>% 
  slice_head(n=10)
## `summarise()` ungrouping output (override with `.groups` argument)
# Make the March dataframe for number of cases 
march_top10_countries_cases <- report_03_12_2020 %>% 
  group_by(Country_Region) %>% 
  summarize(total_cases = sum(Confirmed)) %>% 
  arrange(desc(total_cases)) %>% 
  slice_head(n=10)
## `summarise()` ungrouping output (override with `.groups` argument)
# plot for cases
cases<-ggplot(data = sept_top10_countries_cases,
mapping = aes(x = total_cases, y = Country_Region)) + 
  geom_point(color = "red")+
  geom_point(data = march_top10_countries_cases, 
             color = "blue") + 
  ggtitle("Countries with Most COVID-19 Cases in March and Sept 2020") + 
  ylab("Country") +
  xlab("Cumulative Cases as of March 12 (blue) and September 12 (red)")

# not sure how to add a legend when the two dates are not in the same dataframe 

# Repeat for number of deaths 

march_top10_countries_deaths <- report_03_12_2020 %>% 
  group_by(Country_Region) %>% 
  summarize(total_deaths = sum(Deaths)) %>% 
  arrange(desc(total_deaths)) %>% 
  slice_head(n=10)
## `summarise()` ungrouping output (override with `.groups` argument)
# plot for deaths
deaths <- ggplot(data = sept_top10_countries_deaths,
       mapping = aes(x = total_deaths, y = Country_Region)) + 
  geom_point(color = "orange")+
  geom_point(data = march_top10_countries_deaths, 
             color = "forestgreen") + 
  ggtitle("Countries with Most COVID-19-related Deaths in March and Sept 2020") + 
  ylab("Country") +
  xlab("Cumulative Deaths as of March 12 (green) and September 12 (orange)")

library('gridExtra')
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(cases, deaths, nrow = 2)

  1. Make using geom_bar of the 10 states with the most confirmed cases and deaths (two separate graphs) on 3/12/2020 and 9/12/2020.
# cases 
bar_cases <- ggplot(data = sept_top10_countries_cases,
mapping = aes(x = total_cases,  y = Country_Region)) + 
  geom_bar(stat = 'identity', color = "red") + 
  geom_bar(data = march_top10_countries_cases, stat = 'identity', color = "blue") + 
  ggtitle("Countries with Most COVID-19 Cases in March and Sept 2020") + 
  ylab("Country") +
  xlab("Cumulative Cases as of March 12 (blue) and September 12 (red)")

# deaths 
bar_deaths <- ggplot(data = sept_top10_countries_deaths,
       mapping = aes(x = total_deaths,  y = Country_Region)) + 
  geom_bar(stat = 'identity', color = "orange") + 
  geom_bar(data = march_top10_countries_deaths, stat = 'identity', color = "forestgreen") + 
  ggtitle("Countries with Most COVID-19-related Deaths in March and Sept 2020") + 
  ylab("Country") +
  xlab("Cumulative Deaths as of March 12 (green) and September 12 (orange)")

grid.arrange(bar_cases, bar_deaths, nrow = 2 )