library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Read in the data from JHU github
report_03_11_2020 <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/03-13-2020.csv")) %>%
rename(Country_Region = "Country/Region", Province_State = "Province/State")
## Parsed with column specification:
## cols(
## `Province/State` = col_character(),
## `Country/Region` = col_character(),
## `Last Update` = col_datetime(format = ""),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double(),
## Latitude = col_double(),
## Longitude = col_double()
## )
str(report_03_11_2020)
## tibble [230 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Province_State: chr [1:230] "Hubei" "Guangdong" "Henan" "Zhejiang" ...
## $ Country_Region: chr [1:230] "China" "China" "China" "China" ...
## $ Last Update : POSIXct[1:230], format: "2020-03-13 11:09:03" "2020-03-13 11:09:03" ...
## $ Confirmed : num [1:230] 67786 1356 1273 1215 1018 ...
## $ Deaths : num [1:230] 3062 8 22 1 4 ...
## $ Recovered : num [1:230] 51553 1296 1249 1197 1005 ...
## $ Latitude : num [1:230] 31 23.3 33.9 29.2 27.6 ...
## $ Longitude : num [1:230] 112 113 114 120 112 ...
## - attr(*, "spec")=
## .. cols(
## .. `Province/State` = col_character(),
## .. `Country/Region` = col_character(),
## .. `Last Update` = col_datetime(format = ""),
## .. Confirmed = col_double(),
## .. Deaths = col_double(),
## .. Recovered = col_double(),
## .. Latitude = col_double(),
## .. Longitude = col_double()
## .. )
Interactive data tables:
library(DT)
datatable(report_03_11_2020)
report_03_12_2020 <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/03-12-2020.csv")) %>%
rename(Country_Region = "Country/Region", Province_State = "Province/State")
## Parsed with column specification:
## cols(
## `Province/State` = col_character(),
## `Country/Region` = col_character(),
## `Last Update` = col_datetime(format = ""),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double(),
## Latitude = col_double(),
## Longitude = col_double()
## )
report_09_12_2020 <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/09-12-2020.csv"))
## Parsed with column specification:
## cols(
## FIPS = col_double(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Last_Update = col_datetime(format = ""),
## Lat = col_double(),
## Long_ = col_double(),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double(),
## Active = col_double(),
## Combined_Key = col_character(),
## Incidence_Rate = col_double(),
## `Case-Fatality_Ratio` = col_double()
## )
str(report_03_12_2020)
## tibble [218 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Province_State: chr [1:218] "Hubei" NA NA NA ...
## $ Country_Region: chr [1:218] "China" "Italy" "Iran" "Korea, South" ...
## $ Last Update : POSIXct[1:218], format: "2020-03-12 09:53:06" "2020-03-11 21:33:02" ...
## $ Confirmed : num [1:218] 67781 12462 10075 7869 2281 ...
## $ Deaths : num [1:218] 3056 827 429 66 48 ...
## $ Recovered : num [1:218] 50318 1045 2959 333 12 ...
## $ Latitude : num [1:218] 31 43 32 36 46.2 ...
## $ Longitude : num [1:218] 112.27 12 53 128 2.21 ...
## - attr(*, "spec")=
## .. cols(
## .. `Province/State` = col_character(),
## .. `Country/Region` = col_character(),
## .. `Last Update` = col_datetime(format = ""),
## .. Confirmed = col_double(),
## .. Deaths = col_double(),
## .. Recovered = col_double(),
## .. Latitude = col_double(),
## .. Longitude = col_double()
## .. )
str(report_09_12_2020)
## tibble [3,954 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ FIPS : num [1:3954] NA NA NA NA NA NA NA NA NA NA ...
## $ Admin2 : chr [1:3954] NA NA NA NA ...
## $ Province_State : chr [1:3954] NA NA NA NA ...
## $ Country_Region : chr [1:3954] "Afghanistan" "Albania" "Algeria" "Andorra" ...
## $ Last_Update : POSIXct[1:3954], format: "2020-09-13 04:30:52" "2020-09-13 04:30:52" ...
## $ Lat : num [1:3954] 33.9 41.2 28 42.5 -11.2 ...
## $ Long_ : num [1:3954] 67.71 20.17 1.66 1.52 17.87 ...
## $ Confirmed : num [1:3954] 38641 11185 48007 1344 3335 ...
## $ Deaths : num [1:3954] 1420 330 1605 53 132 ...
## $ Recovered : num [1:3954] 31234 6494 33875 943 1289 ...
## $ Active : num [1:3954] 5987 4361 12527 348 1914 ...
## $ Combined_Key : chr [1:3954] "Afghanistan" "Albania" "Algeria" "Andorra" ...
## $ Incidence_Rate : num [1:3954] 99.3 388.7 109.5 1739.5 10.1 ...
## $ Case-Fatality_Ratio: num [1:3954] 3.67 2.95 3.34 3.94 3.96 ...
## - attr(*, "spec")=
## .. cols(
## .. FIPS = col_double(),
## .. Admin2 = col_character(),
## .. Province_State = col_character(),
## .. Country_Region = col_character(),
## .. Last_Update = col_datetime(format = ""),
## .. Lat = col_double(),
## .. Long_ = col_double(),
## .. Confirmed = col_double(),
## .. Deaths = col_double(),
## .. Recovered = col_double(),
## .. Active = col_double(),
## .. Combined_Key = col_character(),
## .. Incidence_Rate = col_double(),
## .. `Case-Fatality_Ratio` = col_double()
## .. )
country_confirmed_cases <- report_09_12_2020 %>%
group_by(Country_Region) %>%
summarize(total_confirmed = sum(Confirmed))
## `summarise()` ungrouping output (override with `.groups` argument)
head(country_confirmed_cases)
## # A tibble: 6 x 2
## Country_Region total_confirmed
## <chr> <dbl>
## 1 Afghanistan 38641
## 2 Albania 11185
## 3 Algeria 48007
## 4 Andorra 1344
## 5 Angola 3335
## 6 Antigua and Barbuda 95
states_cases <- report_09_12_2020 %>%
filter(Country_Region == "US") %>%
group_by(Province_State) %>%
summarize(total_confirmed = sum(Confirmed))
## `summarise()` ungrouping output (override with `.groups` argument)
head(states_cases)
## # A tibble: 6 x 2
## Province_State total_confirmed
## <chr> <dbl>
## 1 Alabama 137646
## 2 Alaska 6207
## 3 Arizona 208128
## 4 Arkansas 69710
## 5 California 758933
## 6 Colorado 60876
sept_top10_countries_deaths <- report_09_12_2020 %>%
group_by(Country_Region) %>%
summarize(total_deaths = sum(Deaths)) %>%
arrange(desc(total_deaths)) %>%
slice_head(n=10)
## `summarise()` ungrouping output (override with `.groups` argument)
# could also do slice(n=1:10)
datatable(sept_top10_countries_deaths)
# Make cases dataframe for September
sept_top10_countries_cases <- report_09_12_2020 %>%
group_by(Country_Region) %>%
summarize(total_cases = sum(Confirmed)) %>%
arrange(desc(total_cases)) %>%
slice_head(n=10)
## `summarise()` ungrouping output (override with `.groups` argument)
# Make the March dataframe for number of cases
march_top10_countries_cases <- report_03_12_2020 %>%
group_by(Country_Region) %>%
summarize(total_cases = sum(Confirmed)) %>%
arrange(desc(total_cases)) %>%
slice_head(n=10)
## `summarise()` ungrouping output (override with `.groups` argument)
# plot for cases
cases<-ggplot(data = sept_top10_countries_cases,
mapping = aes(x = total_cases, y = Country_Region)) +
geom_point(color = "red")+
geom_point(data = march_top10_countries_cases,
color = "blue") +
ggtitle("Countries with Most COVID-19 Cases in March and Sept 2020") +
ylab("Country") +
xlab("Cumulative Cases as of March 12 (blue) and September 12 (red)")
# not sure how to add a legend when the two dates are not in the same dataframe
# Repeat for number of deaths
march_top10_countries_deaths <- report_03_12_2020 %>%
group_by(Country_Region) %>%
summarize(total_deaths = sum(Deaths)) %>%
arrange(desc(total_deaths)) %>%
slice_head(n=10)
## `summarise()` ungrouping output (override with `.groups` argument)
# plot for deaths
deaths <- ggplot(data = sept_top10_countries_deaths,
mapping = aes(x = total_deaths, y = Country_Region)) +
geom_point(color = "orange")+
geom_point(data = march_top10_countries_deaths,
color = "forestgreen") +
ggtitle("Countries with Most COVID-19-related Deaths in March and Sept 2020") +
ylab("Country") +
xlab("Cumulative Deaths as of March 12 (green) and September 12 (orange)")
library('gridExtra')
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(cases, deaths, nrow = 2)
# cases
bar_cases <- ggplot(data = sept_top10_countries_cases,
mapping = aes(x = total_cases, y = Country_Region)) +
geom_bar(stat = 'identity', color = "red") +
geom_bar(data = march_top10_countries_cases, stat = 'identity', color = "blue") +
ggtitle("Countries with Most COVID-19 Cases in March and Sept 2020") +
ylab("Country") +
xlab("Cumulative Cases as of March 12 (blue) and September 12 (red)")
# deaths
bar_deaths <- ggplot(data = sept_top10_countries_deaths,
mapping = aes(x = total_deaths, y = Country_Region)) +
geom_bar(stat = 'identity', color = "orange") +
geom_bar(data = march_top10_countries_deaths, stat = 'identity', color = "forestgreen") +
ggtitle("Countries with Most COVID-19-related Deaths in March and Sept 2020") +
ylab("Country") +
xlab("Cumulative Deaths as of March 12 (green) and September 12 (orange)")
grid.arrange(bar_cases, bar_deaths, nrow = 2 )