library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Read in data
# March 13
Confirmed_State_3_13 <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/03-13-2020.csv")) %>%
rename(Country_Region = "Country/Region", Province_State = "Province/State") %>%
filter (Country_Region == "US") %>%
group_by(Province_State, Country_Region) %>%
summarise(Confirmed = sum(Confirmed))
## Parsed with column specification:
## cols(
## `Province/State` = col_character(),
## `Country/Region` = col_character(),
## `Last Update` = col_datetime(format = ""),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double(),
## Latitude = col_double(),
## Longitude = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
str(Confirmed_State_3_13)
## tibble [53 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ Province_State: chr [1:53] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Country_Region: chr [1:53] "US" "US" "US" "US" ...
## $ Confirmed : num [1:53] 5 1 9 6 282 49 11 4 46 10 ...
## - attr(*, "groups")= tibble [53 × 2] (S3: tbl_df/tbl/data.frame)
## ..$ Province_State: chr [1:53] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## ..$ .rows : list<int> [1:53]
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..$ : int 3
## .. ..$ : int 4
## .. ..$ : int 5
## .. ..$ : int 6
## .. ..$ : int 7
## .. ..$ : int 8
## .. ..$ : int 9
## .. ..$ : int 10
## .. ..$ : int 11
## .. ..$ : int 12
## .. ..$ : int 13
## .. ..$ : int 14
## .. ..$ : int 15
## .. ..$ : int 16
## .. ..$ : int 17
## .. ..$ : int 18
## .. ..$ : int 19
## .. ..$ : int 20
## .. ..$ : int 21
## .. ..$ : int 22
## .. ..$ : int 23
## .. ..$ : int 24
## .. ..$ : int 25
## .. ..$ : int 26
## .. ..$ : int 27
## .. ..$ : int 28
## .. ..$ : int 29
## .. ..$ : int 30
## .. ..$ : int 31
## .. ..$ : int 32
## .. ..$ : int 33
## .. ..$ : int 34
## .. ..$ : int 35
## .. ..$ : int 36
## .. ..$ : int 37
## .. ..$ : int 38
## .. ..$ : int 39
## .. ..$ : int 40
## .. ..$ : int 41
## .. ..$ : int 42
## .. ..$ : int 43
## .. ..$ : int 44
## .. ..$ : int 45
## .. ..$ : int 46
## .. ..$ : int 47
## .. ..$ : int 48
## .. ..$ : int 49
## .. ..$ : int 50
## .. ..$ : int 51
## .. ..$ : int 52
## .. ..$ : int 53
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
# Sept 13
Confirmed_State_9_13 <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/09-13-2020.csv")) %>%
filter (Country_Region == "US") %>%
group_by(Province_State, Country_Region) %>%
summarise(Confirmed = sum(Confirmed))
## Parsed with column specification:
## cols(
## FIPS = col_double(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Last_Update = col_datetime(format = ""),
## Lat = col_double(),
## Long_ = col_double(),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double(),
## Active = col_double(),
## Combined_Key = col_character(),
## Incidence_Rate = col_double(),
## `Case-Fatality_Ratio` = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
str(Confirmed_State_9_13)
## tibble [58 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Country_Region: chr [1:58] "US" "US" "US" "US" ...
## $ Confirmed : num [1:58] 138755 6268 208512 70219 761728 ...
## - attr(*, "groups")= tibble [58 × 2] (S3: tbl_df/tbl/data.frame)
## ..$ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## ..$ .rows : list<int> [1:58]
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..$ : int 3
## .. ..$ : int 4
## .. ..$ : int 5
## .. ..$ : int 6
## .. ..$ : int 7
## .. ..$ : int 8
## .. ..$ : int 9
## .. ..$ : int 10
## .. ..$ : int 11
## .. ..$ : int 12
## .. ..$ : int 13
## .. ..$ : int 14
## .. ..$ : int 15
## .. ..$ : int 16
## .. ..$ : int 17
## .. ..$ : int 18
## .. ..$ : int 19
## .. ..$ : int 20
## .. ..$ : int 21
## .. ..$ : int 22
## .. ..$ : int 23
## .. ..$ : int 24
## .. ..$ : int 25
## .. ..$ : int 26
## .. ..$ : int 27
## .. ..$ : int 28
## .. ..$ : int 29
## .. ..$ : int 30
## .. ..$ : int 31
## .. ..$ : int 32
## .. ..$ : int 33
## .. ..$ : int 34
## .. ..$ : int 35
## .. ..$ : int 36
## .. ..$ : int 37
## .. ..$ : int 38
## .. ..$ : int 39
## .. ..$ : int 40
## .. ..$ : int 41
## .. ..$ : int 42
## .. ..$ : int 43
## .. ..$ : int 44
## .. ..$ : int 45
## .. ..$ : int 46
## .. ..$ : int 47
## .. ..$ : int 48
## .. ..$ : int 49
## .. ..$ : int 50
## .. ..$ : int 51
## .. ..$ : int 52
## .. ..$ : int 53
## .. ..$ : int 54
## .. ..$ : int 55
## .. ..$ : int 56
## .. ..$ : int 57
## .. ..$ : int 58
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
# Notice from the above `str’ calls that there are 53 states in the table on 3/13/2020 and
# 58 states in the table on 9/13/2020... Let's see what the differences are
setdiff(Confirmed_State_9_13$Province_State, Confirmed_State_3_13$Province_State)
## [1] "Guam" "Northern Mariana Islands"
## [3] "Puerto Rico" "Recovered"
## [5] "Virgin Islands"
# There are 4 US territories and a field you may have notice last week: “Recovered”.
# We don’t need the “Recovered” row so let’s filter it from the data set
Confirmed_State_9_13 <- Confirmed_State_9_13 %>%
filter(Province_State != "Recovered")
# Since all of the states and territories in 9/13/2020 are found in 3/13/2020 a left_join or full_join can
# be used to merge the tables
Confirmed_State_3_13_9_13_joined <- full_join(Confirmed_State_3_13,
Confirmed_State_9_13, by = c("Province_State"))
# look at the new joined dataframe
head(Confirmed_State_3_13_9_13_joined)
## # A tibble: 6 x 5
## # Groups: Province_State [6]
## Province_State Country_Region.x Confirmed.x Country_Region.y Confirmed.y
## <chr> <chr> <dbl> <chr> <dbl>
## 1 Alabama US 5 US 138755
## 2 Alaska US 1 US 6268
## 3 Arizona US 9 US 208512
## 4 Arkansas US 6 US 70219
## 5 California US 282 US 761728
## 6 Colorado US 49 US 61293
tail(Confirmed_State_3_13_9_13_joined, 5)
## # A tibble: 5 x 5
## # Groups: Province_State [5]
## Province_State Country_Region.x Confirmed.x Country_Region.y Confirmed.y
## <chr> <chr> <dbl> <chr> <dbl>
## 1 Wyoming US 1 US 4346
## 2 Guam <NA> NA US 1863
## 3 Northern Mariana Is… <NA> NA US 60
## 4 Puerto Rico <NA> NA US 37380
## 5 Virgin Islands <NA> NA US 1220
# look to see what cells are NA
which(is.na(Confirmed_State_3_13_9_13_joined))
## [1] 111 112 113 114 168 169 170 171
# let's turn the NAs to 0 and rename the columns
Confirmed_State_3_13_9_13_joined <- full_join(Confirmed_State_3_13,
Confirmed_State_9_13, by = c("Province_State")) %>%
rename(Confirmed_3_13_2020 = "Confirmed.x", Confirmed_9_13_2020 = "Confirmed.y") %>%
select(-Country_Region.x, -Country_Region.y) %>%
replace_na(list(Confirmed_3_13_2020 = 0))
head(Confirmed_State_3_13_9_13_joined)
## # A tibble: 6 x 3
## # Groups: Province_State [6]
## Province_State Confirmed_3_13_2020 Confirmed_9_13_2020
## <chr> <dbl> <dbl>
## 1 Alabama 5 138755
## 2 Alaska 1 6268
## 3 Arizona 9 208512
## 4 Arkansas 6 70219
## 5 California 282 761728
## 6 Colorado 49 61293
# Switching between wide and long table formats
Confirmed_State_3_13_9_13_joined_long <- Confirmed_State_3_13_9_13_joined %>%
pivot_longer(-c(Province_State),
names_to = "Date", values_to = "Confirmed")
# to adjust figure size {r, fig.width = 5, fig.height = 10}
ggplot(Confirmed_State_3_13_9_13_joined_long, aes(x = Confirmed, y = Province_State)) +
geom_point(aes(color = Date))
# Working with the time series data
# load the most recent time series data for confirmed cases
download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
destfile = "time_series_covid19_confirmed_global.csv")
time_series_confirmed <- read_csv("time_series_covid19_confirmed_global.csv")%>%
rename(Province_State = "Province/State", Country_Region = "Country/Region")
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
head(time_series_confirmed)
## # A tibble: 6 x 258
## Province_State Country_Region Lat Long `1/22/20` `1/23/20` `1/24/20`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 0 0 0
## 2 <NA> Albania 41.2 20.2 0 0 0
## 3 <NA> Algeria 28.0 1.66 0 0 0
## 4 <NA> Andorra 42.5 1.52 0 0 0
## 5 <NA> Angola -11.2 17.9 0 0 0
## 6 <NA> Antigua and B… 17.1 -61.8 0 0 0
## # … with 251 more variables: `1/25/20` <dbl>, `1/26/20` <dbl>, `1/27/20` <dbl>,
## # `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>, `1/31/20` <dbl>,
## # `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>, `2/4/20` <dbl>,
## # `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>,
## # `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>, `2/12/20` <dbl>,
## # `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>, `2/16/20` <dbl>,
## # `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>,
## # `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, `2/24/20` <dbl>,
## # `2/25/20` <dbl>, `2/26/20` <dbl>, `2/27/20` <dbl>, `2/28/20` <dbl>,
## # `2/29/20` <dbl>, `3/1/20` <dbl>, `3/2/20` <dbl>, `3/3/20` <dbl>,
## # `3/4/20` <dbl>, `3/5/20` <dbl>, `3/6/20` <dbl>, `3/7/20` <dbl>,
## # `3/8/20` <dbl>, `3/9/20` <dbl>, `3/10/20` <dbl>, `3/11/20` <dbl>,
## # `3/12/20` <dbl>, `3/13/20` <dbl>, `3/14/20` <dbl>, `3/15/20` <dbl>,
## # `3/16/20` <dbl>, `3/17/20` <dbl>, `3/18/20` <dbl>, `3/19/20` <dbl>,
## # `3/20/20` <dbl>, `3/21/20` <dbl>, `3/22/20` <dbl>, `3/23/20` <dbl>,
## # `3/24/20` <dbl>, `3/25/20` <dbl>, `3/26/20` <dbl>, `3/27/20` <dbl>,
## # `3/28/20` <dbl>, `3/29/20` <dbl>, `3/30/20` <dbl>, `3/31/20` <dbl>,
## # `4/1/20` <dbl>, `4/2/20` <dbl>, `4/3/20` <dbl>, `4/4/20` <dbl>,
## # `4/5/20` <dbl>, `4/6/20` <dbl>, `4/7/20` <dbl>, `4/8/20` <dbl>,
## # `4/9/20` <dbl>, `4/10/20` <dbl>, `4/11/20` <dbl>, `4/12/20` <dbl>,
## # `4/13/20` <dbl>, `4/14/20` <dbl>, `4/15/20` <dbl>, `4/16/20` <dbl>,
## # `4/17/20` <dbl>, `4/18/20` <dbl>, `4/19/20` <dbl>, `4/20/20` <dbl>,
## # `4/21/20` <dbl>, `4/22/20` <dbl>, `4/23/20` <dbl>, `4/24/20` <dbl>,
## # `4/25/20` <dbl>, `4/26/20` <dbl>, `4/27/20` <dbl>, `4/28/20` <dbl>,
## # `4/29/20` <dbl>, `4/30/20` <dbl>, `5/1/20` <dbl>, `5/2/20` <dbl>,
## # `5/3/20` <dbl>, …
time_series_confirmed_long <- time_series_confirmed %>%
pivot_longer(-c(Province_State, Country_Region, Lat, Long),
names_to = "Date", values_to = "Confirmed")
head(time_series_confirmed_long)
## # A tibble: 6 x 6
## Province_State Country_Region Lat Long Date Confirmed
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 1/22/20 0
## 2 <NA> Afghanistan 33.9 67.7 1/23/20 0
## 3 <NA> Afghanistan 33.9 67.7 1/24/20 0
## 4 <NA> Afghanistan 33.9 67.7 1/25/20 0
## 5 <NA> Afghanistan 33.9 67.7 1/26/20 0
## 6 <NA> Afghanistan 33.9 67.7 1/27/20 0
download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
destfile = "time_series_covid19_deaths_global.csv")
time_series_deaths <- read_csv("time_series_covid19_deaths_global.csv")%>%
rename(Province_State = "Province/State", Country_Region = "Country/Region")
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
time_series_deaths_long <- time_series_deaths %>%
pivot_longer(-c(Province_State, Country_Region, Lat, Long),
names_to = "Date", values_to = "Deaths")
head(time_series_deaths_long)
## # A tibble: 6 x 6
## Province_State Country_Region Lat Long Date Deaths
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 1/22/20 0
## 2 <NA> Afghanistan 33.9 67.7 1/23/20 0
## 3 <NA> Afghanistan 33.9 67.7 1/24/20 0
## 4 <NA> Afghanistan 33.9 67.7 1/25/20 0
## 5 <NA> Afghanistan 33.9 67.7 1/26/20 0
## 6 <NA> Afghanistan 33.9 67.7 1/27/20 0
# Joining the time series tables
# first need to create the common column (key), which will be a column where we combine province, country, and date
time_series_confirmed_long <- time_series_confirmed_long %>%
unite(Key, Province_State, Country_Region, Date, sep = ".", remove = FALSE)
head(time_series_confirmed_long)
## # A tibble: 6 x 7
## Key Province_State Country_Region Lat Long Date Confirmed
## <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 NA.Afghanistan.1/2… <NA> Afghanistan 33.9 67.7 1/22/… 0
## 2 NA.Afghanistan.1/2… <NA> Afghanistan 33.9 67.7 1/23/… 0
## 3 NA.Afghanistan.1/2… <NA> Afghanistan 33.9 67.7 1/24/… 0
## 4 NA.Afghanistan.1/2… <NA> Afghanistan 33.9 67.7 1/25/… 0
## 5 NA.Afghanistan.1/2… <NA> Afghanistan 33.9 67.7 1/26/… 0
## 6 NA.Afghanistan.1/2… <NA> Afghanistan 33.9 67.7 1/27/… 0
# do the same thing for deaths
time_series_deaths_long <- time_series_deaths_long %>%
unite(Key, Province_State, Country_Region, Date, sep = ".") %>%
select(Key, Deaths)
head(time_series_deaths_long)
## # A tibble: 6 x 2
## Key Deaths
## <chr> <dbl>
## 1 NA.Afghanistan.1/22/20 0
## 2 NA.Afghanistan.1/23/20 0
## 3 NA.Afghanistan.1/24/20 0
## 4 NA.Afghanistan.1/25/20 0
## 5 NA.Afghanistan.1/26/20 0
## 6 NA.Afghanistan.1/27/20 0
# now join the tables
time_series_long_joined <- full_join(time_series_confirmed_long,
time_series_deaths_long, by = c("Key")) %>%
select(-Key)
head(time_series_long_joined)
## # A tibble: 6 x 7
## Province_State Country_Region Lat Long Date Confirmed Deaths
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 1/22/20 0 0
## 2 <NA> Afghanistan 33.9 67.7 1/23/20 0 0
## 3 <NA> Afghanistan 33.9 67.7 1/24/20 0 0
## 4 <NA> Afghanistan 33.9 67.7 1/25/20 0 0
## 5 <NA> Afghanistan 33.9 67.7 1/26/20 0 0
## 6 <NA> Afghanistan 33.9 67.7 1/27/20 0 0
which(is.na(time_series_long_joined$Confirmed))
## integer(0)
which(is.na(time_series_long_joined$Deaths))
## integer(0)
no NAs - great
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
time_series_long_joined$Date <- mdy(time_series_long_joined$Date)
# create a column using pivot_long
time_series_long_joined_counts <- time_series_long_joined %>%
pivot_longer(-c(Province_State, Country_Region, Lat, Long, Date),
names_to = "Report_Type", values_to = "Counts")
time_series_long_joined_counts <- time_series_long_joined %>%
pivot_longer(-c(Province_State, Country_Region, Lat, Long, Date),
names_to = "Report_Type", values_to = "Counts")
head(time_series_long_joined_counts)
## # A tibble: 6 x 7
## Province_State Country_Region Lat Long Date Report_Type Counts
## <chr> <chr> <dbl> <dbl> <date> <chr> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 2020-01-22 Confirmed 0
## 2 <NA> Afghanistan 33.9 67.7 2020-01-22 Deaths 0
## 3 <NA> Afghanistan 33.9 67.7 2020-01-23 Confirmed 0
## 4 <NA> Afghanistan 33.9 67.7 2020-01-23 Deaths 0
## 5 <NA> Afghanistan 33.9 67.7 2020-01-24 Confirmed 0
## 6 <NA> Afghanistan 33.9 67.7 2020-01-24 Deaths 0
# Making graphs from the time series data
time_series_long_joined %>%
group_by(Country_Region,Date) %>%
summarise_at(c("Confirmed", "Deaths"), sum) %>%
filter (Country_Region == "US") %>%
ggplot(aes(x = Date, y = Deaths)) +
geom_point() +
geom_line() +
ggtitle("US COVID-19 Deaths")
# some other countries
time_series_long_joined %>%
group_by(Country_Region,Date) %>%
summarise_at(c("Confirmed", "Deaths"), sum) %>%
filter (Country_Region %in% c("China","Japan", "Korea, South",
"Italy","Spain", "US")) %>%
ggplot(aes(x = Date, y = Deaths)) +
geom_point() +
geom_line() +
ggtitle("COVID-19 Deaths") +
facet_wrap(~Country_Region, ncol=2, scales="free_y")
# several countries on the same graph
time_series_long_joined %>%
group_by(Country_Region,Date) %>%
summarise_at(c("Confirmed", "Deaths"), sum) %>%
filter (Country_Region %in% c("China","France","Italy",
"Korea, South", "US")) %>%
ggplot(aes(x = Date, y = Deaths, color = Country_Region)) +
geom_point() +
geom_line() +
ggtitle("COVID-19 Deaths")
# cases and deaths on the same graph
time_series_long_joined_counts %>%
group_by(Country_Region, Report_Type, Date) %>%
summarise(Counts = sum(Counts)) %>%
filter (Country_Region == "US") %>%
ggplot(aes(x = Date, y = log2(Counts), fill = Report_Type, color = Report_Type)) +
geom_point() +
geom_line() +
ggtitle("US COVID-19 Cases")
## `summarise()` regrouping output by 'Country_Region', 'Report_Type' (override with `.groups` argument)
Finish the visualization tutorial: Lab 5A
Revise the above example for joining tables using 6/13/2020 and 9/13/2020. Plot the data as a bar plot. Make sure your graph looks nice in the report by adjusting the height in the R code chunk header.
# Read in the June data
june <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/06-13-2020.csv")) %>%
filter (Country_Region == "US") %>%
group_by(Province_State, Country_Region) %>%
summarise(Confirmed = sum(Confirmed))
## Parsed with column specification:
## cols(
## FIPS = col_double(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Last_Update = col_datetime(format = ""),
## Lat = col_double(),
## Long_ = col_double(),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double(),
## Active = col_double(),
## Combined_Key = col_character(),
## Incidence_Rate = col_double(),
## `Case-Fatality_Ratio` = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
str(june)
## tibble [58 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Country_Region: chr [1:58] "US" "US" "US" "US" ...
## $ Confirmed : num [1:58] 24601 653 34660 12095 150018 ...
## - attr(*, "groups")= tibble [58 × 2] (S3: tbl_df/tbl/data.frame)
## ..$ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## ..$ .rows : list<int> [1:58]
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..$ : int 3
## .. ..$ : int 4
## .. ..$ : int 5
## .. ..$ : int 6
## .. ..$ : int 7
## .. ..$ : int 8
## .. ..$ : int 9
## .. ..$ : int 10
## .. ..$ : int 11
## .. ..$ : int 12
## .. ..$ : int 13
## .. ..$ : int 14
## .. ..$ : int 15
## .. ..$ : int 16
## .. ..$ : int 17
## .. ..$ : int 18
## .. ..$ : int 19
## .. ..$ : int 20
## .. ..$ : int 21
## .. ..$ : int 22
## .. ..$ : int 23
## .. ..$ : int 24
## .. ..$ : int 25
## .. ..$ : int 26
## .. ..$ : int 27
## .. ..$ : int 28
## .. ..$ : int 29
## .. ..$ : int 30
## .. ..$ : int 31
## .. ..$ : int 32
## .. ..$ : int 33
## .. ..$ : int 34
## .. ..$ : int 35
## .. ..$ : int 36
## .. ..$ : int 37
## .. ..$ : int 38
## .. ..$ : int 39
## .. ..$ : int 40
## .. ..$ : int 41
## .. ..$ : int 42
## .. ..$ : int 43
## .. ..$ : int 44
## .. ..$ : int 45
## .. ..$ : int 46
## .. ..$ : int 47
## .. ..$ : int 48
## .. ..$ : int 49
## .. ..$ : int 50
## .. ..$ : int 51
## .. ..$ : int 52
## .. ..$ : int 53
## .. ..$ : int 54
## .. ..$ : int 55
## .. ..$ : int 56
## .. ..$ : int 57
## .. ..$ : int 58
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
sept <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/09-13-2020.csv")) %>%
filter (Country_Region == "US") %>%
group_by(Province_State, Country_Region) %>%
summarise(Confirmed = sum(Confirmed))
## Parsed with column specification:
## cols(
## FIPS = col_double(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Last_Update = col_datetime(format = ""),
## Lat = col_double(),
## Long_ = col_double(),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double(),
## Active = col_double(),
## Combined_Key = col_character(),
## Incidence_Rate = col_double(),
## `Case-Fatality_Ratio` = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
str(sept)
## tibble [58 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Country_Region: chr [1:58] "US" "US" "US" "US" ...
## $ Confirmed : num [1:58] 138755 6268 208512 70219 761728 ...
## - attr(*, "groups")= tibble [58 × 2] (S3: tbl_df/tbl/data.frame)
## ..$ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## ..$ .rows : list<int> [1:58]
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..$ : int 3
## .. ..$ : int 4
## .. ..$ : int 5
## .. ..$ : int 6
## .. ..$ : int 7
## .. ..$ : int 8
## .. ..$ : int 9
## .. ..$ : int 10
## .. ..$ : int 11
## .. ..$ : int 12
## .. ..$ : int 13
## .. ..$ : int 14
## .. ..$ : int 15
## .. ..$ : int 16
## .. ..$ : int 17
## .. ..$ : int 18
## .. ..$ : int 19
## .. ..$ : int 20
## .. ..$ : int 21
## .. ..$ : int 22
## .. ..$ : int 23
## .. ..$ : int 24
## .. ..$ : int 25
## .. ..$ : int 26
## .. ..$ : int 27
## .. ..$ : int 28
## .. ..$ : int 29
## .. ..$ : int 30
## .. ..$ : int 31
## .. ..$ : int 32
## .. ..$ : int 33
## .. ..$ : int 34
## .. ..$ : int 35
## .. ..$ : int 36
## .. ..$ : int 37
## .. ..$ : int 38
## .. ..$ : int 39
## .. ..$ : int 40
## .. ..$ : int 41
## .. ..$ : int 42
## .. ..$ : int 43
## .. ..$ : int 44
## .. ..$ : int 45
## .. ..$ : int 46
## .. ..$ : int 47
## .. ..$ : int 48
## .. ..$ : int 49
## .. ..$ : int 50
## .. ..$ : int 51
## .. ..$ : int 52
## .. ..$ : int 53
## .. ..$ : int 54
## .. ..$ : int 55
## .. ..$ : int 56
## .. ..$ : int 57
## .. ..$ : int 58
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
# Join the two dataframes
june_sept_joined <- full_join(june, sept, by = c("Province_State"))
str(june_sept_joined)
## tibble [58 × 5] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ Province_State : chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Country_Region.x: chr [1:58] "US" "US" "US" "US" ...
## $ Confirmed.x : num [1:58] 24601 653 34660 12095 150018 ...
## $ Country_Region.y: chr [1:58] "US" "US" "US" "US" ...
## $ Confirmed.y : num [1:58] 138755 6268 208512 70219 761728 ...
## - attr(*, "groups")= tibble [58 × 2] (S3: tbl_df/tbl/data.frame)
## ..$ Province_State: chr [1:58] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## ..$ .rows : list<int> [1:58]
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..$ : int 3
## .. ..$ : int 4
## .. ..$ : int 5
## .. ..$ : int 6
## .. ..$ : int 7
## .. ..$ : int 8
## .. ..$ : int 9
## .. ..$ : int 10
## .. ..$ : int 11
## .. ..$ : int 12
## .. ..$ : int 13
## .. ..$ : int 14
## .. ..$ : int 15
## .. ..$ : int 16
## .. ..$ : int 17
## .. ..$ : int 18
## .. ..$ : int 19
## .. ..$ : int 20
## .. ..$ : int 21
## .. ..$ : int 22
## .. ..$ : int 23
## .. ..$ : int 24
## .. ..$ : int 25
## .. ..$ : int 26
## .. ..$ : int 27
## .. ..$ : int 28
## .. ..$ : int 29
## .. ..$ : int 30
## .. ..$ : int 31
## .. ..$ : int 32
## .. ..$ : int 33
## .. ..$ : int 34
## .. ..$ : int 35
## .. ..$ : int 36
## .. ..$ : int 37
## .. ..$ : int 38
## .. ..$ : int 39
## .. ..$ : int 40
## .. ..$ : int 41
## .. ..$ : int 42
## .. ..$ : int 43
## .. ..$ : int 44
## .. ..$ : int 45
## .. ..$ : int 46
## .. ..$ : int 47
## .. ..$ : int 48
## .. ..$ : int 49
## .. ..$ : int 50
## .. ..$ : int 51
## .. ..$ : int 52
## .. ..$ : int 53
## .. ..$ : int 54
## .. ..$ : int 55
## .. ..$ : int 56
## .. ..$ : int 57
## .. ..$ : int 58
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
# look at the new joined dataframe
head(june_sept_joined)
## # A tibble: 6 x 5
## # Groups: Province_State [6]
## Province_State Country_Region.x Confirmed.x Country_Region.y Confirmed.y
## <chr> <chr> <dbl> <chr> <dbl>
## 1 Alabama US 24601 US 138755
## 2 Alaska US 653 US 6268
## 3 Arizona US 34660 US 208512
## 4 Arkansas US 12095 US 70219
## 5 California US 150018 US 761728
## 6 Colorado US 29002 US 61293
tail(june_sept_joined, 5)
## # A tibble: 5 x 5
## # Groups: Province_State [5]
## Province_State Country_Region.x Confirmed.x Country_Region.y Confirmed.y
## <chr> <chr> <dbl> <chr> <dbl>
## 1 Virginia US 53869 US 133742
## 2 Washington US 25538 US 79826
## 3 West Virginia US 2274 US 12705
## 4 Wisconsin US 22518 US 89185
## 5 Wyoming US 1050 US 4346
# look to see what cells are NA
which(is.na(june_sept_joined))
## integer(0)
# rename the columns
june_sept_joined <- full_join(june, sept, by = c("Province_State")) %>%
rename(Confirmed_June = "Confirmed.x", Confirmed_Sept = "Confirmed.y") %>%
select(-Country_Region.x, -Country_Region.y)
head(june_sept_joined)
## # A tibble: 6 x 3
## # Groups: Province_State [6]
## Province_State Confirmed_June Confirmed_Sept
## <chr> <dbl> <dbl>
## 1 Alabama 24601 138755
## 2 Alaska 653 6268
## 3 Arizona 34660 208512
## 4 Arkansas 12095 70219
## 5 California 150018 761728
## 6 Colorado 29002 61293
# pivot the data
june_sept_joined_long <- june_sept_joined %>%
pivot_longer(-c(Province_State),
names_to = "Date", values_to = "Confirmed")
head(june_sept_joined_long)
## # A tibble: 6 x 3
## # Groups: Province_State [3]
## Province_State Date Confirmed
## <chr> <chr> <dbl>
## 1 Alabama Confirmed_June 24601
## 2 Alabama Confirmed_Sept 138755
## 3 Alaska Confirmed_June 653
## 4 Alaska Confirmed_Sept 6268
## 5 Arizona Confirmed_June 34660
## 6 Arizona Confirmed_Sept 208512
ggplot(june_sept_joined_long, aes(x = Confirmed, y = Province_State)) +
geom_point(aes(color = Date))
ggplot(june_sept_joined_long, aes(x = Confirmed, y = Province_State)) +
geom_point(aes(color = Date)) +
labs(x = "Confirmed Cases", y = "State/Province")
# load time series death data again
download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
destfile = "data_raw/time_series_covid19_deaths_global.csv")
time_series_deaths <- read_csv("data_raw/time_series_covid19_deaths_global.csv")%>%
rename(Province_State = "Province/State", Country_Region = "Country/Region")
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
head(time_series_deaths)
## # A tibble: 6 x 258
## Province_State Country_Region Lat Long `1/22/20` `1/23/20` `1/24/20`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 0 0 0
## 2 <NA> Albania 41.2 20.2 0 0 0
## 3 <NA> Algeria 28.0 1.66 0 0 0
## 4 <NA> Andorra 42.5 1.52 0 0 0
## 5 <NA> Angola -11.2 17.9 0 0 0
## 6 <NA> Antigua and B… 17.1 -61.8 0 0 0
## # … with 251 more variables: `1/25/20` <dbl>, `1/26/20` <dbl>, `1/27/20` <dbl>,
## # `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>, `1/31/20` <dbl>,
## # `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>, `2/4/20` <dbl>,
## # `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>,
## # `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>, `2/12/20` <dbl>,
## # `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>, `2/16/20` <dbl>,
## # `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>,
## # `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, `2/24/20` <dbl>,
## # `2/25/20` <dbl>, `2/26/20` <dbl>, `2/27/20` <dbl>, `2/28/20` <dbl>,
## # `2/29/20` <dbl>, `3/1/20` <dbl>, `3/2/20` <dbl>, `3/3/20` <dbl>,
## # `3/4/20` <dbl>, `3/5/20` <dbl>, `3/6/20` <dbl>, `3/7/20` <dbl>,
## # `3/8/20` <dbl>, `3/9/20` <dbl>, `3/10/20` <dbl>, `3/11/20` <dbl>,
## # `3/12/20` <dbl>, `3/13/20` <dbl>, `3/14/20` <dbl>, `3/15/20` <dbl>,
## # `3/16/20` <dbl>, `3/17/20` <dbl>, `3/18/20` <dbl>, `3/19/20` <dbl>,
## # `3/20/20` <dbl>, `3/21/20` <dbl>, `3/22/20` <dbl>, `3/23/20` <dbl>,
## # `3/24/20` <dbl>, `3/25/20` <dbl>, `3/26/20` <dbl>, `3/27/20` <dbl>,
## # `3/28/20` <dbl>, `3/29/20` <dbl>, `3/30/20` <dbl>, `3/31/20` <dbl>,
## # `4/1/20` <dbl>, `4/2/20` <dbl>, `4/3/20` <dbl>, `4/4/20` <dbl>,
## # `4/5/20` <dbl>, `4/6/20` <dbl>, `4/7/20` <dbl>, `4/8/20` <dbl>,
## # `4/9/20` <dbl>, `4/10/20` <dbl>, `4/11/20` <dbl>, `4/12/20` <dbl>,
## # `4/13/20` <dbl>, `4/14/20` <dbl>, `4/15/20` <dbl>, `4/16/20` <dbl>,
## # `4/17/20` <dbl>, `4/18/20` <dbl>, `4/19/20` <dbl>, `4/20/20` <dbl>,
## # `4/21/20` <dbl>, `4/22/20` <dbl>, `4/23/20` <dbl>, `4/24/20` <dbl>,
## # `4/25/20` <dbl>, `4/26/20` <dbl>, `4/27/20` <dbl>, `4/28/20` <dbl>,
## # `4/29/20` <dbl>, `4/30/20` <dbl>, `5/1/20` <dbl>, `5/2/20` <dbl>,
## # `5/3/20` <dbl>, …
time_series_deaths_long <- time_series_deaths %>%
pivot_longer(-c(Province_State, Country_Region, Lat, Long),
names_to = "Date", values_to = "Deaths")
head(time_series_deaths_long)
## # A tibble: 6 x 6
## Province_State Country_Region Lat Long Date Deaths
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 1/22/20 0
## 2 <NA> Afghanistan 33.9 67.7 1/23/20 0
## 3 <NA> Afghanistan 33.9 67.7 1/24/20 0
## 4 <NA> Afghanistan 33.9 67.7 1/25/20 0
## 5 <NA> Afghanistan 33.9 67.7 1/26/20 0
## 6 <NA> Afghanistan 33.9 67.7 1/27/20 0
# reformat date
library(lubridate)
time_series_deaths_long$Date <- mdy(time_series_deaths_long$Date)
head(time_series_deaths_long)
## # A tibble: 6 x 6
## Province_State Country_Region Lat Long Date Deaths
## <chr> <chr> <dbl> <dbl> <date> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 2020-01-22 0
## 2 <NA> Afghanistan 33.9 67.7 2020-01-23 0
## 3 <NA> Afghanistan 33.9 67.7 2020-01-24 0
## 4 <NA> Afghanistan 33.9 67.7 2020-01-25 0
## 5 <NA> Afghanistan 33.9 67.7 2020-01-26 0
## 6 <NA> Afghanistan 33.9 67.7 2020-01-27 0
time_series_deaths_long %>%
group_by(Date) %>%
summarize(Total_Deaths = sum(Deaths)) %>%
ggplot(aes(x = Date, y = Total_Deaths)) +
geom_line() +
ggtitle("Worldwide COVID-19 Deaths")
## `summarise()` ungrouping output (override with `.groups` argument)
# lets look at the data set we want, which we made a bit ago
head(time_series_long_joined)
## # A tibble: 6 x 7
## Province_State Country_Region Lat Long Date Confirmed Deaths
## <chr> <chr> <dbl> <dbl> <date> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 2020-01-22 0 0
## 2 <NA> Afghanistan 33.9 67.7 2020-01-23 0 0
## 3 <NA> Afghanistan 33.9 67.7 2020-01-24 0 0
## 4 <NA> Afghanistan 33.9 67.7 2020-01-25 0 0
## 5 <NA> Afghanistan 33.9 67.7 2020-01-26 0 0
## 6 <NA> Afghanistan 33.9 67.7 2020-01-27 0 0
# change the date format
time_series_long_joined$Date <- mdy(time_series_long_joined$Date)
## Warning: All formats failed to parse. No formats found.
# now mutate to add column
mortality<-time_series_long_joined %>%
mutate(rate = Deaths/Confirmed)
head(mortality)
## # A tibble: 6 x 8
## Province_State Country_Region Lat Long Date Confirmed Deaths rate
## <chr> <chr> <dbl> <dbl> <date> <dbl> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 NA 0 0 NaN
## 2 <NA> Afghanistan 33.9 67.7 NA 0 0 NaN
## 3 <NA> Afghanistan 33.9 67.7 NA 0 0 NaN
## 4 <NA> Afghanistan 33.9 67.7 NA 0 0 NaN
## 5 <NA> Afghanistan 33.9 67.7 NA 0 0 NaN
## 6 <NA> Afghanistan 33.9 67.7 NA 0 0 NaN
#mortality %>%
# replace_na(list(rate = 0)) %>%
# filter(Country_Region == "US") %>%
# ggplot(aes(x = Date, y = rate)) +
# geom_line() +
# ggtitle("Mortality rate in the US (deaths per positive case)")
# first find the total deaths of the top ten countries
top_deaths<-time_series_deaths_long %>%
group_by(Country_Region) %>%
summarize(total_deaths = sum(Deaths)) %>%
arrange(desc(total_deaths)) %>%
slice_head(n=10)
## `summarise()` ungrouping output (override with `.groups` argument)
countries <- top_deaths$Country_Region
time_series_deaths_long %>%
filter(Country_Region == countries) %>%
ggplot(aes(x = Date, y = Deaths, color = Country_Region)) +
geom_line(lwd = .7) +
ggtitle("COVID-19 Deaths in Top 10 Countries")
## Warning in Country_Region == countries: longer object length is not a multiple
## of shorter object length
I guess UK and France were reporting numbers once a week?
time_series_deaths_long %>%
filter(Country_Region == countries) %>%
ggplot(aes(x = Date, y = Deaths)) +
geom_point() +
geom_line() +
ggtitle("COVID-19 Deaths") +
facet_wrap(~Country_Region, ncol=2, scales="free_y")
## Warning in Country_Region == countries: longer object length is not a multiple
## of shorter object length
NEED TO FIND THIS DATA
Currently, the only time series data I have downloaded has all of US together (no states). The states data I have is cumulative total cases/deaths, not time series.
#data %>%
# filter(Country_Region == "US") %>%
# ggplot(aes(x = Date, y = Confirmed)) +
# geom_point() +
# geom_line() +
# ggtitle("COVID-19 Cases") +
# facet_wrap(~Province_State, ncol=2, scales="free_y")
time_series_deaths_long %>%
filter(Country_Region == countries) %>%
ggplot(aes(x = Date, y = Deaths)) +
geom_point() +
geom_line() +
ggtitle("COVID-19 Deaths") +
facet_wrap(~Country_Region, ncol=2) +
theme_classic()
## Warning in Country_Region == countries: longer object length is not a multiple
## of shorter object length