The World Health Organization provides detailed data on various population health indicators in an Open Data environment. The R package WHO
allows you to access that data directly from R. Install this package using install.packages('WHO')
.
The package has a nice vignette, which can be accessed using
library(WHO); vignette('who_vignette')
We will be using this package to download data on infant and under-5 mortality and exploring the merging of data sets and the split-apply-combine paradigm
You can download these data sets using the following code:
library(WHO)
infant <- get_data('EQ_INFANTMORT')
child <- get_data('EQ_U5MORT')
str
, and find common variablesSolution
str(infant)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1978 obs. of 9 variables:
## $ datasource : chr "MICS" "MICS" "DHS" "DHS" ...
## $ country : chr "Albania" "Albania" "Armenia" "Armenia" ...
## $ year : num 2005 2005 2000 2000 2010 ...
## $ region : chr "Europe" "Europe" "Europe" "Europe" ...
## $ gho : chr "Infant mortality per 1 000 live births" "Infant mortality per 1 000 live births" "Infant mortality per 1 000 live births" "Infant mortality per 1 000 live births" ...
## $ publishstate : chr "Published" "Published" "Published" "Published" ...
## $ value : num 16.7 17 39 26 16 13 118 99 72 62 ...
## $ residenceareatype: chr NA "Urban" NA NA ...
## $ wealthquintile : chr NA NA "Q1 (Poorest)" "Q2" ...
str(child)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1978 obs. of 9 variables:
## $ residenceareatype: chr "Urban" NA NA NA ...
## $ country : chr "Afghanistan" "Afghanistan" "Armenia" "Armenia" ...
## $ region : chr "Eastern Mediterranean" "Eastern Mediterranean" "Europe" "Europe" ...
## $ year : num 2010 2010 2000 2000 2005 ...
## $ datasource : chr "MICS" "MICS" "DHS" "DHS" ...
## $ gho : chr "Under-five mortality per 1 000 live births" "Under-five mortality per 1 000 live births" "Under-five mortality per 1 000 live births" "Under-five mortality per 1 000 live births" ...
## $ publishstate : chr "Published" "Published" "Published" "Published" ...
## $ value : num 86 98 41 56 23 28 126 46 150 57 ...
## $ wealthquintile : chr NA "Q1 (Poorest)" "Q1 (Poorest)" "Q3" ...
residenceareatype
and wealthquintile
having missing values or NA
Solution
infant <- infant %>% filter(is.na(residenceareatype), is.na(wealthquintile))
child <- child %>% filter(is.na(residenceareatype), is.na(wealthquintile))
gho
, datasource
or publishstate
columns, so omit them. Also omit the two variables you filtered on aboveSolution
infant <- infant %>% select(-gho, -publishstate, -datasource, -residenceareatype, -wealthquintile)
child <- child %>% select(-gho, -publishstate, -datasource, -residenceareatype, -wealthquintile)
Solution
mortality <- infant %>% inner_join(child, by = c('region','country','year'))
Solution
mortality <- mortality %>%
dplyr::rename(infant = value.x, child = value.y)
which.max
and which.min
)Solution
mortality_2000 <- mortality %>% filter(year == 2000)
mortality_2000 %>% group_by(region) %>% summarize(infant_worst = country[which.max(infant)],
child_worst = country[which.max(child)])
## # A tibble: 5 x 3
## region infant_worst child_worst
## <chr> <chr> <chr>
## 1 Africa Rwanda Rwanda
## 2 Americas Haiti Haiti
## 3 Eastern Mediterranean Egypt Egypt
## 4 Europe Armenia Armenia
## 5 Western Pacific Cambodia Cambodia
by
argument to get the best dataset).Solution
mortality1 <- filter(mortality, year >=1990 & year <= 1995)
mortality2 <- filter(mortality, year >= 2000 & year <= 2005)
infant1 <- mortality1 %>% group_by(country) %>% summarize(infant = mean(infant, na.rm=T))
child1 <- mortality1 %>% group_by(country) %>% summarize(child = mean(child, na.rm=T))
merged1 <- inner_join(infant1, child1, by='country')
Solution
mortality_yr_region <- mortality %>% group_by(year, region) %>%
summarize(infant = mean(infant, na.rm=T),
child = mean(child, na.rm=T))
Solution
mortality_yr_region_reshaped <- mortality_yr_region %>% gather(variable, value, infant, child)
library(ggplot2)
ggplot(mortality_yr_region_reshaped, aes(x=year, y=value, group=region, color=region))+geom_point()+geom_line()+facet_wrap(~variable, nrow=2,scales='free_y')
Solution
ggplot(mortality_yr_region_reshaped, aes(x=year, y=value, group=variable, color=variable))+geom_point()+geom_line()+facet_wrap(~region, nrow=2)