# Date[1:41], format: "2018-08-30" "2018-08-31" "2018-09-01" "2018-09-02" "2018-09-03" ...
We’re making the decision to use ggplot2
for graphics
# install.packages('tidyverse')
library(ggplot2)
The ggplot2
package is a very flexible and (to me) intuitive way of visualizing data. It is based on the concept of layering elements on a canvas.
This idea of layering graphics on a canvas is, to me, a nice way of building graphs
You need:
data.frame
objectggplot
used pipes before pipes were a thing.
However, it uses the +
symbol for piping rather than the %>%
operator, since it pre-dates the tidyverse
library(ggplot2)
ggplot(mtcars, aes(x = wt, y = mpg)) + geom_point()
data.frame
object: mtcarslibrary(ggplot2)
ggplot(mtcars, aes(x = wt, y = mpg)) + geom_point()+ geom_smooth()
data.frame
object: mtcarsWe will use the two data sets:
data_spine <- read.csv('http://www.araastat.com/BIOF339_PracticalR/
Lectures/lecturedataframe_data/Dataset_spine.csv',
stringsAsFactors = F)
data_brca <- read.csv('http://www.araastat.com/BIOF339_PracticalR/
Lectures/lecturedataframe_data/
clinical_data_breast_cancer_modified.csv',
stringsAsFactors = F)
ggplot(data_brca, aes(x = Age.at.Initial.Pathologic.Diagnosis)) +
geom_histogram()
ggplot(data_brca, aes(x = Age.at.Initial.Pathologic.Diagnosis)) +
geom_histogram(binwidth=4)
ggplot(data_brca, aes(x = Age.at.Initial.Pathologic.Diagnosis)) +
geom_density()
ggplot(data_brca, aes(x = Tumor))+geom_bar()
Using the mtcars
dataset in R, create:
mpg
) in the data setcyl
) in the carggplot(mtcars, aes(x = mpg)) + geom_histogram(binwidth=3)
# ggplot(mtcars) + geom_histogram(aes(x = mpg), binwidth = 3)
ggplot(mtcars, aes(x = factor(cyl))) + geom_bar()
ggplot(data_spine, aes(x = Lumbar.lordosis.angle, y = Sacral.slope)) +
geom_point()
ggplot(data_spine, aes(x = Lumbar.lordosis.angle, y = Sacral.slope))+
geom_point() +
geom_smooth()
ggplot(data_spine, aes(x = Lumbar.lordosis.angle, y = Sacral.slope)) +
geom_point()+
geom_smooth(method='lm')
library(forecast)
d <- data.frame(x = 1:length(gas), y = gas) # Australian monthly gas production
ggplot(d, aes(x, y)) + geom_line()
iris
dataset, and add a smooth line through itggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() + geom_smooth()
ggplot(data_spine, aes(x = Class.attribute, y = Sacral.slope))+
geom_boxplot()
# Factor/discrete variable is always x
ggplot(data_spine, aes(x = Class.attribute, y = Sacral.slope)) +
geom_violin()
iris
datasetggplot(iris, aes(x = Species, y = Petal.Length))+geom_boxplot()
ggplot(data_brca, aes(x = Tumor))+geom_bar()
ggplot(data_brca, aes(x = Tumor))+geom_bar()+
coord_flip()
dplyr
packagedplyr
is the most lucid package for manipulating and analyzing data organized in a data frame.
group_by
function which creates a grouped data framelibrary(dplyr)
grouped_data_spine = data_spine %>% group_by(Class.attribute)
Note that you have to group using a discrete valued variable (factor, character, integer)
grouped_data_spine %>%
summarize(mean(Pelvic.incidence),
sd(Pelvic.incidence),
min(Pelvic.incidence),
max(Pelvic.incidence))
Class.attribute | mean(Pelvic.incidence) | sd(Pelvic.incidence) | min(Pelvic.incidence) | max(Pelvic.incidence) |
---|---|---|---|---|
Abnormal | 64.69 | 17.66 | 26.15 | 129.83 |
Normal | 51.69 | 12.37 | 30.74 | 89.83 |
grouped_data_spine %>% summarize(Mean = mean(Pelvic.incidence),
SD = sd(Pelvic.incidence),
Min = min(Pelvic.incidence),
Max = max(Pelvic.incidence))
Class.attribute Mean SD Min Max —————- —— —— —— ——- Abnormal 64.69 17.66 26.15 129.83 Normal 51.69 12.37 30.74 89.83
grouped_data_spine %>% summarize_all(mean)
# # A tibble: 2 x 13
# Class.attribute Pelvic.incidence Pelvic.tilt Lumbar.lordosis…
# <fct> <dbl> <dbl> <dbl>
# 1 Abnormal 64.7 19.8 55.9
# 2 Normal 51.7 12.8 43.5
# # ... with 9 more variables: Sacral.slope <dbl>, Pelvic.radius <dbl>,
# # Degree.spondylolisthesis <dbl>, Pelvic.slope <dbl>, Direct.tilt <dbl>,
# # Thoracic.slope <dbl>, Cervical.tilt <dbl>, Sacrum.angle <dbl>,
# # Scoliosis.slope <dbl>
data.frame
.
data.frame
object, use as.data.frame
data.frame
, so all operations on data.frame
’s will work.options(dplyr.width=Inf)
.Differences between a tibble and a data.frame
:
data_spine %>%
group_by(Class.attribute) %>%
summarize_all(funs(Mean = mean(., na.rm=T),
SEM = sd(., na.rm=T)/sqrt(n()))) %>%
gather(variable, value, -Class.attribute) %>%
separate(variable, c('Variable','stat'), sep = '_') %>%
spread(stat, value) %>%
mutate(lcb = Mean - 2 * SEM, ucb = Mean + 2 * SEM) %>%
ggplot(aes(x = Class.attribute, y = Mean, ymin = lcb, ymax = ucb)) +
geom_pointrange() +
facet_wrap(~Variable, scales = 'free_y') +
labs( x = 'Class', y = '') +
ggtitle('Confidence intervals of the mean')
Work through the pipeline yourself to understand what each step does, just like last week
ggplot(data_spine, aes(x = Sacral.slope, group = Class.attribute,
color=Class.attribute))+
geom_density()
ggplot(data_spine, aes(x = Lumbar.lordosis.angle, y = Sacral.slope,
group = Class.attribute, color = Class.attribute))+
geom_point()
ggplot(data_spine, aes(x = Lumbar.lordosis.angle, y = Sacral.slope,
group = Class.attribute, shape = Class.attribute))+
geom_point()
ggplot(data_spine, aes(x = Lumbar.lordosis.angle, y = Sacral.slope))+
geom_point(aes(size = Pelvic.slope))
ggplot(data_spine, aes(x = Lumbar.lordosis.angle, y = Sacral.slope,
group = Class.attribute, color = Class.attribute))+
geom_point(aes(size = Pelvic.slope))
ggplot(data_spine, aes(x = Lumbar.lordosis.angle, y = Sacral.slope,
group = Class.attribute, color=Class.attribute))+
geom_point()+
geom_smooth(method='lm')
ggplot(data_spine, aes(x = Lumbar.lordosis.angle, y = Sacral.slope))+
geom_point()+
geom_smooth(aes(color = Class.attribute), method='lm')
Facetted graphs are a panel of graphs, each of which corresponds to a particular subgroup of the data.
ggplot(data_spine, aes(x = Lumbar.lordosis.angle, y = Sacral.slope))+
geom_point()+
facet_wrap( ~ Class.attribute, nrow=1)
ggplot(data_spine, aes(x = Lumbar.lordosis.angle, y = Sacral.slope))+
geom_point()+ geom_smooth(method='lm')+
facet_wrap( ~ Class.attribute, nrow=1)
library(qqman)
data(gwasResults)
head(gwasResults)
# SNP CHR BP P
# 1 rs1 1 1 0.9148060
# 2 rs2 1 2 0.9370754
# 3 rs3 1 3 0.2861395
# 4 rs4 1 4 0.8304476
# 5 rs5 1 5 0.6417455
# 6 rs6 1 6 0.5190959
gwasResults <- gwasResults %>%
mutate(x_position = 1:n())
ggplot(gwasResults, aes(x = x_position, y = -log(P, base=10)))+
geom_point(size = 0.2)
ggplot(gwasResults, aes(x = x_position, y = -log(P, base=10),
group=CHR, color=CHR))+
geom_point(size=0.2)
ggplot(gwasResults, aes(x = x_position, y = -log(P, base=10),
group=factor(CHR), color=factor(CHR)))+
geom_point(size=0.2)
ggplot(gwasResults, aes(x = x_position, y = -log(P, base=10),
group=factor(CHR), color=factor(CHR)))+
geom_point(size=0.2)+
geom_hline(yintercept = 8, color='red', linetype=2)
ggplot(gwasResults, aes(x = BP, y = -log(P, base=10)))+
geom_point(size=0.2)+
facet_wrap(~ CHR, nrow=4)+
geom_hline(yintercept = 8, color='red', linetype=2)