First modify the setup chunk to activate the packages you will need for this HW. Ensure you have tidyverse, knitr and rmarkdown installed. Add statements like library(tidyverse)')
```{r setup}
library(pacman)
p_load(tidyverse) # library(tidyverse)
p_load(knitr)
p_load(rmarkdown)
p_load(janitor)
# add more p_load statements
# if you need to. Or you can do it in one go with
#
# p_load(tidyverse, knitr, rmarkdown, janitor)
```
You are expected to edit this R Markdown document, especially the R code chunks. Once you’re done, please change eval=F
to eval=T
in each chunk in order to make the code run. Most of the R chunks are fill-in-the-blanks, with a few left empty for you to completely fill in.
Your submission for this week will comprise 2 files:
my_variable <- 1:5
my_variable # Change 1 to i)
## [1] 1 2 3 4 5
mean(airquality$Wind) # Wind should be capitalized
table(iris$Sepal.Length) # The actual variable uses ., not _
data
folder of your RStudio Project for this class.breast_cancer
.# Code to import data file here
library(pacman)
p_load(here)
breast_cancer <- rio::import(here("data/clinical_data_breast_cancer_modified.csv"), check.names=TRUE)
## I've added check.names=TRUE to help with removing spaces in the names.
##
## Other alternatives are below
# breast_cancer <- data.table::fread("data/clinical_data_breast_cancer_modified.csv")
# breast_cancer <- readr::read_csv('data/clinical_data_breast_cancer_modified.csv')
# breast_cancer <- read.csv('data/clinica_data_breast_cancer_modified.csv',
# stringsAsFactors = FALSE)
# Check for data types
glimpse(breast_cancer)
# str(breast_cancer)
# Convert here
# breast_cancer$Gender <- as.factor(breast_cancer$Gender)
# breast_cancer$ER.Status <- as.factor(breast_cancer$ER.Status)
## You can repeat this for the other variables, making sure you spell them properly. Or you can comment
## proceed as follows:
# breast_cancer <- breast_cancer %>%
# mutate(Gender = as.factor(Gender),
# ER.Status= as.factor(ER.Status),
# ..... # fill this in with the other variables
# )
## Comment out one of the two strategies for your submission
## Instead of the repetition of code in the above two strategies,
## you can use the `across` function to make things more succint
breast_cancer <- breast_cancer %>%
mutate(across(c(Gender, ER.Status, PR.Status, HER2.Final.Status), as.factor))
# Not correct (Why?):
# breast_cancer <- breast_cancer %>%
# mutate(across(c(Gender, ends_with("Status")), as.factor))
# Convert any new variables here. I don't think there is an entirely right
# answer here, but as long as the process was followed, I'm fine with different
# choices
breast_cancer <- breast_cancer %>%
mutate(Age.at.Initial.Pathologic.Diagnosis = as.numeric(Age.at.Initial.Pathologic.Diagnosis)) %>%
mutate(across(c(Tumor, Node, Metastasis),as.factor))
breast_cancer
match the modifications you made# Don't edit this chunk. It is for verification.
breast_cancer %>%
dplyr::summarize_all(class) %>%
tidyr::pivot_longer(names_to = "Variable", values_to = "Class", cols = everything()) %>%
knitr::kable()
Variable | Class |
---|---|
Complete.TCGA.ID | character |
Gender | character |
Age.at.Initial.Pathologic.Diagnosis | numeric |
ER.Status | character |
PR.Status | character |
HER2.Final.Status | character |
Tumor | factor |
Tumor..T1.Coded | character |
Node | factor |
Node.Coded | character |
Metastasis | factor |
Metastasis.Coded | character |
AJCC.Stage | character |
Converted.Stage | character |
Survival.Data.Form | character |
Vital.Status | character |
Days.to.Date.of.Last.Contact | integer |
Days.to.date.of.Death | integer |
OS.event | integer |
OS.Time | integer |
brca_data
.brca_data <- rio::import(here('data/clinical_data_breast_cancer_hw.csv'))
glimpse(brca_data)
## Rows: 105
## Columns: 13
## $ Complete.TCGA.ID <chr> "TCGA-A2-A0T2", "TCGA-A2-A0CM", "T…
## $ Gender <chr> "FEMALE", "FEMALE", "FEMALE", "FEM…
## $ Age.at.Initial.Pathologic.Diagnosis <chr> "66", "40", "48", "56", "38", "57"…
## $ ER.Status <chr> "Negative", "Negative", "Negative"…
## $ PR.Status <chr> "Negative", "Negative", "Negative"…
## $ HER2.Final.Status <chr> "Negative", "Negative", "Negative"…
## $ Tumor <chr> "T3", "T2", "T2", "T2", "T3", "T2"…
## $ Node <chr> "N3", "N0", "N1", "N1", "N3", "N0"…
## $ Metastasis <chr> "M1", "M0", "M0", "M0", "M0", "M0"…
## $ AJCC.Stage <chr> "Stage IV", "Stage IIA", "Stage II…
## $ Vital.Status <chr> "DECEASED", "DECEASED", "DECEASED"…
## $ Days.to.Date.of.Last.Contact <int> 240, 754, 1555, 1692, 133, 309, 42…
## $ Days.to.date.of.Death <int> 240, 754, 1555, 1692, NA, NA, NA, …
## Add code here to correct any problems in the data set
brca_data <- brca_data %>%
mutate(across(c(Gender, ends_with("Status"), Tumor:AJCC.Stage), as.factor)) %>%
mutate(Age.at.Initial.Pathologic.Diagnosis = as.numeric(Age.at.Initial.Pathologic.Diagnosis))
This is a rather comprehensive answer, using information from StackOverflow.
This is a non-trivial question since the obvious way doesn’t quite work
brca_data1 <- brca_data %>%
mutate(ER.Status = ifelse(ER.Status=='Indeterminate', NA, ER.Status)) %>%
mutate(HER2.Final.Status = ifelse(HER2.Final.Status=='Equivocal', NA, HER2.Final.Status))
str(brca_data1 %>% select(ER.Status, HER2.Final.Status))
## 'data.frame': 105 obs. of 2 variables:
## $ ER.Status : int 2 2 2 2 2 2 2 2 2 2 ...
## $ HER2.Final.Status: int 2 2 2 2 2 2 2 2 2 2 ...
OhOh!! The type of the data changed to integers!!
There are several ways to achieve this in a type-safe way. The one I like most is using forcats::fct_recode
brca_data1 <- brca_data %>%
mutate(ER.Status = fct_recode(ER.Status, NULL = 'Indeterminate')) %>%
mutate(HER2.Final.Status = fct_recode(HER2.Final.Status, NULL = 'Equivocal'))
str(brca_data1 %>% select(ER.Status, HER2.Final.Status))
## 'data.frame': 105 obs. of 2 variables:
## $ ER.Status : Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
## $ HER2.Final.Status: Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
Note that the forcats package is already loaded into R as part of the tidyverse
Other options include:
brca_data1 <- brca_data %>%
mutate(ER.Status = recode_factor(ER.Status, Indeterminate = NA_character_)) %>%
mutate(HER2.Final.Status = recode_factor(HER2.Final.Status,
'Equivocal' = NA_character_))
brca_data1 <- brca_data %>%
mutate(ER.Status = na_if(ER.Status, 'Indeterminate')) %>%
mutate(HER2.Final.Status = na_if(HER2.Final.Status, 'Equivocal'))
brca_data1 <- brca_data %>%
mutate(ER.Status = factor(ER.Status, exclude = 'Indeterminate')) %>%
mutate(HER2.Final.Status = factor(HER2.Final.Status, exclude = 'Equivocal'))
The other approach is to create a function that does the transformation and run it on all the columns:
clean_markers <- function(x){
x <- recode_factor(x, Positive = 'Positive', Negative = 'Negative', .default = NA_character_)
return(x)
}
brca_data2 <- brca_data %>%
mutate(across(c(ER.Status, PR.Status, HER2.Final.Status), clean_markers))
You can verify that both data sets are identical
all(brca_data1 == brca_data2, na.rm=TRUE)
## [1] TRUE
brca_data3 <- select(brca_data, Complete.TCGA.ID, ER.Status, PR.Status, HER2.Final.Status)
# brca_data3 <- brca_data %>%
# select(Complete.TCGA.ID, ER.Status, PR.Status, HER2.Final.Status)
Creating new variables (what dplyr
function will you use for all of these?): mutate
Create a variable giving the TNM status of each patient. The T, N and M statuses are given separately. I want a single variable encoded as, for example, “T2N0M0”. [Hint: The function paste
is your friend]
brca_data <- brca_data %>%
mutate(tnm_status = paste(Tumor, Node, Metastasis, sep = ""))
brca_data3$tnm_status <- brca_data$tnm_status
brca_data <- unite(brca_data, tnm_status, c('Tumor','Node', 'Metastasis'),
sep='', remove=FALSE)
case_when
might help]: - Luminal (ER positive and/or PR positive) - HER2 (HER2 positive) - Basal-like (Er PR and HER2 negative)brca_data3 <- brca_data3 %>%
mutate(mol_cat = case_when( # fill in the next 4 lines
(ER.Status =='Positive') | (PR.Status=="Positive") ~ "Luminal",
(HER2.Final.Status == 'Positive') ~ "HER2",
(ER.Status=='Negative') & (PR.Status=='Negative') & (HER2.Final.Status=="Negative") ~ 'Basal_like',
TRUE ~ 'Other'
))
Vital.Status
) or the time of last contact if they are alive. This is a common computation for survival analysis studies, and is called the overall survival time. [Hint: the function ifelse
might be useful]## Fill this in yourself. Time to start leaving the nest
brca_data <- brca_data %>%
mutate(event_time = ifelse(Vital.Status == 'DECEASED', Days.to.date.of.Death, Days.to.Date.of.Last.Contact))
brca_data3$event_time <- brca_data$event_time
Save the cleaned breast cancer dataset as brca_cleaned
. You can save this to your computer using saveRDS(brca_cleaned, file="<a filename of your choice>.rds")
. We’ll be using this dataset again when we do plots and modeling.
brca_cleaned <- brca_data1
saveRDS(brca_cleaned, file = here('data/brca_data.rds'))