Read and load a data set in Excel

We will read a breast cancer data set available at http://www.araastat.com/BIOF339_PracticalR/Lectures/lecture2_files/BreastCancer.xlsx. This is an Microsoft Excel file of 569 subjects with breast tumor biopsies. The objective of this data set and study was to predict the class of tumor (malignant/benign) from several tumor characteristics.

  1. Use the function download.file to download this file to your computer, specifying where you save it.
download.file('http://www.araastat.com/BIOF339_PracticalR/Lectures/lecture2_files/BreastCancer.xlsx', 'BreastCancer.xlsx')
  1. Install the package readxl from your favorite CRAN mirror
install.packages('readxl', repos='http://cran.rstudio.com')
  1. Read the file in to R using the read_excel function from the readxl package.
library(readxl)
dat <- read_excel('BreastCancer.xlsx')
dat <- data.frame(dat, stringsAsFactors = F) # Convert file to a data frame, remove spaces from names

Data Exploration

  1. Find the structure of this data set
str(dat)
## 'data.frame':    569 obs. of  31 variables:
##  $ mean.radius            : num  18 20.6 19.7 11.4 20.3 ...
##  $ mean.texture           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ mean.perimeter         : num  122.8 132.9 130 77.6 135.1 ...
##  $ mean.area              : num  1001 1326 1203 386 1297 ...
##  $ mean.smoothness        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ mean.compactness       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ mean.concavity         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ mean.concave.points    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ mean.symmetry          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ mean.fractal.dimension : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius.error           : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture.error          : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter.error        : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area.error             : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness.error       : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness.error      : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity.error        : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points.error   : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry.error         : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal.dimension.error: num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ worst.radius           : num  25.4 25 23.6 14.9 22.5 ...
##  $ worst.texture          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ worst.perimeter        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ worst.area             : num  2019 1956 1709 568 1575 ...
##  $ worst.smoothness       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ worst.compactness      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ worst.concavity        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ worst.concave.points   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ worst.symmetry         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ worst.fractal.dimension: num  0.1189 0.089 0.0876 0.173 0.0768 ...
##  $ Status                 : chr  "benign" "benign" "benign" "benign" ...
  1. Find the dimension of this data set
dim(dat)
## [1] 569  31
  1. Create new variables that are the differences between the worst recorded and mean values of each predictor (radius, texture, perimeter, etc) as new columns in the data
names(dat)
##  [1] "mean.radius"             "mean.texture"           
##  [3] "mean.perimeter"          "mean.area"              
##  [5] "mean.smoothness"         "mean.compactness"       
##  [7] "mean.concavity"          "mean.concave.points"    
##  [9] "mean.symmetry"           "mean.fractal.dimension" 
## [11] "radius.error"            "texture.error"          
## [13] "perimeter.error"         "area.error"             
## [15] "smoothness.error"        "compactness.error"      
## [17] "concavity.error"         "concave.points.error"   
## [19] "symmetry.error"          "fractal.dimension.error"
## [21] "worst.radius"            "worst.texture"          
## [23] "worst.perimeter"         "worst.area"             
## [25] "worst.smoothness"        "worst.compactness"      
## [27] "worst.concavity"         "worst.concave.points"   
## [29] "worst.symmetry"          "worst.fractal.dimension"
## [31] "Status"
dat[,'diff.radius'] <-  dat[,'worst.radius'] - dat[,'mean.radius']
  1. Create two subsets based on tumor status
dat_malignant <- dat[dat$Status=='malignant',]
dat_benign <- dat[dat$Status=='benign',]
  1. In each subset, compute summaries of the different mean variables
summary(dat_malignant[,1:9])
##   mean.radius      mean.texture   mean.perimeter     mean.area    
##  Min.   : 6.981   Min.   : 9.71   Min.   : 43.79   Min.   :143.5  
##  1st Qu.:11.080   1st Qu.:15.15   1st Qu.: 70.87   1st Qu.:378.2  
##  Median :12.200   Median :17.39   Median : 78.18   Median :458.4  
##  Mean   :12.147   Mean   :17.91   Mean   : 78.08   Mean   :462.8  
##  3rd Qu.:13.370   3rd Qu.:19.76   3rd Qu.: 86.10   3rd Qu.:551.1  
##  Max.   :17.850   Max.   :33.81   Max.   :114.60   Max.   :992.1  
##  mean.smoothness   mean.compactness  mean.concavity    mean.concave.points
##  Min.   :0.05263   Min.   :0.01938   Min.   :0.00000   Min.   :0.00000    
##  1st Qu.:0.08306   1st Qu.:0.05562   1st Qu.:0.02031   1st Qu.:0.01502    
##  Median :0.09076   Median :0.07529   Median :0.03709   Median :0.02344    
##  Mean   :0.09248   Mean   :0.08008   Mean   :0.04606   Mean   :0.02572    
##  3rd Qu.:0.10070   3rd Qu.:0.09755   3rd Qu.:0.05999   3rd Qu.:0.03251    
##  Max.   :0.16340   Max.   :0.22390   Max.   :0.41080   Max.   :0.08534    
##  mean.symmetry   
##  Min.   :0.1060  
##  1st Qu.:0.1580  
##  Median :0.1714  
##  Mean   :0.1742  
##  3rd Qu.:0.1890  
##  Max.   :0.2743
summary(dat_benign[,1:9])
##   mean.radius     mean.texture   mean.perimeter     mean.area     
##  Min.   :10.95   Min.   :10.38   Min.   : 71.90   Min.   : 361.6  
##  1st Qu.:15.07   1st Qu.:19.33   1st Qu.: 98.75   1st Qu.: 705.3  
##  Median :17.32   Median :21.46   Median :114.20   Median : 932.0  
##  Mean   :17.46   Mean   :21.60   Mean   :115.37   Mean   : 978.4  
##  3rd Qu.:19.59   3rd Qu.:23.77   3rd Qu.:129.93   3rd Qu.:1203.8  
##  Max.   :28.11   Max.   :39.28   Max.   :188.50   Max.   :2501.0  
##  mean.smoothness   mean.compactness  mean.concavity    mean.concave.points
##  Min.   :0.07371   Min.   :0.04605   Min.   :0.02398   Min.   :0.02031    
##  1st Qu.:0.09401   1st Qu.:0.10960   1st Qu.:0.10952   1st Qu.:0.06462    
##  Median :0.10220   Median :0.13235   Median :0.15135   Median :0.08628    
##  Mean   :0.10290   Mean   :0.14519   Mean   :0.16077   Mean   :0.08799    
##  3rd Qu.:0.11092   3rd Qu.:0.17240   3rd Qu.:0.20305   3rd Qu.:0.10318    
##  Max.   :0.14470   Max.   :0.34540   Max.   :0.42680   Max.   :0.20120    
##  mean.symmetry   
##  Min.   :0.1308  
##  1st Qu.:0.1741  
##  Median :0.1899  
##  Mean   :0.1929  
##  3rd Qu.:0.2099  
##  Max.   :0.3040
library(stringr)

mean_vars = str_detect(names(dat_malignant),'mean') # Gets column names with "mean" in it (what structure is this?)
summary(dat_malignant[,mean_vars])
##   mean.radius      mean.texture   mean.perimeter     mean.area    
##  Min.   : 6.981   Min.   : 9.71   Min.   : 43.79   Min.   :143.5  
##  1st Qu.:11.080   1st Qu.:15.15   1st Qu.: 70.87   1st Qu.:378.2  
##  Median :12.200   Median :17.39   Median : 78.18   Median :458.4  
##  Mean   :12.147   Mean   :17.91   Mean   : 78.08   Mean   :462.8  
##  3rd Qu.:13.370   3rd Qu.:19.76   3rd Qu.: 86.10   3rd Qu.:551.1  
##  Max.   :17.850   Max.   :33.81   Max.   :114.60   Max.   :992.1  
##  mean.smoothness   mean.compactness  mean.concavity    mean.concave.points
##  Min.   :0.05263   Min.   :0.01938   Min.   :0.00000   Min.   :0.00000    
##  1st Qu.:0.08306   1st Qu.:0.05562   1st Qu.:0.02031   1st Qu.:0.01502    
##  Median :0.09076   Median :0.07529   Median :0.03709   Median :0.02344    
##  Mean   :0.09248   Mean   :0.08008   Mean   :0.04606   Mean   :0.02572    
##  3rd Qu.:0.10070   3rd Qu.:0.09755   3rd Qu.:0.05999   3rd Qu.:0.03251    
##  Max.   :0.16340   Max.   :0.22390   Max.   :0.41080   Max.   :0.08534    
##  mean.symmetry    mean.fractal.dimension
##  Min.   :0.1060   Min.   :0.05185       
##  1st Qu.:0.1580   1st Qu.:0.05853       
##  Median :0.1714   Median :0.06154       
##  Mean   :0.1742   Mean   :0.06287       
##  3rd Qu.:0.1890   3rd Qu.:0.06576       
##  Max.   :0.2743   Max.   :0.09575

Notes

  1. If you have SAS or SPSS data sets then you can use the haven package to load them into R. You may have to install it.