Understanding ur data

Reading & Viewing ur data

#dir = "/varidata/researchtemp/hpctmp/r_basics_2024/"
cyto_data <- read_csv(file = "/varidata/researchtemp/hpctmp/r_basics_2024/cytomegalovirus.csv")
## Rows: 64 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): sex, race, diagnosis
## dbl (23): ID, age, diagnosis.type, time.to.transplant, prior.radiation, prio...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View(cyto_data)

What kind of obj do we have?

class(cyto_data)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"
dim(cyto_data)
## [1] 64 26
nrow(cyto_data)
## [1] 64
ncol(cyto_data)
## [1] 26
#glimpse(cyto_data)
str(cyto_data)
## spc_tbl_ [64 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ID                : num [1:64] 1 2 3 4 5 6 7 8 9 10 ...
##  $ age               : num [1:64] 61 62 63 33 54 55 67 51 44 59 ...
##  $ sex               : chr [1:64] "1" "1" "0" "0" ...
##  $ race              : chr [1:64] "0" "1" "1" "1" ...
##  $ diagnosis         : chr [1:64] "acute myeloid leukemia" "non-Hodgkin lymphoma" "non-Hodgkin lymphoma" "Hodgkin lymphoma" ...
##  $ diagnosis.type    : num [1:64] 1 0 0 0 0 1 1 1 0 0 ...
##  $ time.to.transplant: num [1:64] 5.16 79.05 35.58 33.02 11.4 ...
##  $ prior.radiation   : num [1:64] 0 1 0 1 0 0 0 0 1 0 ...
##  $ prior.chemo       : num [1:64] 2 3 4 4 5 0 2 0 3 2 ...
##  $ prior.transplant  : num [1:64] 0 0 0 0 0 0 0 1 1 0 ...
##  $ recipient.cmv     : num [1:64] 1 0 1 1 1 1 1 1 1 0 ...
##  $ donor.cmv         : num [1:64] 0 0 1 0 1 1 1 1 1 0 ...
##  $ donor.sex         : num [1:64] 0 1 0 1 0 1 1 0 1 0 ...
##  $ TNC.dose          : num [1:64] 18.31 4.26 8.09 21.02 14.7 ...
##  $ CD34.dose         : num [1:64] 2.29 2.04 6.97 6.09 2.36 6.91 3.66 3.9 7 2.52 ...
##  $ CD3.dose          : num [1:64] 3.21 NA 2.19 4.87 6.55 2.53 3.66 7.27 2.59 2.52 ...
##  $ CD8.dose          : num [1:64] 0.95 NA 0.59 2.32 2.4 0.86 0.17 1.95 NA 1.22 ...
##  $ TBI.dose          : num [1:64] 200 200 200 200 400 200 400 400 200 400 ...
##  $ C1/C2             : num [1:64] 0 1 0 0 0 0 0 0 1 1 ...
##  $ aKIRs             : num [1:64] 1 5 3 2 6 2 1 2 2 4 ...
##  $ cmv               : num [1:64] 1 0 0 0 0 1 0 0 1 0 ...
##  $ time.to.cmv       : num [1:64] 3.91 65.12 3.75 48.49 4.37 ...
##  $ agvhd             : num [1:64] 1 0 0 1 1 1 0 0 1 0 ...
##  $ time.to.agvhd     : num [1:64] 3.55 65.12 3.75 28.55 2.79 ...
##  $ cgvhd             : num [1:64] 0 0 0 1 0 0 0 0 1 1 ...
##  $ time.to.cgvhd     : num [1:64] 6.28 65.12 3.75 10.45 4.37 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ID = col_double(),
##   ..   age = col_double(),
##   ..   sex = col_character(),
##   ..   race = col_character(),
##   ..   diagnosis = col_character(),
##   ..   diagnosis.type = col_double(),
##   ..   time.to.transplant = col_double(),
##   ..   prior.radiation = col_double(),
##   ..   prior.chemo = col_double(),
##   ..   prior.transplant = col_double(),
##   ..   recipient.cmv = col_double(),
##   ..   donor.cmv = col_double(),
##   ..   donor.sex = col_double(),
##   ..   TNC.dose = col_double(),
##   ..   CD34.dose = col_double(),
##   ..   CD3.dose = col_double(),
##   ..   CD8.dose = col_double(),
##   ..   TBI.dose = col_double(),
##   ..   `C1/C2` = col_double(),
##   ..   aKIRs = col_double(),
##   ..   cmv = col_double(),
##   ..   time.to.cmv = col_double(),
##   ..   agvhd = col_double(),
##   ..   time.to.agvhd = col_double(),
##   ..   cgvhd = col_double(),
##   ..   time.to.cgvhd = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Categorical Variables

# nominal variable
table(cyto_data$diagnosis)
## 
## acute lymphoblastic leukemia       acute myeloid leukemia 
##                            1                           12 
##              aplastic anemia chronic lymphocytic leukemia 
##                            1                            5 
##     chronic myeloid leukemia            congenital anemia 
##                            4                            1 
##             Hodgkin lymphoma            multiple myelomas 
##                            3                            7 
##     myelodysplastic syndrome                myelofibrosis 
##                            9                            4 
##  myeloproliferative disorder         non-Hodgkin lymphoma 
##                            1                           11 
##         renal cell carcinoma 
##                            4
# ordinal variable
table(cyto_data$prior.chemo)
## 
##  0  1  2  3  4  5  7  8 
## 10 15 17  8  6  5  1  1
cyto_data$prior.chemo_ <-  factor(cyto_data$prior.chemo, levels = 0:8) # incriminal 

class(cyto_data$prior.chemo_) #factor
## [1] "factor"
class(cyto_data$prior.chemo) # numeric
## [1] "numeric"

Continuous Variable

summary(cyto_data$age )
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.00   46.00   55.00   67.17   61.00 1000.00

Missing data & Outlier Detection

table(cyto_data$prior.radiation, useNA="ifany") # a patient has no data of prior.radiation
## 
##    0    1 <NA> 
##   52   11    1
library(DataExplorer)
plot_missing(cyto_data)

summary(cyto_data$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.00   46.00   55.00   67.17   61.00 1000.00

%>% Cmd + Shift + M (Mac)

cyto_data %>% 
  ggplot(aes(x = CD8.dose, y = CD34.dose, color=sex)) +
  geom_point()
## Warning: Removed 13 rows containing missing values (`geom_point()`).

cyto_data %>% filter(prior.chemo_ %in% 0:8) %>% 
  ggplot(aes(x = prior.chemo_)) +
  geom_bar()

table(cyto_data$prior.radiation, useNA="ifany")
## 
##    0    1 <NA> 
##   52   11    1
cyto_data %>% 
  filter(prior.radiation %in% c(0,1)) %>% 
  ggplot(aes(x = TNC.dose, y = CD3.dose, color = sex))+
  geom_point() +
  facet_grid(.~prior.radiation)+
  geom_smooth(method= "lm", se =F)+
  ggtitle("Dosage by Radation Exposure")+
  ylab("CD3 Dose") +xlab("TNC Dose")+
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).