#dir = "/varidata/researchtemp/hpctmp/r_basics_2024/"
cyto_data <- read_csv(file = "/varidata/researchtemp/hpctmp/r_basics_2024/cytomegalovirus.csv")
## Rows: 64 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, race, diagnosis
## dbl (23): ID, age, diagnosis.type, time.to.transplant, prior.radiation, prio...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View(cyto_data)
class(cyto_data)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
dim(cyto_data)
## [1] 64 26
nrow(cyto_data)
## [1] 64
ncol(cyto_data)
## [1] 26
#glimpse(cyto_data)
str(cyto_data)
## spc_tbl_ [64 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ID : num [1:64] 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num [1:64] 61 62 63 33 54 55 67 51 44 59 ...
## $ sex : chr [1:64] "1" "1" "0" "0" ...
## $ race : chr [1:64] "0" "1" "1" "1" ...
## $ diagnosis : chr [1:64] "acute myeloid leukemia" "non-Hodgkin lymphoma" "non-Hodgkin lymphoma" "Hodgkin lymphoma" ...
## $ diagnosis.type : num [1:64] 1 0 0 0 0 1 1 1 0 0 ...
## $ time.to.transplant: num [1:64] 5.16 79.05 35.58 33.02 11.4 ...
## $ prior.radiation : num [1:64] 0 1 0 1 0 0 0 0 1 0 ...
## $ prior.chemo : num [1:64] 2 3 4 4 5 0 2 0 3 2 ...
## $ prior.transplant : num [1:64] 0 0 0 0 0 0 0 1 1 0 ...
## $ recipient.cmv : num [1:64] 1 0 1 1 1 1 1 1 1 0 ...
## $ donor.cmv : num [1:64] 0 0 1 0 1 1 1 1 1 0 ...
## $ donor.sex : num [1:64] 0 1 0 1 0 1 1 0 1 0 ...
## $ TNC.dose : num [1:64] 18.31 4.26 8.09 21.02 14.7 ...
## $ CD34.dose : num [1:64] 2.29 2.04 6.97 6.09 2.36 6.91 3.66 3.9 7 2.52 ...
## $ CD3.dose : num [1:64] 3.21 NA 2.19 4.87 6.55 2.53 3.66 7.27 2.59 2.52 ...
## $ CD8.dose : num [1:64] 0.95 NA 0.59 2.32 2.4 0.86 0.17 1.95 NA 1.22 ...
## $ TBI.dose : num [1:64] 200 200 200 200 400 200 400 400 200 400 ...
## $ C1/C2 : num [1:64] 0 1 0 0 0 0 0 0 1 1 ...
## $ aKIRs : num [1:64] 1 5 3 2 6 2 1 2 2 4 ...
## $ cmv : num [1:64] 1 0 0 0 0 1 0 0 1 0 ...
## $ time.to.cmv : num [1:64] 3.91 65.12 3.75 48.49 4.37 ...
## $ agvhd : num [1:64] 1 0 0 1 1 1 0 0 1 0 ...
## $ time.to.agvhd : num [1:64] 3.55 65.12 3.75 28.55 2.79 ...
## $ cgvhd : num [1:64] 0 0 0 1 0 0 0 0 1 1 ...
## $ time.to.cgvhd : num [1:64] 6.28 65.12 3.75 10.45 4.37 ...
## - attr(*, "spec")=
## .. cols(
## .. ID = col_double(),
## .. age = col_double(),
## .. sex = col_character(),
## .. race = col_character(),
## .. diagnosis = col_character(),
## .. diagnosis.type = col_double(),
## .. time.to.transplant = col_double(),
## .. prior.radiation = col_double(),
## .. prior.chemo = col_double(),
## .. prior.transplant = col_double(),
## .. recipient.cmv = col_double(),
## .. donor.cmv = col_double(),
## .. donor.sex = col_double(),
## .. TNC.dose = col_double(),
## .. CD34.dose = col_double(),
## .. CD3.dose = col_double(),
## .. CD8.dose = col_double(),
## .. TBI.dose = col_double(),
## .. `C1/C2` = col_double(),
## .. aKIRs = col_double(),
## .. cmv = col_double(),
## .. time.to.cmv = col_double(),
## .. agvhd = col_double(),
## .. time.to.agvhd = col_double(),
## .. cgvhd = col_double(),
## .. time.to.cgvhd = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# nominal variable
table(cyto_data$diagnosis)
##
## acute lymphoblastic leukemia acute myeloid leukemia
## 1 12
## aplastic anemia chronic lymphocytic leukemia
## 1 5
## chronic myeloid leukemia congenital anemia
## 4 1
## Hodgkin lymphoma multiple myelomas
## 3 7
## myelodysplastic syndrome myelofibrosis
## 9 4
## myeloproliferative disorder non-Hodgkin lymphoma
## 1 11
## renal cell carcinoma
## 4
# ordinal variable
table(cyto_data$prior.chemo)
##
## 0 1 2 3 4 5 7 8
## 10 15 17 8 6 5 1 1
cyto_data$prior.chemo_ <- factor(cyto_data$prior.chemo, levels = 0:8) # incriminal
class(cyto_data$prior.chemo_) #factor
## [1] "factor"
class(cyto_data$prior.chemo) # numeric
## [1] "numeric"
summary(cyto_data$age )
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.00 46.00 55.00 67.17 61.00 1000.00
table(cyto_data$prior.radiation, useNA="ifany") # a patient has no data of prior.radiation
##
## 0 1 <NA>
## 52 11 1
library(DataExplorer)
plot_missing(cyto_data)
summary(cyto_data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.00 46.00 55.00 67.17 61.00 1000.00
%>% Cmd + Shift + M (Mac)
cyto_data %>%
ggplot(aes(x = CD8.dose, y = CD34.dose, color=sex)) +
geom_point()
## Warning: Removed 13 rows containing missing values (`geom_point()`).
cyto_data %>% filter(prior.chemo_ %in% 0:8) %>%
ggplot(aes(x = prior.chemo_)) +
geom_bar()
table(cyto_data$prior.radiation, useNA="ifany")
##
## 0 1 <NA>
## 52 11 1
cyto_data %>%
filter(prior.radiation %in% c(0,1)) %>%
ggplot(aes(x = TNC.dose, y = CD3.dose, color = sex))+
geom_point() +
facet_grid(.~prior.radiation)+
geom_smooth(method= "lm", se =F)+
ggtitle("Dosage by Radation Exposure")+
ylab("CD3 Dose") +xlab("TNC Dose")+
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).