--- title: "Get BLUPs combining all trial data" author: "Marnin Wolfe" date: "2021-Aug-09" output: workflowr::wflow_html: toc: true editor_options: chunk_output_type: inline --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = F, tidy='styler', tidy.opts=list(strict=FALSE,width.cutoff=100), highlight=TRUE) ``` # Previous step 1. [Prepare training dataset](01-cleanTPdata.html): Download data from DB, "Clean" and format DB data. # Get multi-trial BLUPs from raw data (two-stage) Two-stage procedure: 1. Fit mixed-model to multi-trial dataset and extract BLUPs, de-regressed BLUPs and weights. Include two rounds of outlier removal. 2. Genomic prediction with drg-BLUPs from multi-trial analysis as input. **Work below represents Stage 1 of the Two-stage procedure.** To fit the mixed-model I used last year, I am again resorting to `asreml`. I fit random effects for rep and block only where complete and incomplete blocks, respectively are indicated in the trial design variables. `sommer` should be able to fit the same model via the `at()` function, _but_ I am having trouble with it's memory intnsity _and_ `sommer` is much slower even without a dense covariance (i.e. a kinship), compared to `lme4::lmer()` or `asreml()`. To use `asreml` I require to access the license available only on `cbsurobbins.biohpc.cornell.edu`. This is the only step like this in the pipeline. `cbsurobbins` is using a SLURM job scheduler now. According to instructions, start an interactive `bash` shell with requested amount of recources, as follows: ```{bash, eval=F} screen; cd ~/IITA_2021GS/; salloc -n 8 --mem=60G --time=06:00:00; # salloc: Granted job allocation 833 # salloc: Waiting for resource configuration # salloc: Nodes cbsurobbins are ready for job R; ``` ## Set-up training datasets ```{r, message=F} library(tidyverse); library(magrittr); library(genomicMateSelectR) dbdata<-readRDS(here::here("output","IITA_ExptDesignsDetected_2021Aug08.rds")) traits<-c("MCMDS","DM","PLTHT","BRNHT1","BRLVLS","HI", "logDYLD", # <-- logDYLD now included. "logFYLD","logTOPYLD","logRTNO","TCHART","LCHROMO","ACHROMO","BCHROMO") # **Nest by trait.** Need to restructure the data from per-trial by regrouping by trait. dbdata<-nestDesignsDetectedByTraits(dbdata,traits) ``` ```{r} dbdata %>% mutate(N_blups=map_dbl(MultiTrialTraitData,nrow)) %>% rmarkdown::paged_table() ``` ```{r asreml version} dbdata %<>% mutate(fixedFormula=ifelse(Trait %in% c("logDYLD","logFYLD","logRTNO","logTOPYLD"), "Value ~ yearInLoc + PropNOHAV","Value ~ yearInLoc"), randFormula=paste0("~idv(GID) + idv(trialInLocYr) + at(CompleteBlocks,'Yes'):repInTrial ", "+ at(IncompleteBlocks,'Yes'):blockInRep")) dbdata %>% mutate(Nobs=map_dbl(MultiTrialTraitData,nrow)) %>% select(Trait,Nobs,fixedFormula,randFormula) %>% rmarkdown::paged_table() ``` ## Function to run asreml Includes rounds of outlier removal and re-fitting. ```{r, eval=F} fitASfunc<-function(fixedFormula,randFormula,MultiTrialTraitData,...){ # test arguments for function # ---------------------- # MultiTrialTraitData<-dbdata$MultiTrialTraitData[[7]] # #Trait<-dbdata$Trait[[7]] # fixedFormula<-dbdata$fixedFormula[[7]] # randFormula<-dbdata$randFormula[[7]] #test<-fitASfunc(fixedFormula,randFormula,MultiTrialTraitData) # ---------------------- MultiTrialTraitData %<>% mutate(across(c(GID,yearInLoc, CompleteBlocks, IncompleteBlocks, trialInLocYr, repInTrial, blockInRep),as.factor)) %>% droplevels require(asreml); fixedFormula<-as.formula(fixedFormula) randFormula<-as.formula(randFormula) # fit asreml out<-asreml(fixed = fixedFormula, random = randFormula, data = MultiTrialTraitData, maxiter = 40, workspace=1000e6, na.method.X="omit") #### extract residuals - Round 1 outliers1<-which(abs(scale(out$residuals))>3.3) if(length(outliers1)>0){ x<-MultiTrialTraitData[-outliers1,] # re-fit out<-asreml(fixed = fixedFormula, random = randFormula, data = x, maxiter = 40, workspace=1000e6, na.method.X="omit") #### extract residuals - Round 2 outliers2<-which(abs(scale(out$residuals))>3.3) if(length(outliers2)>0){ #### remove outliers x<-x[-outliers2,] # final re-fit out<-asreml(fixed = fixedFormula, random = randFormula, data = x, maxiter = 40,workspace=1000e6, na.method.X="omit") } } if(length(outliers1)==0){ outliers1<-NULL } if(length(outliers2)==0){ outliers2<-NULL } ll<-summary(out,all=T)$loglik varcomp<-summary(out,all=T)$varcomp Vg<-varcomp["GID!GID.var","component"] Ve<-varcomp["R!variance","component"] H2=Vg/(Vg+Ve) blups<-summary(out,all=T)$coef.random %>% as.data.frame %>% rownames_to_column(var = "GID") %>% dplyr::select(GID,solution,`std error`) %>% filter(grepl("GID",GID)) %>% rename(BLUP=solution) %>% mutate(GID=gsub("GID_","",GID), PEV=`std error`^2, # asreml specific REL=1-(PEV/Vg), # Reliability drgBLUP=BLUP/REL, # deregressed BLUP WT=(1-H2)/((0.1 + (1-REL)/REL)*H2)) # weight for use in Stage 2 out<-tibble(loglik=ll,Vg,Ve,H2, blups=list(blups), varcomp=list(varcomp), outliers1=list(outliers1), outliers2=list(outliers2)) gc() return(out) } ``` ## Run asreml Ran in small chunks. Still learning SLURM scheduler used on server. ```{r, eval=F} library(asreml) require(furrr); plan(multisession, workers = 4) options(future.globals.maxSize=+Inf); options(future.rng.onMisuse="ignore") test<-dbdata %>% slice(1:4) %>% mutate(fitAS=future_pmap(.,fitASfunc)) saveRDS(test,file=here::here("output","test_2021Aug09.rds")) plan(sequential) rm(test); gc() require(furrr); plan(multisession, workers = 5) options(future.globals.maxSize=+Inf); options(future.rng.onMisuse="ignore") test1<-dbdata %>% slice(5:9) %>% mutate(fitAS=future_pmap(.,fitASfunc)) plan(sequential) saveRDS(test1,file=here::here("output","test1_2021Aug09.rds")) rm(test1); gc(); require(furrr); plan(multisession, workers = 5) options(future.globals.maxSize=+Inf); options(future.rng.onMisuse="ignore") test2<-dbdata %>% slice(10:14) %>% mutate(fitAS=future_pmap(.,fitASfunc)) plan(sequential) saveRDS(test2,file=here::here("output","test2_2021Aug09.rds")) dbdata<-readRDS(here::here("output","test_2021Aug09.rds")) %>% bind_rows(readRDS(here::here("output","test1_2021Aug09.rds"))) %>% bind_rows(readRDS(here::here("output","test2_2021Aug09.rds"))) %>% select(-fixedFormula,-randFormula,-MultiTrialTraitData) dbdata %<>% unnest(fitAS) ``` ## Output file ```{r, eval=F} saveRDS(dbdata,file=here::here("output","IITA_blupsForModelTraining_twostage_asreml_2021Aug09.rds")) ``` # Results See [Results](07-Results.html): Home for plots and summary tables. # Next step 3. [Validate the pedigree obtained from cassavabase](03-validatePedigree.html): Before setting up a cross-validation scheme for predictions that depend on a correct pedigree, add a basic verification step to the pipeline. Not trying to fill unknown or otherwise correct the pedigree. Assess evidence that relationship is correct, remove if incorrect. # sommer version attempt Set-up the singularity shell and R environment ```{bash, eval=F} # 1) start a screen shell screen; # or screen -r if re-attaching... # 2) start the singularity Linux shell inside that #singularity shell /workdir/$USER/rocker.sif; singularity pull rocker.sif docker://rocker/tidyverse:latest; singularity shell ~/rocker.sif; #singularity shell ~/rocker2.sif; # Project directory, so R will use as working dir. cd /home/mw489/IITA_2021GS/; # 3) Start R R # libPath<-"/home/mw489/R/x86_64-pc-linux-gnu-library/4.1" # withr::with_libpaths(new=libPath, devtools::install_github("wolfemd/genomicMateSelectR", ref = 'master')) ``` ```{r, message=F} # library(tidyverse); library(magrittr); # library(genomicMateSelectR) # dbdata<-readRDS(here::here("output","IITA_ExptDesignsDetected_2021Aug08.rds")) # traits<-c("MCMDS","DM","PLTHT","BRNHT1","BRLVLS","HI", # "logDYLD", # <-- logDYLD now included. # "logFYLD","logTOPYLD","logRTNO","TCHART","LCHROMO","ACHROMO","BCHROMO") # # # **Nest by trait.** Need to restructure the data from per-trial by regrouping by trait. # dbdata<-nestDesignsDetectedByTraits(dbdata,traits) ``` ```{r sommer version} # RhpcBLASctl::blas_set_num_threads(56) # # MultiTrialTraitData<-dbdata$MultiTrialTraitData[[2]] # fixedFormula<-"Value ~ yearInLoc" # randFormula<-paste0("~vs(GID) + vs(trialInLocYr) + vs(at(CompleteBlocks,'Yes'),repInTrial) + vs(at(IncompleteBlocks,'Yes'),blockInRep)") # library(sommer) # fit <- sommer::mmer(fixed = as.formula(fixedFormula), # random = as.formula(randFormula), # weights = WT, # data=MultiTrialTraitData, # date.warning = F, # getPEV = F) # MultiTrialTraitData %>% distinct(GID) # Error: cannot allocate vector of size 537.8 Gb ```