---
title: "Get BLUPs combining all trial data"
author: "Marnin Wolfe"
date: "2021-Aug-09"
output: 
  workflowr::wflow_html:
    toc: true
editor_options:
  chunk_output_type: inline
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = F, 
                      tidy='styler', tidy.opts=list(strict=FALSE,width.cutoff=100), highlight=TRUE)
```

# Previous step 

1. [Prepare training dataset](01-cleanTPdata.html): Download data from DB, "Clean" and format DB data.

# Get multi-trial BLUPs from raw data (two-stage)

Two-stage procedure: 

1. Fit mixed-model to multi-trial dataset and extract BLUPs, de-regressed BLUPs and weights. Include two rounds of outlier removal.
2. Genomic prediction with drg-BLUPs from multi-trial analysis as input. 

**Work below represents Stage 1 of the Two-stage procedure.**

To fit the mixed-model I used last year, I am again resorting to `asreml`. I fit random effects for rep and block only where complete and incomplete blocks, respectively are indicated in the trial design variables. `sommer` should be able to fit the same model via the `at()` function, _but_ I am having trouble with it's memory intnsity _and_ `sommer` is much slower even without a dense covariance (i.e. a kinship), compared to `lme4::lmer()` or `asreml()`. 

To use `asreml` I require to access the license available only on `cbsurobbins.biohpc.cornell.edu`. 

This is the only step like this in the pipeline.

`cbsurobbins` is using a SLURM job scheduler now. According to instructions, start an interactive `bash` shell with requested amount of recources, as follows:

```{bash, eval=F}
screen;
cd ~/IITA_2021GS/;
salloc -n 8 --mem=60G --time=06:00:00;
# salloc: Granted job allocation 833
# salloc: Waiting for resource configuration
# salloc: Nodes cbsurobbins are ready for job
R;
```

## Set-up training datasets

```{r, message=F}
library(tidyverse); library(magrittr);
library(genomicMateSelectR)
dbdata<-readRDS(here::here("output","IITA_ExptDesignsDetected_2021Aug08.rds"))
traits<-c("MCMDS","DM","PLTHT","BRNHT1","BRLVLS","HI",
          "logDYLD", # <-- logDYLD now included. 
          "logFYLD","logTOPYLD","logRTNO","TCHART","LCHROMO","ACHROMO","BCHROMO")

# **Nest by trait.** Need to restructure the data from per-trial by regrouping by trait. 
dbdata<-nestDesignsDetectedByTraits(dbdata,traits)
```

```{r}
dbdata %>% mutate(N_blups=map_dbl(MultiTrialTraitData,nrow)) %>% rmarkdown::paged_table()
```
```{r asreml version}
dbdata %<>%
  mutate(fixedFormula=ifelse(Trait %in% c("logDYLD","logFYLD","logRTNO","logTOPYLD"),
                             "Value ~ yearInLoc + PropNOHAV","Value ~ yearInLoc"),
         randFormula=paste0("~idv(GID) + idv(trialInLocYr) + at(CompleteBlocks,'Yes'):repInTrial ",
                            "+ at(IncompleteBlocks,'Yes'):blockInRep"))
dbdata %>%
  mutate(Nobs=map_dbl(MultiTrialTraitData,nrow)) %>%
  select(Trait,Nobs,fixedFormula,randFormula) %>%
  rmarkdown::paged_table()
```

## Function to run asreml
Includes rounds of outlier removal and re-fitting.
```{r, eval=F}
fitASfunc<-function(fixedFormula,randFormula,MultiTrialTraitData,...){
  # test arguments for function
  # ----------------------
  # MultiTrialTraitData<-dbdata$MultiTrialTraitData[[7]]
  # #Trait<-dbdata$Trait[[7]]
  # fixedFormula<-dbdata$fixedFormula[[7]]
  # randFormula<-dbdata$randFormula[[7]]
  #test<-fitASfunc(fixedFormula,randFormula,MultiTrialTraitData)
  # ----------------------
  MultiTrialTraitData %<>%
    mutate(across(c(GID,yearInLoc,
                    CompleteBlocks,
                    IncompleteBlocks,
                    trialInLocYr,
                    repInTrial,
                    blockInRep),as.factor)) %>% 
    droplevels
  
  require(asreml); 
  fixedFormula<-as.formula(fixedFormula)
  randFormula<-as.formula(randFormula)
  # fit asreml 
  out<-asreml(fixed = fixedFormula,
              random = randFormula,
              data = MultiTrialTraitData, 
              maxiter = 40, workspace=1000e6, 
              na.method.X="omit")
  #### extract residuals - Round 1
  
  outliers1<-which(abs(scale(out$residuals))>3.3)
  
  if(length(outliers1)>0){
    
    x<-MultiTrialTraitData[-outliers1,]
    # re-fit
    out<-asreml(fixed = fixedFormula,
                random = randFormula,
                data = x, 
                maxiter = 40, workspace=1000e6, 
                na.method.X="omit")
    #### extract residuals - Round 2
    outliers2<-which(abs(scale(out$residuals))>3.3)
    if(length(outliers2)>0){
      #### remove outliers
      x<-x[-outliers2,]
      # final re-fit
      out<-asreml(fixed = fixedFormula,
                  random = randFormula,
                  data = x, maxiter = 40,workspace=1000e6, 
                  na.method.X="omit")
    }
  }
  if(length(outliers1)==0){ outliers1<-NULL }
  if(length(outliers2)==0){ outliers2<-NULL }
  
  ll<-summary(out,all=T)$loglik
  varcomp<-summary(out,all=T)$varcomp
  Vg<-varcomp["GID!GID.var","component"]
  Ve<-varcomp["R!variance","component"]
  H2=Vg/(Vg+Ve)
  blups<-summary(out,all=T)$coef.random %>%
    as.data.frame %>%
    rownames_to_column(var = "GID") %>%
    dplyr::select(GID,solution,`std error`) %>%
    filter(grepl("GID",GID)) %>%
    rename(BLUP=solution) %>%
    mutate(GID=gsub("GID_","",GID),
           PEV=`std error`^2, # asreml specific
           REL=1-(PEV/Vg), # Reliability
           drgBLUP=BLUP/REL, # deregressed BLUP
           WT=(1-H2)/((0.1 + (1-REL)/REL)*H2)) # weight for use in Stage 2
  out<-tibble(loglik=ll,Vg,Ve,H2,
              blups=list(blups),
              varcomp=list(varcomp),
              outliers1=list(outliers1),
              outliers2=list(outliers2))
  gc()
  return(out) }
```

## Run asreml

Ran in small chunks. Still learning SLURM scheduler used on server.
```{r, eval=F}
library(asreml)

require(furrr); plan(multisession, workers = 4)
options(future.globals.maxSize=+Inf); options(future.rng.onMisuse="ignore")
test<-dbdata %>%
  slice(1:4) %>% 
  mutate(fitAS=future_pmap(.,fitASfunc))
saveRDS(test,file=here::here("output","test_2021Aug09.rds"))
plan(sequential)
rm(test); gc()

require(furrr); plan(multisession, workers = 5)
options(future.globals.maxSize=+Inf); options(future.rng.onMisuse="ignore")
test1<-dbdata %>%
  slice(5:9) %>% 
  mutate(fitAS=future_pmap(.,fitASfunc))
plan(sequential)
saveRDS(test1,file=here::here("output","test1_2021Aug09.rds"))
rm(test1); gc();

require(furrr); plan(multisession, workers = 5)
options(future.globals.maxSize=+Inf); options(future.rng.onMisuse="ignore")
test2<-dbdata %>%
  slice(10:14) %>% 
  mutate(fitAS=future_pmap(.,fitASfunc))
plan(sequential)
saveRDS(test2,file=here::here("output","test2_2021Aug09.rds"))

dbdata<-readRDS(here::here("output","test_2021Aug09.rds")) %>% 
  bind_rows(readRDS(here::here("output","test1_2021Aug09.rds"))) %>% 
  bind_rows(readRDS(here::here("output","test2_2021Aug09.rds"))) %>% 
  select(-fixedFormula,-randFormula,-MultiTrialTraitData)
dbdata %<>%
  unnest(fitAS)
```

## Output file
```{r, eval=F}
saveRDS(dbdata,file=here::here("output","IITA_blupsForModelTraining_twostage_asreml_2021Aug09.rds"))
```

# Results

See [Results](07-Results.html): Home for plots and summary tables.

# Next step

3. [Validate the pedigree obtained from cassavabase](03-validatePedigree.html): Before setting up a cross-validation scheme for predictions that depend on a correct pedigree, add a basic verification step to the pipeline. Not trying to fill unknown or otherwise correct the pedigree. Assess evidence that relationship is correct, remove if incorrect.

# sommer version attempt

Set-up the singularity shell and R environment
```{bash, eval=F}
# 1) start a screen shell 
screen; # or screen -r if re-attaching...
# 2) start the singularity Linux shell inside that
#singularity shell /workdir/$USER/rocker.sif; 
singularity pull rocker.sif docker://rocker/tidyverse:latest;
singularity shell ~/rocker.sif; 
#singularity shell ~/rocker2.sif; 
# Project directory, so R will use as working dir.
cd /home/mw489/IITA_2021GS/;
# 3) Start R
R
# libPath<-"/home/mw489/R/x86_64-pc-linux-gnu-library/4.1"
# withr::with_libpaths(new=libPath, devtools::install_github("wolfemd/genomicMateSelectR", ref = 'master'))
```

```{r, message=F}
# library(tidyverse); library(magrittr);
# library(genomicMateSelectR)
# dbdata<-readRDS(here::here("output","IITA_ExptDesignsDetected_2021Aug08.rds"))
# traits<-c("MCMDS","DM","PLTHT","BRNHT1","BRLVLS","HI",
#           "logDYLD", # <-- logDYLD now included. 
#           "logFYLD","logTOPYLD","logRTNO","TCHART","LCHROMO","ACHROMO","BCHROMO")
# 
# # **Nest by trait.** Need to restructure the data from per-trial by regrouping by trait. 
# dbdata<-nestDesignsDetectedByTraits(dbdata,traits)
```

```{r sommer version}
# RhpcBLASctl::blas_set_num_threads(56)
# 
# MultiTrialTraitData<-dbdata$MultiTrialTraitData[[2]]
# fixedFormula<-"Value ~ yearInLoc"
# randFormula<-paste0("~vs(GID) + vs(trialInLocYr) + vs(at(CompleteBlocks,'Yes'),repInTrial) + vs(at(IncompleteBlocks,'Yes'),blockInRep)")
# library(sommer)
# fit <- sommer::mmer(fixed = as.formula(fixedFormula),
#                     random = as.formula(randFormula),
#                     weights = WT,
#                     data=MultiTrialTraitData,
#                     date.warning = F,
#                     getPEV = F)
# MultiTrialTraitData %>% distinct(GID)
# Error: cannot allocate vector of size 537.8 Gb
```