DT_wheat: wheat lines dataset

Description

Information from a collection of 599 historical CIMMYT wheat lines. The wheat data set is from CIMMYT's Global Wheat Program. Historically, this program has conducted numerous international trials across a wide variety of wheat-producing environments. The environments represented in these trials were grouped into four basic target sets of environments comprising four main agroclimatic regions previously defined and widely used by CIMMYT's Global Wheat Breeding Program. The phenotypic trait considered here was the average grain yield (GY) of the 599 wheat lines evaluated in each of these four mega-environments.

A pedigree tracing back many generations was available, and the Browse application of the International Crop Information System (ICIS), as described in (McLaren et al. 2000, 2005) was used for deriving the relationship matrix A among the 599 lines; it accounts for selection and inbreeding.

Wheat lines were recently genotyped using 1447 Diversity Array Technology (DArT) generated by Triticarte Pty. Ltd. (Canberra, Australia; http://www.triticarte.com.au). The DArT markers may take on two values, denoted by their presence or absence. Markers with a minor allele frequency lower than 0.05 were removed, and missing genotypes were imputed with samples from the marginal distribution of marker genotypes, that is, \(x_{ij}=Bernoulli(\hat p_j)\), where \(\hat p_j\) is the estimated allele frequency computed from the non-missing genotypes. The number of DArT MMs after edition was 1279.

Usage

data(DT_wheat)

Arguments

Format

Matrix Y contains the average grain yield, column 1: Grain yield for environment 1 and so on.

References

Covarrubias-Pazaran G (2016) Genome assisted prediction of quantitative traits using the R package sommer. PLoS ONE 11(6): doi:10.1371/journal.pone.0156744

Giovanny Covarrubias-Pazaran (2024). lme4breeding: enabling genetic evaluation in the age of genomic data. To be submitted to Bioinformatics.

Douglas Bates, Martin Maechler, Ben Bolker, Steve Walker (2015). Fitting Linear Mixed-Effects Models Using lme4. Journal of Statistical Software, 67(1), 1-48.

McLaren, C. G., L. Ramos, C. Lopez, and W. Eusebio. 2000. ``Applications of the geneaology manegment system.'' In International Crop Information System. Technical Development Manual, version VI, edited by McLaren, C. G., J.W. White and P.N. Fox. pp. 5.8-5.13. CIMMyT, Mexico: CIMMyT and IRRI.

Giovanny Covarrubias-Pazaran (2024). evola: a simple evolutionary algorithm for complex problems. To be submitted to Bioinformatics.

Gaynor, R. Chris, Gregor Gorjanc, and John M. Hickey. 2021. AlphaSimR: an R package for breeding program simulations. G3 Gene|Genomes|Genetics 11(2):jkaa017. https://doi.org/10.1093/g3journal/jkaa017.

Chen GK, Marjoram P, Wall JD (2009). Fast and Flexible Simulation of DNA Sequence Data. Genome Research, 19, 136-142. http://genome.cshlp.org/content/19/1/136.

Examples

Run this code


data(DT_wheat)
DT <- DT_wheat
GT <- apply(GT_wheat,2,as.numeric)
rownames(GT) <- rownames(GT_wheat)
DT <- data.frame(pheno=as.vector(DT),
                 env=as.factor(paste0("e", sort(rep(1:4,nrow(DT))))),
                 id=rep(rownames(DT),4))

K <- tcrossprod(GT-1)
m <- sum(diag(K))/nrow(K)
K <- K/m
K <- K + diag(1e-05, ncol(K), ncol(K))
K[1:4,1:4]
##
head(DT)

# \donttest{

############## sommer ###############
if(requireNamespace("sommer")){
library(sommer)
mix0 <- mmes(pheno~1,
             random = ~vsm(ism(id),Gu=K),
             rcov=~units,
             data=DT[which(DT$env == "e1"),])
summary(mix0)$varcomp
# if using mmes=TRUE provide Gu as inverse
Ki <- solve(K + diag(1e-4,ncol(K),ncol(K)))
Ki <- as(as(as( Ki,  "dMatrix"), "generalMatrix"), "CsparseMatrix")
attr(Ki, 'inverse')=TRUE

}

############## lme4breeding ###############

if(requireNamespace("lme4breeding")){
library(lme4breeding)
#### main effect model
system.time(
mix0 <- lmeb(pheno ~ (1|id),
                 relmat = list(id=K),
                 data=DT)
                 )
vc <- VarCorr(mix0); print(vc,comp=c("Variance"))
sigma(mix0)^2 # error variance
BLUP <- ranef(mix0, condVar=TRUE)
condVAR <- lapply(BLUP, function(x){attr(x, which="postVar")}) # take sqrt() for SEs

#### unstructured model
DT <- DT[with(DT, order(env, id)), ] # sort for rotation
system.time(
  mix1 <- lmeb(pheno ~ (0 + env | id),
                   relmat = list(id=K),
                   rotation = TRUE,
                   data=DT)
)
vc <- VarCorr(mix1); print(vc,comp=c("Variance"))
sigma(mix1)^2 # error variance

}

############## evola ###############

if(requireNamespace("evola")){
library(evola)
DT <- as.data.frame(DT_wheat)
DT$id <- rownames(DT) # IDs
DT$occ <- 1; DT$occ[1]=0 # to track occurrences
DT$dummy <- 1; DT$dummy[1]=0 # dummy trait

# if genomic
G <- K
# if pedigree
A <- A_wheat
A[1:4,1:4]
##Perform eigenvalue decomposition for clustering
##And select cluster 5 as target set to predict
pcNum=25
svdWheat <- svd(A, nu = pcNum, nv = pcNum)
PCWheat <- A %*% svdWheat$v
rownames(PCWheat) <- rownames(A)
DistWheat <- dist(PCWheat)
TreeWheat <- cutree(hclust(DistWheat), k = 5 )
plot(PCWheat[,1], PCWheat[,2], col = TreeWheat, 
     pch = as.character(TreeWheat), xlab = "pc1", ylab = "pc2")
vp <- rownames(PCWheat)[TreeWheat == 3]; length(vp)
tp <- setdiff(rownames(PCWheat),vp)

As <- A[tp,tp]
DT2 <- DT[rownames(As),]
DT2$cov <- apply(A[tp,vp],1,mean)

# we assign a weight to x'Dx of (60*pi)/180=1
res<-evolafit(formula=cbind(cov, occ)~id, dt= DT2, 
              # constraints: if sum is greater than this ignore 
              constraintsUB = c(Inf, 100), 
              # constraints: if sum is smaller than this ignore
              constraintsLB= c(-Inf, -Inf), 
              # weight the traits for the selection
              b = c(1,0), 
              # population parameters
              nCrosses = 100, nProgeny = 10, 
              recombGens=1, nChr=1, mutRateAllele=0,
              # coancestry parameters
              D=As, lambda= (60*pi)/180 , nQtlStart = 90, 
              # selection parameters
              propSelBetween = 0.5, propSelWithin =0.5, 
              nGenerations = 30, verbose = TRUE) 

Q <- pullQtlGeno(res$pop, simParam = res$simParam, trait=1); Q <- Q/2
best <- bestSol(res$pop)[,"fitness"]
sum(Q[best,]) # total # of inds selected
pareto(res)

}

################ end ##################

# }

Run the code above in your browser using DataLab