#!/usr/bin/env Rscript
bindir <- ""
treecns <- paste0(bindir,"treecns")
rert <- paste0(bindir,"rert")
gfmix.lnl <- paste0(bindir,"gfmix_lnl_align")
anfreq <- paste0(bindir,"an_freq")
charfreq.taxa.prog <- paste0(bindir,"charfreq-taxa")
#==============================================================================
FnameE <- function(fname){
  if(!file.exists(fname)) stop(paste(fname,"does not exist"))
  return(fname)
}
rinterleave <- function(filename,as.int=FALSE,nchar=4,sortname=FALSE){
  # IN: as.int - T => change to integer codes 0,...,19. >19 means missing data
  #     nchar - only needed if as.int=TRUE. nchar=20 for amino acid data
  #     sortnames - for integer names. Sort them
  # OUT:
  # names
  # seq - nsite x ntaxa matrix of characters.
  ##
  ## Note: integer values differ from rinterleavef which uses 1,...,20
  x <- scan(filename,quiet=TRUE,what=character(),sep="\n")
  ntaxa <- unlist(strsplit(x[1],split=" "))
  if(sum(ntaxa=="")>0){ # sometimes "" is obtained after strsplit
    ntaxa <- ntaxa[ntaxa!=""]
  }
  ntaxa <- as.numeric(ntaxa)[1]
  x <- x[-1]
  names <-  substring(x[1:ntaxa],first=1,last=10)
  x[1:ntaxa] <- substring(x[1:ntaxa],11)
  x <- gsub(" ","",x)
  x <- matrix(x,nrow=ntaxa)
  x <- apply(x,1,paste,collapse="")
  seq <- NULL
  for(i in 1:ntaxa){
    seqc <- unlist(strsplit(x[i], split=NULL))
    ## If nsite=2 the string N & A will be read in as NA
    if(length(seqc)==1)
      if(is.na(seqc)) seqc <- c("N","A")
    seq <- cbind(seq,seqc)
  }
  if(as.int){
    if(nchar==4) seq <- l2i(seq)
    if(nchar==20) seq <- l2ip(seq)
  }
  if(sortname){
    o <- order(names)
    seq <- seq[,o]; names <- names[o]
  }
  return(list(seq=seq, names=names))
}
r.singleline <- function(seqfile="tmp.infile", as.int=FALSE, nchar=4, nexus=F,
                         sortname=F){
  # IN: as.int - T => change to integer codes 0,...,19
  #     nchar - only needed if as.int=T
  #     sortnames - for integer names. Sort them
  # OUT:
  # names
  # seq - nsite x ntaxa matrix of characters.
  ##
  ## Note: integer values differ from rinterleavef which uses 1,...,20
  x <- scan(seqfile, what=character(),quiet=T)
  if(nexus){
    i <- match("Dimensions",x)
    ntaxa <- as.numeric(gsub("NTAX=","",x[(i+1)]))
    i <- match("Matrix",x)
    x <- x[(i+1):(length(x)-2)]
  }
  else{
    ntaxa <- as.integer(x[1])
    x <- x[-c(1:2)]
  }
  x <- matrix(x,ncol=2,byrow=T)
  names <- x[,1]
  x <- x[,2]
  seq <- NULL
  for(i in 1:ntaxa){
    seqc <- unlist(strsplit(x[i], split=NULL))
    ## If nsite=2 the string N & A will be read in as NA
    if(length(seqc)==1)
      if(is.na(seqc)) seqc <- c("N","A")
    seq <- cbind(seq,seqc)
  }
  if(as.int){
    if(nchar==4) seq <- l2i(seq)
    if(nchar==20) seq <- l2ip(seq)
  }
  if(sortname){
    o <- order(names)
    seq <- seq[,o]; names <- names[o]
  }
  return(list(seq=seq, names=names))
}
w.singleline <- function(filename,seq,names){
  cat(dim(seq)[2],dim(seq)[1],"\n",file=filename)
  for(i in 1:length(names)){
    if(nchar(names[i])<10)
      cat(names[i],rep(" ",10-nchar(names[i])),file=filename,append=T,sep="")
    else
      cat(names[i],file=filename,append=T,sep="")
    cat(seq[,i],"\n",file=filename,append=T,sep="")
  }
}
charfreqc.taxa <- function(seqfile, nchar=4,
                           charfreq.taxa.prog="~/bio/prog/charfreq-taxa"){
  cmdline <- paste(charfreq.taxa.prog,nchar,"<",seqfile)
  fr <- system(cmdline,intern=TRUE)
  ntaxa <- scan(seqfile,quiet=TRUE,n=1)
  fr <- unlist(strsplit(fr,split=" "))
  fr <- fr[fr != ""]
  fr <- matrix(fr,nrow=ntaxa,byrow=TRUE)[,-1]
  fr <- matrix(as.numeric(fr),nrow=ntaxa)
  return(fr)
}
LocateRoot <- function(utreec,taxa){
  ##IN: taxa - labels 0,1,.... corresponding to one side of the split
  if(length(taxa)==1) return(taxa)
  ntaxa <- dim(utreec)[1]+1

  if(0 %in% taxa){
    spl.of.int <- rep(0,ntaxa)
    spl.of.int[(taxa+1)] <- 1
  }else{
    spl.of.int <- rep(1,ntaxa)
    spl.of.int[(taxa+1)] <- 0
  }
    
  splits <- utreec2splits(utreec)
  spl.label <- -1
  for(i in 1:dim(splits)[1]){
    if(sum(abs(splits[i,]-spl.of.int))==0){
      spl.label <- ntaxa+i-1
      break
    }
  }
  if(spl.label<0) stop("split not found")
  return(spl.label)
}
utreec2splits <- function(utreec){
  # OUT:
  # splits - (ntaxa-3)x ntaxa matrix of 1 and 0; 1 for taxa 1 
  ntaxa <- dim(utreec)[1]+1
  splits <- matrix(0,ncol=ntaxa,nrow=ntaxa-3)
  
  for(split in 1:(ntaxa-3))
    for(k in 1:2){
      b <- utreec[split,k]
      if (b < ntaxa)
        splits[split,(b+1)] <- 1
      else
        splits[split,] <- splits[split,] + splits[(b-ntaxa+1),]
    }

  splits <- split.leading1(splits)
  
  return(splits)
}
split.leading1 <- function(spls){
  for(spl in 1:dim(spls)[1])
    if (spls[spl,1] == 0){
      idx <- spls[spl,] == 0
      spls[spl,idx] <- 1
      spls[spl,!idx] <- 0
    }
  return(spls)
}
ut2nrt <- function(utreec,names,blabel){
  ntaxa <- dim(utreec)[1]+1
  tstring <- names
  for(join in 1:(ntaxa-1)){
    r <- utreec[join,1]+1; l <- utreec[join,2]+1
    lr <- utreec[join,3]; ll <- utreec[join,4]
    tstring <- c(tstring,
                 paste0("(",tstring[r],":",lr,",",tstring[l],":",ll,")"))
    if (!missing(blabel))
      tstring[(join+ntaxa)] <- paste(tstring[(join+ntaxa)],
                                     blabel[(join+ntaxa)],sep="")
    ## print(tstring)
  }
  return(paste0(tstring[(ntaxa-1+ntaxa)],";"))
}
dlistf <- function(ut){
  #OUT: dl - list dl[[b]]: descendents of branch b
  nt <- dim(ut)[1]+1
  dl <- vector("list",(2*nt-1))
  for(b in 1:nt) dl[[b]] <- b
  for(j in 1:(nt-1))
    dl[[(j+nt)]] <- c(dl[[(ut[j,1]+1)]], dl[[(ut[j,2]+1)]])
  ## for(j in 1:(2*nt-1)) dl[[j]] <- dl[[j]]-1
  return(dl)
}
#==============================================================================

args <- commandArgs()
iarg <- length(args)

## Process Command Line
seqfile <- long.seqfile <- treefile <- utreecfile <- taxafile <- NULL
garp <- "GARP"; fymink <- "FYMINK"
nclass <- -1; frfile <- NULL
alpha <- -1
exchangeability <- 9
deriv <- 0
oderiv <- -1
pr <- -1
multiplier.file <- wtfile <- uxparamfile <- yparamfile <- NULL
factr <- -1
nthreads <- 1
while(iarg>=6){
  if(substring(args[iarg],1,1)=='-'){
    opt <- args[iarg]
    is.opt <- TRUE
  }else{
    val <- args[iarg];
    is.opt <- FALSE
  }
  if(is.opt){	
    not.an.option <- TRUE
    
    ## Command line arguments    
    if(opt=="-s"){
      seqfile <- FnameE(val); not.an.option <- FALSE
    }   
    if(opt=="-l"){
      long.seqfile <- FnameE(val); not.an.option <- FALSE
    }
    if(opt=="-t"){
      treefile <- FnameE(val); not.an.option <- FALSE
    }
    if(opt=="-u"){
      utreecfile <- FnameE(val); not.an.option <- FALSE
    }
    if(opt=="-r"){
      taxafile <- FnameE(val); not.an.option <- FALSE
    }
    if(opt=="-g"){
      garp <- val; not.an.option <- FALSE
    }
    if(opt=="-k"){
      fymink <- val; not.an.option <- FALSE
    }
    if(opt=="-c"){
      nclass <- as.integer(val); not.an.option <- FALSE
    }
    if(opt=="-f"){
      frfile <- FnameE(val); not.an.option <- FALSE
    }
    if(opt=="-w"){
      wtfile <- FnameE(val); not.an.option <- FALSE
    }
    if(opt=="-a"){
      alpha <- as.double(val); not.an.option <- FALSE
    }    
    if(opt=="-e"){
      exchangeability <- as.integer(val); not.an.option <- FALSE
    }   
    if(opt=="-n"){
      deriv <- as.integer(val); not.an.option <- FALSE
    }
    if(opt=="-o"){
      oderiv <- as.integer(val); not.an.option <- FALSE
    }       
    if(opt=="-p"){
      pr <- as.integer(val); not.an.option <- FALSE
    }       
    if(opt=="-x"){
      uxparamfile <- val; not.an.option <- FALSE
    }       
    if(opt=="-y"){
      yparamfile <- val; not.an.option <- FALSE
    }
    if(opt=="-d"){
      factr <- as.double(val); not.an.option <- FALSE
    }
    if(opt=="-b"){
      nthreads <- as.integer(val); not.an.option <- FALSE
    }
    if(opt=="-m"){
      multiplier.file <- FnameE(val); not.an.option <- FALSE
    }
    
    if(not.an.option) stop(paste(opt,"is not an option\n"))  
  }
  iarg <- iarg-1
}

## Check options
## sequence file
if(!is.null(long.seqfile) && !is.null(seqfile))
  stop("Only one of -s seqfile -l seqfile is allowed")
if(is.null(long.seqfile) && is.null(seqfile))
  stop("Need sequence file: -s seqfile or -l seqfile.with.longnames")
## tree file
if(!is.null(treefile) && !is.null(utreecfile))
  stop("Only one of -u utreecfile -t treefile is allowed")
if(is.null(treefile) && is.null(utreecfile))
  stop("Need treefile file: -t treefile")
## rootfile
if(is.null(taxafile))
  stop("Need file with taxa on one side of root: -r rootfile")
## frequencies
if(is.null(frfile))stop("Need file with frequencies for mixture: -f frfile" )
if(nclass<0)stop("Need nclass>0: -c nclass")
## weights
if(!is.null(wtfile))
  if(length(scan(wtfile,quiet=TRUE))<nclass)
    stop("-w wtfile: wtfile should have at least as many entries as the number of classes")
## exchangeabilities
if(!(exchangeability %in% c(0,5,6,9)))
  stop("-e exchangeability: exchangeability should be 0=poisson, 5=jtt, 6=wag or 9=lg")
## optimization
if(!(deriv %in% c(0,1,2,3,4,5,6)))
  stop("-n deriv: deriv should be in {0,1,...6}")
if(!(oderiv %in% c(-1,0,1,2,3,4,5,6)))
  stop("-o opt: opt should be in {-1,1,2,...6}")

## Sequence file
if(!is.null(seqfile)){
  seq <- rinterleave(seqfile)
}else{
  seq <- r.singleline(long.seqfile) # seqfile w/ long names or single line
}
names <- trimws(seq$names,which="right")
seq <- seq$seq
## Write seqfile with integer names to tmp.seqfile
## winterleave a little slower but useful for checking seqfile
w.singleline("tmp.seqfile",seq,names=c(0:(length(names)-1)))
## winterleave(seq,names=c(0:(length(names)-1)),seqfile="tmp.seqfile")
ntaxa <- scan("tmp.seqfile",n=1,quiet=TRUE)

## Tree
if(!is.null(treefile)){
  ## Convert tree to utreec
  tstr <- scan(treefile,what=character(),quiet=TRUE)
  cmdline <- paste(treecns,treefile,"tmp.utreecu -1 <",seqfile,"> tmp.names")
  system(cmdline)
  utreec <- matrix(scan("tmp.utreecu",quiet=TRUE),ncol=4,byrow=TRUE)
  names.tree <- trimws(scan("tmp.names",what=character(),quiet=TRUE),
                       which="right")

  ## Process root file
  ## Need to warn about no spaces being allowed in names
  taxa <- trimws(scan(taxafile,what=character(),quiet=TRUE),"right")
  taxa.tmp <- match(taxa,names)
  not.present <- is.na(taxa.tmp)
  if(sum(not.present)>0)
    stop(paste("Taxa in rootfile but not sequence file:\n",
               taxa.tmp[not.present]))
  taxa <- taxa.tmp-1
  cmdline <- paste(rert,LocateRoot(utreec,taxa),dim(utreec)[1]+1,
                   "< tmp.utreecu > tmp.utreec")
  system(cmdline)
  utreec <- matrix(scan("tmp.utreec",quiet=TRUE),ncol=4,byrow=TRUE)
}

## GARP and FYMINK
aal <- c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F",
         "P", "S", "T", "W", "Y", "V")
GARPidx <- function(g){
  go <- g
  g <- unlist(strsplit(g,split=""))
  g <- match(g,aal)
  if(sum(is.na(g)))
    stop(paste("group",go,"is not all capitalized amino acid letter codes"))
  return(g)
}
cia <- rep(2,20)
cia[GARPidx(garp)] <- 0
cia[GARPidx(fymink)] <- 1
cat(cia,file="tmp.gpfile")

## Create command line
cmdline <- paste0(gfmix.lnl,
                  " -s tmp.seqfile",
                  " -c ",nclass,
                  " -f ",frfile)
if(!is.null(utreecfile)) cmdline <- paste0(cmdline," -u ",utreecfile)
if(!is.null(treefile)) cmdline <- paste0(cmdline," -u tmp.utreec")
cmdline <- paste0(cmdline," -g tmp.gpfile")
if(!is.null(multiplier.file))
  cmdline <- paste0(cmdline," -m ",multiplier.file)
if(!is.null(wtfile)) cmdline <- paste0(cmdline," -w ",wtfile)
if(alpha>0) cmdline <- paste0(cmdline," -a ",alpha)
cmdline <- paste0(cmdline," -e ",exchangeability)
if(oderiv<0 && deriv>0) cmdline <- paste0(cmdline," -n ",deriv)
if(oderiv>0) cmdline <- paste0(cmdline," -o ",oderiv)
if(pr>=0) cmdline <- paste0(cmdline," -p ",pr)
xparamfile <- ifelse(is.null(uxparamfile),"tmp.paramfile",uxparamfile)
cmdline <- paste0(cmdline," -x ",xparamfile)
if(factr>0) cmdline <- paste0(cmdline," -d ",factr)
if(nthreads>1) cmdline <- paste0(cmdline," -b ",nthreads)
## cat(cmdline,"\n")
system(cmdline)

## Parameter File
x <- scan(xparamfile,quiet=TRUE)
if(!is.null(yparamfile)){
  alpha <- x[1]
  cat(sprintf("alpha = %.3f\n\n",alpha),file=yparamfile)
  x <- x[-1]

  utreec <- matrix(x[1:(4*(ntaxa-1))],ncol=4,byrow=TRUE)
  cat("Estimated Tree\n",file=yparamfile,append=TRUE)
  cat(ut2nrt(utreec,names),"\n\n",file=yparamfile,append=TRUE)
  x <- x[-c(1:(4*(ntaxa-1)))]

  cat("Weights\n",file=yparamfile,append=TRUE)
  wtc <- x[1:nclass]
  wtc <- wtc/sum(wtc)
  for(ic in 1:nclass)
    cat(sprintf("%3i %.5e\n",ic,wtc[ic]),file=yparamfile,append=TRUE)
  cat("\n",file=yparamfile,append=TRUE)
  x <- x[-c(1:nclass)]

  ## GF ratio input files
  write(format(t(x),sci=TRUE,digits=16),ncol=2,file="tmp.bfile")
  write(format(t(utreec),sci=TRUE,digits=16),ncol=4,file="tmp.utreec")
  write(format(wtc,sci=TRUE,digits=16),ncol=1,file="tmp.wtc")

  ## GF ratio command line
  cmdline <- paste0(anfreq," -b tmp.bfile -c -u tmp.utreec",
                    " -f ",frfile," -w tmp.wtc -g tmp.gpfile")
  if(alpha>0) cmdline <- paste0(cmdline," -a ",alpha)
  cmdline <- paste0(cmdline," -e ",exchangeability," > tmp.mu")
  ## cat(cmdline,"\n")
  system(cmdline)

  ## GF ratio output
  mu <- matrix(scan("tmp.mu",quiet=TRUE),ncol=20,byrow=TRUE)
  ## cia=0<->garp, cia=1<->fymink
  gf.ratio <- apply(mu[,cia==0],1,sum)/apply(mu[,cia==1],1,sum)
  gf.taxa <- charfreqc.taxa("tmp.seqfile",nchar=20,
                            charfreq.taxa.prog=charfreq.taxa.prog)
  gf.taxa <- apply(gf.taxa[,cia==0],1,sum)/apply(gf.taxa[,cia==1],1,sum)
  
  cat("gamma(G), gamma(F), GF ratio each node and emprical GF for taxa\n",
      file=yparamfile,append=TRUE)
  x <- matrix(x,ncol=2,byrow=TRUE)
  for(i in 1:ntaxa)
    cat(sprintf("%3i %.5e %.5e %.5e %.5e\n",i-1,x[i,1],x[i,2],
                gf.ratio[i],gf.taxa[i]),file=yparamfile,append=TRUE)
  for(i in (ntaxa+1):dim(x)[1])
    cat(sprintf("%3i %.5e %.5e %.5e\n",i-1,x[i,1],x[i,2],gf.ratio[i]),
        file=yparamfile,append=TRUE)
  cat("\n",file=yparamfile,append=TRUE)

  cat("Edge/node labels. Taxa descending from edge/node are indicated\n",
      file=yparamfile,append=TRUE)
  dl <- dlistf(utreec)
  for(i in 1:length(dl))
    cat(i-1, names[dl[[i]]],"\n",file=yparamfile,append=TRUE)
  cat("\n",file=yparamfile,append=TRUE)

  cat("Newick treefile with node labels\n",file=yparamfile,append=TRUE)
  utop <- utreec
  utop[,3:4] <- 1
  cat(ut2nrt(utop,paste0(0:(ntaxa-1),"|",names),0:(2*ntaxa-2)),
      "\n\n",file=yparamfile,append=TRUE)
}

tmp.files <- c("tmp.seqfile","tmp.utreecu","tmp.names","tmp.utreec","tmp.gpfile",
                "tmp.bfile","tmp.wtc","tmp.mu")
if(is.null(uxparamfile)) tmp.files <- c(tmp.files,"tmp.paramfile")
l <- file.remove(tmp.files)
