# Parametric Estimation of the Expected Number of Genes

# The first two functions are the main ones of interest. They give
#   the parameter estimates for the parametric model (dnb.ml)
#   the expected number of new genes (deltat)

#= Obtain the fitted parameters for the truncated negative binomial model
dnb.ml <- function(k, Nk, x0){
  # arguments:
  #  k, Nk - vectors
  #          Nk[i] gives the number of genes that had k[i] hits
  #
  #  x0 - scalar truncation parameter
  #       the model is fit to the counts for k=1,...,x0 hits.

  # value:
  #  a list with elements
  #   alpha - the estimated alpha parameter
  #   beta - the estimated beta parameter (gamma = beta/(1+beta))
  
  # example:
  #  k <- c(1,2, 3, 4, 5, 6, 7, 9, 14)
  #  Nk <- c(200, 21, 14, 4, 3, 3, 1, 1, 1)
  #  params <- dnb.ml(k, Nk, 10)
  
  f <- function(v){
    return(-dnb.lik(k, Nk, x0, v[1], v[2]))
  }
  v <- optim(c(0,1), f, lower = c(-1.0+1.0e-8,1.0e-8), method = "L-BFGS-B")
  return(list(alpha = v$par[1], beta = v$par[2]))
}

#= The expected number of new genes for the trucated negative binomial model
deltat <- function(k, Nk, alpha, beta, tval = seq(0.01, 1.0, 0.01), x0 = 10){
  # arguments:
  #  k, Nk - vectors
  #          Nk[i] gives the number of genes that had k[i] hits
  #  
  #  alpha - the estimated alpha parameter
  #
  #  beta - the estimated beta parameter (gamma = beta/(1+beta))
  #
  #  tval - vector: estimates are of the expected numbers of new genes in
  #            a sample of size tval * n, where n is the original sample size

  # value:
  #  dtp - the expected numbers of new genes in
  #        a sample of size tval * n, where n is the original sample size
  
  # example:
  #  k <- c(1,2, 3, 4, 5, 6, 7, 9, 14)
  #  Nk <- c(200, 21, 14, 4, 3, 3, 1, 1, 1)
  #  params <- dnb.ml(k, Nk, 10)
  #  tval <- seq(0.01, 2, 0.01)
  #  dt <- deltat(k, Nk, params$alpha, params$beta, tval = tval)
  
  gam <- beta/(beta+1)
  if (alpha != 0)
    dtp <- Nk[1] * (1 - (1 + gam*tval)^(-alpha))/(alpha*gam)
  else
    dtp <- Nk[1] * log(1 + gam*tval)/gam
  return(dtp)
}

#= likelihood for the truncated negative binomial model
dnb.lik <- function(k, Nk, x0, alpha, beta){
  # arguments:
  #  k, Nk - vectors
  #          Nk[i] gives the number of genes that had k[i] hits
  #
  #  x0 - scalar truncation parameter
  #       the model is fit to the counts for k=1,...,x0 hits.
  #
  #  alpha - alpha parameter
  #
  #  beta - beta parameter (gamma = beta/(1+beta))

  # value:
  #  likelihood
  
  Nk <- Nk[k <= x0]
  N0 <- sum(Nk)
  k <- k[k <= x0]
  p <- dnb.ext(x0, alpha, beta)
  lik <- sum( Nk*log(p[k]) )
  return(lik)
}

#= vector of multinomial probs for negative binomial
dnb.ext <- function(x0, alpha, beta){
  # arguments:
  #  x0 - scalar truncation parameter
  #       the model is fit to the counts for k=1,...,x0 hits.
  #
  #  alpha - alpha parameter
  #
  #  beta - beta parameter (gamma = beta/(1+beta))

  # value:
  #  a vector giving the probabilities for x=1,...x_0
  
  y <- c(1:x0)
  gam <- beta/(beta+1)
  p <- gamma(y+alpha)*gam^(y-1)/(gamma(y+1)*gamma(1+alpha))
  p <- p/sum(p)
  return(p)
}