#' Simulate data set
#' @export
#' @param n number of data points
#' @param p number of covariates
#' @param setting (optional) a list (or named vector) of all the arguments
#' @param t.model propensity score model, see Details
#' @param t.model.x model on X before t.model, see Details
#' @param y.model outcome model, see Details
#' @param t.param parameters in t.model
#' @param y0.param parameters in y.model for Y0
#' @param y1.param parameters in y.model for Y1
#' @param sigma0 noise level in y.model for Y0, see Details
#' @param sigma1 noise level in y.model for Y1, see Details
#' @param sigma.normal noise level for the "normal" propensity score model
#' @param quadratic.offset the offset if t.model.x or y.model is quadratic,
#'                         see deatils (will appear as qo)
#' @return a "obdata" object
#' @details
#' The data is generated by the following models
#' \itemize {
#' \item t.mode.x = "linear": logit.true = X * theta; \cr
#'                = "quadratic": logit.true = (X+qo)^2 * theta; \cr
#'                = "cubic": logit.true = X^3 * theta; \cr
#' \item t.model = "logistic": logit(P(T=1)) = logit.true; \cr
#'               = "normal": T = (logit.true + sigma.norm * rnorm(n) > 0); \cr
#'               = "loglinear": log(P(T=1)) = logit.true; \cr
#' \item y.model = "linear: Y(0) = X * beta; \cr
#'               = "quadratic" : Y(0) = (X+qo)^2 * beta; \cr
#'               = "cubic": Y(0) = X^3 * beta; \cr
#' Y(1) and Y are always the same: \cr
#' Y(1) = 2 * Y(0) + 1 + sigma * N(0,1); \cr
#' Y = T * Y(1) + (1-T) * Y(0); \cr
#' }
generate.synthetic.data <- function(
    n,
    p = 2,
    setting = NULL,
    t.model = c("logistic", "normal", "loglinear"),
    t.model.x = c("linear", "quadratic", "cubic"),
    y.model = c("linear", "quadratic", "cubic"),
    t.param = c(0, rep(1, p)),
    y0.param = c(-2, rep(0.2, p)),
    y1.param = c(-2, rep(-2, p)),
    sigma0 = 0.2,
    sigma1 = 0.2,
    sigma.normal = 5,
    quadratic.offset = 1) {

    if (!is.null(setting)) { # if the parameters are passed through setting
        minimal.params <- c("t.model", "y.model")
        if (length(intersect(names(setting), minimal.params)) !=
            length(minimal.params)) {
            stop(paste("Parameter",
                       setdiff(minimal.params, names(setting)) ,"is missing!"))
        }
        for (param in names(setting)) {
            assign(param, setting[[param]])
        }
    }

                                        # check the arguments
    t.model <- match.arg(as.character(t.model),
                         c("logistic", "normal", "loglinear"))
    t.model.x <- match.arg(as.character(t.model.x),
                           c("linear", "quadratic", "cubic"))
    y.model <- match.arg(as.character(y.model),
                         c("linear", "quadratic", "cubic"))

    X <- cbind(rep(1, n), matrix(rnorm(n * p), n, p))
    ## X <- cbind(rep(1, n), matrix(runif(n * p, -3, 3), n, p))
    logit.true <- switch(t.model.x,
                         linear = X %*% t.param,
                         quadratic = (X + quadratic.offset)^2 %*% t.param,
                         cubic = X^3 %*% t.param)
    T <- switch(t.model,
                logistic = as.numeric(runif(n) < plogis(logit.true)),
                normal = as.numeric(logit.true + sigma.normal * rnorm(n) > 0),
                loglinear = as.numeric(runif(n) < exp(logit.true)))
    Y0 <- switch(y.model,
                 linear = X %*% y0.param,
                 quadratic = (X + quadratic.offset)^2 %*% y0.param,
                 cubic = X^3 %*% y0.param)
    Y0.true <- Y0
    Y0 <- Y0 + sigma0 * rnorm(n)
    Y1 <- switch(y.model,
                 linear = X %*% y1.param,
                 quadratic = (X + quadratic.offset)^2 %*% y1.param,
                 cubic = X^3 %*% y1.param)
    Y1.true <- Y1
    Y1 <- Y1 + sigma1 * rnorm(n)
    Y <- T * Y1 + (1 - T) * Y0
                                        # features are just X without intercept
    features <- X[, -1]

    return(list(X = X,
                features = features,
                T = T,
                Y = Y,
                Y0 = Y0,
                Y1 = Y1,
                logit.true = logit.true,
                Y0.true = Y0.true,
                Y1.true = Y1.true,
                t.model = t.model,
                y.model = y.model,
                t.param = t.param,
                y0.param = y0.param,
                y1.param = y1.param,
                sigma = c(sigma0 = sigma0,
                    sigma1 = sigma1)))

}


#' Simulate data set by Kang and Schafer (2007)
#' @description only X and Y should be considered observable.
#' @export
#' @inheritParams generate.synthetic.data
#' @param y.linear is Y linear in X or Z
#' @param t.linear is the true logit linear in X or Z
#' @param sigma.y the noise level of Y
#' @references J. Kang and J. Schafer, Demystifying Double Robustness:
#' A Comparison of Alternative Strategies for Estimating a Population Mean from
#' Incomplete Data, Statistical Science, 2007.
generate.synthetic.data.KS <- function(n, setting = NULL,
                                       y.linear = c("X", "Z"),
                                       y.param = c(210, 27.4, 13.7, 13.7, 13.7),
                                       t.linear = c("X", "Z"),
                                       t.param = c(0, -1, 0.5, -0.25, -0.1),
                                       sigma.y = 1) {

                                        # check arguments
    if (!is.null(setting)) { # if the parameters are passed through setting
        minimal.params <- c("t.linear", "y.linear")
        if (length(intersect(names(setting), minimal.params)) !=
            length(minimal.params)) {
            stop(paste("Parameter",
                       setdiff(minimal.params, names(setting)) ,"is missing!"))
        }
        for (param in names(setting)) {
            assign(param, setting[[param]])
        }
    }

    ## check the arguments
    t.linear <- match.arg(as.character(t.linear),
                          c("X", "Z"))
    y.linear <- match.arg(as.character(y.linear),
                          c("X", "Z"))

    X <- matrix(rnorm(n * 4), n, 4)
    ## X <- matrix(runif(n * 4, -3, 3), n, 4)
    Z <- matrix(0, n, 4)
    Z[, 1] <- exp(X[, 1] / 2)
    Z[, 2] <- X[, 2] / (1 + exp(X[, 1])) + 10
    Z[, 3] <- (X[, 1] * X[, 3] / 25 + 0.6)^3
    Z[, 4] <- (X[, 2] + X[, 4] + 20)^2
    ## X <- scale(X)
    Z <- scale(Z)

    features <- X
    X <- cbind(rep(1, n), X)
    Z <- cbind(rep(1, n), Z)

    Y.true <- switch(y.linear,
                X = X %*% y.param,
                Z = Z %*% y.param)
    Y <- Y.true + sigma.y * rnorm(n)
    logit.true <- switch(t.linear,
                         X = X %*% t.param,
                         Z = Z %*% t.param)
    T = as.numeric(runif(n) < plogis(logit.true))

    return(list(X = X,
                features = features,
                T = T,
                Y.true = Y.true,
                Y = Y,
                logit.true = logit.true,
                Z = Z,
                t.linear = t.linear,
                y.linear = y.linear,
                t.param = t.param,
                y.param = y.param,
                sigma.y = sigma.y))

}

generate.synthetic.data.KS.correct <- function(n,
                                               sigma.y = 1) {

    library(MASS)
    X <- mvrnorm(n, mu = rep(0, 4), Sigma = diag(4))
    prop <- 1 / (1 + exp(X[,1] - 0.5 * X[,2] +
                             0.25*X[,3] + 0.1 * X[,4]))
    treat <- rbinom(n, 1, prop)
    Y.true <- 210 + 27.4*X[,1] + 13.7*X[,2] + 13.7*X[,3] + 13.7*X[,4]
    Y <- Y.true + sigma.y * rnorm(n)
    X.mis <- cbind(exp(X[,1]/2), X[,2]*(1+exp(X[,1]))^(-1)+10,
                   (X[,1]*X[,3]/25+.6)^3, (X[,2]+X[,4]+20)^2)

    return(list(X = X.mis,
                T = treat,
                Y.true = Y.true,
                Y = Y,
                Z = X,
                sigma.y = sigma.y))

}


#' simulate data set in Lunceford & Davidian (2003)
#' @export
#' @param n data size
#' @param nu some Y parameters
#' @param T.assoc association between T and Z
#' @param Y.assoc association between Y and X
#'
generate.synthetic.data.LD <- function(n,
                                       nu = c(0, -1, 1, -1, 2),
                                       T.assoc = c("no", "moderate", "strong"),
                                       Y.assoc = c("no", "moderate", "strong")) {

    T.assoc <- match.arg(as.character(T.assoc),
                         c("no", "moderate", "strong"))
    Y.assoc <- match.arg(as.character(Y.assoc),
                         c("no", "moderate", "strong"))
    beta <- switch(T.assoc,
                   no = c(0, 0, 0, 0),
                   moderate = c(0, 0.3, -0.3, 0.3),
                   strong = c(0, 0.6, -0.6, 0.6))
    xi <- switch(Y.assoc,
                 no = c(0, 0, 0),
                 moderate = c(-0.5, 0.5, 0.5),
                 strong = c(1, 1, 1))

    require(MASS)
    X3 <- as.numeric(runif(n) < 0.2)
    Z3 <- as.numeric(runif(n) < 0.75 * X3 + 0.25 * (1 - X3))
    index <- which(X3 == 0)
    Sigma <- matrix(c(1, 0.5, -0.5, -0.5,
                      0.5, 1, -0.5, -0.5,
                      -0.5, -0.5, 1, 0.5,
                      -0.5, -0.5, 0.5, 1), 4, 4)
    XZ12 <- mvrnorm(n, mu = rep(0, 4), Sigma)
    XZ12[index, ] <- t(t(XZ12[index, ]) + c(-1, -1, 1, 1))
    XZ12[-index, ] <- t(t(XZ12[-index, ]) + c(1, 1, -1, -1))
    X <- cbind(XZ12[, c(1,3)], X3)
    Z <- cbind(XZ12[, c(2,4)], Z3)

    logit.true <- beta[1] + beta[2] * X[, 1] + beta[3] * X[, 2] + beta[4] * X[, 3]
    T <- as.numeric(runif(n) < plogis(logit.true))
    Y <- nu[1] + nu[2] * X[, 1] + nu[3] * X[, 2] + nu[4] * X[, 3] +
        nu[5] * T + xi[1] * Z[, 1] + xi[2] * Z[, 2] + xi[3] * Z[, 3] + rnorm(n)

    return(list(T = T,
                Y = Y,
                X = X,
                features = cbind(X, Z),
                logit.true = logit.true))
}


#' get target effect in the simulated data set
#' @export
#' @inheritParams generate.synthetic.data
#' @inheritParams get.all.weights
#' @return the target effect
get.target.effect <- function(effect = c("treated", "overall"),
                              setting = NULL, n = 1000000, ...) {

    effect <- match.arg(as.character(effect), c("treated", "overall"))

    data <- generate.synthetic.data(n = n, setting = setting, ...)

    switch(effect,
           treated = mean(data$Y1[data$T == 1]) - mean(data$Y0[data$T == 1]),
           overall = mean(data$Y1) - mean(data$Y0))
}

generate.nonlinear.data <- function(
    n,
    p = 2,
    X.dist = c("normal", "unif"),
    ps.link = c("logistic", "normal", "loglinear"),
    ps.order = 2,
    ps.knots = c(-2, -0.5, 0, 0.5, 2),
    y.order = 2,
    y.knots = ps.knots,
    sigma = 0.2,
    seed = 3) {

    library(splines)

    X.dist <- match.arg(X.dist, c("normal", "unif"))
    ps.link <- match.arg(ps.link, c("logistic", "normal", "loglinear"))

    X <- switch(X.dist,
                normal = rnorm(n * p),
                unif = 2.5 * runif(n * p))
    X <- matrix(X, n, p)
    X.expand.ps <- cbind(
        bs(X[, 1], degree = ps.order, knots = ps.knots),
        bs(X[, 2], degree = ps.order, knots = ps.knots))
    X.expand.y <- cbind(
        bs(X[, 1], degree = y.order, knots = y.knots),
        bs(X[, 2], degree = y.order, knots = y.knots))


    set.seed(seed)
    theta <- rnorm(ncol(X.expand.ps))
    beta <- exp(theta) * 2
    logit.true <- X.expand.ps %*% theta
    Y.true <- X.expand.y %*% beta

    set.seed(Sys.time())
    propensity.score <- switch(ps.link,
                               logistic = plogis(logit.true),
                               normal = pnorm(logit.true),
                               loglinear = log(logit.true))
    T <- as.numeric(runif(n) < propensity.score)
    Y <- Y.true + sigma * rnorm(n)
    Y[T == 0] <- NA

    return(list(T = T,
                Y = Y,
                X = X,
                Y.true = Y.true))

}
### not being used
## weighted.var <- function(x, weights, normalize = FALSE) {
##     if (normalize)
##         weights <- weights / sum(weights)
##     x.bar <- sum(x * weights)
##     x.var <- sum((x - x.bar)^2 * weights)
##     x.var
## }

generate.smooth.data <- function(n,
                                 p = 2,
                                 beta = 2,
                                 nbasis = 11,
                                 sigma.y = 2) {

    ## features <- matrix(runif(n * p, -3, 3), n, p)
    features <- matrix(rnorm(n * p), n, p)
    features <- pmin(features, 3)
    features <- pmax(features, -3)

    library(fda)
    interaction.degree <- 2
    phi <- create.fourier.basis(c(-3, 3), nbasis)
    uni.basis <- lapply(1:ncol(features),
                        function(j) eval.basis(features[, j], phi)[, -1])

    model.formula <- paste(paste0("uni.basis[[", 1:ncol(features), "]]"), collapse = " + ")
    model.formula <- paste0("~ (", model.formula, ")^", interaction.degree, " -1")
    X <- model.matrix(as.formula(model.formula))

    degree <- lapply(colnames(X),
                     function(s)
                         sapply(unlist(strsplit(s, ":")),
                                function(ss)
                                    as.numeric(substr(ss, regexpr("cos|sin", ss) + 3, nchar(ss)))) )
    degree <- lapply(degree, function(x) {x[is.na(x)] <- 0; x} )
    c <- sqrt(sapply(degree, function(x) sum(x^(2 * beta))))

    logit.true <- (features[, 1] + features[, 2] + features[, 1] * features[, 2])
    propensity.score <- plogis(logit.true)
    T <- as.numeric(runif(n) < propensity.score)
    ## c[seq(1, length(c), by = 2)] <- Inf
    Y <- 1 * rowSums(features) + 5 * rowSums(abs(sin(features))) + X %*% (0 / c^2) + sigma.y * rnorm(n)

    return(list(features = features,
                X = X,
                Y = Y,
                T = T,
                logit.true = logit.true,
                propensity.score = propensity.score))

}


## Simulation dataset for  "Weight trimming and propensity score weighting" # in PLoS ONE 2011.
## The study design is based on methods printed in
## Setoguchi et al., "Evaluating uses of data mining
## techniques in propensity score estimation: a
## simulation study." Pharmacoepi Drug Saf 2008.


#' Generate continuous random variable correlated to variable x by rho invoked by the "F.generate" function
#' @export
#' @param x data vector
#' @param rho correlation coefficient
#' @return a correlated data vector of the same length as x

F.sample.cor <- function(x, rho) {
    y <- (rho * (x - mean(x)))/sqrt(var(x)) + sqrt(1 - rho^2) * rnorm(length(x))
                                        #cat("Sample corr = ", cor(x, y), "\n")
    return(y)
}

#' Generate simulation datasets
#' @param size sample size
#' @param scenario scenario
#' @return a dataset of size N, with binary variables: w1, w3, w5, w6, w8, w9 and continous variables: w2, w4, w7, w10. Confounders: w1, w2, w3, w4, Exposure predictors only: w5, w6, w7, Outcome predictors only: w8, w9, w10, correlations: (w1,w5)=0.2, (w2,w6)=0.9, (w3,w8)=0.2, (w4,w9)=0.9.
#'
F.generate <- function(size, scenario,
                       b0 = 0, b1 = 0.8, b2 = -0.25, b3 = 0.6, b4 = -0.4, b5 = -0.8, b6 = -0.5, b7 = 0.7,
                       a0 = -3.85, a1 = 0.3, a2 = -0.36, a3 = -0.73, a4 = -0.2, a5 = 0.71, a6 = -0.19,
                       a7 = 0.26,
                       g1 = -0.4 # effect of exposure
                       ) {
    w1 <- rnorm(size, mean=0, sd=1)
    w2 <- rnorm(size, mean=0, sd=1)
    w3 <- rnorm(size, mean=0, sd=1)
    w4 <- rnorm(size, mean=0, sd=1)
    w5 <- F.sample.cor(w1, 0.2)
    w6 <- F.sample.cor(w2, 0.9)
    w7 <- rnorm(size, mean=0, sd=1)
    w8 <- F.sample.cor(w3, 0.2)
    w9 <- F.sample.cor(w4, 0.9)
    w10 <- rnorm(size, mean=0, sd=1)

    ##~~ dichotomize variables (will attenuate correlations above)
    w1 <- ifelse(w1 > mean(w1), 1, 0)
    w3 <- ifelse(w3 > mean(w3), 1, 0)
    w5 <- ifelse(w5 > mean(w5), 1, 0)
    w6 <- ifelse(w6 > mean(w6), 1, 0)
    w8 <- ifelse(w8 > mean(w8), 1, 0)
    w9 <- ifelse(w9 > mean(w9), 1, 0)

    ##~~ scenarios for data generation models
    ## A: model with additivity and linearity
    ## B: mild non-linearity
    ## C: moderate non-linearity
    ## D: mild non-additivity
    ## E: mild non-additivity and non-linearity
    ## F: moderate non-additivity
    ## G: moderate non-additivity and non-linearity

    ## binary exposure modeling
    if (scenario == "A") {
        z.a_trueps <- (1 + exp( -(0 + b1*w1 + b2*w2 + b3*w3 + b4*w4 + b5*w5 + b6*w6 + b7*w7) ) )^-1
    } else if (scenario == "B") {
            z.a_trueps <- (1 + exp( -(0 + b1*w1 + b2*w2 + b3*w3 + b4*w4 + b5*w5 + b6*w6 + b7*w7
                                      + b2*w2*w2) ) )^-1
        } else if (scenario == "C") {
            z.a_trueps <- (1 + exp( -(0 + b1*w1 + b2*w2 + b3*w3 + b4*w4 + b5*w5 + b6*w6 + b7*w7
                                      + b2*w2*w2 +b4*w4*w4 + b7*w7*w7) ) )^-1
        } else if (scenario == "D") {
            z.a_trueps <- (1 + exp( -(0 + b1*w1 + b2*w2 + b3*w3 + b4*w4 + b5*w5 + b6*w6 + b7*w7
                                      + b1*0.5*w1*w3 + b2*0.7*w2*w4 + b4*0.5*w4*w5 + b5*0.5*w5*w6) ) )^-1
        } else if (scenario == "E") {
            z.a_trueps <- (1 + exp( -(0 + b1*w1 + b2*w2 + b3*w3 + b4*w4 + b5*w5 + b6*w6 + b7*w7
                                      + b2*w2*w2 + b1*0.5*w1*w3 + b2*0.7*w2*w4 + b4*0.5*w4*w5 + b5*0.5*w5*w6) ) )^-1
        } else if (scenario == "F") {
            z.a_trueps <- (1 + exp( -(0 + b1*w1 + b2*w2 + b3*w3 + b4*w4 + b5*w5 + b6*w6 + b7*w7
                                      + b1*0.5*w1*w3 + b2*0.7*w2*w4 + b3*0.5*w3*w5 + b4*0.7*w4*w6 + b5*0.5*w5*w7
                                      + b1*0.5*w1*w6 + b2*0.7*w2*w3 + b3*0.5*w3*w4 + b4*0.5*w4*w5 + b5*0.5*w5*w6) ) )^-1
        } else { # scenario G
            z.a_trueps <- (1 + exp( -(0 + b1*w1 + b2*w2 + b3*w3 + b4*w4 + b5*w5 + b6*w6 + b7*w7
                                      + b2*w2*w2 + b4*w4*w4 + b7*w7*w7 + b1*0.5*w1*w3 + b2*0.7*w2*w4 +b3*0.5*w3*w5
                                      + b4*0.7*w4*w6 + b5*0.5*w5*w7 + b1*0.5*w1*w6 + b2*0.7*w2*w3 + b3*0.5*w3*w4
                                      + b4*0.5*w4*w5 + b5*0.5*w5*w6) ) )^-1
        }

    ## probability of exposure: random number betw 0 and 1
    ## if estimated true ps > prob.exposure, than received exposure (z.a=1)
    prob.exposure <- runif(size)
    z.a <- ifelse(z.a_trueps > prob.exposure, 1, 0)
    ## continuous outcome modeling
    y.a <- a0 + a1*w1 + a2*w2 + a3*w3 + a4*w4 +a5*w8 + a6*w9 + a7*w10 + g1*z.a
    ## create simulation dataset
    sim <- as.data.frame(cbind(w1, w2, w3 ,w4, w5, w6, w7, w8, w9, w10, z.a, y.a))
    return(sim)
}
