forward.stagewise <- function(T, X, alpha, beta,
                              epsilon = 0.01,
                              max.stage = 5000,
                              intercept = TRUE,
                              normalize = TRUE,
                              tol = 1e-4) {
    n <- length(T)
    if (intercept) {
        X <- cbind(rep(1, n), X)
    }
    m <- ncol(X)

    if (nrow(X) != n) {
        stop("The dimensions of T and X don't match.")
    }

    S.func <- function(f) {
        prob <- link(f)
        S <- rep(0, length(f))
        if (alpha == -1 && beta == -1) {
            S[T == 1] <- f[T == 1] - 1 / prob[T == 1]
            S[T == 0] <- - f[T == 0] - 1 / (1 - prob[T == 0])
        } else if (alpha == -1 && beta == 0) {
            S[T == 1] <- - 1 / prob[T == 1]
            S[T == 0] <- - f[T == 0]
        } else if (alpha == 0 && beta == -1) {
            S[T == 1] <- f[T == 1]
            S[T == 0] <- - 1 / (1 - prob[T == 0])
        } else if (alpha == 0 && beta == 0) {
            S[T == 1] <- log(prob[T == 1])
            S[T == 0] <- log(1 - prob[T == 0])
        } else {
            for (i in 1:length(prob)) {
                S[i] <- integrate(function(p) (T[i] - p) * p^(alpha - 1) * (1 - p)^(beta - 1), lower = 1/2, upper = prob[i])$value
            }
        }
        sum(S) / n
    }

    S.prime <- function(f) {
        prob <- link(f)
        Sp <- rep(0, length(f))
        Sp[T == 1] <- prob[T == 1]^alpha * (1 - prob[T == 1])^(beta + 1)
        Sp[T == 0] <- - prob[T == 0]^(alpha + 1) * (1 - prob[T == 0])^beta
        Sp
    }

    link <- function(f) {
        1 / (1 + exp(-f))
    }

    imbalance.intercept <- function(c, f, T) {
        sum((exp(c + f))^(alpha * (T == 1) + (1 + alpha) * (T == 0)) / (1 + exp(c + f))^(alpha + beta + 1) * (2 * T - 1)) / n
    }

    c0 <- log(mean(T) / (1 - mean(T)))
    c0 <- max(c0, 1)
    update.intercept <- function(f, T) {
        uniroot(function(c) imbalance.intercept(c, f = f, T = T),
                c(-2 * c0, 2 * c0), extendInt = "yes")
    }

    coef <- matrix(0, 1, m)
    prob <- matrix(0, 0, n)
    imba <- matrix(0, 0, m)
    grad <- 0
    hist <- c()

    if (intercept) {
        coef[1, 1] <- update.intercept(rep(0, n), T)$root
    }

    imba <- matrix(0, max.stage, ncol(X))
    prob <- matrix(0, max.stage, length(T))
    coef <- matrix(0, max.stage + 1, ncol(X))
    hist <- rep(0, max.stage)
    j <- 0
    while (j == 0 || max(abs(grad)) > tol) {
        j <- j + 1
        ## print(j)
        if (j > max.stage) {
            break
        }
        f <- X %*% coef[j, ]
        prob[j, ] <- c(link(f))
        grad <- t(X) %*% S.prime(f) / n
        imba[j, ] <- c(grad)
        i <- which.max(abs(grad))
        hist[j] <- i
        new.coef <- coef[j, ]
        new.coef[i] <- new.coef[i] + (2 * (grad[i] > 0) - 1) * epsilon
        if (intercept) {
            new.coef[1] <- 0
            new.coef[1] <- update.intercept(X %*% new.coef, T)$root
        }
        coef[j + 1, ] <- new.coef
        if (tryCatch(S.func(f) > S.func(X %*% new.coef), error = function(cond) {message(cond); return(TRUE) })) {
            break
        }
    }
    coef <- coef[-nrow(coef), ]

    return(list(history = hist[1:(j-1)],
                coef = coef[1:(j-1), ],
                prob = prob[1:(j-1), ],
                imba = imba[1:(j-1), ]))

}
