#' Annotate RNA duplexes with features
#'
#' Overlays RNA duplexes with `GRanges` annotation object.
#'
#' @param gi `GInteraction` object to annotate
#' @param anno_gr `GRanges` object with the `keys` columns in the metadata
#' @param keys vectors with the names of the features to use for annotation.
#' @param save_ambig When RNA duplex overlaps multiple features, ambiguous annotation for a single key
#' can be stored in `ambig_list.A` and `ambig_list.B`. `ambig.A` and `ambig.B' fields will be added as the 0/1 label
#' @param ambig_key a which feature to use for recording the annotation ambiguity. Determines the values in `ambig_list.A` and `ambig_list.B`.
#' @param order_key Experimental. In case RNA duplex overlaps multiple features, this key will be used to sort the overlapping features.
#' @param order_vec Experimental. An ordered vector of values in `order_key` annotation feature, which sets the priority in case of feature overlap.
#' @export
#' @return `GInteractions` object with new fields
#' @details
#' For each annotation feature in `keys`, i.e if keys=c(keyname1),
#' then `<keyname1>.A`, `<keyname1>.B` annotation fields will be created, containing the
#' names of overlapping features
#' If no overlap is found for the feature, then filed will have NA
#' @examples
#' data("RNADuplexesSampleData")
#' annotateGI(gi = RNADuplexSampleDGs, anno_gr = SampleGeneAnnoGR)
#'
#' # Prioritisation of the snRNA and lncRNA before mRNA if genes overlap
#' annotateGI(
#'     gi = RNADuplexSampleDGs, anno_gr = SampleGeneAnnoGR,
#'     keys = c("gene_id", "gene_name", "gene_type"),
#'     order_key = "gene_type",
#'     order_vec = c("snRNA", "lncRNA", "protein_coding"),
#'     save_ambig = TRUE
#' )
annotateGI <- function(
        gi, anno_gr,
        keys = c("gene_name", "gene_type", "gene_id"),
        ambig_key = keys[1],
        save_ambig = TRUE,
        order_key = NULL,
        order_vec = NULL) {
    orininal_columns <- colnames(mcols(gi))

    if (!all(keys %in% colnames(mcols(anno_gr)))) {
        ms <- str_c(setdiff(keys, colnames(mcols(anno_gr))), collapse = ", ")
        message()
        stop("Not all provided keys are found in the anotation GRanges \nmissing keys: ", ms)
    }
    anno_gr <- anno_gr[, keys]
    anno_gr$feature_id <- seq_len(length(anno_gr))
    gi@regions$region_id <- seq_len(length(gi@regions))
    po <- findOverlapPairs(gi@regions, anno_gr)

    # df will be grouped by region from below
    df <- tibble(
        "region_id" = po@first$region_id,
        "overlap_w" = width(pintersect(po))
    ) %>%
        bind_cols(data.frame(mcols(po@second))) %>%
        group_by(region_id) %>%
        mutate(ambig = vctrs::vec_cast(n_distinct(.data[[ambig_key]], na.rm = TRUE) > 1, integer()))


    # save ambig mapping regions (use pre-defined key to save ids)
    if (save_ambig) {
        message("Will save all redundant values in: ", ambig_key)
        ambig_save <- df %>% dplyr::filter(ambig == 1)
        ambig_save <- ambig_save %>%
            dplyr::filter(!is.na(.data[[ambig_key]])) %>%
            group_by(region_id) %>%
            dplyr::summarise(ambig_list = paste(unique(.data[[ambig_key]]), collapse = ",")) %>%
            ungroup()
    }

    # if we use some column to sort the annotation for resolution of ambiguities
    if (!is.null(order_key) & !is.null(order_vec)) {
        # Order by the overlap amount first, then by the order key (gene_type)
        message("Sorting the annotation layers by: ", order_key)
        if ((order_key %in% colnames(df) && (order_key %in% keys))) {
            df <- df %>%
                mutate(sort_col = factor(.data[[order_key]], levels = order_vec, ordered = TRUE)) %>%
                arrange(sort_col) %>%
                select(-sort_col) # sort by provided order
        } else {
            warning(
                "Provided annotaion order key", order_key,
                "does not exist in annotation or not part of the keys, skipping"
            )
        }
    }

    df <- df %>%
        tidyr::fill(all_of(keys), .direction = "up") %>%
        slice(1) %>%
        ungroup()

    dt <- left_join(as_tibble(mcols(gi@regions)), df, by = "region_id")

    if (save_ambig) {
        dt <- left_join(dt, ambig_save, by = "region_id")
    }

    dt <- dt %>%
        select(-c(feature_id, overlap_w, region_id))

    cnames <- c(str_c(colnames(dt), ".A"), str_c(colnames(dt), ".B"))

    dt <- cbind(dt[gi@anchor1, ], dt[gi@anchor2, ])
    colnames(dt) <- cnames
    dt <- dt %>% dplyr::select(order(colnames(dt)))

    if (save_ambig) {
        dt <- dt %>%
            relocate(ambig.A, ambig.B, ambig_list.A, ambig_list.B, .after = last_col())
    } else {
        dt <- dt %>% dplyr::select(-c(ambig.A, ambig.B, ambig_list.A, ambig_list.B))
    }

    dt <- dt %>% data.frame()
    mcols(gi) <- cbind(mcols(gi)[orininal_columns], dt)
    gi@regions$region_id <- NULL
    return(gi)
}



#' Annotate RNA-RNA interactions as cis- and trans-
#'
#' @description
#' Annotated each entry gi object as cis, if the .A and .B arms correspond to
#' the same feature (i.e transcript_id or gene_id)
#' If the values are are equal, then annotated with value: `cis` = 1, If not equal
#' or `NA`: `cis` = 0
#'
#' @param gi `GInteractions` object containing two metadata columns as feature annotation
#' @param id_col_base base name of the feature id columns to use. Function will
#' look for <id_col_base>.A and <id_col_base>.B columns and compare them
#' @keywords internal
#' @return gi `GInteractions` object containing `cis` field with 0/1 values
.annotateCisTrans <- function(gi, id_col_base = "gene_id") {
    gi$cis <- 0

    name_col1 <- paste0(id_col_base, ".A")
    name_col2 <- paste0(id_col_base, ".B")

    gi$c1 <- mcols(gi)[, name_col1]
    gi$c2 <- mcols(gi)[, name_col2]


    if (length(gi[!is.na(gi$c1) & !is.na(gi$c2) &
        (gi$c1 == gi$c2)]) != 0) {
        gi[!is.na(gi$c1) & !is.na(gi$c2) &
            (gi$c1 == gi$c2)]$cis <- 1
        gi$c1 <- NULL
        gi$c2 <- NULL
        return(gi)
    } else {
        message("Could not do cis/trans annotation because columns content is missing ")
        return(gi)
    }
}

#' Helper function to add count data to metadata of `GInteractions`
#'
#' Merges the count dataframe and interactions metadata by `id_col`
#' If key is not found, in metadata throws error

#' @param gi `GInteractions`
#' @param df_counts dataframe with read counts
#' @param id_col key to use in merge
#' @return `GInteractions` with added counts
.addGeneCounts <- function(gi, df_counts, id_col = "gene_id") {
    df_counts <- df_counts %>% as.data.frame()
    df_all_cts <- as_tibble(data.frame("RNA" = unname(df_counts[1]), "n" = unname(df_counts[2])))

    id_columns <- str_c(id_col, c(".A", ".B"))

    if (!all(id_columns %in% colnames(mcols(gi)))) {
        stop(id_columns, " are not found in the input gi")
    }
    colnames(df_all_cts) <- c(id_columns[1], "gene_count.A")
    gi$gene_count.A <- left_join(
        as_tibble(mcols(gi)[id_columns[1]]),
        df_all_cts
    ) %>% pull(gene_count.A)
    colnames(df_all_cts) <- c(id_columns[2], "gene_count.B")
    gi$gene_count.B <- left_join(
        as_tibble(mcols(gi)[id_columns[2]]),
        df_all_cts
    ) %>% pull(gene_count.B)
    return(gi)
}

#' Calculate p-values and abundance fractions for RNA duplexes
#'
#' Calculates p-values by applying Fisher test to each gene/transcript pair
#' Uses BH correction, outputs duplex abundance relative to the per - gene/transcript
#' count, and counts of other RNA duplexes formed by either or none gene/transcript
#' in this pair.
#'
#' @param gi `GInteraction` object annotated with gene/transcript names
#' @param df_counts `data.frame` A two- column dataframe with gene/transcript
#' counts to. The first column should match the 'gene_id' feature in anno_gr.
#' The second column is the respective count.
#' @param id_col the prefix for gene/transcript metadata id fields in input gi.
#' Two fields of <id_col>.A and <id.col>.B are expected. Otherwise throws error.
#' @return `GInteractions` object with new fields
#' @export
#' @details
#'
#' H0: RNA duplex not existing and reported due to the random ligation of fragments
#' H1: RNA duplex is true and formed because of existing the RNA-RNA interaction
#'
#' The probability of random ligation is modeled as \(P(a, b)\)
#' given by the following equation:
#' The probability \eqn{P(a, b)} is defined as:
#'
#' \eqn{
#' P(a, b) \propto
#' \begin{cases}
#'     2 \cdot P(a) \cdot P(b) & \textnormal{if } a:b \textnormal{ is observed and } a \neq b \\
#'     P(a) \cdot P(b) & \textnormal{if } a:b \textnormal{ is observed and } a = b \\
#'     0 & \textnormal{else}
#' \end{cases}
#' }
#'
#'
#' where The probability (P(a)) (same as for P(b) ) is calculated as:
#' \eqn{
#' P(a) = \frac{\textnormal{N reads(a)}}{\textnormal{total N reads}}
#' }
#'
#' p-value calculated by comparing observed duplex abundance to the expected
#' as the are under the curve distribution to the right of the observed.
#' P(a, b) is normalized to sum up to one.
#' @examples
#' data("RNADuplexesSampleData")
#' gi <- calculateLigationPvalues(RNADuplexSampleDGs, df_counts = RNADuplexesGeneCounts)
#' hist(gi$p.adj, breaks = 20)
calculateLigationPvalues <- function(gi, df_counts, id_col = "gene_id") {
    df_counts <- df_counts %>% as.data.frame()
    df_all_cts <- as_tibble(data.frame("RNA" = unname(df_counts[1]), "n" = unname(df_counts[2])))

    id_columns <- str_c(id_col, c(".A", ".B"))

    if (!all(id_columns %in% colnames(mcols(gi)))) {
        stop(id_columns, " are not found in the input gi")
    }

    chim_df <- as_tibble(mcols(gi)) %>% dplyr::select(c(n_reads, !!!id_columns))
    colnames(chim_df) <- c("n", "Araw", "Braw")
    chim_df$chim_id <- seq_len(nrow(chim_df))

    unique_ids <- unique(df_all_cts$RNA)
    # filter not annotated chimeras and those for which we don't have record in counts df

    chim_df$AB <- apply(chim_df[c("Araw", "Braw")], 1, function(row) {
        paste(sort(row), collapse = "<>")
    })
    chimdf_save <- chim_df[c("chim_id", "AB")]

    chim_df <- chim_df %>%
        group_by(AB) %>%
        summarise(n_reads_chim_total = sum(n)) %>%
        ungroup()
    ssplit <- str_split_fixed(chim_df$AB, "<>", n = 2)
    chim_df$A <- ssplit[, 1]
    chim_df$B <- ssplit[, 2]
    chim_df <- chim_df %>%
        left_join(tibble("idA" = df_all_cts$RNA, "nA" = df_all_cts$n),
            by = c("A" = "idA")
        )
    chim_df <- chim_df %>%
        left_join(tibble("idB" = df_all_cts$RNA, "nB" = df_all_cts$n),
            by = c("B" = "idB")
        )
    chim_df <- chim_df %>% dplyr::filter((A %in% unique_ids) &
        (B %in% unique_ids))
    N_total <- sum(df_all_cts$n) + sum(chim_df$n_reads_chim_total)
    N_chim_total <- sum(chim_df$n_reads_chim_total)
    chim_df <- chim_df %>% mutate(
        Pa = nA / N_total,
        Pb = nB / N_total,
        Pab = ifelse(A != B, 2 * Pa * Pb, Pa * Pb)
    )

    # adding extra abundance counts

    df <- data.frame(chim_df[c("A", "B", "n_reads_chim_total")]) %>%
        dplyr::rename("n" = "n_reads_chim_total")

    chim_df$chim_withA_noB <- apply(df, 1, function(row) {
        df %>%
            dplyr::filter((A == row["A"] | B == row["A"]) & (A != row["B"] & B != row["B"])) %>%
            pull(n) %>%
            sum()
    })
    chim_df$chim_withB_noA <- apply(df, 1, function(row) {
        df %>%
            dplyr::filter((A == row["B"] | B == row["B"]) & (A != row["A"] & B != row["A"])) %>%
            pull(n) %>%
            sum()
    })
    chim_df$chim_noB_noA <- apply(df, 1, function(row) {
        df %>%
            dplyr::filter((A != row["B"] & B != row["B"]) & (A != row["A"] & B != row["A"])) %>%
            pull(n) %>%
            sum()
    })
    chim_df$nonchimA <- vapply(vctrs::vec_cast(chim_df$A, character()), function(x) {
        df_all_cts %>%
            dplyr::filter(RNA == x) %>%
            pull(n)
    }, vector("double", 1))
    chim_df$nonchimB <- vapply(vctrs::vec_cast(chim_df$B, character()), function(x) {
        df_all_cts %>%
            dplyr::filter(RNA == x) %>%
            pull(n)
    }, vector("double", 1))


    chim_df$Pab_norm <- scale(chim_df$Pab,
        center = FALSE,
        scale = sum(chim_df$Pab)
    )[, 1]
    chim_df$pval <- pbinom(chim_df$n_reads_chim_total,
        size = N_total,
        prob = chim_df$Pab_norm
    )
    chim_df$p.adj <- p.adjust(chim_df$pval, method = "BH")

    counts_stats <- chim_df %>%
        dplyr::select(
            pval, p.adj, AB, Pa, Pb, chim_withA_noB,
            chim_withB_noA, chim_noB_noA
        ) %>%
        dplyr::rename(
            count.chim.notA.B = "chim_withB_noA",
            count.chim.A.notB = "chim_withA_noB",
            count.chim.notA.notB = chim_noB_noA
        )

    chimdf_save <- chimdf_save %>%
        left_join(counts_stats, by = "AB")

    statcounts_final <- tibble("chim_id" = seq_len(length(gi))) %>%
        left_join(chimdf_save, by = "chim_id") %>%
        dplyr::rename(p_val = pval) %>% # back - compatibility
        select(-c(chim_id, AB)) %>%
        data.frame()

    mcols_old <- mcols(gi) %>%
        data.frame() %>%
        as_tibble() %>%
        select(-any_of(colnames(statcounts_final))) %>%
        data.frame()

    mcols(gi) <- cbind(mcols_old, statcounts_final)

    gi <- .addGeneCounts(gi, df_counts)

    return(gi)
}


.checkRNAduplexinstalled <- function() {
    returncode <- system2("RNAduplex", args = c("-h"), stdout = NULL, stderr = NULL)
    if (returncode != 0) {
        return(1)
    } else {
        return(0)
    }
}

.runRNAduplex <- function(RNA1, RNA2) {
    input <- paste(RNA1, RNA2, sep = "\n")
    result <- system2("RNAduplex", input = input, stdout = TRUE)
    return(result)
}

.getDuplexString <- function(gi, fafile) {
    s <- Biostrings::readBStringSet(fafile)
    s <- Biostrings::RNAStringSet(s)
    newnames <- str_squish(str_sub(names(s), 1, 5))
    names(s) <- newnames
    gi$seq1 <- s[get_arm_a(gi)]
    gi$seq2 <- s[get_arm_b(gi)]
    return(gi)
}


.getGCContent <- function(seqrna) {
    # seqrna is a RNAstringset based on the arm
    gc_freq <- Biostrings::letterFrequency(seqrna, c("G", "C"))
    gc_content <- rowSums(gc_freq) / width(seqrna) * 100
    return(gc_content)
}
