#' Import and organise sRNAseq & mRNAseq data sets
#'
#' @description Load and organise either sRNAseq or mRNAseq pre-processing
#' results into a single dataframe containing all experimental replicates 
#' specified where rows represent either a sRNA cluster
#' (ie. sRNA producing-locus) or gene, respectively. Based on using the 
#' mobileRNA pre-processing method (See [mobileRNA::mapRNA()]). 
#'
#'
#' @details
#' The `RNAimport()` function requires the user to supply a directory path and 
#' a character vector. The path must be to the pre-processing output. 
#'
#' Following the `mobileRNA` method, for sRNA analysis, the path will be to the 
#' `2_alignment_results` folder. While for mRNA analysis, the path will be to 
#' the `2_raw_counts` folder. Both folders are generated by the 
#' [mobileRNA::mapRNA()] function. The vector should contain strings 
#' that represent and mirror the names of the sample replicate folders in the 
#' above directory. 
#'  
#'  Together this information allows the function to extract the information 
#'  stored in "Result.txt" files of each sample. 
#'  
#'@references 
#' ShortStack \url{https://github.com/MikeAxtell/ShortStack},
#' HISAT2 \url{https://anaconda.org/bioconda/hisat2},
#' HTSeq \url{https://htseq.readthedocs.io/en/master/install.html},
#' SAMtools \url{https://anaconda.org/bioconda/samtools}
#' 
#'@param input string; define type of dataset.
#'"sRNA" for sRNAseq data and "mRNA" for mRNAseq data. 
#'
#' @param directory path; directory containing of sample folders generated by 
#' `ShortStack`
#'
#' @param samples character; vector naming samples correlating
#' to outputted folders within the `directory` path.
#' 
#' @param featuretype character; type of feature. Default is "mRNA", 
#' only for mRNA data. 
#'@param FPKM logical; calculate the FPKM for each sample. Default is FALSE. 
#'
#'@param analysisType character; either "core" or "mobile" to represent the sRNA
#'analysis workflow. Where the "core" sRNA analysis imports all reads (unique
#'& multi-map), while "mobile" sRNA analysis imports only uniquely aligned read
#'counts. Only for sRNA data, default is "mobile". 
#'
#'@param annotation path; directory to genome annotation (GFF) file used for 
#'pre-processing. Only for mRNA data.
#'
#'@param idattr character; GFF attribute to be used as feature ID containing 
#'mRNA names. Several GFF lines with the same feature ID will be considered as 
#'parts of the same  feature. The feature ID is used to identity the counts in 
#'the output table. Default is "Name". Only for mRNA data.
#'
#'@return 
#'**For sRNAseq:**
#'A dataframe where rows represent sRNA clusters and columns represent
#'replicate information extracted from the ShortStack output. Replicate 
#'information includes Dicercall, Counts, and MajorRNA sequence. Each replicate 
#'information is distinguishable as the replicate name is joined as a suffix to 
#'each column name. For example, for a sample called "Sample1", the columns will 
#'include DicerCall_Sample1, Count_Sample1, MajorRNA_Sample1 and RPM_Sample1. 
#'
#'The breakdown of each column:
#'
#'* `Locus` : sRNA cluster locus
#'* `chr` : Chromosome 
#'* `start` : start coordinate of cluster
#'* `end` : end coordinate of cluster
#'* `Cluster` : name of cluster 
#'* `DicerCall_` : the size in nucleotides of most abundant sRNA in the cluster
#'* `Count_` :  number of uniquely aligned sRNA-seq reads that overlap the locus
#'* `MajorRNA_` : RNA sequence of the most abundant sRNA in the cluster
#'* `RPM_` : reads per million
#'* `FPKM_` : Fragments Per Kilobase of transcript per Million mapped reads (only if option activated)
#'
#'  
#'**For mRNAseq:**
#'A dataframe where rows represent genes and columns represent replicate 
#'information extracted from HTseq result. Replicate information includes Counts 
#'and FPKM.  For example, for a sample called "Sample1", the columns will 
#'include Count_Sample1, and FPKM_Sample1. 
#'
#'The breakdown of each column:
#'* `mRNA` : Name of mRNA  
#'* `Locus`: Genomic loci of mRNA
#'* `chr` : Chromosome 
#'* `start` : start coordinate
#'* `end` : end coordinate
#'* `width`: width in nucleotides of regions
#'* `Count_` : number of uniquely aligned mRNA-seq reads that overlap the locus
#'* `FPKM_` : Fragments Per Kilobase of transcript per Million mapped reads 
#'
#'
#' @examples
#' \dontrun{
#' # import sRNAseq data
#' df_sRNA <- RNAimport(input = "sRNA",
#'                      directory = "./analysis/sRNA_mapping_results",
#'                      samples = c("heterograft_1", "heterograft_2",
#'                      "heterograft_3","selfgraft_1" , "selfgraft_2" ,
#'                      "selfgraft_3"))
#'
#'
#'# The output of this function can be explored in the data object sRNA_data
#' data("sRNA_data")
#' head(sRNA_data)
#'
#'
#' # import sRNAseq data
#' df_mRNA <- RNAimport(input = "mRNA",
#'                      directory = "./analysis/mRNA_mapping_results",
#'                      samples = c("heterograft_1", "heterograft_2",
#'                      "heterograft_3","selfgraft_1" , "selfgraft_2" ,
#'                      "selfgraft_3"), 
#'                      annotation = "./merged_annotation.gff3")
#'
#'
#'}
#'
#' @export
#' @importFrom data.table data.table
#' @importFrom data.table setnames
#' @importFrom utils read.table
#' @importFrom data.table fread
#' @importFrom dplyr mutate
#' @importFrom dplyr across
#' @importFrom dplyr contains
#' @importFrom dplyr rename
#' @importFrom tidyr replace_na
#' @importFrom data.table :=
#' @importFrom dplyr %>%
#' @importFrom dplyr filter
#' @importFrom dplyr if_any
#' @importFrom dplyr where
#' @importFrom stats setNames
#' @importFrom utils flush.console
RNAimport <- function(input = c("sRNA", "mRNA"), 
                      directory, 
                      samples, 
                      analysisType = "mobile",
                      annotation,
                      idattr = "Name", 
                      FPKM = FALSE, 
                      featuretype = "mRNA") {
  if (base::missing(input) || !input %in% c("sRNA", "mRNA")) {
    stop("Please state the data-type to the `input` paramter.")
  }
  
  if (base::missing(directory)) {
    stop("Please specify an accessable directory where files are stored")
  }
  if (base::missing(samples)) {
    stop("Please specify a vector storing sample names matching files")
  }
  if(input=="sRNA"){
    if (is.null(analysisType) || !analysisType %in% c("core", "mobile")) {
      stop("Please state the sRNA analysis type; whether it is the 'core' or 'mobile'.") 
    }
    # LOad sample data as list of data frames, with index as file name.
    dt_list <- list()
    total_files <- length(samples)
    file_n <- 0
    message("Importing sRNA data into R...")
    for (file in samples) {
      options(datatable.showProgress = FALSE)
      dt_list[[file]] <- data.table::fread(file.path(directory, file,
                                                  "Results.txt"),header = TRUE)
      Sys.sleep(0.01)
      file_n <- file_n + 1
      message('\r', "---Importing file ", file_n, " of ", total_files,  appendLF = FALSE)
      
    }
    # remove any hashtags from header - added by shortstack
    dt_list <- lapply(dt_list, function(x) setNames(x, gsub("#", "", names(x))))
    # Check each data frame in the list for the required columns
    message(" \nChecking data content...")
    required_cols <- c("Locus", "DicerCall", "Reads", "MajorRNA")
    for (df in dt_list) {
      if (!all(required_cols %in% colnames(df))) {
        stop("Sample data frame does not contain all required columns: ",
             paste(setdiff(required_cols, colnames(df)), ".", collapse = ", ",
                   "Make sure there is not a hashtag or similar in the header
                   line of the input file(s)"))
      }
    }
    message("---Data content is correct.")
    # merge first columns to create list of loci across all samples
    loci <- lapply(dt_list, "[", , "Locus")
    combined_data <- do.call(rbind, loci)
    loci_all <- data.table::data.table(Locus=unique(combined_data$Locus))
    
    # Define a function to update the loci with the matching values from a
    # single input dataframe
    if(analysisType=="core"){
      message("---Core sRNA analysis selected, Counts represent all counts")
    } else {
      message("---Mobile sRNA analysis selected, Counts represent unique mappers")
    }
    update_locus_df <- function(dt, i) {
      # Join loci and the current input df on chromosome and coordinate range
      join_cols <- c("Locus")
      dt_match <- loci_all[dt, on = join_cols, nomatch = 0]
      # Aggregate the matching rows by chromosome, start coordinate, & end coor,
      # and compute the sum of DicerCall, Reads, and RPM values for each group
      if(analysisType == "core"){
        dt_agg <- dt_match[, .(Width=sum(Length),
                               Count=sum(Reads),
                               DicerCall = as.character(DicerCall),
                               MajorRNA = as.character(MajorRNA)),
                           by = join_cols]
      }else
        if(analysisType == "mobile"){
        dt_agg <- dt_match[, .(Width=sum(Length),
                               Count=sum(UniqueReads),
                               DicerCall = as.character(DicerCall),
                               MajorRNA = as.character(MajorRNA)),
                           by = join_cols]
      }
        # Rename the aggregated columns
        total_counts <- sum(dt_agg$Count)
        RPM_cal <- (dt_agg$Count / total_counts) * 1e6
        dt_agg$RPM <- RPM_cal
        if(FPKM){
      dt_agg$FPKM <- (dt_agg$RPM / (dt_agg$Width / 1000)) / sum(dt_agg$RPM)* 1e6
          col_names <- paste0(c("Count_", "DicerCall_","MajorRNA_", "RPM_", 
                                "FPKM_"),  i)
          data.table::setnames(dt_agg, c("Locus","Width", col_names))
        }else {
          col_names <- paste0(c("Count_", "DicerCall_", "MajorRNA_", "RPM_"), 
                              i)
          data.table::setnames(dt_agg, c("Locus","Width", col_names))
        }
      
     
     
      # Merge the aggregated values back into df1
      loci_all[dt_agg, on = join_cols, (col_names) := mget(col_names)]

    }
    # Update loci with the matching values from each input dataframe
    for (i in seq_len(length(dt_list))) {
      update_locus_df(dt_list[[i]], names(dt_list)[i])
    }
    # Fill in missing values with 0 or N
    ## Dicer call needs to character/factor
    loci_all <- loci_all %>%
      dplyr::mutate(dplyr::across(dplyr::contains('Count_'),
                                  ~tidyr::replace_na(.,0))) %>%
      dplyr::mutate(dplyr::across(dplyr::contains('RPM_'),
                                  ~tidyr::replace_na(.,0))) %>%
      dplyr::mutate(dplyr::across(dplyr::contains('DicerCall_'),
                                  ~tidyr::replace_na(.,"N")))%>%
      dplyr::mutate(dplyr::across(dplyr::contains('MajorRNA_'),
                                  ~tidyr::replace_na(.,"N"))) %>%
      dplyr::mutate_all(~ ifelse(. == "*", "N", .)) # remove any astriks to "N"
    
    if(FPKM){
      loci_all <- loci_all %>%
        dplyr::mutate(dplyr::across(dplyr::contains('Count_'),
                                    ~tidyr::replace_na(.,0))) %>%
        dplyr::mutate(dplyr::across(dplyr::contains('RPM_'),
                                    ~tidyr::replace_na(.,0))) %>%
        dplyr::mutate(dplyr::across(dplyr::contains('FPKM_'),
                                    ~tidyr::replace_na(.,0))) %>%
        dplyr::mutate(dplyr::across(dplyr::contains('DicerCall_'),
                                    ~tidyr::replace_na(.,"N")))%>%
        dplyr::mutate(dplyr::across(dplyr::contains('MajorRNA_'),
                                    ~tidyr::replace_na(.,"N"))) %>%
        dplyr::mutate_all(~ ifelse(. == "*", "N", .)) # remove any astriks to "N"
    }
    # Convert loci_all back to a data.frame and return it
    res_data <- as.data.frame(loci_all)
    # Split the Locus column into three new columns
    locus_cols <- data.frame(
      chr = vapply(strsplit(res_data$Locus, split = ":"), "[[", character(1), 1),
      start = vapply(strsplit(vapply(strsplit(res_data$Locus, split = ":"),
                                     "[[", character(1), 2), split = "-"), "[[", 
                     character(1), 1),
      end = vapply(strsplit(vapply(strsplit(res_data$Locus, split = ":"),
                                   "[[", character(1), 2), split = "-"), "[[", 
                   character(1), 2)
    )
    df_final <- cbind(res_data[,1], locus_cols, res_data[, 2:ncol(res_data)])
    names(df_final)[1] <- "Locus"
    # order by chr
    df_final <- df_final[order(df_final$chr),]
    # insert cluster name
    cluster_names <-  paste0("cluster_", seq_len(nrow(df_final)) )
    df_final <- as.data.frame(append(df_final, list(Cluster = cluster_names),
                                     after = 4))
    # Remove rows with no counts 
    count_columns <- grep("Count", names(df_final), value = TRUE)
    condition <- rowSums(df_final[count_columns])
    filtered_data <-  df_final[rowSums(df_final[count_columns])>0,] 
    # return values
    message("\n---Complete!")
    return(filtered_data)
  } 
  if(input == "mRNA"){
    if (base::missing(annotation)) {
      stop("Please specify a accessable path to a GFF file")
    }
    message("Importing mRNA data into R:")
    # load data as list
    sample_data <- list()
    file_n  <- 0
    total_files <- length(samples)
    for (file in samples) {
      sample_data[[file]] <- data.table::fread(file.path(directory, file,
                                                         "Results.txt"),header = TRUE)
              colnames(sample_data[[file]])[1] <- "mRNA"
      colnames(sample_data[[file]])[2] <- "Count"
      
      Sys.sleep(0.01)
      file_n <- file_n + 1
      message('\r', "---Importing file ", file_n, " of ", total_files,  appendLF = FALSE, "\n")
      
    }
    # remove rows with extra info 
    sample_data <- lapply(sample_data, function(x) {
      x[!grepl("__", x$mRNA),]
    })
    
    
    # check each file has two columns
    for (df in sample_data) {
      if (!all(ncol(df) == 2)) {
        stop("mRNA dataset(s) does not contain required columns")
      }
    }
    # merge first columns to create list of genes across all samples
    genes <- lapply(sample_data, "[", , "mRNA")
    genes_all <- unique(Reduce(merge,genes))
    
    # add mRNA locus and width etc
    annotation_file <- rtracklayer::import(annotation)
    genes_info <- as.data.frame(subset(annotation_file, type == featuretype))
    Locus <- paste0(genes_info$seqname, ":",genes_info$start,"-",
                    genes_info$end)
    genes_info <- cbind(Locus, genes_info)
    colnames(genes_info)[colnames(genes_info) %in% idattr] <- "mRNA"
    # merge gene list with annotation info. 
    merged_gene_info <- merge(genes_all, genes_info, by = "mRNA", all.x = TRUE) %>%
      dplyr::select(mRNA, Locus, seqnames, start, end, width, strand, type)%>% 
      dplyr::rename(chr = seqnames)
    gene_widths <- merged_gene_info$width
    
    # ADDs sample information to the genes_all object
    for (i in seq_along(sample_data)){
      matches <- merged_gene_info[sample_data[[i]], on = "mRNA", nomatch = 0]
      matches_values <- matches[, .(Count=sum(Count)),by = "mRNA"]
      
      # Rename the aggregated columns
      col_name <- paste0("Count_", names(sample_data)[i])
      data.table::setnames(matches_values, c("mRNA", col_name))
      # Merge the aggregated values back into df1
      merged_gene_info[matches_values, on = "mRNA", (col_name) := mget(col_name)]
      # Print progress
      
    }
    
    # Fill in missing values with 0 or N
    mRNA_information <- merged_gene_info %>%
      dplyr::mutate(dplyr::across(dplyr::contains('Count_'),
                                  ~tidyr::replace_na(.,0)))
    # set genes as rownames 
    fpkm <- apply(X = subset(mRNA_information, 
                             select = c(-Locus, -mRNA, -chr, -start, -end, -width, -strand, -type)),
                  MARGIN = 2, 
                  FUN = function(x) {
                    sum_x <- sum(as.numeric(x))
                    if (sum_x == 0) {
                      t <- 0
                    } else {
                      t <- 10^9 * x / as.numeric(gene_widths) / sum_x
                    }
                    t
                  })
    
    
    
    # add prefix to all columns 
    fpkm <- data.frame(fpkm)
    names_col <- sub("^Count_", "", colnames(fpkm))
    colnames(fpkm) <- paste0("FPKM_", names_col)
    
    
    # add result to df to output
    mRNA_information <- cbind(mRNA_information, fpkm)
    
    # add thresholding values - number of replicates with counts 
    t <- names(mRNA_information)[grep("^Count_", names(mRNA_information))]
    SampleCounts_vals <- apply(X = subset(mRNA_information, 
                                          select = c(t)),
                               MARGIN = 1, 
                               FUN = function(x) {
                                 length(x[x > 0])
                               })
    
    mRNA_information$SampleCounts <- SampleCounts_vals
    # remove rows with
      mRNA_information <- mRNA_information %>%
        dplyr::filter(SampleCounts != 0)
    message("---Complete!")
    return(as.data.frame(mRNA_information))
    
  }
}

