% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/add_reanno.R
\name{add_reanno}
\alias{add_reanno}
\title{Generates biotype annotation from reannotation object}
\usage{
add_reanno(
  reanno,
  mismatches = 0,
  type = "genome",
  bio_search,
  bio_perfect = FALSE,
  genome_max = 10,
  merge_pac = NULL
)
}
\arguments{
\item{reanno}{A reannotation list object generated by
\code{\link{make_reanno}}, with an Overview data.frame and a Full_anno list.}

\item{mismatches}{Integer indicating the number of mismatches that should be
reported. Can never have a higher number than was originally specified with
the \code{\link{map_reanno}} function. While default=0, reporting the
maximum number is recommended. Note that the mismatch information is only
added to the report. Further classification can be generated with the
\code{\link{simplify_reanno}} function.}

\item{type}{Character indicating what type of mapping that has been performed. If
type="genome", the reanno object is expected to have been mapped against a
genome or larger fasta references like chromosomes. This may include
coordinates as controlled by \code{\link{map_reanno}} and the
\code{\link{import_reanno}} functions. See \code{genome_max} on how to
control how coordinates are reported. If coordinates are missing, only hit or
no hit will be reported. If type="biotype", then coordinates are not
expected. Reference hits will instead be classified according to the biotype
search terms in \code{bio_search}.}

\item{bio_search}{List of character vectors indicating search terms targeting
feature names in the original reference fasta files. These search terms will
be parsed to \code{\link{grepl}} as regular expressions. List names must
match names of the references in the reanno object. Classifications will be
reported with reference name + search term (e.g. "Ensembl_trna").}

\item{bio_perfect}{Logical whether the function should allow that not all hits
against the references must have a unique bio_search term. If perfect=FALSE
(default) all references hits that was not caught by a search term will be
annotated as reference + other (e.g. "Ensembl_other"). In case perfect=TRUE,
the function will throw an error if the search terms do not catch all reference 
hits. Can be used to ensure that all hits receive a classification.}

\item{genome_max}{Integer or character indicating the number of maximum
coordinates to be reported when type="genome". If the number of hits
exceeds \code{genome_max} it will be indicated in the classification and
only the first hits up to \code{max_hits} will be reported. This is useful to
handling sequences that multimap. If genome_max="all", all coordinates will
be reported which may dramatically affect performance. (default=10)}

\item{merge_pac}{PAC object. If a PAC object is provided in merge_pac, then
the function will automatically merge the Anno table of the PAC object with
the newly generated annotations. As default, merge_pac=NULL will return
a tibble data.frame instead.}
}
\value{
Data frame with mismatch and coordinate or biotype information (search
 term hits) that can directly be added to an Anno data.frame of a PAC object,
 given that the PAC object was used in the reannotation workflow. Annotations
 can be further simplified using the \code{\link{simplify_reanno}} function.
}
\description{
\code{add_reanno} adds biotypes from reannotation to PAC.
}
\details{
Given a reanno object generated by \code{\link{make_reanno}} this function
will either extract genome coordinates or classify biotypes using a list of
search terms (see details). Information will be compiled into a data frame with
the same order of sequences as in the original PAC master file.

Please rename the column names of your previously added columns if running 
\code{\link{map_reanno}} more than two times using the same \code{type} input 
("genome" or "biotype").
}
\examples{

######################################################### 
##### Example for reference mapping and classification 
closeAllConnections()
######################################################### 
##### Create an reanno object

### First, if you haven't already generated Bowtie indexes for the included
# fasta references you need to do so. If you are having problem see the small
# RNA guide (vignette) for more info.
                                                             
 ## tRNA:
 trna_file <- system.file("extdata/trna", "tRNA.fa", 
                          package = "seqpac", mustWork = TRUE)
 trna_dir <- dirname(trna_file)
 
 if(!sum(stringr::str_count(list.files(trna_dir), ".ebwt")) ==6){     
     Rbowtie::bowtie_build(trna_file, 
                           outdir=trna_dir, 
                           prefix="tRNA", force=TRUE)
                           }
 ## rRNA:
 rrna_file <- system.file("extdata/rrna", "rRNA.fa", 
                          package = "seqpac", mustWork = TRUE)
 rrna_dir <- dirname(rrna_file)
 
 if(!sum(stringr::str_count(list.files(rrna_dir), ".ebwt")) ==6){
     Rbowtie::bowtie_build(rrna_file, 
                           outdir=rrna_dir, 
                           prefix="rRNA", force=TRUE)
                           }
                           
                           
##  Then load a PAC-object and remove previous mapping from anno:
 load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
                   package = "seqpac", mustWork = TRUE))
 anno(pac) <- anno(pac)[,1, drop = FALSE]
 
 ref_paths <- list(trna= trna_file, rrna= rrna_file)

                                    
##  You may add an output path of your choice, but here we use a temp folder:
 output <- paste0(tempdir(),"/seqpac/test")


##  Then map the PAC-object against the fasta references. 
# Warning: if you use your own data, you may want to use override=FALSE, to avoid
# deleting previous mapping by mistake.

map_reanno(pac, input=ref_paths, output=output,
               type="internal", mismatches=0,  import="biotype", 
               threads=2, keep_temp=FALSE, override=TRUE)
 
##  Then import and generate a reanno-object of the temporary bowtie-files
reanno_biotype <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
 
                                    
## Now make some search terms against reference names to create shorter names
# Theses can be used to create factors in downstream analysis
# One search hit (regular expressions) gives one new short name 

bio_search <- list(
              rrna=c("5S", "5.8S", "12S", "16S", "18S", "28S", "pre_S45"),
              trna =c("_tRNA", "mt:tRNA"))
 
 
## You can merge directly with your PAC-object by adding your original 
# PAC-object, that you used with map_reanno, to merge_pac option.

pac <- add_reanno(reanno_biotype, bio_search=bio_search, 
                       type="biotype", bio_perfect=FALSE, 
                       mismatches = 0, merge_pac=pac)

table(anno(pac)$mis0_bio) 

############################################################################ 
## Similarly, you can use Bowtie indexed genome fasta references
## But don't forget to use import="genome" for coordinate import
#   
# ref_paths <- list(genome="<path_to_bowtie_indexed_fasta>")
# output_genome <- "<your_path_to_output_folder>"
#
## We can actually map against several genomes simultaneously:
# (to illustrate the principle we run the mycoplasma genome twice)
# ref_paths <- list(genome1=mycoplasma_file, genome2=mycoplasma_file)
# ### Run map_reanno 
# map_reanno(PAC=pac, input=ref_paths, output=output, 
#            type="internal", mismatches=0, import="genome", 
#            threads=2, keep_temp=TRUE, override=TRUE)
# reanno_genome <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
############################################################################
#### The trick to succeed with bio_perfect=TRUE 

## Run add_reanno with bio_perfect="FALSE" (look where "Other=XX" occurs)
# Use the bio
#' ## Add a search terms that catches the other rrna 

anno <- add_reanno(reanno_biotype, bio_search=bio_search, 
                   type="biotype", bio_perfect=FALSE, mismatches = 0)

## Find sequences that has been classified as other
other_seqs  <- anno[grepl("other", anno$mis0),]$seq
tab <- full(reanno_biotype)$mis0$trna
tab[tab$seq \%in\% other_seqs,]         #No other hit in trna

tab <- full(reanno_biotype)$mis0$rrna
tab[tab$seq \%in\% other_seqs,]         #A few in rrna


## Add a search terms that catches the other rrna 
bio_search <- list(
              rrna=c("5S", "5.8S", "12S", "16S", 
                     "18S", "28S", "pre_S45", "Other"),
              trna =c("_tRNA", "mt:tRNA"))
                
anno <- add_reanno(reanno_biotype, bio_search=bio_search, 
                   type="biotype", bio_perfect=FALSE, mismatches = 0)

table(anno$mis0)                 

## Repeat search until no "Other" appear when running add_reanno, 
## then run  bio_perfect=TRUE: 

anno <- add_reanno(reanno_biotype, bio_search=bio_search,  
                   type="biotype", bio_perfect=TRUE, mismatches = 0)
                   



}
\seealso{
\url{http://bowtie-bio.sourceforge.net/index.shtml} for information
 about Bowtie and \url{https://github.com/OestLab/seqpac} for updates on the
 current package.

Other PAC reannotation: 
\code{\link{PAC_annotate}()},
\code{\link{as.reanno}()},
\code{\link{import_reanno}()},
\code{\link{make_conv}()},
\code{\link{make_reanno}()},
\code{\link{map_reanno}()},
\code{\link{simplify_reanno}()}
}
\concept{PAC reannotation}
