% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/getGenesets.R
\name{getGenesets}
\alias{getGenesets}
\alias{get.go.genesets}
\alias{get.kegg.genesets}
\alias{parse.genesets.from.GMT}
\alias{showAvailableSpecies}
\alias{showAvailableCollections}
\alias{writeGMT}
\title{Definition of gene sets according to different sources}
\usage{
getGenesets(
  org,
  db = c("go", "kegg", "msigdb", "enrichr"),
  gene.id.type = "ENTREZID",
  cache = TRUE,
  return.type = c("list", "GeneSetCollection"),
  ...
)

showAvailableSpecies(db = c("go", "kegg", "msigdb", "enrichr"), cache = TRUE)

showAvailableCollections(
  org,
  db = c("go", "kegg", "msigdb", "enrichr"),
  cache = TRUE
)

writeGMT(gs, gmt.file)
}
\arguments{
\item{org}{An organism in (KEGG) three letter code, e.g. \sQuote{hsa} for
\sQuote{Homo sapiens}. Alternatively, this can also be a text file storing 
gene sets in GMT format. See details.}

\item{db}{Database from which gene sets should be retrieved. Currently, 
either 'go' (default), 'kegg', 'msigdb', or 'enrichr'.}

\item{gene.id.type}{Character. Gene ID type of the returned gene sets.
Defaults to \code{"ENTREZID"}. See \code{\link{idTypes}} for available
gene ID types.}

\item{cache}{Logical.  Should a locally cached version used if available?
Defaults to \code{TRUE}.}

\item{return.type}{Character. Determines whether gene sets are returned
as a simple list of gene sets (each being a character vector of gene IDs), or
as an object of class \code{\linkS4class{GeneSetCollection}}.}

\item{...}{Additional arguments for individual gene set databases. 
For \code{db = "GO"}:
\itemize{
\item onto: Character. Specifies one of the three GO ontologies: 'BP'
(biological process), 'MF' (molecular function), 'CC' (cellular component).
Defaults to 'BP'.
\item evid: Character. Specifies one or more GO evidence code(s) such as 
IEP (inferred from expression pattern) or TAS (traceable author statement).
Defaults to \code{NULL} which includes all annotations, i.e. does not filter by
evidence codes. See references for a list of available evidence codes. 
\item hierarchical: Logical. Incorporate hierarchical relationships between
GO terms ('is_a' and 'has_a') when collecting genes annotated to a GO term?
If set to \code{TRUE}, this will return all genes annotated to a GO term
*or to one of its child terms* in the GO ontology.
Defaults to \code{FALSE}, which will then only collect genes directly
annotated to a GO term. 
\item mode: Character. Determines in which way the gene 
sets are retrieved. This can be either 'GO.db' or 'biomart'. 
The 'GO.db' mode creates the gene sets based on BioC annotation packages - 
which is fast, but represents not
necessarily the most up-to-date mapping. In addition, this option is only
available for the currently supported model organisms in BioC.  The
'biomart' mode downloads the mapping from BioMart - which can be time
consuming, but allows to select from a larger range of organisms and
contains the latest mappings.  Defaults to 'GO.db'.}
For \code{db = "msigdb":} \itemize{ \item cat: Character. 
MSigDB collection category: 'H' (hallmark), 
'C1' (genomic position), 'C2' (curated databases), 'C3' (binding site motifs),
'C4' (computational cancer), 'C5' (Gene Ontology), 'C6' (oncogenic), 
'C7' (immunologic), 'C8' (cell type). Note that MSigDB has designated
collections for mouse named 'MH' (hallmark), 'M1' (genomic position),
'M2' (curated databases), and so on. See references.
\item subcat: Character. MSigDB collection subcategory. Depends on the
chosen MSigDB collection category. For example, 'MIR' to obtain microRNA targets
from the 'C3' collection. See references.}
For \code{db = "enrichr"}: \itemize{ \item lib: Character. Enrichr gene set 
library. For example, 'Genes_Associated_with_NIH_Grants' to obtain gene sets 
based on associations with NIH grants. See references.}}

\item{gs}{A list of gene sets (character vectors of gene IDs).}

\item{gmt.file}{Gene set file in GMT format. See details.}
}
\value{
For \code{getGenesets}: a list of gene sets (vectors of gene IDs).
For \code{writeGMT}: none, writes to file.

For \code{showAvailableSpecies} and \code{showAvailableCollections}: 
a \code{\linkS4class{DataFrame}}, displaying supported species and
available gene set collections for a gene set database of choice.
}
\description{
Functionality for retrieving gene sets for an organism under
investigation from databases such as GO and KEGG. Parsing and writing a list
of gene sets from/to a flat text file in GMT format is also supported.

The GMT (Gene Matrix Transposed) file format is a tab delimited file format
that describes gene sets.  In the GMT format, each row represents a gene
set.  Each gene set is described by a name, a description, and the genes in
the gene set. See references.
}
\examples{

    # (1) Typical usage for gene set enrichment analysis with GO:
    # Biological process terms based on BioC annotation (for human)
    go.gs <- getGenesets(org = "hsa", db = "go")
    
    # eq.:  
    # go.gs <- getGenesets(org = "hsa", db = "go", onto = "BP", mode = "GO.db")
    \donttest{
    # Alternatively:
    # downloading from BioMart 
    # this may take a few minutes ...
    go.gs <- getGenesets(org = "hsa", db = "go", mode = "biomart")

    # list supported species for obtaining gene sets from GO 
    showAvailableSpecies(db = "go")
    }
    # (2) Defining gene sets according to KEGG  
    kegg.gs <- getGenesets(org = "hsa", db = "kegg")
    \donttest{
    # list supported species for obtaining gene sets from KEGG 
    showAvailableSpecies(db = "kegg")

    # (3) Obtaining *H*allmark gene sets from MSigDB
    hall.gs <- getGenesets(org = "hsa", db = "msigdb", cat = "H")

    # list supported species for obtaining gene sets from MSigDB
    showAvailableSpecies(db = "msigdb")

    # list available gene set collections in the MSigDB
    showAvailableCollections(org = "mmu", db = "msigdb") 

    # (4) Obtaining gene sets from Enrichr
    tfppi.gs <- getGenesets(org = "hsa", db = "enrichr", 
                            lib = "Transcription_Factor_PPIs")

    # list supported species for obtaining gene sets from Enrichr
    showAvailableSpecies(db = "enrichr")

    # list available Enrichr gene set libraries
    showAvailableCollections(org = "hsa", db = "enrichr")        
    }
    # (6) parsing gene sets from GMT
    gmt.file <- system.file("extdata/hsa_kegg_gs.gmt",
                            package = "EnrichmentBrowser")
    gs <- getGenesets(gmt.file)     
    
    # (7) writing gene sets to file
    writeGMT(gs, gmt.file)

}
\references{
GO: \url{http://geneontology.org/}

GO evidence codes: \url{http://geneontology.org/docs/guide-go-evidence-codes/}

KEGG Organism code: \url{http://www.genome.jp/kegg/catalog/org_list.html}

MSigDB: \url{http://software.broadinstitute.org/gsea/msigdb/collections.jsp}

Enrichr: \url{https://maayanlab.cloud/Enrichr/#stats}

GMT file format:
\url{http://www.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats}
}
\seealso{
the \code{GO.db} package for GO2gene mapping used in
'GO.db' mode, and the biomaRt package for general queries to BioMart. 

\code{\link{keggList}} and \code{\link{keggLink}} for accessing the KEGG REST
server.

\code{msigdbr::msigdbr} for obtaining gene sets from the MSigDB.
}
\author{
Ludwig Geistlinger
}
