\name{getSNPlocs}

\alias{getSNPcount}
\alias{.loadLoc}
\alias{.loadAlleles}
\alias{getSNPlocs}
\alias{rsid2loc}
\alias{rsid2alleles}
\alias{rsidsToGRanges}


\title{Accessing the SNPs stored in SNPlocs.Hsapiens.dbSNP144.GRCh38}

\description{
  Functions for accessing the SNPs stored in the
  SNPlocs.Hsapiens.dbSNP144.GRCh38 package.

  WARNING: All the functions described in this man page are defunct
  and will be removed at some point in the future.
  See \code{?\link[BSgenome]{snpcount}} in the \pkg{BSgenome} software
  package for the new preferred way to access the data stored in this
  package.
}

\usage{
## Count and load all the SNPs for a given chromosome:
getSNPcount()
getSNPlocs(seqname, as.GRanges=FALSE, caching=TRUE)

## Extract SNP information for a set of rs ids:
rsid2loc(rsids, caching=TRUE)
rsid2alleles(rsids, caching=TRUE)
rsidsToGRanges(rsids, caching=TRUE)
}

\arguments{
  \item{seqname}{
    The name of the sequence for which to get the SNP locations
    and alleles.

    If \code{as.GRanges} is \code{FALSE}, only one sequence can
    be specified (i.e. \code{seqname} must be a single string).
    If \code{as.GRanges} is \code{TRUE}, an arbitrary number of
    sequences can be specified (i.e. \code{seqname} can be
    a character vector of arbitrary length).
  }
  \item{as.GRanges}{
    \code{TRUE} or \code{FALSE}. If \code{TRUE}, then the SNP locations
    and alleles are returned in a \link[GenomicRanges]{GRanges} object.
    Otherwise (the default), they are returned in a data frame (see below).
  }
  \item{caching}{
    Should the loaded SNPs be cached in memory for faster further
    retrieval but at the cost of increased memory usage?
  }
  \item{rsids}{
    A vector of rs ids. Can be integer or character vector, with or
    without the \code{"rs"} prefix. NAs are not allowed.
  }
}

\details{
  See \link{SNPlocs.Hsapiens.dbSNP144.GRCh38} for general information
  about this package.

  The SNP data are split by chromosome (1-22, X, Y, MT) i.e. the
  package contains one data set per chromosome, each of them being a
  serialized data frame with 1 row per SNP and the 2 following columns:
  \itemize{
    \item \code{loc}: The 1-based location of the SNP relative to the
          first base at the 5' end of the plus strand of the reference
          sequence.
    \item \code{alleles}: A raw vector with no NAs which can be
          converted into a character vector containing the alleles
          for each SNP represented by an IUPAC nucleotide ambiguity
          code (see \code{?\link[Biostrings]{IUPAC_CODE_MAP}} in the
          Biostrings package for more information).
  }

  Note that those data sets are not intended to be used directly but
  the user should instead use the \code{getSNPcount} and \code{getSNPlocs}
  convenience wrappers for loading the SNP data. When used with
  \code{as.GRanges=FALSE} (the default), \code{getSNPlocs} returns
  a data frame with 1 row per SNP and the 3 following columns:
  \itemize{
    \item \code{RefSNP_id}: RefSNP ID (aka "rs id") with \code{"rs"}
          prefix removed. Character vector with no NAs and no duplicates.
    \item \code{alleles_as_ambig}: A character vector with no NAs
          containing the alleles for each SNP represented by an IUPAC
          nucleotide ambiguity code.
    \item \code{loc}: Same as for the 2-col serialized data frame
          described previously.
  }
}

\value{
  \code{getSNPcount} returns a named integer vector containing the number
  of SNPs for each sequence in the reference genome.

  By default (\code{as.GRanges=FALSE}), \code{getSNPlocs} returns the
  3-col data frame described above containing the SNP data for the
  specified chromosome.
  Otherwise (\code{as.GRanges=TRUE}), it returns a
  \link[GenomicRanges]{GRanges} object with extra columns
  \code{"RefSNP_id"} and \code{"alleles_as_ambig"}.
  Note that all the elements (genomic ranges) in this
  \link[GenomicRanges]{GRanges} object have their strand set
  to \code{"+"} and that all the sequence lengths are set to \code{NA}.

  \code{rsid2loc} and \code{rsid2alleles} both return a named vector
  (integer vector for the former, character vector for the latter)
  where each (name, value) pair corresponds to a supplied rs id.
  For both functions the name in (name, value) is the chromosome
  of the rs id. The value in (name, value) is the position of the rs id
  on the chromosome for \code{rsid2loc}, and a single IUPAC code
  representing the associated alleles for \code{rsid2alleles}.

  \code{rsidsToGRanges} returns a \link[GenomicRanges]{GRanges} object
  similar to the one returned by \code{getSNPlocs} (when used with
  \code{as.GRanges=TRUE}) and where each element corresponds to a
  supplied rs id.
}

\author{H. Pagès}

\seealso{
  \itemize{
    \item \code{\link[BSgenome]{snpcount}} in the \pkg{BSgenome} software
          package for the new preferred way to access the data stored in
          this package.

    \item \link{SNPlocs.Hsapiens.dbSNP144.GRCh38}
  }
}

\keyword{data}
