\name{SNPlocs.Hsapiens.dbSNP144.GRCh37}
\docType{package}

\alias{SNPlocs.Hsapiens.dbSNP144.GRCh37-package}
\alias{SNPlocs.Hsapiens.dbSNP144.GRCh37}

\alias{COMPATIBLE_BSGENOMES}


\title{The SNPlocs.Hsapiens.dbSNP144.GRCh37 package}

\description{
  SNP positions and alleles for Homo sapiens extracted
  from NCBI dbSNP Build 144. The source data files used for this
  package were created by NCBI on May 29-30, 2015, and contain SNPs
  mapped to reference genome GRCh37.p13 (a patched version of GRCh37
  that doesn't alter chromosomes 1-22, X, Y, MT).
}

\details{
  SNPs from dbSNP were filtered to keep only those satisfying the 3
  following criteria:
  \enumerate{
    \item The SNP is a single-base substitution i.e. its class is \emph{snp}.
          Other classes supported by dbSNP are: \emph{in-del},
          \emph{heterozygous}, \emph{microsatellite}, \emph{named-locus},
          \emph{no-variation}, \emph{mixed}, and
          \emph{multinucleotide-polymorphism}.
          These SNPs are NOT included in
          \pkg{SNPlocs.Hsapiens.dbSNP144.GRCh37}
          but are available in separate package
          \pkg{XtraSNPlocs.Hsapiens.dbSNP144.GRCh37}.

    \item The SNP is marked as notwithdrawn.

    \item A \emph{single} position on the reference genome (GRCh37.p13)
          is reported for the SNP, and this position is on chromosome
          1-22, X, Y, or MT.
  }

  SNPlocs packages always store the alleles corresponding to the \emph{plus}
  strand, whatever the strand reported by dbSNP is (which is achieved by
  storing the complement of the alleles reported by dbSNP for SNPs located
  on the minus strand).
  In other words, in a SNPlocs package, all the SNPs are considered to be
  on the plus strand and everything is reported with respect to that strand. 
}

\note{
  WARNING: The SNPs in this package are mapped to reference genome GRCh37.p13.
  Note that the GRCh37.p13 genome is a patched version of GRCh37. However the
  patch doesn't alter chromosomes 1-22, X, Y, MT.
  GRCh37 itself is the same as the hg19 genome from UCSC *except* for the
  mitochondrion chromosome. Therefore, the SNPs in this package can be
  "injected" in BSgenome.Hsapiens.UCSC.hg19 and will land at the correct
  position but this injection will exclude chrM (i.e. nothing will be
  injected in that sequence).

  See \code{?\link[BSgenome]{injectSNPs}} in the \pkg{BSgenome} software
  package for more information about the SNP injection mechanism.

  See \url{https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.25/}
  for more information about the GRCh37.p13 assembly.

  See \url{http://genome.ucsc.edu/cgi-bin/hgGateway?db=hg19}
  for the UCSC Genome Browser based on the hg19 assembly.
  Note that chromosomes 1-22, X, and Y in hg19 and GRCh37.p13 are the same
  except that they are named differently (no \code{chr} prefix in GRCh37.p13).
}

\references{
  SNP Home at NCBI:
  \url{https://www.ncbi.nlm.nih.gov/snp}

  dbSNP Human BUILD 144 announcement:
  \url{https://www.ncbi.nlm.nih.gov/mailman/pipermail/dbsnp-announce/2015q2/000163.html}
}

\author{H. Pagès}

\seealso{
  \itemize{
    \item The \pkg{XtraSNPlocs.Hsapiens.dbSNP144.GRCh37} package for SNPs of
          class other than \emph{snp}.

    \item \code{\link[BSgenome]{snpcount}} in the \pkg{BSgenome} software
          package for how to access the data stored in this package.

    \item \code{\link[Biostrings]{IUPAC_CODE_MAP}} in the \pkg{Biostrings}
          package.

    \item The \link[GenomicRanges]{GRanges} class in the \pkg{GenomicRanges}
          package.

    \item \code{\link[BSgenome]{injectSNPs}} in the \pkg{BSgenome} software
          package for SNP injection.

    \item The \pkg{VariantAnnotation} software package to annotate variants
          with respect to location and amino acid coding.
  }
}

\examples{
## ---------------------------------------------------------------------
## A. BASIC USAGE
## ---------------------------------------------------------------------
snps <- SNPlocs.Hsapiens.dbSNP144.GRCh37
snpcount(snps)

## Get the positions and alleles of all SNPs on chromosome 22:
chr22_snps <- snpsBySeqname(snps, "22")
chr22_snps

## Get the positions and alleles of all SNPs on chromosomes 22 and MT:
snpsBySeqname(snps, c("22", "MT"))

## ---------------------------------------------------------------------
## B. EXTRACT SNP INFORMATION FOR A SET OF RS IDS
## ---------------------------------------------------------------------
my_rsids <- c("rs2639606", "rs75264089", "rs73396229", "rs55871206",
              "rs10932221", "rs56219727", "rs73709730", "rs55838886",
              "rs3734153", "rs79381275", "rs1516535")
my_snps <- snpsById(snps, my_rsids)
my_snps

## Translate the IUPAC ambiguity codes used to represent the alleles
## into nucleotides:
IUPAC_CODE_MAP[mcols(my_snps)$alleles_as_ambig]

## ---------------------------------------------------------------------
## C. INJECTION IN THE REFERENCE GENOME
## ---------------------------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg19)
genome <- BSgenome.Hsapiens.UCSC.hg19
genome

genome2 <- injectSNPs(genome, "SNPlocs.Hsapiens.dbSNP144.GRCh37")
genome2  # note the additional line "with SNPs injected from..."

alphabetFrequency(genome$chr22)
alphabetFrequency(genome2$chr22)

## Get the number of nucleotides that were modified by this injection:
neditAt(genome$chr22, genome2$chr22)  # 1835910

## ---------------------------------------------------------------------
## D. SOME BASIC QUALITY CONTROL (WITH SURPRISING RESULTS!)
## ---------------------------------------------------------------------

## Note that dbSNP can assign distinct ids to SNPs located at the same
## position:
any(duplicated(mcols(chr22_snps)$RefSNP_id))  # rs ids are all distinct...
any(duplicated(chr22_snps))  # but some positions are repeated!

which(duplicated(chr22_snps))[1:2]  # 14, 20
chr22_snps[12:15]  # rs2186463 and rs146752890 share the same position
                   # (16050612) and alleles (S, i.e. C/G)

## Also note that not all SNP alleles are consistent with the GRCh37
## genomic sequences, that is, the alleles reported for a given SNP are
## not necessarily compatible with the nucleotide found at the SNP
## position in GRCh37. For example, to get the number of inconsistent
## SNPs on chr1:
chr1_snps <- snpsBySeqname(snps, "1")
chr1_alleles <- mcols(chr1_snps)$alleles_as_ambig
chr1_alleles <- DNAString(paste(chr1_alleles, collapse=""))
nchar(chr1_alleles)  # 10608552 SNPs on chr1
neditAt(genome$chr1[pos(chr1_snps)], chr1_alleles, fixed=FALSE)
## ==> 5724 SNPs (0.054%) are inconsistent with GRCh37 chr1!
}

\keyword{package}
