## How it's built:
require(GEOquery)
hm450k <- getGEO('GPL13534')@dataTable@table
names(hm450k) <- toupper(names(hm450k))
hm27k <- getGEO('GPL8490')@dataTable@table
names(hm27k) <- toupper(names(hm27k))
hm450k$PLATFORM <- 'HM450'
hm450k$CHANNEL27 <- NA
hm450k$CHANNEL450 <- hm450k$COLOR_CHANNEL
hm450k$ADDRESSA_450 <- hm450k$ADDRESSA_ID
hm450k$ADDRESSB_450 <- hm450k$ADDRESSB_ID
hm450k$ADDRESSA_27 <- NA
hm450k$ADDRESSB_27 <- NA
hm27k$PLATFORM <- 'HM27'
hm450k$PLATFORM[ which(hm450k$NAME %in% hm27k$NAME) ] <- 'BOTH'
hm27k$PLATFORM[ which(hm27k$NAME %in% hm450k$NAME) ] <- 'BOTH'
hm27k$CHANNEL27 <- hm27k$COLOR_CHANNEL
hm27k$CHANNEL450 <- NA
hm27k$ADDRESSA_450 <- NA
hm27k$ADDRESSB_450 <- NA
hm27k$ADDRESSA_27 <- hm27k$ADDRESSA_ID
hm27k$ADDRESSB_27 <- hm27k$ADDRESSB_ID
hm27k$STRAND <- '*' ## this we do the brute-force way
hm27k$CHROMOSOME_36 <- hm27k$CHR
hm27k$COORDINATE_36 <- hm27k$MAPINFO
commonColumns <- intersect( names(hm450k), names(hm27k) )
rownames(hm450k) <- hm450k$ID
rownames(hm27k) <- hm27k$ID

## this makes it relatively painless to move from hg18 to hg19
df2GR <- function(df, keepColumns=FALSE, ignoreStrand=FALSE){ # {{{
  require(GenomicRanges)
  stopifnot(class(df) == "data.frame")
  subs <- list(chromStart='start', chromEnd='end', chrom='chr', seqnames='chr')
  for(s in names(subs)) names(df) = gsub(s, subs[[s]], names(df), ignore=TRUE)
  stopifnot(all(c("start", "end") %in% names(df)))
  if('genome' %in% names(attributes(df))) g <- attr(df, 'genome') else g <- NULL
  if(substr(df$chr, 1, 3)[1] != 'chr') df$chr <- paste('chr', df$chr, sep='')
  df <- subset(df, !is.na(start) & !is.na(end))
  if(!ignoreStrand && ("strand" %in% names(df))) {
    if(is.numeric(df$strand)) df$strand <- strandMe(df$strand)
    GR <- with(df, GRanges(chr, IRanges(start=start, end=end), strand=strand))
  } else {
    GR <- with(df, GRanges(chr, IRanges(start=start, end=end)))
  }
  if('name' %in% names(df)) {
    names(GR) <- df$name
    df$name <- NULL
  } else {
    names(GR) <- rownames(df)
  }
  if(keepColumns) {
    skipped = c("rangename","chr","start","end","width","strand")
    elementMetadata(GR) <- as(df[, setdiff(names(df), skipped), drop=F],
                              "DataFrame")
  }
  if('X' %in% names(elementMetadata(GR))) {
    if(all(is.na(GR$X))) {
      GR$X <- NULL
    } else {
      names(elementMetadata(GR))[which(names(elementMetadata(GR))=='X')]='score'
    }
  }
  if(!is.null(g)) genome(GR) <- g
  return(GR)
} # }}}

hm27k$start <- hm27k$end <- as.numeric(hm27k$MAPINFO)
hm27k$chrom <- hm27k$CHR
hm27k$name <- hm27k$ID 
hm27k.gr <- df2GR(hm27k)
genome(hm27k.gr) <- 'hg18'

kept <- match(rownames(hm27k)[which(hm27k$PLATFORM=='BOTH')],rownames(hm450k))
hm450k$CHANNEL27[ kept ] = hm27k$CHANNEL27[ which(hm27k$PLATFORM == 'BOTH') ]
hm450k$ADDRESSA_27[ kept ] = hm27k$ADDRESSA_27[which(hm27k$PLATFORM == 'BOTH')]
hm450k$ADDRESSB_27[ kept ] = hm27k$ADDRESSB_27[which(hm27k$PLATFORM == 'BOTH')]

hm27k.allProbes <- hm27k ## in case we need it later, if we screw up
hm27k <- hm27k[ which(hm27k$PLATFORM == 'HM27'), ] ## only old probes
keepColumns <- c('ADDRESSA_450','ADDRESSB_450', 'ADDRESSA_27', 'ADDRESSB_27',
                 'CHANNEL27','CHANNEL450', 'CHROMOSOME_36','COORDINATE_36',
                 'SOURCESEQ','STRAND','PLATFORM','NAME')
infMeth.hg18 <- rbind(hm450k[, keepColumns], hm27k[, keepColumns])


## note that SNP probes do not have MAPINFO; will fix this from dbSNP
noMap <- which(is.na(as.numeric(infMeth.hg18$COORDINATE_36)))
noMap.ids <- infMeth.hg18$NAME[ noMap ]
message(paste('Note: GEO is missing genomic coordinates for probes',
              paste(noMap.ids, collapse=', ')))
# data(SNPs.hg18) # extracted from SNAP
load("~/Dropbox/RDBMS/FDb.InfiniumMethylation.hg18/data/SNPs.hg18.rda")
mapped <- intersect(noMap.ids, names(SNPs.hg18))
map.ids <- match(mapped, rownames(infMeth.hg18))
infMeth.hg18$CHROMOSOME_36[map.ids] = 
  gsub('chr','', as.character(seqnames(SNPs.hg18[mapped])))
infMeth.hg18$COORDINATE_36[map.ids] = start(SNPs.hg18[mapped])
infMeth.hg18$STRAND[map.ids] = '*'
## 
## Zero out the rest
##
unmappable <- setdiff(noMap.ids, mapped)
unmap.ids <- match(unmappable, rownames(infMeth.hg18))
infMeth.hg18$COORDINATE_36[unmap.ids] <- 0

## chrMULTI or empty -> chrUn
##
infMeth.hg18$CHROMOSOME_36 <- gsub('MULTI','Un',
                                          infMeth.hg18$CHROMOSOME_36)
infMeth.hg18$CHROMOSOME_36 <- gsub('^$','Un',
                                          infMeth.hg18$CHROMOSOME_36)
droppedProbes <- which(infMeth.hg18$CHROMOSOME_36 == 'Un')
infMeth.hg18 <- infMeth.hg18[ -droppedProbes, ] 
##
## length(droppedProbes) == length(unmappable) ## TRUE 

##
## now build the GRanges that will become the FeatureDb:
##
## scaffolding: 
##
require(Biostrings)
require(GenomicRanges)
sourceSeq <- with(infMeth.hg18, DNAStringSet(SOURCESEQ))
gcContent <- round(letterFrequency(sourceSeq, letters='GC', as.prob=T), 2)
Infinium.GR.hg18 <- with(infMeth.hg18,
                 GRanges(paste0('chr', CHROMOSOME_36), 
                         IRanges(as.numeric(COORDINATE_36), width=1),
                         strand=as.factor(ifelse(STRAND == 'F','-','+')),
                         addressA_450=Rle(ADDRESSA_450),
                         addressB_450=Rle(ADDRESSB_450),
                         addressA_27=Rle(ADDRESSA_27),
                         addressB_27=Rle(ADDRESSB_27),
                         channel450=Rle(as.factor(CHANNEL450)),
                         channel27=Rle(as.factor(CHANNEL27)),
                         probeType=Rle(as.factor(substr(NAME, 1, 2))),
                         percentGC=as.vector(gcContent),
                         platform=Rle(as.factor(PLATFORM)),
                         sourceSeq=DNAStringSet(SOURCESEQ)
                        )
                 )
names(Infinium.GR.hg18) <- infMeth.hg18$NAME
both = setdiff(levels(values(Infinium.GR.hg18)$channel450)[1], c('Grn','Red'))
swap = which(levels(values(Infinium.GR.hg18)$channel450) == both)
levels(values(Infinium.GR.hg18)$channel450)[ swap ] <- 'Both'
genome(Infinium.GR.hg18) <- 'hg18'
gc(,T)

## for masking purposes, keep track of the 5' and 3' probe coordinates
##
values(Infinium.GR.hg18)$probeStart <- start(Infinium.GR.hg18)
values(Infinium.GR.hg18)$probeEnd <- end(resize(Infinium.GR.hg18, 50, 'end'))
values(Infinium.GR.hg18)$probeTarget <- start(Infinium.GR.hg18)
values(Infinium.GR.hg18)$probeExtension <- NA ## temporary
revMe = which(values(Infinium.GR.hg18)$probeStart == values(Infinium.GR.hg18)$probeEnd)
values(Infinium.GR.hg18[revMe])$probeStart=start(resize(Infinium.GR.hg18[revMe],49,'end'))
values(Infinium.GR.hg18[revMe])$probeEnd=end(resize(Infinium.GR.hg18[revMe],2,'start'))
extend = which(values(Infinium.GR.hg18)$probeEnd == start(Infinium.GR.hg18)+1)
values(Infinium.GR.hg18[extend])$probeExtension <- start(Infinium.GR.hg18[extend])+1
##
## e.g. for masking anything within the probe's target sequence:
##
## probeRanges <- GRanges( seqnames(Infinium.GR.hg18), 
##                        IRanges( values(Infinium.GR.hg18)$probeStart,
##                        values(Infinium.GR.hg18)$probeEnd ),
##                        strand='*' ) 


## fix stranding and check dinucleotide sequence in the reference genome
##
try(detach(package:BSgenome.Hsapiens.UCSC.hg19, unload=TRUE), silent=T)
require(BSgenome.Hsapiens.UCSC.hg18)
resizeToStart <- which(
  getSeq(Hsapiens, resize(Infinium.GR.hg18, 2, fix='start'), as.char=T) == 'CG'
)
resizeToEnd <- which(
  getSeq(Hsapiens, resize(Infinium.GR.hg18, 2, fix='end'), as.char=T) == 'CG'
)

## intersect(resizeToStart, resizeToEnd) ## == integer(0)

## Now fix the actual ranges so that the interrogated dinucleotide is reflected
##
Infinium.GR.hg18[resizeToStart] <- resize(Infinium.GR.hg18[resizeToStart], 2, fix='start')
Infinium.GR.hg18[ resizeToEnd ] <- resize(Infinium.GR.hg18[ resizeToEnd ], 2, fix='end')
restranded = length(resizeToEnd) + length(resizeToStart)
unstranded = length(Infinium.GR.hg18) - restranded # not bad! just CpHs + SNPs

## fix SNP probes (again)
##
start(Infinium.GR.hg18[ names(SNPs.hg18) ]) = start(SNPs.hg18)
width(Infinium.GR.hg18[ names(SNPs.hg18) ]) = 1
strand(Infinium.GR.hg18) = '*'

## It turns out that the HM27k chip DOES have SNP probes.  Label these:
##
## data(hm27.controls)
load("~/Dropbox/RDBMS/FDb.InfiniumMethylation.hg18/data/hm27.controls.rda")
## data(hm27.SNP.colors)
load("~/Dropbox/RDBMS/FDb.InfiniumMethylation.hg18/data/hm27.SNP.colors.rda")
hm27.SNP.controls <- hm27.controls[grep('^rs', hm27.controls$Name), ]
for(i in unique(hm27.SNP.controls$Name)) {
  i = gsub('_$','',i)
  if( i %in% names(Infinium.GR.hg18) & !is.na(hm27.SNP.colors[i]) ) {
    addresses = hm27.SNP.controls$Address[ grep(i, hm27.SNP.controls$Name) ]
    values(Infinium.GR.hg18[ i ])$addressA_27 = addresses[1]
    values(Infinium.GR.hg18[ i ])$addressB_27 = addresses[2]
    values(Infinium.GR.hg18[ i ])$channel27 = hm27.SNP.colors[i]
    values(Infinium.GR.hg18[ i ])$platform = 'BOTH'
  }
}
#Infinium.GR.hg18[na.omit(match(unique(hm27.SNP.controls$Name), names(Infinium.GR.hg18)))]

## Now order the probes based on their name
##
Infinium.GR.hg18 <- Infinium.GR.hg18[order(names(Infinium.GR.hg18))]
values(Infinium.GR.hg18)$name <- names(Infinium.GR.hg18)

## Create the FDb, and save it.
##
source('GenomicRangesToFeatureDb.R')
FDb.InfiniumMethylation.hg18 = GenomicRangesToFeatureDb(
  Infinium.GR.hg18, 
  URL='ftp://ftp.illumina.com', 
  tableName='InfiniumMethylation', 
  src='NCBI/GEO and dbSNP', 
  label='Illumina Infinium DNA methylation probes, aligned to hg18'
)
saveFeatures(FDb.InfiniumMethylation.hg18, 
             file='FDb.InfiniumMethylation.hg18.sqlite')

## to notify users at load time about dropped probes
##
save(unmappable, file="../../data/unmappable.rda")

## Now verify that it comes out the same as it went in (well, almost)
## The extra lines of code are, in my view, a bug in GenomicFeatures
## Also, the types of variables (Rle, numeric, etc.) are not retained
## Nonetheless, this is a pretty handy structure for storing the data.
##
FDb.InfiniumMethylation.hg18 <- loadDb('FDb.InfiniumMethylation.hg18.sqlite')
infi <- features(FDb.InfiniumMethylation.hg18) ## need to fetch features
meta <- metadata(FDb.InfiniumMethylation.hg18) ## need to fetch genome
genome(infi) <- meta[ which(meta[,'name'] == 'Genome'), 'value' ]
if(require(regulatoR)) infi <- addSeqinfo(infi)
show(FDb.InfiniumMethylation.hg18)
show(infi)
