.DB_COUNT_COLS <- c("EuropeanDiscovery", "AfricanDiscovery",
    "EastAsianDiscovery", "IndianSouthAsianDiscovery",
    "HispanicDiscovery", "NativeDiscovery", "MicronesianDiscovery",
    "ArabMEDiscovery", "MixedDiscovery", "UnspecifiedDiscovery",
    "FilipinoDiscovery", "IndonesianDiscovery", "EuropeanReplication",
    "AfricanReplication", "EastAsianReplication",
    "IndianSouthAsianReplication", "HispanicReplication",
    "NativeReplication", "MicronesianReplication", "ArabMEReplication",
    "MixedReplication", "UnspecifiedReplication", "FilipinoReplication",
    "IndonesianReplication")

.DB_STUDY_COLS <- c("PMID", "LastCurationDate", "CreationDate",
    "DatePub", "Journal", "Title", "PaperPhenotypeDescription",
    "PaperPhenotypeCategories", "InNHGRIcat_3_31_12",
    "IncludesMaleFemaleOnlyAnalyses", "ExclusivelyMaleFemale")

.DB_VARIANT_COLS <- c("NHLBIkey", "PMID", "HUPfield",
    "SNPid_dbSNP134", "chr_hg19", "pos_hg19", "SNPidInPaper",
    "LocationWithinPaper", "Pvalue", "NegativeLog10PBin", "Phenotype",
    "PlatformSNPsPassingQC", "GWASancestryDescription",
    "InGene", "InLincRNA", "InMiRNA", "InMiRNABS", "dbSNPfxn",
    "dbSNPMAF", "dbSNPallelesHetSe", "dbSNPvalidation", "dbSNPClinStatus",
    "ORegAnno", "ConservPredTFBS", "HumanEnhancer", "RNAedit",
    "PolyPhen2", "SIFT", "LS_SNP", "UniProt", "EqtlMethMetabStudy",
    "DiscoverySampleDescription", "ReplicationSampleDescription")
        
.db_clean_colnames <-
    function(inp)
{
    colnames <- names(inp)
    map <- setNames(
        c("SNPid_dbSNP134", "chr_hg19", "pos_hg19", "SNPid.in.paper", 
          "InNHGRIcat_3_31_12", "Discovery.Sample.Description",
          "TotalSamples.discovery.replication", "LS_SNP"),
        c("SNPid.dbSNP134.", "chr.hg19.", "pos.hg19.", "SNPid.in.paper.",
          "InNHGRIcat.as.of.3.31.12.", "Initial.Sample.Description",
          "TotalSamples.discovery.replication.", "LS.SNP"))
    colnames[match(names(map), colnames)] <- unname(map)
    ## CamelCase
    gsub("[\\.]+([[:alpha:]])?", "\\U\\1", colnames, perl=TRUE)
}

.db_clean_chunk <-
    function(inp, colnames)
{
    ## FIXME
    ## 
    ## - Phenotype (and others?) contain CP1250-encoded characters
    ## - Phenotype contains case-only variants, e.g., 'Foo' and 'foo'
    ##     P = iconv(Phenotype, "CP1250", "UTF-8")
    ##     p = tolower(P)
    ##     Phenotype = P[match(p, p)]
    ## 
    NHLBIkey.bad <- c("2.36501E+14", "2.29412E+14")
    names(inp) <- colnames

    ## bin P values for easy look-up
    inp[["NegativeLog10PBin"]] <-
        as.integer(round(-log10(inp[["Pvalue"]] + .Machine$double.eps), 0))

    ## basic clean-up
    ridx <- inp[["NHLBIkey"]] %in% NHLBIkey.bad
    if (any(ridx))
        inp <- inp[!ridx,, drop=FALSE]

    inp[["CreationDate"]] <- ifelse(inp[["CreationDate"]] == "8/17/12",
                                    "8/17/2012", inp[["CreationDate"]])

    inp[["LastCurationDate"]] <-
        ifelse(inp[["LastCurationDate"]] == "8/17/12",
               "8/17/2012", inp[["LastCurationDate"]])

    inp[["HUPfield"]] <- ifelse(inp[["HUPfield"]] == "9/15/2014",
                                inp[["HUPfield"]], "1/1/2014")

    inp[["LocationWithinPaper"]] <- local({
        x <- inp[["LocationWithinPaper"]]
        x <- sub("^(Table|Figure|Full)([[:alnum:]])", "\\1 \\2", x)
        sub(" table ", " Table ", x)
    })

    map <- setNames(c(NA, FALSE, TRUE), c("", "NO", "YES"))
    inp[["dbSNPvalidation"]] <- unname(map[inp[["dbSNPvalidation"]]])

    inp[["dbSNPClinStatus"]] <- tolower(inp[["dbSNPClinStatus"]])

    inp
}

.db_write_variant <-
    function(db, inp)
{
    cidx <- match(.DB_VARIANT_COLS, names(inp))
    dbWriteTable(db, "variant", inp[, cidx, drop=FALSE], append=TRUE)
}

.db_write_count <-
    function(db, inp)
{
    cidx <- match(.DB_COUNT_COLS, names(inp))
    m <- as.matrix(inp[, cidx, drop=FALSE])
    idx <- !is.na(m)
    re <- "(.*)(Discovery|Replication)$"
    sample <- sub(re, "\\2", colnames(m))
    population <- sub(re, "\\1", colnames(m))
    df <- data.frame(NHLBIkey=inp$NHLBIkey[row(m)[idx]],
                     Sample=sample[col(m)[idx]],
                     Population=population[col(m)[idx]],
                     Count=m[idx],
                     stringsAsFactors=FALSE)
    dbWriteTable(db, "count", df, append=TRUE)
}

.db_accumulate_study <-
    function(study, inp)
{
    ridx <- (!inp[["PMID"]] %in% study[["PMID"]]) &
        (!duplicated(inp[["PMID"]]))
    cidx <- match(.DB_STUDY_COLS, names(inp))
    rbind(study, inp[ridx, cidx, drop=FALSE])
}

.db_index <-
    function(db)
{
    message("indexing")
    idx <- c(
        "CREATE INDEX PMID_index ON variant (PMID);",
        "CREATE INDEX study_PMID_index ON study (PMID);",
        "CREATE INDEX NHLBIkey_index ON variant (NHLBIkey);",
        "CREATE INDEX count_NHLBIkey_index ON count (NHLBIkey);",

        "CREATE INDEX Phenotype_index ON variant (Phenotype);",
        "CREATE INDEX dbSNPid_index ON variant (SNPid_dbSNP134);",
        "CREATE INDEX chr_pos_index ON variant (chr_hg19, pos_hg19);",
        "CREATE INDEX NegativeLog10PBin_index ON variant (NegativeLog10PBin);")
    for (i in idx) {
        message(i)
        dbSendQuery(db, i)
    }
    db
}

.db_create <-
    function(fname, dbname, block=1000000L)
{
    ## initial cleaning, write to SQL
    con <- file(fname, "rt")
    inp <- read.delim(con, header=TRUE, sep="\t", stringsAsFactors=FALSE,
                      nrows=block)
    colClasses <- sapply(inp, class)
    colnames <- .db_clean_colnames(inp)
    ## drop easily calculated 'totals' columns, empty 'NearestGene' column
    colidx <- -grep("(Total|NearestGene)", colnames)

    ## clean up and output to SQLite
    n <- 0L
    db <- dbConnect(dbDriver("SQLite"), dbname)
    study <- NULL
    repeat{
        inp <- .db_clean_chunk(inp, colnames)
        .db_write_variant(db, inp[, colidx, drop=FALSE])
        .db_write_count(db, inp[, colidx, drop=FALSE])
        study <- .db_accumulate_study(study, inp[, colidx, drop=FALSE])
        n <- n + nrow(inp)
        message(n)
        inp <- read.delim(con, header=FALSE, sep="\t",
                          stringsAsFactors=FALSE, col.names=colnames,
                          colClasses=colClasses, nrows=block)
        if (nrow(inp) == 0L)
            break
    }

    dbWriteTable(db, "study", study)
    .db_index(db)
    dbDisconnect(db)
    close(con)
    dbname
}
