.loadLibrary <- function(x) {
	if(suppressWarnings(require(x , character.only = TRUE , quietly=TRUE))){
    	print( paste(x , "is loaded correctly") )
	} else {
    	
    	tryCatch( {
    			print( paste("Trying to install from CRAN:" , x) )
    			install.packages(x) 
    			}
    			, error=function() {
    				print("The library is not on CRAN")
    				print("Trying to install from Bioconductor")
    				source("http://bioconductor.org/biocLite.R")
					biocLite(x , suppressUpdates=TRUE , suppressAutoUpdate=TRUE)
    			}
    			, warning=function(w) {
    				if( grepl('not available' , w) ) {
	    				print("The library is not on CRAN")
	    				print("Trying to install from Bioconductor")
	    				source("http://bioconductor.org/biocLite.R")
						biocLite(x , suppressUpdates=TRUE , suppressAutoUpdate=TRUE)
    				} else {
    				print(w)
    				}}
    			)
    	if(require(x , character.only = TRUE , quietly=TRUE)){
        	print( paste(x , "installed and loaded") )
    	} else {
       		stop( paste("Could not install" , x) )
    	}
	}
}

.loadLibrary("cgdsr")
.loadLibrary("org.Hs.eg.db")
.loadLibrary("parallel")
.loadLibrary("stringr")
.loadLibrary("data.table")
.loadLibrary("stringr")
.loadLibrary("RCurl")
.loadLibrary("XML")


##############################################################
############# UNIPROT PARSER SECOND VERSION ##################
##############################################################
# FROM THIS FILE WE ARE GOING TO OBTAIN ALL THE FASTA SEQUENCES OF ALL THE PROTEIN IN UNIPROT
	
download.file("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/proteomes/HUMAN.fasta.gz" , "HUMAN.fasta.gz")

	#R can read the file without uncompress it (cooool!!)
hs_fasta <- readLines("HUMAN.fasta.gz")
	
	#Tell me the lines of the protein entries
fasta_lines <- grep(">" , hs_fasta)
	#Create a table with entries and corresponding fasta (one line format)
fasta_list <- lapply(1:length(fasta_lines) , function(i) {
											if (i!=length(fasta_lines)) {
												entry <- hs_fasta[ fasta_lines[i] ]
												if ( (fasta_lines[i] + 1)==(fasta_lines[i+1]-1) ) {
													fasta <- hs_fasta[ fasta_lines[i] + 1 ]
												} else {
													fasta <- paste( hs_fasta[ (fasta_lines[i]+1):(fasta_lines[i+1]-1) ] , collapse="")
												}
											} else {
												entry <- hs_fasta[ fasta_lines[i] ]
												fasta <- paste( hs_fasta[ (fasta_lines[i]+1):length(hs_fasta) ] , collapse="")
											}
											return(c( entry , fasta) )

										})
fasta_df <- do.call("rbind" , fasta_list)
	
	#Split by pipe , OS (Organism)
fasta_df_name <- strsplit(fasta_df[ , 1] , split="\\|| OS=")
fasta_df_name <- t(sapply(fasta_df_name , function(x) {
											entry <- x[2]
											uniprot_and_name <- strsplit(x[3] , " ")[[1]]
											uniprot <- uniprot_and_name[1]
											name <- paste(uniprot_and_name[-1] , collapse=" ")
											return(c(entry , uniprot , name))
	}))
	#add fasta sequence
fasta_final <- cbind( fasta_df_name , fasta_df[ , 2] )
colnames(fasta_final) <- c("Entry" , "UNIPROT" , "Protein.name" , "AMINO_SEQ")


##############################################################
###################	HGNC PARSER ##############################
##############################################################
# FROM THIS PARSING WE OBTAIN ALL THE GENE SYMBOLS, HGNC IDs AND ENTREZ ASSOCIATED WITH UNIPROT PROTEIN NAMES

download.file("ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc_complete_set.txt.gz" , "hgnc_complete_set.txt.gz")
hugo_custom <- read.table("hgnc_complete_set.txt.gz" , header=TRUE , sep="\t" , quote="" , comment.char="" , as.is=TRUE)
hugo_custom <- droplevels( hugo_custom[hugo_custom$Status=="Approved" , ] )
colnames(hugo_custom) <- c("HGNC"
							,"Approved_Symbol"
							,"Approved_Name"
							,"Status"
							,"Locus_Type"
							,"Locus_Group"
							,"Previous_Symbols"
							,"Previous_Names"
							,"Synonyms"
							,"Name_Synonyms"
							,"Chromosome"
							,"Date_Approved"
							,"Date_Modified"
							,"Date_Symbol_Changed"
							,"Date_Name_Changed"
							,"Accession_Numbers"
							,"Enzyme_IDs"
							,"Entrez_Gene_ID"
							,"Ensembl_Gene_ID"
							,"Mouse_Genome_Database_ID"
							,"Specialist_Database_Links"
							,"Specialist_Database_IDs"
							,"Pubmed_IDs"
							,"RefSeq_IDs"
							,"Gene_Family_Tag"
							,"Gene_family_description"
							,"Record_Type"
							,"Primary_IDs"
							,"Secondary_IDs"
							,"CCDS_IDs"
							,"VEGA_IDs"
							,"Locus_Specific_Databases"
							,"GeneID"
							,"OMIM_ID_NCBI"
							,"RefSeq_NCBI"
							,"UniProt_chosenByHGNC"
							,"Ensembl_ID_Ensembl"
							,"UCSC_ID_UCSC"
							,"Mouse_Genome_Database_ID_MGI"
							,"Rat_Genome_Database_ID_RGD"
							)

confr <- hugo_custom$Entrez_Gene_ID!=hugo_custom$'GeneID'
confr <- ifelse( is.na(confr) , FALSE , confr)
message( "We found a discrepancy between the HGNC Entrez (Entrez_Gene_ID) and the Entrez provided by NCBI (GeneID)")
print( hugo_custom[ confr , c("HGNC" , "Approved_Symbol" , "GeneID" , "Entrez_Gene_ID")] )

hugo_custom$GeneID <- ifelse( is.na(hugo_custom$GeneID) & !is.na(hugo_custom$Entrez_Gene_ID) , hugo_custom$Entrez_Gene_ID , hugo_custom$GeneID )

colsToKeep <- c("HGNC" 
				,"Approved_Symbol" 
				,"GeneID" 
				,"UniProt_chosenByHGNC" 
				,"Approved_Name" 
				,"Synonyms_and_Previous_Symbols" 
				,"Chromosome"
				)

hugo_custom$Synonyms_and_Previous_Symbols <- with(hugo_custom , paste(Previous_Symbols , Synonyms , sep=", ") )
hugo_custom$Synonyms_and_Previous_Symbols <- ifelse(hugo_custom$Synonyms_and_Previous_Symbols==", " , "" , hugo_custom$Synonyms_and_Previous_Symbols)
hugo_custom$Synonyms_and_Previous_Symbols <- ifelse( grepl("^, " , hugo_custom$Synonyms_and_Previous_Symbols) , sub(", " , "" , hugo_custom$Synonyms_and_Previous_Symbols) , hugo_custom$Synonyms_and_Previous_Symbols)
hugo_custom$Synonyms_and_Previous_Symbols <- ifelse( grepl(", $" , hugo_custom$Synonyms_and_Previous_Symbols) , gsub(", $" , "" , hugo_custom$Synonyms_and_Previous_Symbols) , hugo_custom$Synonyms_and_Previous_Symbols)
hugo_custom_red <- hugo_custom[ , colsToKeep]


	# take id mapping file as suggested by cbioportal
download.file("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz" , "HUMAN_9606_idmapping.dat.gz")
idmap2 <- read.table("HUMAN_9606_idmapping.dat.gz" , sep="\t" , as.is=TRUE)
colnames(idmap2) <- c("UniProtKB_AC"
					,"ID_type"
					,"ID"
					)
	# take out the reduntant entries
idmap2 <- droplevels(idmap2[ !grepl("-" , idmap2$UniProtKB_AC) , ])
idmap2_red <- droplevels( unique( idmap2[ grepl("UniProtKB-ID|HGNC|GeneID" , idmap2$ID_type) , ] ) )


	# create the second type of protein annotation (uniprotKB_ID)
idmap2_red_UniProtKB_ID <- data.frame( UniProtKB_AC=idmap2_red[ idmap2_red$ID_type=="UniProtKB-ID", "UniProtKB_AC" ]
										,UniProtKB_ID=idmap2_red[ idmap2_red$ID_type=="UniProtKB-ID", "ID" ]
										,stringsAsFactors=FALSE)
	# merge with the second type of protein annotation and then merge with HGNC
idmap2_red_merge <- merge(idmap2_red[ idmap2_red$ID_type=="GeneID" | idmap2_red$ID_type=="HGNC" ,  ] 
						, idmap2_red_UniProtKB_ID , by="UniProtKB_AC" 
						, all.x=TRUE)
idmap2_hugo <- merge(idmap2_red_merge , hugo_custom_red , by.x="ID" , by.y="HGNC" , all.x=TRUE)


idmap2_hugo_HGNC <- droplevels( idmap2_hugo[ idmap2_hugo$ID_type=="HGNC", ] )
idmap2_hugo_HGNC$ID_type <- NULL
colnames(idmap2_hugo_HGNC) <- c("HGNC"
								,"Entry"
								,"UNIPROT"
								,"Gene_Symbol"
								,"Entrez"
								,"UniProt_chosenByHGNC"
								,"Approved_Name"
								,"Synonyms_and_Previous_Symbols"
								,"Chromosome"
								)
idmap2_hugo_HGNC <- droplevels(idmap2_hugo_HGNC[ !is.na(as.character(idmap2_hugo_HGNC$Gene_Symbol)) | 
													as.character(idmap2_hugo_HGNC$Gene_Symbol)!="", ])



##############################################################
############# MERGING UNIPROT FASTA AND HGNC #################
##############################################################

	#we get rid of the UNIPROT column because is redundant. We use Entry ids as match
uniprot <- fasta_final[ , c(1,3,4)]
uni <- merge(idmap2_hugo_HGNC , uniprot , by="Entry" , all.x=TRUE)
uni <- uni[ !is.na(uni$AMINO_SEQ) , ]
for(i in colnames(uni)){
	print(class(uni[ , i]))
	if(class(uni[ , i])=="factor")
		uni[,i] <- as.character(uni[,i])

}

##############################################################
###################### CBIOPORTAL ANNOTATION #################
##############################################################

		# change Entrez column type to character
uni$'Entrez' <- as.character(uni$'Entrez')

	# find out proteins without a valid Entrez ID
entrezAvailable <- !is.na(uni$'Entrez')

	# Interrogate org.Hs db
x <- org.Hs.egSYMBOL2EG
	# Get the entrez gene identifiers that are mapped to a gene symbol
mapped_genes <- mappedkeys(x)
	# Convert to a list
xx <- as.list(x[mapped_genes])

	# change Entrez column type to character
entrezNOTAvailable <- !entrezAvailable
missingEntrez <- uni[entrezNOTAvailable , 'Gene_Symbol']
imputation <- lapply( xx[missingEntrez] , function(x) if(is.null(x)) return(NA) else return(x) )
names(imputation) <- missingEntrez
uni[entrezNOTAvailable , 'Entrez'] <- unlist(imputation)
	

	# filter out
entrezAvailable <- !is.na(uni$'Entrez')
nFiltered <- length(which(!entrezAvailable))
if(nFiltered > 0) {
	message( paste( nFiltered, 'proteins were filtered because they lack a valid Entrez ID. The proteins are:' ) )
	print( uni[!entrezAvailable, c('UNIPROT','Gene_Symbol')] )
}

#uni_original <- uni
uni <- uni[entrezAvailable, ]

eid2uniprot <- split(uni, uni$'Entrez')

eid2nprot <- sapply(eid2uniprot, nrow)
eidNotUnique <-  eid2nprot > 1
eidUnique <- !eidNotUnique
eid2cbpMatch <- as.list(rep(NA, length(eid2uniprot)))
names(eid2cbpMatch) <- names(eid2uniprot)
eid2cbpMatch[which(eidUnique)] <- sapply(eid2uniprot[eidUnique], function(x) as.character(x$UNIPROT))

genes2test <- names(eid2cbpMatch[eidNotUnique])
nOfLoads <- 1000
loadBalance <- ceiling(1:length(genes2test)/nOfLoads)
splittedGenes <- split(genes2test, loadBalance)
	
	# Function to retrive info from cBioportal
cores <- ifelse(Sys.info()['sysname']=="Windows" , 1 , 2)
mycgds <- CGDS("http://www.cbioportal.org/public-portal/")
all_cancer_studies <- getCancerStudies(mycgds)[,c(1,2)]
getGeneMutations <- function(gene, parallelize=FALSE)
{
	if(parallelize) applyfun <- mclapply else applyfun <- lapply
	out <- do.call('rbind', applyfun(all_cancer_studies[,1] , function(i) 
		{
			geneticProfile <- getGeneticProfiles(mycgds, i)[ ,c(1:2)]
			sel <- geneticProfile$genetic_profile_name=="Mutations"
			geneticProfile <- geneticProfile[sel, 1]
			caseList <- getCaseLists(mycgds, i)
			sel <- caseList$case_list_name=="Sequenced Tumors"
			caseListID <- caseList[sel, 1]					
			muts <- getMutationData( mycgds 
				, caseList=caseListID 
				, geneticProfile=geneticProfile 
				, genes=gene)
			letter <- substr(muts$amino_acid_change,1,1)
			position <- as.numeric(as.character(str_extract(
				string=muts$amino_acid_change,pattern="\\d+")))
			out <- data.frame(
				gene_id=muts$entrez_gene_id
				, gene_name=muts$gene_symbol
				, amino_acid_letter=letter
				, amino_acid_position=position
				)
			return(out)
		}))
	out <- unique(out)
	ord <- order(out[,2])
	out <- out[ord,]
	ix <- !is.na(out$amino_acid_position)
	return(out[ix,])
}

	# Function to interrogate the whole cBioportal and find the best gene-protein match
results <- lapply(splittedGenes, function(testGenes)
{
	t <- system.time(mut <- getGeneMutations(testGenes))
	print(t)
	mut_split <- split(mut, with(mut, gene_id))
	#mut_split <- mut
	seq_split <- eid2uniprot[testGenes]

	out <- lapply(testGenes, function(gene){
		seq_gene <- seq_split[[gene]]
		#mut_gene <- mut_split[ mut_split$gene_name==gene , ]
		mut_gene <- mut_split[[gene]]
		if(is.null(mut_gene)) return('not mutated')
		#if(nrow(mut_gene)==0) return('not mutated')
		matchpercs <- lapply(seq_gene$AMINO_SEQ, function(x) {
			x <- gsub(' ', '', x)
			x <- strsplit(x,'')[[1]]
			match <- x[mut_gene$amino_acid_position] == 
				mut_gene$amino_acid_letter
			length(which(match))/length(match)*100
			})
		names(matchpercs) <- seq_gene$UNIPROT
		matchpercs <- unlist(matchpercs)
		maxperc <- max(matchpercs)
		matchpercs <- matchpercs[matchpercs==maxperc]
		return(round(matchpercs))
	})
	names(out) <- testGenes
	return(out)
} 
#, mc.cores=cores
)

#save(results , file="results_for_debugging_v3.Rdata")

for(i in 1:length(splittedGenes)) 
	eid2cbpMatch[names(results[[i]])] <- results[[i]]

resolvedGenes <- sapply(eid2cbpMatch, length) == 1 & sapply(eid2cbpMatch, is.numeric)
resolvedGenes <- names(eid2cbpMatch[resolvedGenes])

eid2cbpMatch[resolvedGenes] <- sapply(eid2cbpMatch[resolvedGenes], names)
stopifnot( identical( names(eid2uniprot), names(eid2cbpMatch) ))

#save(eid2cbpMatch, file='eid2cbpMatch_v3.Rdata')

notResolvedGenes <- sapply(eid2cbpMatch, length) > 1 | sapply(eid2cbpMatch, function(x) x[1] == 'not mutated')
notResolvedGenes <- names(eid2cbpMatch[notResolvedGenes])

######################################################
################## FINAL FILTER ######################
######################################################

# LIST OF FILTERS
	# 1. Single gene-protein match or mutations that best fit to the protein fasta sequence (applied above)
	# 2. If a uniprot ID has Gene_Symbol = UNIPROT id, we select that transcript (like LCE6A and LCEA6_HUMAN)
	# 3. If all the entries have 'Fragment' except one, we choose this transcript	
	# 4. If for a given gene, there is only one reviewed transcript, we choose that transcript
	# 5. If for a given gene, HGNC chose a single protein, we trust their choice
	# 6. We accept a partial matching with a Levensthein distance <= 3 (e.g. TP53 and P53_HUMAN , distance=1) 
			#and in case of ties, the isoform 1 among them
	# 7. If there is still more than one match, we choose the longest transcript
	# 8. If none of the above methods works, we choose the first protein in alphabetical order

uni_plus_split <- eid2uniprot[notResolvedGenes]

message( paste("We have to find a 1 gene 1 protein match to" , length(uni_plus_split) , "genes") )

	# 2. If a uniprot ID has Gene_Symbol = UNIPROT id, we select that transcript (like LCE6A and LCEA6_HUMAN)
uni_plus_split2 <- lapply( uni_plus_split , function(df) {
													gene_prot_eq <- with(df , Gene_Symbol==gsub("_HUMAN" , "" , UNIPROT) )
													df2 <- df[ gene_prot_eq , ]
													if (nrow(df2)==1) {
														return(droplevels(df2))
													} else {
														return(droplevels(df))
													}
											})
n_match <- sapply(uni_plus_split2 , nrow)
uni_plus_split2_NOTsolved <- uni_plus_split2[ n_match!=1 ]
uni_plus_split2_solved <- as.data.frame(rbindlist(uni_plus_split2[ n_match==1 ]))

message( paste( "We solved" , nrow(uni_plus_split2_solved) , "Genes") )
message( paste( "We didn't solve" , length(uni_plus_split2_NOTsolved) , "Genes") )


solvedGenes <- uni_plus_split2_solved$'Entrez'
solvedProteins <- uni_plus_split2_solved$UNIPROT
eid2cbpMatch[solvedGenes] <- solvedProteins
#save(eid2cbpMatch, file='FilteringPart_v3.Rdata')

rm(uni_plus_split , n_match , uni_plus_split2)
gc()

	# 3. If all the entries have 'Fragment' except one, we choose that transcript
uni_plus_split3 <- lapply( uni_plus_split2_NOTsolved , function(df) {
															NO_fragment <- !grepl( "fragment" , as.character(df$Protein.name) , ignore.case=TRUE)
															df3 <- df[NO_fragment , ]
															if(nrow(df3)==1) {
																return(droplevels(df3))	
															} else {
																if(nrow(df3)==0) {
																	return(droplevels(df)) 
																} else {
																	return(droplevels(df3))
																}
															}
														})


n_match <- sapply(uni_plus_split3 , nrow)
uni_plus_split3_NOTsolved <- uni_plus_split3[ n_match!=1 ]
uni_plus_split3_solved <- as.data.frame(rbindlist(uni_plus_split3[ n_match==1 ]))

message( paste( "We solved" , nrow(uni_plus_split3_solved) , "Genes") )
message( paste( "We didn't solve" , length(uni_plus_split3_NOTsolved) , "Genes") )

solvedGenes <- uni_plus_split3_solved$'Entrez'
solvedProteins <- uni_plus_split3_solved$UNIPROT
eid2cbpMatch[solvedGenes] <- solvedProteins
#save(eid2cbpMatch, file='FilteringPart_v3.Rdata')

rm(uni_plus_split3 , n_match)
gc()

	# 4. If for a given gene, there is only one reviewed transcript, we choose that transcript. Otherwise, we filter out all the not reviewed transcript
		# This tba delimited file is obtained from uniprot website with this filter ' reviewed:yes AND organism:"Homo sapiens (Human) [9606]" '. downloaded as tab delimited 
download.file("http://www.uniprot.org/uniprot/?query=reviewed%3ayes+AND+organism%3a%22Homo+sapiens+(Human)+%5b9606%5d%22&force=yes&format=tab&columns=id,entry%20name,reviewed,protein%20names,genes,organism,length" , "uniprot_homo_reviewed.txt")
uni_rew <- read.table('uniprot_homo_reviewed.txt', sep='\t', quote='', comment.char='', header=TRUE , as.is=TRUE)
colnames(uni_rew) <- c("Entry"
						,"UNIPROT"
						,"Status"
						,"Protein.name"
						,"Gene_Symbol"
						,"Organism"
						,"Length"
						)
#uni_rew_forNOTsolved <- uni_rew[ uni_rew$Gene_Symbol %in% names(uni_plus_split3_NOTsolved) , ]

uni_plus_split4 <- lapply( uni_plus_split3_NOTsolved , function(df) {
															gene <- as.character(unique(df$Gene_Symbol))
															rew <- uni_rew[ uni_rew$Gene_Symbol==gene , ]
															if( nrow(rew)==1 ) {
																uniprot <- unique( as.character(rew$UNIPROT) )
																df4 <- df[ df$UNIPROT==uniprot , ]
																return(droplevels(df4))
															} else {
																if( nrow(rew)==0 ) {
																	return(droplevels(df))
																} else {
																	uniprot <- unique( as.character(rew$UNIPROT) )
																	df4 <- df[ df$UNIPROT %in% uniprot , ]																
																	return(droplevels(df4))
																}
															}
															})

n_match <- sapply(uni_plus_split4 , nrow)
uni_plus_split4_NOTsolved <- uni_plus_split4[ n_match!=1 ]
uni_plus_split4_solved <- as.data.frame(rbindlist(uni_plus_split4[ n_match==1 ]))

message( paste( "We solved" , nrow(uni_plus_split4_solved) , "Genes") )
message( paste( "We didn't solve" , length(uni_plus_split4_NOTsolved) , "Genes") )


solvedGenes <- uni_plus_split4_solved$'Entrez'
solvedProteins <- uni_plus_split4_solved$UNIPROT
eid2cbpMatch[solvedGenes] <- solvedProteins
#save(eid2cbpMatch, file='FilteringPart_v3.Rdata')

rm(uni_plus_split4 , n_match)
gc()



	# 5. If for a given gene, HGNC chose a single protein, we trust their choice
uni_plus_split5 <- lapply( uni_plus_split4_NOTsolved , function(df) {
															prot <- as.character(unique(df$UniProt_chosenByHGNC))
															if( prot=="" | is.na(prot) | length(prot)!=1 | !(prot %in% as.character(df$Entry)) ) {
																return(droplevels(df))
															} else {
																df5 <- df[ df$Entry==prot , ]
																return(droplevels(df5))
															}
															})

n_match <- sapply(uni_plus_split5 , nrow)
uni_plus_split5_NOTsolved <- uni_plus_split5[ n_match!=1 ]
uni_plus_split5_solved <- as.data.frame(rbindlist(uni_plus_split5[ n_match==1 ]))

message( paste( "We solved" , nrow(uni_plus_split5_solved) , "Genes") )
message( paste( "We didn't solve" , length(uni_plus_split5_NOTsolved) , "Genes") )


solvedGenes <- uni_plus_split5_solved$'Entrez'
solvedProteins <- uni_plus_split5_solved$UNIPROT
eid2cbpMatch[solvedGenes] <- solvedProteins
#save(eid2cbpMatch, file='FilteringPart_v3.Rdata')

rm(uni_plus_split5 , n_match)
gc()



	# 6. We accept a partial matching with a Levensthein distance <= 3 (e.g. TP53 and P53_HUMAN , distance=1) and in case of ties, the isoform 1 among them
uni_plus_split6 <- lapply( uni_plus_split5_NOTsolved , function(df) {
															Leven <- adist( as.character(unique(df$Gene_Symbol)) , gsub("_HUMAN" , "", df$UNIPROT) )
															if( min(Leven)<=3 ) {
																Leven_bool <- Leven==min(Leven)
																df6 <- df[ Leven_bool , ]
																if ( nrow(df6)==1 ) {
																	return(df6)
																} else {
																	df6_2 <- df6[ grepl("isoform 1|isoforms 1|isoform A|isoforms A" , df6$Protein.name) , ]
																	if ( nrow(df6_2)==1 ) {
																		return(df6_2)
																	} else {
																		return(df6)
																	}
																}
															} else {
																return( df )
															}
														})

n_match <- sapply(uni_plus_split6 , nrow)
uni_plus_split6_NOTsolved <- uni_plus_split6[ n_match!=1 ]
uni_plus_split6_solved <- as.data.frame(rbindlist(uni_plus_split6[ n_match==1 ]))

message( paste( "We solved" , nrow(uni_plus_split6_solved) , "Genes") )
message( paste( "We didn't solve" , length(uni_plus_split6_NOTsolved) , "Genes") )


solvedGenes <- uni_plus_split6_solved$'Entrez'
solvedProteins <- uni_plus_split6_solved$UNIPROT
eid2cbpMatch[solvedGenes] <- solvedProteins
#save(eid2cbpMatch, file='FilteringPart_v3.Rdata')

rm(n_match , uni_plus_split6)
gc()


	# 7. If there is still more than one match, we choose the longest transcript

		#nchar works just with single string so we create a vectorized version that works with vectors of strings
nchar_v <- Vectorize(function(x) nchar(as.character(x)))
uni_plus_split7 <- lapply( uni_plus_split6_NOTsolved , function(df) {
															df7 <- df[ nchar_v( df$AMINO_SEQ )==max( nchar_v( df$AMINO_SEQ ) ) , ]
															if (nrow(df7)==1) {
																return( droplevels(df7) )
															} else {
																return( df7 )
															}
														})

n_match <- sapply(uni_plus_split7 , nrow)
uni_plus_split7_NOTsolved <- uni_plus_split7[ n_match!=1 ]
uni_plus_split7_solved <- as.data.frame(rbindlist(uni_plus_split7[ n_match==1 ]))

message( paste( "We solved" , nrow(uni_plus_split7_solved) , "Genes") )
message( paste( "We didn't solve" , length(uni_plus_split7_NOTsolved) , "Genes") )


solvedGenes <- uni_plus_split7_solved$'Entrez'
solvedProteins <- uni_plus_split7_solved$UNIPROT
eid2cbpMatch[solvedGenes] <- solvedProteins
#save(eid2cbpMatch, file='FilteringPart_v3.Rdata')

rm(uni_plus_split7 , n_match)
gc()


	# 8. If none of the above methods works, we choose the first protein in alphabetical order
uni_plus_split8 <- lapply( uni_plus_split7_NOTsolved , function(df) {
															first_uniprot <- sort( as.character(df$UNIPROT) )[1]
															df7 <- df[ df$UNIPROT==first_uniprot , ]
															return(df7)
														})

n_match <- sapply(uni_plus_split8 , nrow)
uni_plus_split8_NOTsolved <- uni_plus_split8[ n_match!=1 ]
uni_plus_split8_solved <- as.data.frame(rbindlist(uni_plus_split8[ n_match==1 ]))

message( paste( "We solved" , nrow(uni_plus_split8_solved) , "Genes") )
message( paste( "We didn't solve" , length(uni_plus_split8_NOTsolved) , "Genes") )


solvedGenes <- uni_plus_split8_solved$'Entrez'
solvedProteins <- uni_plus_split8_solved$UNIPROT
eid2cbpMatch[solvedGenes] <- solvedProteins
#save(eid2cbpMatch, file='FilteringPart_v3.Rdata')

rm(uni_plus_split8 , n_match)
gc()

############################## AGGREGATE THE RESULTS ###################################

eid2cbpMatch_df <- data.frame(Entrez=names(eid2cbpMatch) 
							, UNIPROT=unlist(eid2cbpMatch)
							,stringsAsFactors=FALSE)

	#Control
stopifnot( all( grepl("HUMAN" , eid2cbpMatch_df$UNIPROT) ) )


uniprot_unique_match <- merge(uni , eid2cbpMatch_df , by=c("Entrez" , "UNIPROT") , all.y=TRUE)
uniprot_unique_match$Gene_Symbol <- toupper(as.character(uniprot_unique_match$Gene_Symbol))

myFinalCols <- c("Gene_Symbol" , "Entrez" , "UNIPROT" , "Entry" , "HGNC" , "Approved_Name" , "Protein.name" ,"Chromosome" , "AMINO_SEQ")
myUni <- uniprot_unique_match[ , myFinalCols]
for( i in colnames(myUni)) {
	if(class(myUni[ , i])=="factor")
		myUni[ , i] <- as.character(myUni[ , i])
}
myUni$Entrez <- as.integer(myUni$Entrez)


save(myUni , file="myUni.RData" , compress="xz")



######################################################
#################### ALIAS CHECKER ###################
######################################################

#Official_Alias_list <- strsplit(as.character(uniprot_unique_match$Synonyms_and_Previous_Symbols) , split=", ")
#Official_Alias_list <- lapply(Official_Alias_list , toupper)
#names(Official_Alias_list) <- toupper( uniprot_unique_match$Gene_Symbol )

#Alias_Official_df <- lapply(1:length(Official_Alias_list) , function(x) {
#						df <- data.frame(Alias=toupper(Official_Alias_list[[x]]) 
#										, Official_Gene_Symbol=rep(names(Official_Alias_list)[x] , length(Official_Alias_list[[x]]))
#										)
#						return(df)
#								})
#Alias_Official_df <- as.data.frame( rbindlist(Alias_Official_df) )
#Alias_Official_df[ is.na(Alias_Official_df$Alias) , "Alias"] <- "NA"
#for(i in colnames(Alias_Official_df))
#	Alias_Official_df[ , i] <- as.character(Alias_Official_df[ , i])
#myAlias <- Alias_Official_df

#save(myAlias , file="myAlias.RData" , compress="xz")

Official_Alias_list <- strsplit(as.character(hugo_custom$Synonyms_and_Previous_Symbols) , split=", ")
Official_Alias_list <- lapply(Official_Alias_list , toupper)
names(Official_Alias_list) <- toupper( hugo_custom$Approved_Symbol )
locus_type <- hugo_custom$Locus_Type
locus_group <- hugo_custom$Locus_Group

Alias_Official_df <- lapply(1:length(Official_Alias_list) , function(x) {
						df <- data.frame(Alias=toupper(Official_Alias_list[[x]]) 
										, Official_Gene_Symbol=rep(names(Official_Alias_list)[x] 
																, length(Official_Alias_list[[x]]))
										, Locus_Group=rep(locus_group[x] 
																, length(Official_Alias_list[[x]]))
										, Locus_Type=rep(locus_type[x] 
																, length(Official_Alias_list[[x]]))
										)
						return(df)
								})
Alias_Official_df <- as.data.frame( rbindlist(Alias_Official_df) )
Alias_Official_df[ is.na(Alias_Official_df$Alias) , "Alias"] <- "NA"
for(i in colnames(Alias_Official_df))
	Alias_Official_df[ , i] <- as.character(Alias_Official_df[ , i])
myAlias <- Alias_Official_df
myAlias$MappedByLowMACA <- ifelse(as.character(myAlias$Official_Gene_Symbol) %in% as.character(myUni$Gene_Symbol) , "yes" , "no")

save(myAlias , file="myAlias.RData" , compress="xz")

##############################################################
################ PFAM DOWNLOAD AND PARSING ###################
##############################################################

	# general url of the PFAM ftp data repositories
root_repository <- "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/"

	#PFAM-A archive
if( url.exists(root_repository) ) {
	doc <- getURL(root_repository)
	doc_parsed <- unlist( strsplit( strsplit(doc , "\n")[[1]] , "\\s+") )
	pfam_versions <- grep("Pfam" , doc_parsed , value=TRUE , ignore.case=TRUE)
	pfam_versions <- as.numeric( gsub("Pfam" , "" , pfam_versions))
	latestPFAM <- paste("Pfam" , format( max(pfam_versions) , nsmall=1) , sep="")
	latestPFAM_repository <- paste(root_repository , latestPFAM , sep="")
	if( url.exists(latestPFAM_repository) ) {

		# Download and parse pfam entries description
		download.file( paste( latestPFAM_repository , "Pfam-A.clans.tsv.gz" , sep="/") , "Pfam-A.clans.tsv.gz")
		clans <- readLines("Pfam-A.clans.tsv.gz")
		clans <- lapply( strsplit(clans , "\t") , function(x) return(x[1:5]) )
		clans <- do.call("rbind" , clans)
		clans <- ifelse(clans=="\\N" , NA , clans)
		clans <- ifelse(clans=="No_clan" , NA , clans)
		colnames(clans) <- c("Pfam_ID" , "Clan_ID", "Clan_Name", "Pfam_Name", "Pfam_Description")
		clans <- as.data.frame(clans)

		# Download and parse human pfam-A db
		download.file( paste( latestPFAM_repository , "proteomes" , "9606.tsv.gz" , sep="/") , "9606.tsv.gz")
			#This file has 2 rows of useless explanation and the header is the third row. 
			#But I don't like their header so I created one on my own
		pfam <- readLines("9606.tsv.gz")
		pfam <- as.data.frame(do.call("rbind" ,  strsplit(pfam[4:length(pfam)] , split="\t") ))
		colnames(pfam) <- c("Entry"
							,"Alignment_Start"
							,"Alignment_End"
							,"Envelope_Start"
							,"Envelope_End"
							,"Pfam_ID"
							,"Pfam_Name"
							,"Type"
							,"Hmm_Start"
							,"Hmm_End"
							,"Hmm_Length"
							,"Bit_Score"
							,"E_Value"
							,"Clan_ID"
							)
		pfam$Clan_ID <- ifelse( pfam$Clan_ID=="No_clan" , NA , pfam$Clan_ID)

		pfam_merge <- merge(pfam , myUni , by="Entry" , all.x=TRUE)
		pfam_merge <- pfam_merge[ !is.na(pfam_merge$Gene_Symbol) , ]
		pfam_merge$Pfam_Fasta <- substring(pfam_merge$AMINO_SEQ , as.integer( as.character(pfam_merge$Envelope_Start) ) , as.integer(as.character(pfam_merge$Envelope_End)) )
		
		pfam_merge$UniProt_chosenByHGNC <- NULL
	}
}
for( i in colnames(clans)) {
	if(class(clans[ , i])=="factor")
		clans[ , i] <- as.character(clans[ , i])
}
for( i in colnames(pfam_merge)) {
	if(class(pfam_merge[ , i])=="factor")
		pfam_merge[ , i] <- as.character(pfam_merge[ , i])
}
colsmyPfam <- c("Entry"
				,"Envelope_Start"
				,"Envelope_End"
				,"Pfam_ID"
				,"Pfam_Name"
				,"Type"
				,"Clan_ID"
				,"Entrez"
				,"UNIPROT"
				,"Gene_Symbol"
				,"Pfam_Fasta"
				)
myPfam <- unique(pfam_merge[ , colsmyPfam])
## remove domains without a fasta sequence:
## they can be generated when domain boundaries are greater than the
## protein length (for example domiain boudaries are: 157-180 and 
## protein sequence is shorter than 157)
myPfam <- myPfam[myPfam$Pfam_Fasta != "", ]
myClans <- clans
save(myPfam , file="myPfam.RData" , compress="xz")
save(myClans , file="myClans.RData" , compress="xz")

file_to_remove <- c("9606.tsv.gz"
					,"hgnc_complete_set.txt.gz"
					,"HUMAN_9606_idmapping.dat.gz"
					,"HUMAN.fasta.gz"
					,"Pfam-A.clans.tsv.gz"
					,"uniprot_homo_reviewed.txt")

file.remove(file_to_remove)