Sunday, April 10, 2016


This is combined bash/R script that will use a file with human ENSEMBLE geneIDs in a first column of a file and append a gene name to it, while keeping the structure of the file from other columns. Ensemble2genename sets its host to thus it could be especially useful when biomaRt site is down.

#write R script, needs biomaRt

echo "#!/usr/bin/Rscript
ensembl = useMart(\"ENSEMBL_MART_ENSEMBL\",dataset=\"hsapiens_gene_ensembl\", host=\"\")
id_merge = getBM(attributes=c(\"ensembl_gene_id\",\"external_gene_name\"),mart=ensembl)
write.table(id_merge, file=\"id_merge.txt\", sep = \"\t\", quote =F, col.names=F, row.names=F)
" > script.r

#run R script

chmod 775 script.r

#Use awk to append gene names

awk 'NR==FNR {h[$1] = $1; h2[$1] = $2; next} {print h2[$1], $0}' id_merge.txt $1 >$1.genename

#remove temporary files

rm id_merge.txt
rm script.r

