Tuesday, June 9, 2015

Using biomaRt to match Affymetrix and Ensembl IDs

Lets say that you have two files that contain different gene/probe IDs and you want to match them.
First one e.g. contains Ensembl IDs, second Affy probe IDs in their first columns, respectively.

head M0_M1_M2
"ID"    "adj.P.Val"     "P.Value"       "F"     "Gene.symbol"   "Gene.title"
"1458381_at"    "6.98e-13"      "1.55e-17"      "4.82e+03"      "Clic5" "chloride intracellular channel 5"
"1455899_x_at"  "5.14e-12"      "2.28e-16"      "3.02e+03"      "Socs3" "suppressor of cytokine signaling 3"
"1422619_at"    "9.82e-12"      "7.82e-16"      "2.44e+03"      "Ppap2a"        "phosphatidic acid phosphatase type 2A"
"1433741_at"    "9.82e-12"      "8.95e-16"      "2.38e+03"      "Cd38"  "CD38 antigen"
"1434537_at"    "9.82e-12"      "1.09e-15"      "2.30e+03"      "Slco3a1"       "solute carrier organic anion transporter family, member 3a1"
"1424339_at"    "1.09e-11"      "1.44e-15"      "2.19e+03"      "Oasl1" "2'-5' oligoadenylate synthetase-like 1"
"1418652_at"    "1.62e-11"      "2.95e-15"      "1.93e+03"      "Cxcl9" "chemokine (C-X-C motif) ligand 9"
"1420549_at"    "1.62e-11"      "3.23e-15"      "1.90e+03"      "Gbp2b" "guanylate binding protein 2b"
"1448918_at"    "1.62e-11"      "3.73e-15"      "1.85e+03"      "Slco3a1"       "solute carrier organic anion transporter family, member 3a1"


head smc_chl
"ID"    "adj.P.Val"     "P.Value"       "t"     "B"     "logFC" "SPOT_ID"
"ENSMUSG00000029371_at" "5.25e-14"      "3.11e-18"      "214.094964"    "27.467113"     "9.0131515"     "ENSMUSG00000029371"
"ENSMUSG00000040026_at" "3.51e-12"      "4.16e-16"      "125.316488"    "25.629056"     "7.8254984"     "ENSMUSG00000040026"
"ENSMUSG00000029380_at" "2.89e-11"      "5.13e-15"      "95.181841"     "24.148872"     "5.1525411"     "ENSMUSG00000029380"
"ENSMUSG00000026822_at" "3.13e-11"      "7.43e-15"      "91.407928"     "23.899528"     "9.707142"      "ENSMUSG00000026822"
"ENSMUSG00000018920_at" "4.87e-11"      "1.45e-14"      "84.986439"     "23.431046"     "3.3194647"     "ENSMUSG00000018920"
"ENSMUSG00000021390_at" "5.17e-11"      "1.84e-14"      "-82.765402"    "23.254652"     "-5.7210961"    "ENSMUSG00000021390"
"ENSMUSG00000024030_at" "5.40e-11"      "2.24e-14"      "80.996784"     "23.10843"      "3.0392955"     "ENSMUSG00000024030"
"ENSMUSG00000019929_at" "5.46e-11"      "2.59e-14"      "79.724354"     "22.999927"     "5.0517855"     "ENSMUSG00000019929"
"ENSMUSG00000001349_at" "8.16e-11"      "4.41e-14"      "-75.214362"    "22.591543"     "-3.8921629"    "ENSMUSG00000001349"


If you need to extract and match identifiers from different databases it is best to do this via biomaRt Bioconductor package:

Install biomaRt:

source("http://bioconductor.org/biocLite.R") 
biocLite("biomaRt")

To extract mouse Affy IDs from Affymetrix GeneChip Mouse Genome 430 2.0 Array, and match this with Ensembl IDs use the following code. Note that Affy IDs are in Mart:ensemble Dataset:mmusculus_gene_ensembl Attributes:affy_mouse430_2, while Ensemble IDs are in Attributes:ensembl_gene_id. Note also that we pulled all the Affy IDs and Ensembl IDs from the first columns of the two previously defined files and used them as filters for getBM function.


library(biomaRt)
listMarts()

ensembl = useMart("ensembl",dataset="mmusculus_gene_ensembl")
listDatasets(ensembl)

affy_id = read.delim("M0_M1_M2", header =TRUE,quote= "\"", sep="\t")
affy_id_col1 = affy_id [,1]

ensembl_id = read.delim("smc_chl", header =TRUE,quote= "\"", sep="\t")
ensembl_id_col1 = ensembl_id [,1]

id_merge = getBM(attributes=c("affy_mouse430_2","ensembl_gene_id","external_gene_name"),filters="affy_mouse430_2",values=affy_id_col1,mart=ensembl)
head(id_merge, 50)


Output:

affy_mouse430_2    ensembl_gene_id external_gene_name
1        1432227_at ENSMUSG00000087193            Gm14820
2        1441786_at ENSMUSG00000063087            Gm10125
3        1420957_at ENSMUSG00000005871                Apc
4        1450056_at ENSMUSG00000005871                Apc
5        1435543_at ENSMUSG00000005871                Apc
6        1420956_at ENSMUSG00000005871                Apc
7        1428301_at ENSMUSG00000094811            Gm21103
8        1420614_at ENSMUSG00000031176             Dynlt3
9        1449929_at ENSMUSG00000031176             Dynlt3
10       1449928_at ENSMUSG00000031176             Dynlt3
11     1459854_s_at ENSMUSG00000031176             Dynlt3
12     1453241_a_at ENSMUSG00000092523            Gm18191
13     1415833_x_at ENSMUSG00000068243             Gm7079
14       1427542_at ENSMUSG00000038248               Sobp
15     1427382_a_at ENSMUSG00000039231            Suv39h1
16     1432236_a_at ENSMUSG00000039231            Suv39h1
17       1426107_at ENSMUSG00000051977              Prdm9
18     1450331_s_at ENSMUSG00000096707         Vmn2r-ps44
19     1436000_a_at ENSMUSG00000054115               Skp2
20     1437033_a_at ENSMUSG00000054115               Skp2
21       1418969_at ENSMUSG00000054115               Skp2
22     1460247_a_at ENSMUSG00000054115               Skp2
23     1449293_a_at ENSMUSG00000054115               Skp2
24       1425072_at ENSMUSG00000054115               Skp2
25     1415976_a_at ENSMUSG00000044434             Gm9791
26       1442926_at ENSMUSG00000102099      1700011B04Rik
27       1422182_at ENSMUSG00000017688              Hnf4g
28       1450518_at ENSMUSG00000017688              Hnf4g
29     1423696_a_at ENSMUSG00000021737              Psmd6
30       1423697_at ENSMUSG00000021737              Psmd6
31       1418815_at ENSMUSG00000024304               Cdh2
32       1449244_at ENSMUSG00000024304               Cdh2
33       1439307_at ENSMUSG00000024304               Cdh2
34       1422089_at ENSMUSG00000062524               Ncr1
35       1416335_at ENSMUSG00000059658            Gm16379
36       1439973_at ENSMUSG00000074365              Crxos
37       1460605_at ENSMUSG00000074365              Crxos
38       1427317_at ENSMUSG00000037262                Kin
39       1448167_at ENSMUSG00000020009             Ifngr1
40       1418190_at ENSMUSG00000002588               Pon1
41       1460281_at ENSMUSG00000029685              Asb15
42       1439836_at ENSMUSG00000029685              Asb15
43       1456774_at ENSMUSG00000040734           Ppp1r13l
44     1459592_a_at ENSMUSG00000040734           Ppp1r13l
45     1439530_a_at ENSMUSG00000040734           Ppp1r13l
46     1459593_x_at ENSMUSG00000040734           Ppp1r13l
47       1446394_at ENSMUSG00000025905              Oprk1
48       1451813_at ENSMUSG00000025905              Oprk1
49     1429993_s_at ENSMUSG00000033219             Gm9758
50     1435732_x_at ENSMUSG00000024121            Atp6v0c

No comments:

Post a Comment