Friday, November 4, 2016

Parallelize DESeq2 processing of RNA-Seq data with xargs. Code to perform multiple comparison of DE gene overlaps.

This is a code to automate the downstream comparisons of DE gene lists obtained using DESeq2. In this example, lists are located in various subfolders of the current folder.

Follow the code to calculate the number of DE genes in each subfolder, cut the first column with ReFSeq ID, save as separate file, remove quotations, use nested for loops to iterate through every combination of DE list pairs, compare each pair of DE gene lists using awk hash tables, save the overlap, calculate percentage of overlapping genes using the initial number of genes in each DE list in a pair.


#DESeq2 output file with ReFSeq IDs. In this example output files are located in multiple subfolders of the current folder
head ./CTR-TCF/DESEQ.RES.Ax1.Ex1.Annote_Filter.csv
"REFSEQ","baseMean","Ax1","Ex1","(X/Y)","LOG2(X/Y)","lfcSE","PVAL","PADJ","HGNC","ENTREZ"
"NM_001004366",52.8590278877529,6.79294520135589,1.80248640318681,45.9629650035973,5.52239996201874,1.4097928326663,8.95941020937956e-05,0.0164558142882275,"NM_001004366","NM_001004366"
"NM_001007223",41.7568221621828,8.16964446568062,1.80248640318681,118.282063619244,6.88608750868106,1.5958035927171,1.5951463982438e-05,0.0053387777564333,"NM_001007223","NM_001007223"
"NM_001008231",1696.15244217266,9.34709439325593,6.55127081394245,6.68546502082769,2.74102791333421,0.721750109768267,0.000146014310245597,0.0224400155776422,"NM_001008231","NM_001008231"
"NM_001025388",1817.64545598915,5.50851247905198,10.3459116079456,0.0362363048504379,-4.78642034382073,0.730277856835578,5.5923907091726e-11,1.16930170591509e-07,"NM_001025388","NM_001025388"
"NM_001033331",89.0716650432561,7.88692209047355,3.62921761184198,20.6510055475495,4.36814012656324,1.19391005598866,0.000253513276060988,0.0310419792744272,"NM_001033331","NM_001033331"
"NM_001037809",81.4745806890257,6.51501911066681,2.76040726966177,26.5682533972088,4.73163148274551,1.14953225287611,3.85276448555085e-05,0.0096669604860005,"NM_001037809","NM_001037809"
"NM_001037859",650.913051455931,11.2354640663983,7.20231733786188,13.4780131829126,3.75253593692515,0.949172065524538,7.7021632800311e-05,0.0152634580474406,"NM_001037859","NM_001037859"
"NM_001037865",39.3802742679229,7.4021164714241,1.80248640318681,60.2392280286425,5.9126313767434,1.51175340371297,9.18735384352608e-05,0.0166711730406441,"NM_001037865","NM_001037865"
"NM_001044751",23759.7730889387,10.6451432030245,14.122465637817,0.107058595986039,-3.22352745714107,0.869196041818117,0.000208382776310629,0.0275302894211788,"NM_001044751","NM_001044751"

#Calculate number of DE genes in each subfolder with xargs
find . -name "*Annote_Filter.csv" | xargs -I % sh -c 'wc -l %;'
1869 ./TCF-KDN/DESEQ.RES.Cx2.Ex1.Annote_Filter.csv
2926 ./HRT-LVR/DESEQ.RES.Bx2.Dx2.Annote_Filter.csv
157 ./CTR-TCF/DESEQ.RES.Ax1.Ex1.Annote_Filter.csv
384 ./TCF-HRT/DESEQ.RES.Bx2.Ex1.Annote_Filter.csv
2526 ./CTR-LVR/DESEQ.RES.Ax1.Dx2.Annote_Filter.csv
2515 ./HRT-KDN/DESEQ.RES.Bx2.Cx2.Annote_Filter.csv
1826 ./KDN-LVR/DESEQ.RES.Cx2.Dx2.Annote_Filter.csv
2053 ./TCF-LVR/DESEQ.RES.Dx2.Ex1.Annote_Filter.csv
175 ./CTR-HRT/DESEQ.RES.Ax1.Bx2.Annote_Filter.csv
2170 ./CTR-KDN/DESEQ.RES.Ax1.Cx2.Annote_Filter.csv

#Cut the first column with RefSeq ID in each DE gene list
find . -name "*Annote_Filter.csv" | xargs -I % sh -c 'cut -f1 -d "," % | head;'
"REFSEQ"
"NM_001001180"
"NM_001001444"
"NM_001001488"
"NM_001001489"
"NM_001001490"
"NM_001001979"
"NM_001001986"
"NM_001004141"
"NM_001004143"
"REFSEQ"
"NM_001001179"
"NM_001001309"
"NM_001001322"
"NM_001001892"
"NM_001001979"
"NM_001002012"
"NM_001002927"
"NM_001003719"
"NM_001003893"
"REFSEQ"
"NM_001004366"
"NM_001007223"
"NM_001008231"
"NM_001025388"
"NM_001033331"
"NM_001037809"
"NM_001037859"
"NM_001037865"
"NM_001044751"

...

#Cut first column on each DE list, save as separate file, filter quotations
find . -name "*Annote_Filter.csv" | xargs -I % sh -c 'cut -f1 -d "," % > %.cut;'

find . -name "*Annote_Filter.csv.cut" | xargs -I % sed -i 's/"//g' %

find . -name "*Annote_Filter.csv.cut" | xargs -I % sh -c 'head %;'
REFSEQ
NM_001001180
NM_001001444
NM_001001488
NM_001001489
NM_001001490
NM_001001979
NM_001001986
NM_001004141
NM_001004143
REFSEQ
NM_001001179
NM_001001309
NM_001001322
NM_001001892
NM_001001979
NM_001002012
NM_001002927
NM_001003719
NM_001003893
REFSEQ
NM_001004366
NM_001007223
NM_001008231
NM_001025388
NM_001033331
NM_001037809
NM_001037859
NM_001037865
NM_001044751

...

#List .cut files and save as file
find . -name "*Annote_Filter.csv.cut" > list

#Remove end of lines and substitute with space
tr '\n' ' ' < list > list2

mv list2 list

#Create two for loops and list through all combination of DE list pairs
#Compare each pair with awk, create hash tag from first file and use it to search second file
#Save comparison as .intersect file
#Count the number of line in each .intersect file

for i in $(cat list)
do for j in $(cat list)
do
awk 'NR==FNR {h[$1] = $1; next} {if (h[$1]) print $0}' $i $j > $(basename $(dirname $i)).$(basename $(dirname $j)).intersect
echo Total number of overlaps $(basename $(dirname $i)) - $(basename $(dirname $j)) $(wc -l $(basename $(dirname $i)).$(basename $(dirname $j)).intersect | cut -d " " -f1)
done
done

Total number of overlaps TCF-KDN - TCF-KDN 1869
Total number of overlaps TCF-KDN - HRT-LVR 933
Total number of overlaps TCF-KDN - CTR-TCF 71
Total number of overlaps TCF-KDN - TCF-HRT 187
Total number of overlaps TCF-KDN - CTR-LVR 887
Total number of overlaps TCF-KDN - HRT-KDN 1413
Total number of overlaps TCF-KDN - KDN-LVR 805
Total number of overlaps TCF-KDN - TCF-LVR 1077
Total number of overlaps TCF-KDN - CTR-HRT 84
Total number of overlaps TCF-KDN - CTR-KDN 1247
Total number of overlaps HRT-LVR - TCF-KDN 933
Total number of overlaps HRT-LVR - HRT-LVR 2926
Total number of overlaps HRT-LVR - CTR-TCF 76
Total number of overlaps HRT-LVR - TCF-HRT 221
Total number of overlaps HRT-LVR - CTR-LVR 1878
Total number of overlaps HRT-LVR - HRT-KDN 1411
Total number of overlaps HRT-LVR - KDN-LVR 1026
Total number of overlaps HRT-LVR - TCF-LVR 1589

...

#Create two nested for loops, iterate through all pairs of DE lists
#Calculate percent of overlapping genes using initial number of genes in each DE list in a pair

for i in $(cat list)
do for j in $(cat list)
do
echo Percent of overlaps $(basename $(dirname $i)) - $(basename $(dirname $j)) $(wc -l $(basename $(dirname $i)).$(basename $(dirname $j)).intersect | cut -d " " -f1)/$(wc -l $i | cut -d " " -f1) $(wc -l $(basename $(dirname $i)).$(basename $(dirname $j)).intersect | cut -d " " -f1)/$(wc -l $j |  cut -d " " -f1);
echo $(basename $(dirname $i))
echo "scale=4;$(wc -l $(basename $(dirname $i)).$(basename $(dirname $j)).intersect | cut -d " " -f1)/$(wc -l $i | cut -d " " -f1)" | bc
echo $(basename $(dirname $j))
echo "scale=4;$(wc -l $(basename $(dirname $i)).$(basename $(dirname $j)).intersect | cut -d " " -f1)/$(wc -l $j | cut -d " " -f1)" | bc
done
done

Percent of overlaps TCF-KDN - TCF-KDN 1869/1869 1869/1869
TCF-KDN
1.0000
TCF-KDN
1.0000
Percent of overlaps TCF-KDN - HRT-LVR 933/1869 933/2926
TCF-KDN
.4991
HRT-LVR
.3188
Percent of overlaps TCF-KDN - CTR-TCF 71/1869 71/157
TCF-KDN
.0379
CTR-TCF
.4522
Percent of overlaps TCF-KDN - TCF-HRT 187/1869 187/384
TCF-KDN
.1000
TCF-HRT
.4869
Percent of overlaps TCF-KDN - CTR-LVR 887/1869 887/2526
TCF-KDN
.4745
CTR-LVR
.3511
Percent of overlaps TCF-KDN - HRT-KDN 1413/1869 1413/2515
TCF-KDN
.7560
HRT-KDN
.5618

...

#Save output as a file

for i in $(cat list)
do for j in $(cat list)
do
echo Percent of overlaps $(basename $(dirname $i)) - $(basename $(dirname $j)) $(wc -l $(basename $(dirname $i)).$(basename $(dirname $j)).intersect | cut -d " " -f1)/$(wc -l $i | cut -d " " -f1) $(wc -l $(basename $(dirname $i)).$(basename $(dirname $j)).intersect | cut -d " " -f1)/$(wc -l $j |  cut -d " " -f1) >> output.txt 
echo $(basename $(dirname $i)) >> output.txt
echo "scale=4;$(wc -l $(basename $(dirname $i)).$(basename $(dirname $j)).intersect | cut -d " " -f1)/$(wc -l $i | cut -d " " -f1)" | bc >> output.txt
echo $(basename $(dirname $j)) >> output.txt
echo "scale=4;$(wc -l $(basename $(dirname $i)).$(basename $(dirname $j)).intersect | cut -d " " -f1)/$(wc -l $j | cut -d " " -f1)" | bc >> output.txt
done
done


No comments:

Post a Comment