Saturday, October 3, 2015

Some examples of file processing using awk scripting

Let say that you have a file with a list of commands that you need to modify as you don't want to do this for each line separately (example is a list of commands with the kallisto tool for RNA-Seq quantification that you want to convert from the type 'kallisto quant -i index -o folder path/to/fastq_file' to the type 'kallisto h5dump -o folder path/to/h5file').

mpjanic@valkyr:~/HCASMC_RNASeq$ cat commands_kallisto3
kallisto quant -i GENCODE_transcripts -o 1522_6hr_TGFB_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_6hr_TGFB/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_6hr_TGFB_ATGTCA_L008_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_6hr_TGFB_2 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_6hr_TGFB/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_6hr_TGFB_ATGTCA_L007_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_1hr_PDGF_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_1hr_PDGF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_1hr_PDGF_CAAAAG_L004_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_6hr_TNF_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_6hr_TNF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_6hr_TNF_CACCGG_L001_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_6hr_TNF_2 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_6hr_TNF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_6hr_TNF_CACCGG_L002_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 2989_1hr_PMA --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_1hr_PMA/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_1hr_PMA_CGATGT_L004_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 2989_1hr_TGFB_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_1hr_TGFB/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_1hr_TGFB_CTTGTA_L005_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 2989_1hr_TGFB_2 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_1hr_TGFB/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_1hr_TGFB_CTTGTA_L006_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 2989_6hr_SF_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_6hr_SF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_6hr_SF_TCGGCA_L003_R1_001.C5NHAACXX.fastq
kallisto quant -i GENCODE_transcripts -o 2989_6hr_PDGF_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_6hr_PDGF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_6hr_PDGF_TAATCG_L007_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 2989_6hr_PDGF_2 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_6hr_PDGF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_6hr_PDGF_TAATCG_L008_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 2989_6hr_TNF_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_6hr_TNF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_6hr_TNF_GTCCGC_L001_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 2989_6hr_TNF_2 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_6hr_TNF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_6hr_TNF_GTCCGC_L002_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_6hr_SF_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_6hr_SF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_6hr_SF_ACTGAT_L002_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_6hr_SF_2 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_6hr_SF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_6hr_SF_ACTGAT_L001_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_1hr_TGFB_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_1hr_TGFB/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_1hr_TGFB_GAGTGG_L006_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_1hr_TGFB_2 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_1hr_TGFB/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_1hr_TGFB_GAGTGG_L005_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 2989_6hr_TGFB_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_6hr_TGFB/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_6hr_TGFB_CGGAAT_L007_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 2989_6hr_TGFB_2 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_6hr_TGFB/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_6hr_TGFB_CGGAAT_L008_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_1hr_PMA_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_1hr_PMA/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_1hr_PMA_GTGGCC_L003_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 2989_1hr_PDGF_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_1hr_PDGF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_1hr_PDGF_GATCAG_L005_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 2989_1hr_PDGF_2 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_2989_1hr_PDGF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_2989_1hr_PDGF_GATCAG_L006_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_6hr_PDGF_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_6hr_PDGF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_6hr_PDGF_TACAGC_L008_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_6hr_PDGF_2 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_6hr_PDGF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_6hr_PDGF_TACAGC_L007_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_1hr_TNF_1 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_1hr_TNF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_1hr_TNF_CTATAC_L005_R1_001.C5NHAACXX.fastq.gz
kallisto quant -i GENCODE_transcripts -o 1522_1hr_TNF_2 --single -l 180 -s 18 /home/clint/RNAseq/HCASMC_1522_1hr_TNF/Raw/RNA.IlluminaHiSeq2500.PolyA/HCASMC_1522_1hr_TNF_CTATAC_L006_R1_001.C5NHAACXX.fastq.gz

You want to change kallisto quant -i GENCODE_transcripts to kallisto h5dump, using sed command. At the same time pipe it to cut and take first four columns separated with ' '.

sed 's/quant\ -i\ GENCODE_transcripts/h5dump/' commands_kallisto3 | cut -f 1-4 -d ' ' > commands_kallisto3b


mpjanic@valkyr:~/HCASMC_RNASeq$ cat commands_kallisto3b
kallisto h5dump -o 1522_6hr_TGFB_1
kallisto h5dump -o 1522_6hr_TGFB_2
kallisto h5dump -o 1522_1hr_PDGF_1
kallisto h5dump -o 1522_6hr_TNF_1
kallisto h5dump -o 1522_6hr_TNF_2
kallisto h5dump -o 2989_1hr_PMA
kallisto h5dump -o 2989_1hr_TGFB_1
kallisto h5dump -o 2989_1hr_TGFB_2
kallisto h5dump -o 2989_6hr_SF_1
kallisto h5dump -o 2989_6hr_PDGF_1
kallisto h5dump -o 2989_6hr_PDGF_2
kallisto h5dump -o 2989_6hr_TNF_1
kallisto h5dump -o 2989_6hr_TNF_2
kallisto h5dump -o 1522_6hr_SF_1
kallisto h5dump -o 1522_6hr_SF_2
kallisto h5dump -o 1522_1hr_TGFB_1
kallisto h5dump -o 1522_1hr_TGFB_2
kallisto h5dump -o 2989_6hr_TGFB_1
kallisto h5dump -o 2989_6hr_TGFB_2
kallisto h5dump -o 1522_1hr_PMA_1
kallisto h5dump -o 2989_1hr_PDGF_1
kallisto h5dump -o 2989_1hr_PDGF_2
kallisto h5dump -o 1522_6hr_PDGF_1
kallisto h5dump -o 1522_6hr_PDGF_2
kallisto h5dump -o 1522_1hr_TNF_1
kallisto h5dump -o 1522_1hr_TNF_2

Next you need to duplicate last column, do this with awk, specifying the field separator FS=" ".

mpjanic@valkyr:~/HCASMC_RNASeq$ awk 'BEGIN { FS=" "; OFS=" " } { $4=$4 " " $4 } 1' commands_kallisto3b > commands_kallisto3c


mpjanic@valkyr:~/HCASMC_RNASeq$ cat commands_kallisto3c
kallisto h5dump -o 1522_6hr_TGFB_1 1522_6hr_TGFB_1
kallisto h5dump -o 1522_6hr_TGFB_2 1522_6hr_TGFB_2
kallisto h5dump -o 1522_1hr_PDGF_1 1522_1hr_PDGF_1
kallisto h5dump -o 1522_6hr_TNF_1 1522_6hr_TNF_1
kallisto h5dump -o 1522_6hr_TNF_2 1522_6hr_TNF_2
kallisto h5dump -o 2989_1hr_PMA 2989_1hr_PMA
kallisto h5dump -o 2989_1hr_TGFB_1 2989_1hr_TGFB_1
kallisto h5dump -o 2989_1hr_TGFB_2 2989_1hr_TGFB_2
kallisto h5dump -o 2989_6hr_SF_1 2989_6hr_SF_1
kallisto h5dump -o 2989_6hr_PDGF_1 2989_6hr_PDGF_1
kallisto h5dump -o 2989_6hr_PDGF_2 2989_6hr_PDGF_2
kallisto h5dump -o 2989_6hr_TNF_1 2989_6hr_TNF_1
kallisto h5dump -o 2989_6hr_TNF_2 2989_6hr_TNF_2
kallisto h5dump -o 1522_6hr_SF_1 1522_6hr_SF_1
kallisto h5dump -o 1522_6hr_SF_2 1522_6hr_SF_2
kallisto h5dump -o 1522_1hr_TGFB_1 1522_1hr_TGFB_1
kallisto h5dump -o 1522_1hr_TGFB_2 1522_1hr_TGFB_2
kallisto h5dump -o 2989_6hr_TGFB_1 2989_6hr_TGFB_1
kallisto h5dump -o 2989_6hr_TGFB_2 2989_6hr_TGFB_2
kallisto h5dump -o 1522_1hr_PMA_1 1522_1hr_PMA_1
kallisto h5dump -o 2989_1hr_PDGF_1 2989_1hr_PDGF_1
kallisto h5dump -o 2989_1hr_PDGF_2 2989_1hr_PDGF_2
kallisto h5dump -o 1522_6hr_PDGF_1 1522_6hr_PDGF_1
kallisto h5dump -o 1522_6hr_PDGF_2 1522_6hr_PDGF_2
kallisto h5dump -o 1522_1hr_TNF_1 1522_1hr_TNF_1
kallisto h5dump -o 1522_1hr_TNF_2 1522_1hr_TNF_2


Next you need to append text to the last column, do this using awk, again specifying FS=" "

mpjanic@valkyr:~/HCASMC_RNASeq$ awk 'BEGIN { FS=" "; OFS = " " } { $5 = $5"/abundance.h5"; print }' commands_kallisto3c > commands_kallisto3d


mpjanic@valkyr:~/HCASMC_RNASeq$ cat commands_kallisto3d
kallisto h5dump -o 1522_6hr_TGFB_1 1522_6hr_TGFB_1/abundance.h5
kallisto h5dump -o 1522_6hr_TGFB_2 1522_6hr_TGFB_2/abundance.h5
kallisto h5dump -o 1522_1hr_PDGF_1 1522_1hr_PDGF_1/abundance.h5
kallisto h5dump -o 1522_6hr_TNF_1 1522_6hr_TNF_1/abundance.h5
kallisto h5dump -o 1522_6hr_TNF_2 1522_6hr_TNF_2/abundance.h5
kallisto h5dump -o 2989_1hr_PMA 2989_1hr_PMA/abundance.h5
kallisto h5dump -o 2989_1hr_TGFB_1 2989_1hr_TGFB_1/abundance.h5
kallisto h5dump -o 2989_1hr_TGFB_2 2989_1hr_TGFB_2/abundance.h5
kallisto h5dump -o 2989_6hr_SF_1 2989_6hr_SF_1/abundance.h5
kallisto h5dump -o 2989_6hr_PDGF_1 2989_6hr_PDGF_1/abundance.h5
kallisto h5dump -o 2989_6hr_PDGF_2 2989_6hr_PDGF_2/abundance.h5
kallisto h5dump -o 2989_6hr_TNF_1 2989_6hr_TNF_1/abundance.h5
kallisto h5dump -o 2989_6hr_TNF_2 2989_6hr_TNF_2/abundance.h5
kallisto h5dump -o 1522_6hr_SF_1 1522_6hr_SF_1/abundance.h5
kallisto h5dump -o 1522_6hr_SF_2 1522_6hr_SF_2/abundance.h5
kallisto h5dump -o 1522_1hr_TGFB_1 1522_1hr_TGFB_1/abundance.h5
kallisto h5dump -o 1522_1hr_TGFB_2 1522_1hr_TGFB_2/abundance.h5
kallisto h5dump -o 2989_6hr_TGFB_1 2989_6hr_TGFB_1/abundance.h5
kallisto h5dump -o 2989_6hr_TGFB_2 2989_6hr_TGFB_2/abundance.h5
kallisto h5dump -o 1522_1hr_PMA_1 1522_1hr_PMA_1/abundance.h5
kallisto h5dump -o 2989_1hr_PDGF_1 2989_1hr_PDGF_1/abundance.h5
kallisto h5dump -o 2989_1hr_PDGF_2 2989_1hr_PDGF_2/abundance.h5
kallisto h5dump -o 1522_6hr_PDGF_1 1522_6hr_PDGF_1/abundance.h5
kallisto h5dump -o 1522_6hr_PDGF_2 1522_6hr_PDGF_2/abundance.h5
kallisto h5dump -o 1522_1hr_TNF_1 1522_1hr_TNF_1/abundance.h5
kallisto h5dump -o 1522_1hr_TNF_2 1522_1hr_TNF_2/abundance.h5

Commands are now ready to be sourced.

No comments:

Post a Comment