问题描述
我下面有一个fastq文件,我想用lane=$2
分割文件。我的代码完成了拆分工作,但我还希望输出文件中附加$ SM变量。有人可以让我知道我的命令中缺少什么吗?
SM="sample1"
awk 'BEGIN {FS = ":"} {lane=$2 ; print > "${SM}."lane".fastq" ; for (i = 1; i <= 3; i++) {getline ; print > "${SM}."lane".fastq"}}' < File.fastq
File.fastq
@HS2000-1015_160:7:1108:13370:100570/2
CTTGACTGCCAGAGACGCTCCTTTGCAATGCCTTCCGGTAACCAAATTTTTGGGCACAACACACAGCTGGCCTTCATTTCTTCAGGGGCTGGTAAACAGA
+
@@@ADFFFHHHFD=EF@:GHIIFHH<ECHGF@DDBB:6@D60?F=888)8='--(=5@EAE5?'(..((.;?@>>A>3;@####################
@HS2000-1015_160:5:2306:10070:71746/2
GAACCTCAAGGACTATTGGGAGAGCGGCGAGTGGGCCATCATCAAAGCCCCAGGCTACAAACACGACATCAAGTACAACTGCTGCGAGGAGATCTACCCC
+
@CCFFFFDHGHFHIJJJJJJGGGIIJJIGHI@FHGIIGHHEFGHHFFFFFBCddddDCdddddddd;@BDCCDACDD@>ACCddddBDB<BA?C@CC@BD
@HS2000-1015_160:6:2116:4077:79041/2
GGTCCCCGCCTACGCCCACTGGGTTGGTGCACCTGGTGGTGGTGGCCGCCAAGAAGCTGGTGAACCGCCTCCAAGTGGCTCCCAAGACGCAGCTGGATGA
+
CCCFFFFFHHFHHJHJJJJJJJJGGHJHIGAGIIJIFHJ;@F;CHHFHFddddDCDDCDD9CCCDDBDDBBDDCDACDD8@BD3>?BCDBDDDACCDC@>
@HS2000-1015_160:5:2113:11446:94436/2
CGTCAGGGCCAACCCCGCCCCACCCTGACCCTACCTGGCACCCCTCACCTGTGGCCTGCCAGCACAGCCTCGCCCCTGCTGGCCAATGTGTCCCCCGTCA
+
?@@DA@DDFHH?DHI)<@@FHDBGGCHCBDH;DFA<)6.=7D;@CBCHD)).7@=>;?==AABC95<(5(5309@D########################
@HS2000-1015_160:6:2209:18284:44195/2
TAAAATGTCACAAAGCTGGAAACTCTTCCCTATCACAAACCAAAACTTAAAAGGACGTTACCTGGCTGGGTCTAAACTCCACATAACTCGCTTGCAGTTG
+
CCCFFFFEHHHGHJIIIJJIJJHIIJEHJJHIJJJIIJJIJIJJIJIIHJJIJGGHGHGIIHHIIIIHFH@DFFFDEEEECDDDCddddBDDBBDCDACC
@HS2000-1015_160:7:1215:18781:100685/2
ATAAAACAGTAAACAAAATAAAGTCAGTTTTTTTTTTTTTTTTTAAAGAACAAAATGAAACTTGAGGGAAAACTTCATGGAGTTACAGTTTATCCTGATA
+
CCCFFFFFHFHHFJJJJIIGIGI<CFHHIIJJJJJIJJHFDDD=ACC(38+9CB?:(>C(+:@>(4?05<?C?###########################
@HS2000-1015_160:6:1215:6292:43622/2
GGGTCCTGAGACCTGAGGGACCATTGGCCCTCTTCTGGCTTGCTTATCCTTTGTACCTGATGGCCAATGAATGTCAGAGATGGTCCTGTCTCCATCCAGT
+
BCCDFFFFHGHHHJJJIIJJJJJIJJJGIJIJJIHIIJJIEfheIJJJJIGIGIIIIIJHFHIJJJJIHGHEC?BCEFFFEECCCEACCCCddddDDCCC
@HS2000-1015_160:7:2311:1291:4696/2
GATCTGGTGCTCGTATTCCATCCACCTCCCAAGCTATACATAATAACGGCCAAAGGACCTGGATGAAAGTGTCTGAAGCAGTTGTGTGTGTCTCACCTTC
+
?=?ABDDBCFDFHGGHBFCHHGD@GFDGCBDFGFFECCHHD@DDFHJEIIHGG3CE9C(7@E(.7=?;;@C?@ECA>@C3A(;A-5595<9:AC3@AC:A
@HS2000-1015_160:7:1205:18979:53766/2
TCTTGTTTTGACCAATAGTAAAGCACATTTCTCTAATTTGGATTTCTACAATATCCATATCTTGGTTTATGAAAGGTAGGGAAGAGACTTCAGGTACTGC
+
CCCFFDFFHHHHHIJIJJJIHIJHJJIJJJJIJIGIIIJJJJJHJIJJIJDHIJIIIIIJJJJIJGIJJJIIIGEEGCD@AHHFFEDFFCddddCCDD@C
@HS2000-1015_160:7:1205:5641:24287/2
ATAAGAAGGGAAGAATGATTAGGTGTCAAATGTTCTTTTTATTTTCTTTCAGTTCAATGCAAAAACTTTCCAGTGATTATGTAAATGCAGAATCATGTGG
+
CCCFFFFFHHHGHJIJJFJJGIGEHEHIJJJJGJGJJIJJJJJJJJJJJIJIIIJJJIJJIEHGIHGJJJJIGGGHIIIIEEEHCHHC>DFBEEA@CCCC
@HS2000-1015_160:7:1310:19879:73973/2
TTCTTGAGTTCTGATACCTGTTTCCACAATCGTTTCTGTTTCTGTTGTCTCCAGCCCATCCATGCTGTCCTCATCTTCCACTGCAGTTTTCACCCTACTT
+
@<@FFFDFHHH>FGGIJAEFHABHHIAGHAE=F@EF?FB@F:F<GGBGEHGGG9F=BGAGIIIHH;=.=CHG@CEHE3)7?=>)7@C>)(6(.6;A?ACC
@HS2000-1015_160:7:1215:4243:29984/2
ATCTACACCCAAAACAGAACTTTCACAAAAAAACTGTTGATACGAAGCTCATGAAAATCATGATGAATACTCCAACAATTAATGAATAAAACTATACAAT
+
;@@A;D;ADDFHFIIF3EG@A>ACEHE>EH=:DH@<9DB@F?B7C87'@)=)7@>@7==)7...).;?@C)6;((;(5;(>A:(:3;@3>:@>:@(4@::
@HS2000-1015_160:7:1314:6987:62989/2
ATAGCTGTCTGTTCAGAGTCTGATGTTTTCAGTAACACTCTTGATACATTAAGTGAGATAGAATGGAATCCAGCAACAAAGCTACTAAATCAGGTAACTT
+
C@CFFFFDHHHHHJIJJJBHHIIIIHJIJHGJIJJIEHGHJJIJJJJJJJJIGBGHHIJGHGIIHJJIJIIJIGIGHIGGGCHHHHBEFCCEFE>CCEEE
@HS2000-1015_160:6:1208:20370:97766/2
TTTACTTTTTCCCAAACAATAATGATGATAATGTGGCCATACTGGTGCATGAGGGCTCTTATTAAGGATAGGGGCCATGTCAGGCTCTATTGACTCCTAT
+
CCCFFFFFDHDFHJJJIJJJIIJGHJJJIIIIGHIJJIJJJIJIHIJJIIHGHIFHIFHJGIJJIJJJJJJJJHHHFFFFFEEEEEDDCDEddddDDCDD
@HS2000-1015_160:6:1108:20693:2521/2
CCCATTTTCTGATGAGGAAACAGGATCAGGGACATTGAGACCTACCAAAGTTACATAATACCAGTAGTAGAAATGGGACTTCAACACAGGCCTCTTGACT
+
7@@ddddDHHHBDIGIB@F?A+AF@3+2AFE@1:BFE??HH6?BG9BD99??F49BC=88=:;F8=77/@EH=EHF9)=A>C>7?;(6@???C?>@####
@HS2000-1015_160:6:1206:11472:64908/2
AGTTTGTTGGACATTTGAGACCCCAGGAAATCCCCTTTCTCGTAACGTTCTCCGCTTGGATCTGATCTCAACAGGGTGTCGTAGTCATTCTTCAGCACAA
+
B@BDFFFFHHHHHIJGIIJIJJIJJJJGEGHHIJJJJJJIJIFFHIIHCHHIJJJGIIJH:CHHFFFFFFFEEEDD=@BDDDAB@DCddddDDD>CCB<?
@HS2000-1015_160:7:1114:4995:49287/2
CCTCCGCTCAGCACTGGCATTGGCATCGGTTTCTATGGCAACAGTGAGACCAGTGATGGGGTGTCCCAGCTCAGCTCTGCGCTGCTGCACGCCAACCACA
+
BCCFDFFFHHHHHJJJJJGHEIIJHIGIIFGHGIIIGHEHIIJJDHIJJJJJJEGIGGIDE:?BCEEAE@CCDCDDCddddDDDBCCDDD85?9BB@BDD
@HS2000-1015_160:7:1206:16723:26612/2
TTAGATATGCTGTATGTGAAGAAGAGGAGGTTAAAGAACACTGTTTTATGTAAATGTCTCATTCCTTATCCTACAGAAATTGCATTTTTAATTAAATCTT
+
BC@FFFFFHHHHHICIGGHEIGJJIJIEGHGHIJJGGIIIIJIFGIJJIIJIIIJJIIJJJJJIHHGJJGIIIIGIIIHIIFHGHFADFFFDFDE(;@CE
@HS2000-1015_160:5:2101:1745:52266/2
CCCCAGAATTCTCTTGTTTTTTCCTTGGTGATCCAGGAAAACGAAGCCCCCTCCTGTATTGACAGCTGGGAATTGTGGAGTCCACCGTCCTCCACCTGAG
+
C@CFFFFFHHHHHJIJJIJJJJJIIICHCEGIIIEHGIIHIJIGGGIJCHGIHHHGEFHHHGHEEFFDEDAC?CDDCDCD>95>:,99@DCC?<AB9AC
我得到的结果文件名:
${SM}.5.fastq
${SM}.6.fastq
${SM}.7.fastq
我想要的结果文件名:
sample1.5.fastq
sample1.6.fastq
sample1.7.fastq
解决方法
编辑: 根据OP的注释添加解决方案(改进的解决方案),其中包括更改输出文件名。
SM="sample1"
awk -v sm="$SM" '
BEGIN{FS = ":"}
/^@HS/{
split($1,arr,"_")
sub(/^@[a-zA-Z]+/,"",arr[1])
lane=$2
close(outputFile)
outputFile=sm"."arr[1]"."lane".fastq"
}
{
print >> (outputFile)
}' File.fastq
修复OP的尝试: 请您尝试以下操作,实际上您也可以使用我在评论部分共享链接的-v awk_var_name="$shell_var"
,并且已修复您的代码中也没有几件事。
SM="sample1"
awk -v sm="$SM" '
BEGIN{FS = ":"}
{
close(outputFile)
lane=$2
outputFile=sm count "."lane".fastq"
print > (outputFile)
for (i = 1; i <= 3; i++){getline ; print > (outputFile)}
}' File.fastq
修正了OP的尝试:
- 创建的outputFile变量,该变量具有用于清晰目的的输出文件名。
- 使用
close
命令关闭输出文件,这样我们就不会得到too many file opened error
- 根据专家
getline
的建议不多,因此更改为通过执行FNR%4==0
检查行号检查的方法
理想的方法可能是:
SM="sample1"
awk -v sm="$SM" '
BEGIN{FS = ":"}
/^@HS/{
lane=$2
close(outputFile)
outputFile=sm count "."lane".fastq"
}
{
print >> (outputFile)
}' File.fastq
,
您的问题是${SM}
不能在'
引号内扩展为变量。
这是设计。
简单而肮脏的解决方案是在所有这样的地方将${SM}
替换为'${SM}'
:
SM="sample1"
awk 'BEGIN {FS = ":"} {lane=$2 ; print > "'${SM}'."lane".fastq" ; for (i = 1; i <= 3; i++) {getline ; print > "'${SM}'."lane".fastq"}}' < File.fastq
通过这种方式,变量${SM}
被扩展为单行脚本。
另一个选项。
将awk脚本写入文件,并传递字段分隔符-F
选项,然后输入变量-v
选项。如下:
script.awk
{
lane=$2 ;
print > SM"."lane".fastq" ;
for (i = 1; i <= 3; i++) {
getline ;
print > SM"."lane".fastq";
}
}
运行script.awk
SM="sample1"
awk -F";" -v SM=${SM} -f script.awk File.fastq
改进了script.awk
{
outFile = SM"."$2".fastq";
print > outFile ;
for (i = 1; i <= 3; i++) {
getline;
print > outFile;
}
}